2018-zoekmachines-goeievraagle/backend/app.py

from flask import Flask, jsonify, request
from elasticsearch_dsl import Search
from elasticsearch_dsl.connections import connections
from elasticsearch.exceptions import NotFoundError
from tqdm import tqdm
from beeprint import pp
from datetime import datetime
import csv
import click
import requests

from .question import Question

app = Flask(__name__)

connections.create_connection(hosts=["localhost"])
Question.init()


@app.route("/api/")
def index():
    def round_sigfig(value, figures):
        return float(format(value, f".{figures}g"))

    query = request.args.get("q")
    categories = request.args.get("categories", None)
    years = request.args.get("years", None)
    page = int(request.args.get("page", 1)) - 1

    search_dict = {
        "from": page * 10,
        "query": {
            "bool": {
                "must": [
                    {"query_string": {"query": query}},
                    {"term": {"dead": False}},
                ]
            }
        },
        "aggregations": {
            "category": {
                "terms": {"field": "category"},
            },
            "date": {
                "date_histogram": {
                    "field": "date", "interval": "year",
                },
            },
            "chips": {
                "significant_terms": {
                    "field": "body",
                    "mutual_information": {"include_negatives": True},
                    "size": 40,
                },
            }
        },
    }

    if categories is not None or years is not None:
        search_dict["post_filter"] = {"bool": {"must": []}}

    if categories is not None:
        category_list = categories.split(",")
        search_dict["post_filter"]["bool"]["must"].append({
            "terms": {"category": category_list},
        })

    if years is not None:
        year_list = years.split(",")
        search_dict["post_filter"]["bool"]["must"].append({
            "bool": {
                "should": [
                    {
                        "range": {
                            "date": {
                                "gte": f"{year}||/y",
                                "lte": f"{year}||/y",
                                "format": "yyyy"
                            }
                        }
                    } for year in year_list if year
                ]
            }
        })

    search = Search.from_dict(search_dict)
    response = search.execute()
    pp(response.to_dict())

    date_facets = [{"key": datetime.fromtimestamp(bucket.key / 1000).year,
                    "count": bucket.doc_count}
                   for bucket in response.aggregations.date.buckets]
    category_facets = [
        {"category": bucket.key, "count": round_sigfig(bucket.doc_count, 3)}
        for bucket in response.aggregations.category.buckets
    ]

    chips = [{"key": bucket.key, "count": bucket.doc_count}
             for bucket in response.aggregations.chips.buckets]

    results = []
    for hit in response:
        summary = Question.summary(hit)
        url = Question.url(hit)

        try:
            dead = hit.dead
        except AttributeError:
            dead = False

        results.append({
            "id": hit.meta.id, "score": hit.meta.score, "title": hit.title,
            "body": summary, "category": hit.category, "date": hit.date,
            "url": url, "dead": dead,
        })

    return jsonify(
        facets={"dates": date_facets, "categories": category_facets},
        chips=chips,
        results=results,
        hits=round_sigfig(response.hits.total, 4),
        took=response.took / 1000,
    )


@app.cli.command()
@click.argument("questions")
@click.argument("categories")
@click.argument("answers")
def import_data(questions, categories, answers):
    categories_dict = {}
    num_lines = sum(1 for line in open(categories))
    with open(categories, newline="") as csv_file:
        reader = csv.reader(csv_file)
        for row in tqdm(reader, desc="Reading categories", total=num_lines):
            id_ = int(row[0])
            category = row[2]

            categories_dict[id_] = category

    if questions != "skip":
        num_lines = sum(1 for line in open(questions))
        with open(questions, newline="") as csv_file:
            reader = csv.reader(csv_file)

            it = tqdm(reader, desc="Reading questions", total=num_lines)
            for i, row in enumerate(it):
                try:
                    id_ = int(row[0])
                    category_id = int(row[3])

                    question = Question(meta={"id": id_})

                    question.date = row[1]
                    question.category = categories_dict[category_id]
                    question.title = row[4]
                    question.body = "\n".join(row[5:])

                    question.save()
                except (IndexError, ValueError):
                    continue

    if answers != "skip":
        with open(answers, newline="") as csv_file:
            reader = csv.reader(csv_file)

            it = tqdm(reader, desc="Reading answers")
            for i, row in enumerate(it):
                try:
                    question_id = int(row[3])
                    question = Question.get(id=question_id)
                    if question.answers is None:
                        question.answers = row[4]
                    else:
                        question.answers += "\n\n" + row[4]
                    question.save()
                except (IndexError, ValueError, NotFoundError):
                    continue


@app.cli.command()
def cleanup_database():
    dead_count = 0
    alive_count = 0

    for question in Question.search().scan():
        if question.dead is not None or question.error:
            print(end="_")
            dead_count += 1
            continue

        url = question.url()
        response = requests.head(url)

        if response.status_code == 404:
            dead_count += 1
            question.dead = True
            question.save()
            print(end=".")
        elif response.status_code == 302:
            alive_count += 1
            question.dead = False
            print(end="#")
        elif response.status_code == 500:
            question.error = True
            print(end="!")
        else:
            continue

        question.save()
Initial commit 2018-10-19 07:25:54 +00:00			`from flask import Flask, jsonify, request`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`from elasticsearch_dsl import Search`
Initial commit 2018-10-19 07:25:54 +00:00			`from elasticsearch_dsl.connections import connections`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`from elasticsearch.exceptions import NotFoundError`
Initial commit 2018-10-19 07:25:54 +00:00			`from tqdm import tqdm`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`from beeprint import pp`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`from datetime import datetime`
Initial commit 2018-10-19 07:25:54 +00:00			`import csv`
			`import click`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`import requests`
Initial commit 2018-10-19 07:25:54 +00:00
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`from .question import Question`
Initial commit 2018-10-19 07:25:54 +00:00
			`app = Flask(__name__)`

			`connections.create_connection(hosts=["localhost"])`
			`Question.init()`


			`@app.route("/api/")`
			`def index():`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`def round_sigfig(value, figures):`
			`return float(format(value, f".{figures}g"))`

Initial commit 2018-10-19 07:25:54 +00:00			`query = request.args.get("q")`
			`categories = request.args.get("categories", None)`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`years = request.args.get("years", None)`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`page = int(request.args.get("page", 1)) - 1`
Initial commit 2018-10-19 07:25:54 +00:00
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`search_dict = {`
			`"from": page * 10,`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`"query": {`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`"bool": {`
			`"must": [`
			`{"query_string": {"query": query}},`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`{"term": {"dead": False}},`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`]`
			`}`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`},`
			`"aggregations": {`
			`"category": {`
			`"terms": {"field": "category"},`
			`},`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`"date": {`
			`"date_histogram": {`
			`"field": "date", "interval": "year",`
			`},`
			`},`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`"chips": {`
Add chips to improve precision 2018-10-23 11:16:23 +00:00			`"significant_terms": {`
			`"field": "body",`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`"mutual_information": {"include_negatives": True},`
Add chips to improve precision 2018-10-23 11:16:23 +00:00			`"size": 40,`
			`},`
			`}`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`},`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`}`
Initial commit 2018-10-19 07:25:54 +00:00
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`if categories is not None or years is not None:`
			`search_dict["post_filter"] = {"bool": {"must": []}}`

Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`if categories is not None:`
			`category_list = categories.split(",")`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`search_dict["post_filter"]["bool"]["must"].append({`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`"terms": {"category": category_list},`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`})`

			`if years is not None:`
			`year_list = years.split(",")`
			`search_dict["post_filter"]["bool"]["must"].append({`
			`"bool": {`
			`"should": [`
			`{`
			`"range": {`
			`"date": {`
			`"gte": f"{year}\|\|/y",`
			`"lte": f"{year}\|\|/y",`
			`"format": "yyyy"`
			`}`
			`}`
			`} for year in year_list if year`
			`]`
			`}`
			`})`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00
			`search = Search.from_dict(search_dict)`
Initial commit 2018-10-19 07:25:54 +00:00			`response = search.execute()`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`pp(response.to_dict())`
Initial commit 2018-10-19 07:25:54 +00:00
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`date_facets = [{"key": datetime.fromtimestamp(bucket.key / 1000).year,`
			`"count": bucket.doc_count}`
			`for bucket in response.aggregations.date.buckets]`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`category_facets = [`
			`{"category": bucket.key, "count": round_sigfig(bucket.doc_count, 3)}`
			`for bucket in response.aggregations.category.buckets`
			`]`

Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`chips = [{"key": bucket.key, "count": bucket.doc_count}`
			`for bucket in response.aggregations.chips.buckets]`
Add chips to improve precision 2018-10-23 11:16:23 +00:00
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`results = []`
			`for hit in response:`
			`summary = Question.summary(hit)`
			`url = Question.url(hit)`

Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`try:`
			`dead = hit.dead`
			`except AttributeError:`
			`dead = False`

Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`results.append({`
Add chips to improve precision 2018-10-23 11:16:23 +00:00			`"id": hit.meta.id, "score": hit.meta.score, "title": hit.title,`
			`"body": summary, "category": hit.category, "date": hit.date,`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`"url": url, "dead": dead,`
Slowly remove DSL and split frontend into components 2018-10-19 18:23:52 +00:00			`})`
Initial commit 2018-10-19 07:25:54 +00:00
			`return jsonify(`
Finishing touches before presentation 2018-10-26 13:15:49 +00:00			`facets={"dates": date_facets, "categories": category_facets},`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`chips=chips,`
Initial commit 2018-10-19 07:25:54 +00:00			`results=results,`
			`hits=round_sigfig(response.hits.total, 4),`
			`took=response.took / 1000,`
			`)`


			`@app.cli.command()`
			`@click.argument("questions")`
			`@click.argument("categories")`
Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`@click.argument("answers")`
			`def import_data(questions, categories, answers):`
Initial commit 2018-10-19 07:25:54 +00:00			`categories_dict = {}`
			`num_lines = sum(1 for line in open(categories))`
			`with open(categories, newline="") as csv_file:`
			`reader = csv.reader(csv_file)`
			`for row in tqdm(reader, desc="Reading categories", total=num_lines):`
			`id_ = int(row[0])`
			`category = row[2]`

			`categories_dict[id_] = category`

Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`if questions != "skip":`
			`num_lines = sum(1 for line in open(questions))`
			`with open(questions, newline="") as csv_file:`
			`reader = csv.reader(csv_file)`

			`it = tqdm(reader, desc="Reading questions", total=num_lines)`
			`for i, row in enumerate(it):`
			`try:`
			`id_ = int(row[0])`
			`category_id = int(row[3])`

			`question = Question(meta={"id": id_})`

			`question.date = row[1]`
			`question.category = categories_dict[category_id]`
			`question.title = row[4]`
			`question.body = "\n".join(row[5:])`

			`question.save()`
			`except (IndexError, ValueError):`
			`continue`

			`if answers != "skip":`
			`with open(answers, newline="") as csv_file:`
			`reader = csv.reader(csv_file)`

			`it = tqdm(reader, desc="Reading answers")`
			`for i, row in enumerate(it):`
			`try:`
			`question_id = int(row[3])`
			`question = Question.get(id=question_id)`
			`if question.answers is None:`
			`question.answers = row[4]`
			`else:`
			`question.answers += "\n\n" + row[4]`
			`question.save()`
			`except (IndexError, ValueError, NotFoundError):`
			`continue`
Initial commit 2018-10-19 07:25:54 +00:00

Add pagination and some other stuff 2018-10-26 09:10:46 +00:00			`@app.cli.command()`
			`def cleanup_database():`
			`dead_count = 0`
			`alive_count = 0`

			`for question in Question.search().scan():`
			`if question.dead is not None or question.error:`
			`print(end="_")`
			`dead_count += 1`
			`continue`

			`url = question.url()`
			`response = requests.head(url)`

			`if response.status_code == 404:`
			`dead_count += 1`
			`question.dead = True`
			`question.save()`
			`print(end=".")`
			`elif response.status_code == 302:`
			`alive_count += 1`
			`question.dead = False`
			`print(end="#")`
			`elif response.status_code == 500:`
			`question.error = True`
			`print(end="!")`
			`else:`
			`continue`

			`question.save()`