2018-zoekmachines-goeievraagle/backend/app.py

211 lines
6.2 KiB
Python
Raw Normal View History

2018-10-19 07:25:54 +00:00
from flask import Flask, jsonify, request
from elasticsearch_dsl import Search
2018-10-19 07:25:54 +00:00
from elasticsearch_dsl.connections import connections
2018-10-26 09:10:46 +00:00
from elasticsearch.exceptions import NotFoundError
2018-10-19 07:25:54 +00:00
from tqdm import tqdm
from beeprint import pp
2018-10-26 13:15:49 +00:00
from datetime import datetime
2018-10-19 07:25:54 +00:00
import csv
import click
2018-10-26 09:10:46 +00:00
import requests
2018-10-19 07:25:54 +00:00
from .question import Question
2018-10-19 07:25:54 +00:00
app = Flask(__name__)
connections.create_connection(hosts=["localhost"])
Question.init()
@app.route("/api/")
def index():
def round_sigfig(value, figures):
return float(format(value, f".{figures}g"))
2018-10-19 07:25:54 +00:00
query = request.args.get("q")
categories = request.args.get("categories", None)
2018-10-26 13:15:49 +00:00
years = request.args.get("years", None)
2018-10-26 09:10:46 +00:00
page = int(request.args.get("page", 1)) - 1
2018-10-19 07:25:54 +00:00
2018-10-26 09:10:46 +00:00
search_dict = {
"from": page * 10,
"query": {
2018-10-26 09:10:46 +00:00
"bool": {
"must": [
{"query_string": {"query": query}},
2018-10-26 13:15:49 +00:00
{"term": {"dead": False}},
2018-10-26 09:10:46 +00:00
]
}
},
"aggregations": {
"category": {
"terms": {"field": "category"},
},
2018-10-26 13:15:49 +00:00
"date": {
"date_histogram": {
"field": "date", "interval": "year",
},
},
2018-10-26 09:10:46 +00:00
"chips": {
2018-10-23 11:16:23 +00:00
"significant_terms": {
"field": "body",
2018-10-26 13:15:49 +00:00
"mutual_information": {"include_negatives": True},
2018-10-23 11:16:23 +00:00
"size": 40,
},
}
},
2018-10-26 09:10:46 +00:00
}
2018-10-19 07:25:54 +00:00
2018-10-26 13:15:49 +00:00
if categories is not None or years is not None:
search_dict["post_filter"] = {"bool": {"must": []}}
2018-10-26 09:10:46 +00:00
if categories is not None:
category_list = categories.split(",")
2018-10-26 13:15:49 +00:00
search_dict["post_filter"]["bool"]["must"].append({
2018-10-26 09:10:46 +00:00
"terms": {"category": category_list},
2018-10-26 13:15:49 +00:00
})
if years is not None:
year_list = years.split(",")
search_dict["post_filter"]["bool"]["must"].append({
"bool": {
"should": [
{
"range": {
"date": {
"gte": f"{year}||/y",
"lte": f"{year}||/y",
"format": "yyyy"
}
}
} for year in year_list if year
]
}
})
2018-10-26 09:10:46 +00:00
search = Search.from_dict(search_dict)
2018-10-19 07:25:54 +00:00
response = search.execute()
2018-10-26 09:10:46 +00:00
pp(response.to_dict())
2018-10-19 07:25:54 +00:00
2018-10-26 13:15:49 +00:00
date_facets = [{"key": datetime.fromtimestamp(bucket.key / 1000).year,
"count": bucket.doc_count}
for bucket in response.aggregations.date.buckets]
category_facets = [
{"category": bucket.key, "count": round_sigfig(bucket.doc_count, 3)}
for bucket in response.aggregations.category.buckets
]
2018-10-26 09:10:46 +00:00
chips = [{"key": bucket.key, "count": bucket.doc_count}
for bucket in response.aggregations.chips.buckets]
2018-10-23 11:16:23 +00:00
results = []
for hit in response:
summary = Question.summary(hit)
url = Question.url(hit)
2018-10-26 09:10:46 +00:00
try:
dead = hit.dead
except AttributeError:
dead = False
results.append({
2018-10-23 11:16:23 +00:00
"id": hit.meta.id, "score": hit.meta.score, "title": hit.title,
"body": summary, "category": hit.category, "date": hit.date,
2018-10-26 09:10:46 +00:00
"url": url, "dead": dead,
})
2018-10-19 07:25:54 +00:00
return jsonify(
2018-10-26 13:15:49 +00:00
facets={"dates": date_facets, "categories": category_facets},
2018-10-26 09:10:46 +00:00
chips=chips,
2018-10-19 07:25:54 +00:00
results=results,
hits=round_sigfig(response.hits.total, 4),
took=response.took / 1000,
)
@app.cli.command()
@click.argument("questions")
@click.argument("categories")
2018-10-26 09:10:46 +00:00
@click.argument("answers")
def import_data(questions, categories, answers):
2018-10-19 07:25:54 +00:00
categories_dict = {}
num_lines = sum(1 for line in open(categories))
with open(categories, newline="") as csv_file:
reader = csv.reader(csv_file)
for row in tqdm(reader, desc="Reading categories", total=num_lines):
id_ = int(row[0])
category = row[2]
categories_dict[id_] = category
2018-10-26 09:10:46 +00:00
if questions != "skip":
num_lines = sum(1 for line in open(questions))
with open(questions, newline="") as csv_file:
reader = csv.reader(csv_file)
it = tqdm(reader, desc="Reading questions", total=num_lines)
for i, row in enumerate(it):
try:
id_ = int(row[0])
category_id = int(row[3])
question = Question(meta={"id": id_})
question.date = row[1]
question.category = categories_dict[category_id]
question.title = row[4]
question.body = "\n".join(row[5:])
question.save()
except (IndexError, ValueError):
continue
if answers != "skip":
with open(answers, newline="") as csv_file:
reader = csv.reader(csv_file)
it = tqdm(reader, desc="Reading answers")
for i, row in enumerate(it):
try:
question_id = int(row[3])
question = Question.get(id=question_id)
if question.answers is None:
question.answers = row[4]
else:
question.answers += "\n\n" + row[4]
question.save()
except (IndexError, ValueError, NotFoundError):
continue
2018-10-19 07:25:54 +00:00
2018-10-26 09:10:46 +00:00
@app.cli.command()
def cleanup_database():
dead_count = 0
alive_count = 0
for question in Question.search().scan():
if question.dead is not None or question.error:
print(end="_")
dead_count += 1
continue
url = question.url()
response = requests.head(url)
if response.status_code == 404:
dead_count += 1
question.dead = True
question.save()
print(end=".")
elif response.status_code == 302:
alive_count += 1
question.dead = False
print(end="#")
elif response.status_code == 500:
question.error = True
print(end="!")
else:
continue
question.save()