211 lines
6.2 KiB
Python
211 lines
6.2 KiB
Python
from flask import Flask, jsonify, request
|
|
from elasticsearch_dsl import Search
|
|
from elasticsearch_dsl.connections import connections
|
|
from elasticsearch.exceptions import NotFoundError
|
|
from tqdm import tqdm
|
|
from beeprint import pp
|
|
from datetime import datetime
|
|
import csv
|
|
import click
|
|
import requests
|
|
|
|
from .question import Question
|
|
|
|
app = Flask(__name__)
|
|
|
|
connections.create_connection(hosts=["localhost"])
|
|
Question.init()
|
|
|
|
|
|
@app.route("/api/")
|
|
def index():
|
|
def round_sigfig(value, figures):
|
|
return float(format(value, f".{figures}g"))
|
|
|
|
query = request.args.get("q")
|
|
categories = request.args.get("categories", None)
|
|
years = request.args.get("years", None)
|
|
page = int(request.args.get("page", 1)) - 1
|
|
|
|
search_dict = {
|
|
"from": page * 10,
|
|
"query": {
|
|
"bool": {
|
|
"must": [
|
|
{"query_string": {"query": query}},
|
|
{"term": {"dead": False}},
|
|
]
|
|
}
|
|
},
|
|
"aggregations": {
|
|
"category": {
|
|
"terms": {"field": "category"},
|
|
},
|
|
"date": {
|
|
"date_histogram": {
|
|
"field": "date", "interval": "year",
|
|
},
|
|
},
|
|
"chips": {
|
|
"significant_terms": {
|
|
"field": "body",
|
|
"mutual_information": {"include_negatives": True},
|
|
"size": 40,
|
|
},
|
|
}
|
|
},
|
|
}
|
|
|
|
if categories is not None or years is not None:
|
|
search_dict["post_filter"] = {"bool": {"must": []}}
|
|
|
|
if categories is not None:
|
|
category_list = categories.split(",")
|
|
search_dict["post_filter"]["bool"]["must"].append({
|
|
"terms": {"category": category_list},
|
|
})
|
|
|
|
if years is not None:
|
|
year_list = years.split(",")
|
|
search_dict["post_filter"]["bool"]["must"].append({
|
|
"bool": {
|
|
"should": [
|
|
{
|
|
"range": {
|
|
"date": {
|
|
"gte": f"{year}||/y",
|
|
"lte": f"{year}||/y",
|
|
"format": "yyyy"
|
|
}
|
|
}
|
|
} for year in year_list if year
|
|
]
|
|
}
|
|
})
|
|
|
|
search = Search.from_dict(search_dict)
|
|
response = search.execute()
|
|
pp(response.to_dict())
|
|
|
|
date_facets = [{"key": datetime.fromtimestamp(bucket.key / 1000).year,
|
|
"count": bucket.doc_count}
|
|
for bucket in response.aggregations.date.buckets]
|
|
category_facets = [
|
|
{"category": bucket.key, "count": round_sigfig(bucket.doc_count, 3)}
|
|
for bucket in response.aggregations.category.buckets
|
|
]
|
|
|
|
chips = [{"key": bucket.key, "count": bucket.doc_count}
|
|
for bucket in response.aggregations.chips.buckets]
|
|
|
|
results = []
|
|
for hit in response:
|
|
summary = Question.summary(hit)
|
|
url = Question.url(hit)
|
|
|
|
try:
|
|
dead = hit.dead
|
|
except AttributeError:
|
|
dead = False
|
|
|
|
results.append({
|
|
"id": hit.meta.id, "score": hit.meta.score, "title": hit.title,
|
|
"body": summary, "category": hit.category, "date": hit.date,
|
|
"url": url, "dead": dead,
|
|
})
|
|
|
|
return jsonify(
|
|
facets={"dates": date_facets, "categories": category_facets},
|
|
chips=chips,
|
|
results=results,
|
|
hits=round_sigfig(response.hits.total, 4),
|
|
took=response.took / 1000,
|
|
)
|
|
|
|
|
|
@app.cli.command()
|
|
@click.argument("questions")
|
|
@click.argument("categories")
|
|
@click.argument("answers")
|
|
def import_data(questions, categories, answers):
|
|
categories_dict = {}
|
|
num_lines = sum(1 for line in open(categories))
|
|
with open(categories, newline="") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
for row in tqdm(reader, desc="Reading categories", total=num_lines):
|
|
id_ = int(row[0])
|
|
category = row[2]
|
|
|
|
categories_dict[id_] = category
|
|
|
|
if questions != "skip":
|
|
num_lines = sum(1 for line in open(questions))
|
|
with open(questions, newline="") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
|
|
it = tqdm(reader, desc="Reading questions", total=num_lines)
|
|
for i, row in enumerate(it):
|
|
try:
|
|
id_ = int(row[0])
|
|
category_id = int(row[3])
|
|
|
|
question = Question(meta={"id": id_})
|
|
|
|
question.date = row[1]
|
|
question.category = categories_dict[category_id]
|
|
question.title = row[4]
|
|
question.body = "\n".join(row[5:])
|
|
|
|
question.save()
|
|
except (IndexError, ValueError):
|
|
continue
|
|
|
|
if answers != "skip":
|
|
with open(answers, newline="") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
|
|
it = tqdm(reader, desc="Reading answers")
|
|
for i, row in enumerate(it):
|
|
try:
|
|
question_id = int(row[3])
|
|
question = Question.get(id=question_id)
|
|
if question.answers is None:
|
|
question.answers = row[4]
|
|
else:
|
|
question.answers += "\n\n" + row[4]
|
|
question.save()
|
|
except (IndexError, ValueError, NotFoundError):
|
|
continue
|
|
|
|
|
|
@app.cli.command()
|
|
def cleanup_database():
|
|
dead_count = 0
|
|
alive_count = 0
|
|
|
|
for question in Question.search().scan():
|
|
if question.dead is not None or question.error:
|
|
print(end="_")
|
|
dead_count += 1
|
|
continue
|
|
|
|
url = question.url()
|
|
response = requests.head(url)
|
|
|
|
if response.status_code == 404:
|
|
dead_count += 1
|
|
question.dead = True
|
|
question.save()
|
|
print(end=".")
|
|
elif response.status_code == 302:
|
|
alive_count += 1
|
|
question.dead = False
|
|
print(end="#")
|
|
elif response.status_code == 500:
|
|
question.error = True
|
|
print(end="!")
|
|
else:
|
|
continue
|
|
|
|
question.save()
|