123 lines
3.6 KiB
Python
123 lines
3.6 KiB
Python
from datetime import datetime
|
|
from flask import Flask, jsonify, request
|
|
from elasticsearch_dsl import Document, Date, Integer, Keyword, Text, FacetedSearch, TermsFacet, DateHistogramFacet
|
|
from elasticsearch_dsl.connections import connections
|
|
from tqdm import tqdm
|
|
import csv
|
|
import click
|
|
import re
|
|
|
|
class Question(Document):
|
|
title = Text(analyzer="snowball")
|
|
body = Text(analyzer="snowball")
|
|
category = Keyword()
|
|
date = Date()
|
|
|
|
class Index:
|
|
name = "goeievraag"
|
|
|
|
def save(self, **kwargs):
|
|
return super(Question, self).save(**kwargs)
|
|
|
|
@property
|
|
def url(self):
|
|
id_ = self.meta.id
|
|
if not self.category:
|
|
return f"https://www.startpagina.nl/v/vraag/{id_}"
|
|
|
|
category = self.category.lower()
|
|
category = re.sub("[^A-Za-z ]", "", category).replace(" ", "-")
|
|
return f"https://www.startpagina.nl/v/{category}/vraag/{id_}"
|
|
|
|
@property
|
|
def summary(self, length=128):
|
|
if len(self.body) > length:
|
|
return self.body[:length - 3] + "..."
|
|
|
|
return self.body
|
|
|
|
|
|
|
|
class QuestionSearch(FacetedSearch):
|
|
doc_types = Question,
|
|
fields = "category", "title", "body"
|
|
|
|
facets = {
|
|
"date_frequency": DateHistogramFacet(field="date", interval="month"),
|
|
"category": TermsFacet(field="category")
|
|
}
|
|
|
|
app = Flask(__name__)
|
|
|
|
connections.create_connection(hosts=["localhost"])
|
|
Question.init()
|
|
|
|
def round_sigfig(value, figures):
|
|
return float(format(value, f".{figures}g"))
|
|
|
|
@app.route("/api/")
|
|
def index():
|
|
query = request.args.get("q")
|
|
categories = request.args.get("categories", None)
|
|
|
|
facets = {}
|
|
if categories is not None:
|
|
category_list = categories.split(",")
|
|
facets["category"] = category_list
|
|
|
|
search = QuestionSearch(query, facets)
|
|
|
|
response = search.execute()
|
|
|
|
date_facets = [{"timestamp": date.timestamp(), "count": count}
|
|
for date, count, _ in response.facets.date_frequency]
|
|
category_facets = [{"category": category, "count": round_sigfig(count, 3)}
|
|
for category, count, _ in response.facets.category]
|
|
|
|
results = [{"id": hit.meta.id, "score": hit.meta.score, "title": hit.title,
|
|
"body": hit.summary, "category": hit.category,
|
|
"date": hit.date, "url": hit.url}
|
|
for hit in response]
|
|
|
|
return jsonify(
|
|
facets={"months": date_facets, "categories": category_facets},
|
|
results=results,
|
|
hits=round_sigfig(response.hits.total, 4),
|
|
took=response.took / 1000,
|
|
)
|
|
|
|
|
|
@app.cli.command()
|
|
@click.argument("questions")
|
|
@click.argument("categories")
|
|
def import_data(questions, categories):
|
|
categories_dict = {}
|
|
num_lines = sum(1 for line in open(categories))
|
|
with open(categories, newline="") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
for row in tqdm(reader, desc="Reading categories", total=num_lines):
|
|
id_ = int(row[0])
|
|
category = row[2]
|
|
|
|
categories_dict[id_] = category
|
|
|
|
num_lines = sum(1 for line in open(questions))
|
|
with open(questions, newline="") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
|
|
it = tqdm(reader, desc="Reading questions", total=num_lines)
|
|
for i, row in enumerate(it):
|
|
try:
|
|
id_ = int(row[0])
|
|
category_id = int(row[3])
|
|
|
|
question = Question(meta={"id": id_})
|
|
|
|
question.date = row[1]
|
|
question.category = categories_dict[category_id]
|
|
question.title = row[4]
|
|
question.body = "\n".join(row[5:])
|
|
|
|
question.save()
|
|
except (IndexError, ValueError):
|
|
continue
|