25 Commits

Author SHA1 Message Date
schihei bc3491d8f8 Added software engineering to search string 2024-07-29 10:55:34 +02:00
Heiko Joerg Schick 2a3172850e Minor changes in category handling 2023-09-12 14:56:12 +02:00
Heiko Joerg Schick d79b0b5b5e Minor formating changes in e-mail layout 2023-09-12 09:32:59 +02:00
Heiko Joerg Schick 02ee89dcc5 Removed warning, requiring to send dual explicitly to True. 2023-09-11 20:31:18 +02:00
Heiko Joerg Schick 4c5fbe4786 Using SMTP and Docker environment variables for configuration 2023-09-08 09:42:52 +02:00
Heiko Joerg Schick 696d81f49a Disabled password and sudo for appuser user 2023-09-08 09:34:51 +02:00
Heiko Joerg Schick 695c8d1c22 Added function to filter by primary category 2023-09-07 22:38:17 +02:00
Heiko Joerg Schick 514c2a929d Minor changes in pagination 2023-09-07 20:30:37 +02:00
Heiko Joerg Schick 169db54df2 Added query string as configurable (Docker) environment variable 2023-09-07 16:02:09 +02:00
Heiko Joerg Schick 6a4b1176c6 Modivied volume pathes 2023-09-06 11:37:53 +02:00
Heiko Joerg Schick f58cbcc98c Opening arxiv link in a new frame rather the same window 2023-09-06 11:36:25 +02:00
Heiko Joerg Schick 9d3ad6a896 Increased query number back to 2000. 2023-09-01 01:51:53 +02:00
Heiko Joerg Schick d1c485240c Open PDF links in new tab 2023-08-31 14:24:26 +02:00
schihei 781e9099cb Merge branch 'feature/pagination' into 'wip/h3132'
Feature/pagination

See merge request schihei/arxiv-sanity-lite!3
2023-08-31 12:00:56 +00:00
Heiko Joerg Schick b17e98b3f4 Improved pagination 2023-08-31 13:59:44 +02:00
schihei dd37f6689a Merge branch 'feature/PDF-link' into 'wip/h3132'
Feature/pdf link

See merge request schihei/arxiv-sanity-lite!2
2023-08-31 11:51:21 +00:00
Heiko Joerg Schick e324bb91b1 Prevent thumbnail from increasing width (useful on mobile) 2023-08-31 13:48:41 +02:00
Heiko Joerg Schick 8dc5f4ef0c Added paper link to thumbnail 2023-08-31 13:34:53 +02:00
schihei 2e85c9075c Merge branch 'feature/Docker' into 'wip/h3132'
Feature/docker

See merge request schihei/arxiv-sanity-lite!1
2023-08-30 17:29:57 +00:00
Heiko Joerg Schick 6eb4cfc56b Enlarged query string for arxiv. 2023-08-30 19:08:10 +02:00
Heiko Joerg Schick f181ae609a Removed unused files 2023-08-30 14:17:11 +02:00
Heiko Joerg Schick 381b4ba7ff Minor changes in Docker container 2023-08-30 14:16:25 +02:00
Heiko Joerg Schick bee3df79f4 Minor changes in Docker container 2023-08-30 14:15:14 +02:00
Heiko Joerg Schick 1aff234cf6 Added initial Docker files 2023-08-30 04:09:41 +02:00
Heiko Joerg Schick aeb7ecf96a Change DB access flag to create/write to DB 2023-08-29 23:23:44 +02:00
12 changed files with 276 additions and 44 deletions
+34
View File
@@ -0,0 +1,34 @@
# Include any files or directories that you don't want to be copied to your
# container here (e.g., local build artifacts, temporary files, etc.).
#
# For more help, visit the .dockerignore file reference guide at
# https://docs.docker.com/engine/reference/builder/#dockerignore-file
**/.DS_Store
**/__pycache__
**/.venv
**/.classpath
**/.dockerignore
**/.env
**/.git
**/.gitignore
**/.project
**/.settings
**/.toolstarget
**/.vs
**/.vscode
**/*.*proj.user
**/*.dbmdl
**/*.jfm
**/bin
**/charts
**/docker-compose*
**/compose*
**/Dockerfile*
**/node_modules
**/npm-debug.log
**/obj
**/secrets.dev.yaml
**/values.dev.yaml
LICENSE
README.md
+82
View File
@@ -0,0 +1,82 @@
# syntax=docker/dockerfile:1
# Comments are provided throughout this file to help you get started.
# If you need more help, visit the Dockerfile reference guide at
# https://docs.docker.com/engine/reference/builder/
ARG PYTHON_VERSION=3.10.1
FROM python:${PYTHON_VERSION}-slim as base
# Prevents Python from writing pyc files.
ENV PYTHONDONTWRITEBYTECODE=1
# Keeps Python from buffering stdout and stderr to avoid situations where
# the application crashes without emitting any logs due to buffering.
ENV PYTHONUNBUFFERED=1
ENV ARXIV_QUERY=""
ENV SMTP_ADDRESS=""
ENV SMTP_PORT=""
ENV SMTP_USER_NAME=""
ENV SMTP_PASSWORD=""
# Install required binary packages.
RUN apt-get update && apt-get install -y \
imagemagick \
&& rm -rf /var/lib/apt/lists/*
# Add PDF processing to the ImageMagic policy.
RUN sed -i 's/<policy domain="coder" rights="none" pattern="PDF" \/>/<policy domain="coder" rights="read|write" pattern="PDF" \/>/g' /etc/ImageMagick-6/policy.xml
# DEBUG. Only for debug purposes.
RUN apt-get update && apt-get install -y \
curl \
procps \
sudo \
vim \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Create a non-privileged user that the app will run under.
# See https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#user
ARG UID=10001
#RUN adduser \
# --disabled-password \
# --gecos "" \
# --home "/nonexistent" \
# --shell "/sbin/nologin" \
# --no-create-home \
# --uid "${UID}" \
# appuser
# DEBUG. Only for debug purposes.
RUN useradd -r -u ${UID} -s /sbin/nologin -d /nonexistent appuser
# RUN echo "appuser:12345678" | chpasswd
# RUN echo 'appuser ALL=(ALL) NOPASSWD:ALL' | tee -a /etc/sudoers
# Upgrade pip
RUN python -m pip install --upgrade pip
# Download dependencies as a separate step to take advantage of Docker's caching.
# Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
# Leverage a bind mount to requirements.txt to avoid having to copy them into
# into this layer.
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements.txt,target=requirements.txt \
python -m pip install -r requirements.txt
# Switch to the non-privileged user to run the application.
USER appuser
# Copy the source code into the container.
COPY --chown=appuser . .
# Change file mode(s).
RUN chmod 0744 cron.sh
# Expose the port that the application listens on.
EXPOSE 5000
# Run the application.
CMD export FLASK_APP=serve.py; flask run --host=0.0.0.0
+5 -1
View File
@@ -9,6 +9,7 @@ import time
import random
import logging
import argparse
import os
from aslite.arxiv import get_response, parse_response
from aslite.db import get_papers_db, get_metas_db
@@ -30,7 +31,10 @@ if __name__ == '__main__':
"""
# query string of papers to look for
q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO'
q = os.environ.get('ARXIV_QUERY', '')
if not q:
print("No query string provided, will use default.")
q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO+OR+cat:cs.AR+OR+cat:cs.DC+cat:cs.SE'
pdb = get_papers_db(flag='c')
mdb = get_metas_db(flag='c')
+5 -5
View File
@@ -104,27 +104,27 @@ PAPERS_DB_FILE = os.path.join(DATA_DIR, 'papers.db')
# stores account-relevant info, like which tags exist for which papers
DICT_DB_FILE = os.path.join(DATA_DIR, 'dict.db')
def get_papers_db(flag='r', autocommit=True):
def get_papers_db(flag='c', autocommit=True):
assert flag in ['r', 'c']
pdb = CompressedSqliteDict(PAPERS_DB_FILE, tablename='papers', flag=flag, autocommit=autocommit)
return pdb
def get_metas_db(flag='r', autocommit=True):
def get_metas_db(flag='c', autocommit=True):
assert flag in ['r', 'c']
mdb = SqliteDict(PAPERS_DB_FILE, tablename='metas', flag=flag, autocommit=autocommit)
return mdb
def get_tags_db(flag='r', autocommit=True):
def get_tags_db(flag='c', autocommit=True):
assert flag in ['r', 'c']
tdb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
return tdb
def get_last_active_db(flag='r', autocommit=True):
def get_last_active_db(flag='c', autocommit=True):
assert flag in ['r', 'c']
ladb = SqliteDict(DICT_DB_FILE, tablename='last_active', flag=flag, autocommit=autocommit)
return ladb
def get_email_db(flag='r', autocommit=True):
def get_email_db(flag='c', autocommit=True):
assert flag in ['r', 'c']
edb = SqliteDict(DICT_DB_FILE, tablename='email', flag=flag, autocommit=autocommit)
return edb
+56
View File
@@ -0,0 +1,56 @@
# Comments are provided throughout this file to help you get started.
# If you need more help, visit the Docker compose reference guide at
# https://docs.docker.com/compose/compose-file/
# Here the instructions define your application as a service called "server".
# This service is built from the Dockerfile in the current directory.
# You can add other services your application may depend on here, such as a
# database or a cache. For examples, see the Awesome Compose repository:
# https://github.com/docker/awesome-compose
services:
server:
build:
context: .
ports:
- 5000:5000
volumes:
- /Users/schihei/git/arxiv-sanity-lite/container-data/data/:/app/data
- /Users/schihei/git/arxiv-sanity-lite/container-data/static/thumb/:/app/static/thumb
environment:
- ARXIV_QUERY=cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO+OR+cat:cs.AR+OR+cat:cs.DC+cat.cs.SE
- SMTP_ADDRESS=mail.h3132.de
- SMTP_PORT=465
- SMTP_USER_NAME=arxiv@h3132.de
- SMTP_PASSWORD=__ap0gee9436!
# The commented out section below is an example of how to define a PostgreSQL
# database that your application can use. `depends_on` tells Docker Compose to
# start the database before your application. The `db-data` volume persists the
# database data between container restarts. The `db-password` secret is used
# to set the database password. You must create `db/password.txt` and add
# a password of your choosing to it before running `docker compose up`.
# depends_on:
# db:
# condition: service_healthy
# db:
# image: postgres
# restart: always
# user: postgres
# secrets:
# - db-password
# volumes:
# - db-data:/var/lib/postgresql/data
# environment:
# - POSTGRES_DB=example
# - POSTGRES_PASSWORD_FILE=/run/secrets/db-password
# expose:
# - 5432
# healthcheck:
# test: [ "CMD", "pg_isready" ]
# interval: 10s
# timeout: 5s
# retries: 5
# volumes:
# db-data:
# secrets:
# db-password:
# file: db/password.txt
+10
View File
@@ -0,0 +1,10 @@
#!/bin/bash
python3 /app/arxiv_daemon.py --num 2000
if [ $? -eq 0 ]; then
echo "New papers detected! Running compute.py"
python3 /app/compute.py
else
echo "No new papers were added, skipping feature computation"
fi
+6 -5
View File
@@ -1,5 +1,6 @@
feedparser==6.0.8
Flask==2.0.2
numpy==1.21.4
scikit-learn==1.0.1
sqlitedict==1.7.0
feedparser
flask
numpy
requests
scikit-learn
sqlitedict
+50 -29
View File
@@ -5,8 +5,8 @@ I run this script in a cron job to send out emails to the users with their
recommendations. There's a bit of copy paste code here but I expect that
the recommendations may become more complex in the future, so this is ok for now.
You'll notice that the file sendgrid_api_key.txt is not in the repo, you'd have
to manually register with sendgrid yourself, get an API key and put it in the file.
You'll notice that the smtp password is not in the repo, you'd have
to manually register with smtp yourself.
"""
import os
@@ -17,8 +17,9 @@ import argparse
import numpy as np
from sklearn import svm
import sendgrid
from sendgrid.helpers.mail import Email, To, Content, Mail
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from aslite.db import load_features
from aslite.db import get_tags_db
@@ -37,6 +38,7 @@ template = """
<style>
body {
font-family: Arial, sans-serif;
font-size: 14px;
}
.s {
font-weight: bold;
@@ -51,25 +53,23 @@ body {
margin-bottom: 10px;
}
.f {
color: #933;
color: #fb0007;
display: inline-block;
}
</style>
</head>
<body>
<br><br>
<div>Hi! Here are your <a href="https://arxiv-sanity-lite.com">arxiv-sanity-lite</a> recommendations. __STATS__</div>
<br><br>
<div>Hi! Here are your <a href="https://arxiv.h3132.de">arxiv-sanity-lite</a> recommendations.</div>
<br>
<div>__STATS__</div>
<br>
<div>
__CONTENT__
</div>
<br><br>
<br>
<div>
To stop these emails remove your email in your <a href="https://arxiv-sanity-lite.com/profile">account</a> settings. (your account is __ACCOUNT__).
To stop these emails remove your email in your <a href="https://arxiv.h3132.de/profile">account</a> settings. (your account is __ACCOUNT__).
</div>
<div> <3, arxiv-sanity-lite. </div>
@@ -105,7 +105,7 @@ def calculate_recommendation(
y[ptoi[pid]] = 1.0
# classify
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01)
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01, dual=True)
clf.fit(x, y)
s = clf.decision_function(x)
sortix = np.argsort(-s)
@@ -159,15 +159,16 @@ def render_recommendations(user, tags, tag_pids, tag_scores):
if len(summary) == 500:
summary += '...'
# create the url that will feature this paper on top and also show the most similar papers
url = 'https://arxiv-sanity-lite.com/?rank=pid&pid=' + pid
url = 'https://arxiv.h3132.de/?rank=pid&pid=' + pid
parts.append(
"""
<tr>
<td valign="top"><div class="s">%.2f</div></td>
<td>
<a href="%s">%s</a> <div class="f">(%s)</div>
<div class="a">%s</div>
<div class="a"><br>%s</div><br>
<div class="u">%s</div>
<hr size="1">
</td>
</tr>
""" % (score, url, p['title'], max_source_tag[pid], authors, summary)
@@ -196,27 +197,47 @@ def render_recommendations(user, tags, tag_pids, tag_scores):
return out
# -----------------------------------------------------------------------------
# send the actual html via sendgrid
# send the actual html via smtp
def send_email(to, html):
# init the api
assert os.path.isfile('sendgrid_api_key.txt')
api_key = open('sendgrid_api_key.txt', 'r').read().strip()
sg = sendgrid.SendGridAPIClient(api_key=api_key)
# assert os.path.isfile('sendgrid_api_key.txt')
# api_key = open('sendgrid_api_key.txt', 'r').read().strip()
# sg = sendgrid.SendGridAPIClient(api_key=api_key)
# Setup the necessary details from environment variables
smtp_server = os.environ.get('SMTP_ADDRESS', '')
port = os.environ.get('SMTP_PORT', '')
sender = os.environ.get('SMTP_USER_NAME', '')
password = os.environ.get('SMTP_PASSWORD', '')
receiver = to
# Create the message
msg = MIMEMultipart()
msg['Subject'] = tnow_str + " Arxiv Sanity Lite recommendations"
msg['From'] = sender
msg['To'] = to
msg_body = html
msg.attach(MIMEText(msg_body, 'html'))
# construct the email
from_email = Email("admin@arxiv-sanity-lite.com")
to_email = To(to)
subject = tnow_str + " Arxiv Sanity Lite recommendations"
content = Content("text/html", html)
mail = Mail(from_email, to_email, subject, content)
# hope for the best :)
if not args.dry_run:
response = sg.client.mail.send.post(request_body=mail.get())
print(response.status_code)
pass
try:
# Connect to the server
server = smtplib.SMTP_SSL(smtp_server, port)
# Login to the email server
server.login(sender, password)
# Send the email
server.sendmail(sender, receiver, msg.as_string())
# Close the connection to the server
server.quit()
except Exception as e:
print('Something went wrong.', e)
# -----------------------------------------------------------------------------
+12
View File
@@ -10,6 +10,7 @@ ideas:
import os
import re
import time
import math
from random import shuffle
import numpy as np
@@ -214,6 +215,7 @@ def main():
opt_q = request.args.get('q', '') # search request in the text box
opt_tags = request.args.get('tags', default_tags) # tags to rank by if opt_rank == 'tag'
opt_pid = request.args.get('pid', '') # pid to find nearest neighbors to
opt_category_filter = request.args.get('category_filter', '') # primary category to filter
opt_time_filter = request.args.get('time_filter', default_time_filter) # number of days to filter by
opt_skip_have = request.args.get('skip_have', default_skip_have) # hide papers we already have?
opt_svm_c = request.args.get('svm_c', '') # svm C parameter
@@ -245,6 +247,14 @@ def main():
else:
raise ValueError("opt_rank %s is not a thing" % (opt_rank, ))
# filter by primary category
if opt_category_filter:
pdb = get_papers()
kv = {k:v for k,v in pdb.items()} # read all of metas to memory at once, for efficiency
keep = [i for i,pid in enumerate(pids) if (kv[pid]['arxiv_primary_category']['term'])
== opt_category_filter]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# filter by time
if opt_time_filter:
mdb = get_metas()
@@ -262,6 +272,7 @@ def main():
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# crop the number of results to RET_NUM, and paginate
total_pages = math.ceil(len(pids) / RET_NUM)
try:
page_number = max(1, int(opt_page_number))
except ValueError:
@@ -297,6 +308,7 @@ def main():
context['gvars']['search_query'] = opt_q
context['gvars']['svm_c'] = str(C)
context['gvars']['page_number'] = str(page_number)
context['gvars']['total_pages'] = str(total_pages)
return render_template('index.html', **context)
@app.route('/inspect', methods=['GET'])
+2 -2
View File
@@ -38,12 +38,12 @@ const Paper = props => {
return (
<div class='rel_paper'>
<div class="rel_score">{p.weight.toFixed(2)}</div>
<div class='rel_title'><a href={'http://arxiv.org/abs/' + p.id}>{p.title}</a></div>
<div class='rel_title'><a href={'http://arxiv.org/abs/' + p.id} target="_blank">{p.title}</a></div>
<div class='rel_authors'>{p.authors}</div>
<div class="rel_time">{p.time}</div>
<div class='rel_tags'>{p.tags}</div>
{utag_controls}
{thumb_img}
<a href={'http://arxiv.org/pdf/' + p.id} target="_blank">{thumb_img}</a>
<div class='rel_abs'>{p.summary}</div>
<div class='rel_more'><a href={similar_url}>similar</a></div>
<div class='rel_inspect'><a href={inspect_url}>inspect</a></div>
+5 -1
View File
@@ -119,7 +119,11 @@ body {
text-decoration: none;
}
.rel more {
font-size: 10px;
font-size
: 10px;
}
.rel_img { /* prevent thumbnail from increasing width (useful on mobile) */
overflow: hidden;
}
#sbox {
width: 100%;
+9 -1
View File
@@ -54,6 +54,10 @@ var move_page = function(int_offset) {
<label for="pid">pid: </label>
<input name="pid" type="text" id="pid_field" value="{{ gvars.pid }}">
<!-- current category, simply in a text field -->
<label for="category">category: </label>
<input name="category_filter" type="text" id="category_filter__field" value="{{ gvars.category_filter }}" size="8">
<!-- current time_filter, in a text field -->
<label for="time_filter">time_filter (days): </label>
<input name="time_filter" type="text" id="time_filter_field" value="{{ gvars.time_filter }}">
@@ -102,9 +106,13 @@ var move_page = function(int_offset) {
<!-- links to previous and next pages -->
<div id="pagination">
{% if gvars.page_number|int > 1 %}
<span id="link-prev-page" onclick='move_page(-1);'>prev</span>
<span>current page: {{ gvars.page_number }} </span>
{% endif %}
<span>page {{ gvars.page_number }} of {{ gvars.total_pages }}</span>
{% if gvars.page_number|int < gvars.total_pages|int %}
<span id="link-next-page" onclick='move_page(1);'>next</span>
{% endif %}
</div>
{% endblock %}