diff --git a/README.md b/README.md
index eea5e09..213331b 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,8 @@ export FLASK_APP=serve.py; flask run
All of the database will be stored inside the `data` directory. Finally, if you'd like to run your own instance on the interwebs I recommend simply running the above on a [Linode](https://www.linode.com), e.g. I am running this code currently on the smallest "Nanode 1 GB" instance indexing about 30K papers, which costs $5/month.
+Finally, if you'd like to send periodic emails to users about new papers, see the `send_emails.py` script. I run this script in a daily cron job.
+
#### todos
- I need a proper requirements.txt and such
diff --git a/send_emails.py b/send_emails.py
new file mode 100644
index 0000000..b1100cf
--- /dev/null
+++ b/send_emails.py
@@ -0,0 +1,220 @@
+"""
+Compose and send recommendation emails to arxiv-sanity-lite users!
+
+I run this script in a cron job to send out emails to the users with their
+recommendations. There's a bit of copy paste code here but I expect that
+the recommendations may become more complex in the future, so this is ok for now.
+
+You'll notice that the file sendgrid_api_key.txt is not in the repo, you'd have
+to manually register with sendgrid yourself, get an API key and put it in the file.
+"""
+
+import os
+import time
+import numpy as np
+from sklearn import svm
+
+import sendgrid
+from sendgrid.helpers.mail import Email, To, Content, Mail
+
+from aslite.db import load_features
+from aslite.db import get_tags_db
+from aslite.db import get_metas_db
+from aslite.db import get_papers_db
+from aslite.db import get_email_db
+
+# -----------------------------------------------------------------------------
+# the html template for the email
+
+template = """
+
+
+
+
+
+
+
+
+
+
+Good morning! Here are your daily
arxiv-sanity-lite recommendations of very recent papers:
+
+
+
+ __CONTENT__
+
+
+
+
+To stop these emails remove your email in your
account settings.
+
+
+
+
+
+"""
+
+# -----------------------------------------------------------------------------
+
+def calculate_recommendation(
+ tags,
+ time_delta = 3, # how recent papers are we recommending? in days
+ ):
+
+ # a bit of preprocessing
+ x, pids = features['x'], features['pids']
+ n, d = x.shape
+ ptoi, itop = {}, {}
+ for i, p in enumerate(pids):
+ ptoi[p] = i
+ itop[i] = p
+
+ # construct the positive set via simple union of all tags
+ y = np.zeros(n, dtype=np.float32)
+ for tag, pids in tags.items():
+ for pid in pids:
+ y[ptoi[pid]] = 1.0
+
+ # classify
+ clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
+ clf.fit(x, y)
+ s = clf.decision_function(x)
+ sortix = np.argsort(-s)
+ pids = [itop[ix] for ix in sortix]
+ scores = [100*float(s[ix]) for ix in sortix]
+
+ # filter by time to only recent papers
+ deltat = time_delta*60*60*24 # allowed time delta in seconds
+ keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
+ pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
+
+ # finally exclude the papers we already have tagged
+ have = set().union(*tags.values())
+ keep = [i for i,pid in enumerate(pids) if pid not in have]
+ pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
+
+ return pids, scores
+
+# -----------------------------------------------------------------------------
+
+def render_recommendations(pids, scores, num_recommendations = 10):
+ # render the paper recommendations into the html template
+
+ parts = []
+ n = min(len(scores), num_recommendations)
+ for score, pid in zip(scores[:n], pids[:n]):
+ p = pdb[pid]
+ authors = ', '.join(a['name'] for a in p['authors'])
+ # crop the abstract
+ summary = p['summary']
+ summary = summary[:min(500, len(summary))]
+ if len(summary) == 500:
+ summary += '...'
+ parts.append(
+"""
+
+%.2f |
+
+%s
+ %s
+%s
+ |
+
+""" % (score, p['link'], p['title'], authors, summary)
+ )
+
+ final = ''
+ out = template.replace('__CONTENT__', final)
+ return out
+
+# -----------------------------------------------------------------------------
+# send the actual html via sendgrid
+
+def send_email(to, html):
+
+ # init the api
+ assert os.path.isfile('sendgrid_api_key.txt')
+ api_key = open('sendgrid_api_key.txt', 'r').read().strip()
+ sg = sendgrid.SendGridAPIClient(api_key=api_key)
+
+ # construct the email
+ from_email = Email("arxiv-sanity-lite-admin@arxiv-sanity-lite.com")
+ to_email = To(to)
+ subject = tnow_str + " Arxiv Sanity Lite recommendations"
+ content = Content("text/html", html)
+ mail = Mail(from_email, to_email, subject, content)
+
+ # hope for the best :)
+ response = sg.client.mail.send.post(request_body=mail.get())
+ print(response.status_code)
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+
+ TIME_DELTA = 3 # how recent papers are we recommending? in days
+ NUM_RECCOMENDATIONS = 20 # how many papers to recommend?
+
+ tnow = time.time()
+ tnow_str = time.strftime('%b %d', time.localtime(tnow)) # e.g. "Nov 27"
+
+ # read entire db simply into RAM
+ with get_tags_db() as tags_db:
+ tags = {k:v for k,v in tags_db.items()}
+
+ # read entire db simply into RAM
+ with get_metas_db() as mdb:
+ metas = {k:v for k,v in mdb.items()}
+
+ # read entire db simply into RAM
+ with get_email_db() as edb:
+ emails = {k:v for k,v in edb.items()}
+
+ # read tfidf features into RAM
+ features = load_features()
+
+ # keep the papers as only a handle, since this can be larger
+ pdb = get_papers_db()
+
+ # iterate all users, create recommendations, send emails
+ for user, tags in tags.items():
+
+ # verify that we have an email for this user
+ email = emails.get(user, None)
+ if not email:
+ print("skipping user %s, no email" % (user, ))
+ continue
+
+ # calculate the recommendations
+ pids, scores = calculate_recommendation(tags, time_delta=TIME_DELTA)
+ print("user %s has %d recommendations over last %d days" % (user, len(pids), TIME_DELTA))
+
+ # render the html
+ print("rendering top %d recommendations into a report..." % (NUM_RECCOMENDATIONS, ))
+ html = render_recommendations(pids, scores, num_recommendations=NUM_RECCOMENDATIONS)
+ # temporarily for debugging write recommendations to disk for manual inspection
+ with open('recco/%s.html' % (user, ), 'w') as f:
+ f.write(html)
+
+ # actually send the email
+ print("sending email...")
+ send_email(email, html)
+
+
+ print("done.")