first leet codes

2021-11-12 20:40:19 -08:00
parent 089adab199
commit 194b7f4b22
12 changed files with 820 additions and 0 deletions
@@ -0,0 +1,83 @@
+"""
+This script is intended to wake up every 30 min or so (eg via cron),
+it checks for any new arxiv papers via the arxiv API and stashes
+them into a sqlite database papers.db
+"""
+
+import sys
+import time
+import random
+import logging
+import argparse
+
+from aslite.arxiv import get_response, parse_response
+from aslite.db import SqliteDict, CompressedSqliteDict
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Arxiv Daemon')
+    parser.add_argument('-n', '--num', type=int, default=100, help='how many papers to fetch')
+    parser.add_argument('-s', '--start', type=int, default=0, help='start at what index')
+    args = parser.parse_args()
+    print(args)
+    logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
+
+    # query string of papers to look for
+    q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO'
+
+    # flag='c': default mode, open for read/write, creating the db/table if necessary.
+    pdb = CompressedSqliteDict('papers.db', tablename='papers', flag='c', autocommit=True)
+    mdb = SqliteDict('papers.db', tablename='metas', flag='c', autocommit=True)
+    prevn = len(pdb)
+
+    def store(p):
+        pdb[p['_id']] = p
+        mdb[p['_id']] = {'_time': p['_time']}
+
+    # fetch the latest papers
+    for k in range(args.start, args.start + args.num, 100):
+        logging.info('querying arxiv api for query %s at start_index %d' % (q, k))
+
+        # attempt to fetch a batch of papers from arxiv api
+        ntried = 0
+        while True:
+            try:
+                resp = get_response(search_query=q, start_index=k)
+                papers = parse_response(resp)
+                time.sleep(0.5)
+                if len(papers) == 100:
+                    break # otherwise we have to try again
+            except Exception as e:
+                print(e)
+                print("will try again in a bit...")
+                ntried += 1
+                if ntried > 1000:
+                    print("ok we tried 1,000 times, something is srsly wrong. exitting.")
+                    sys.exit()
+                time.sleep(2 + random.uniform(0, 4))
+
+        # process the batch of retrieved papers
+        nhad, nnew, nreplace = 0, 0, 0
+        for p in papers:
+            pid = p['_id']
+            if pid in pdb:
+                if p['_time'] > pdb[pid]['_time']:
+                    # replace, this one is newer
+                    store(p)
+                    nreplace += 1
+                else:
+                    # we already had this paper, nothing to do
+                    nhad += 1
+            else:
+                # new, simple store into database
+                store(p)
+                nnew += 1
+        prevn = len(pdb)
+
+        # print some diagnostic information
+        print(papers[0]['_time_str'])
+        print("k=%d, out of %d: had %d, replaced %d, new %d. now have: %d" %
+             (k, len(papers), nhad, nreplace, nnew, prevn))
+
+        # zzz
+        time.sleep(2 + random.uniform(0, 4))