first leet codes

This commit is contained in:
Andrej Karpathy
2021-11-12 20:40:19 -08:00
parent 089adab199
commit 194b7f4b22
12 changed files with 820 additions and 0 deletions
+83
View File
@@ -0,0 +1,83 @@
"""
This script is intended to wake up every 30 min or so (eg via cron),
it checks for any new arxiv papers via the arxiv API and stashes
them into a sqlite database papers.db
"""
import sys
import time
import random
import logging
import argparse
from aslite.arxiv import get_response, parse_response
from aslite.db import SqliteDict, CompressedSqliteDict
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Arxiv Daemon')
parser.add_argument('-n', '--num', type=int, default=100, help='how many papers to fetch')
parser.add_argument('-s', '--start', type=int, default=0, help='start at what index')
args = parser.parse_args()
print(args)
logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
# query string of papers to look for
q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO'
# flag='c': default mode, open for read/write, creating the db/table if necessary.
pdb = CompressedSqliteDict('papers.db', tablename='papers', flag='c', autocommit=True)
mdb = SqliteDict('papers.db', tablename='metas', flag='c', autocommit=True)
prevn = len(pdb)
def store(p):
pdb[p['_id']] = p
mdb[p['_id']] = {'_time': p['_time']}
# fetch the latest papers
for k in range(args.start, args.start + args.num, 100):
logging.info('querying arxiv api for query %s at start_index %d' % (q, k))
# attempt to fetch a batch of papers from arxiv api
ntried = 0
while True:
try:
resp = get_response(search_query=q, start_index=k)
papers = parse_response(resp)
time.sleep(0.5)
if len(papers) == 100:
break # otherwise we have to try again
except Exception as e:
print(e)
print("will try again in a bit...")
ntried += 1
if ntried > 1000:
print("ok we tried 1,000 times, something is srsly wrong. exitting.")
sys.exit()
time.sleep(2 + random.uniform(0, 4))
# process the batch of retrieved papers
nhad, nnew, nreplace = 0, 0, 0
for p in papers:
pid = p['_id']
if pid in pdb:
if p['_time'] > pdb[pid]['_time']:
# replace, this one is newer
store(p)
nreplace += 1
else:
# we already had this paper, nothing to do
nhad += 1
else:
# new, simple store into database
store(p)
nnew += 1
prevn = len(pdb)
# print some diagnostic information
print(papers[0]['_time_str'])
print("k=%d, out of %d: had %d, replaced %d, new %d. now have: %d" %
(k, len(papers), nhad, nreplace, nnew, prevn))
# zzz
time.sleep(2 + random.uniform(0, 4))