first leet codes

2021-11-12 20:40:19 -08:00
parent 089adab199
commit 194b7f4b22
12 changed files with 820 additions and 0 deletions
@@ -0,0 +1,81 @@
+"""
+Utils for dealing with arxiv API and related processing
+"""
+
+import time
+import logging
+import urllib.request
+import feedparser
+from collections import OrderedDict
+
+logger = logging.getLogger(__name__)
+
+def get_response(search_query, start_index=0):
+    """ pings arxiv.org API to fetch a batch of 100 papers """
+    # fetch raw response
+    base_url = 'http://export.arxiv.org/api/query?'
+    add_url = 'search_query=%s&sortBy=lastUpdatedDate&start=%d&max_results=100' % (search_query, start_index)
+    #add_url = 'search_query=%s&sortBy=submittedDate&start=%d&max_results=100' % (search_query, start_index)
+    search_query = base_url + add_url
+    logger.info(f"Searching arxiv for {search_query}")
+    with urllib.request.urlopen(search_query) as url:
+        response = url.read()
+
+    if url.status != 200:
+        logger.error(f"arxiv did not return status 200 response")
+
+    return response
+
+def encode_feedparser_dict(d):
+    """ helper function to strip feedparser objects using a deep copy """
+    if isinstance(d, feedparser.FeedParserDict) or isinstance(d, dict):
+        return {k: encode_feedparser_dict(d[k]) for k in d.keys()}
+    elif isinstance(d, list):
+        return [encode_feedparser_dict(k) for k in d]
+    else:
+        return d
+
+def parse_arxiv_url(url):
+    """
+    examples is http://arxiv.org/abs/1512.08756v2
+    we want to extract the raw id (1512.08756) and the version (2)
+    """
+    ix = url.rfind('/')
+    assert ix >= 0, 'bad url: ' + url
+    idv = url[ix+1:] # extract just the id (and the version)
+    parts = idv.split('v')
+    assert len(parts) == 2, 'error splitting id and version in idv string: ' + idv
+    return idv, parts[0], int(parts[1])
+
+def parse_response(response):
+
+    out = []
+    parse = feedparser.parse(response)
+    for e in parse.entries:
+        j = encode_feedparser_dict(e)
+        # extract / parse id information
+        idv, rawid, version = parse_arxiv_url(j['id'])
+        j['_idv']= idv
+        j['_id'] = rawid
+        j['_version'] = version
+        j['_time'] = time.mktime(j['updated_parsed'])
+        j['_time_str'] = time.strftime('%b %d %Y', j['updated_parsed'])
+        # delete apparently spurious and redundant information
+        del j['summary_detail']
+        del j['title_detail']
+        out.append(j)
+
+    return out
+
+def filter_latest_version(idvs):
+    """
+    for each idv filter the list down to only the most recent version
+    """
+
+    pid_to_v = OrderedDict()
+    for idv in idvs:
+        pid, v = idv.split('v')
+        pid_to_v[pid] = max(int(v), pid_to_v.get(pid, 0))
+
+    filt = [f"{pid}v{v}" for pid, v in pid_to_v.items()]
+    return filt
@@ -0,0 +1,21 @@
+"""
+Database support functions
+"""
+
+import sqlite3, zlib, pickle
+from sqlitedict import SqliteDict
+
+# -----------------------------------------------------------------------------
+
+class CompressedSqliteDict(SqliteDict):
+    """ overrides the encode/decode methods to use zlib, so we get compressed storage """
+
+    def __init__(self, *args, **kwargs):
+
+        def encode(obj):
+            return sqlite3.Binary(zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)))
+
+        def decode(obj):
+            return pickle.loads(zlib.decompress(bytes(obj)))
+
+        super().__init__(*args, **kwargs, encode=encode, decode=decode)