first leet codes
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Utils for dealing with arxiv API and related processing
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import urllib.request
|
||||
import feedparser
|
||||
from collections import OrderedDict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_response(search_query, start_index=0):
|
||||
""" pings arxiv.org API to fetch a batch of 100 papers """
|
||||
# fetch raw response
|
||||
base_url = 'http://export.arxiv.org/api/query?'
|
||||
add_url = 'search_query=%s&sortBy=lastUpdatedDate&start=%d&max_results=100' % (search_query, start_index)
|
||||
#add_url = 'search_query=%s&sortBy=submittedDate&start=%d&max_results=100' % (search_query, start_index)
|
||||
search_query = base_url + add_url
|
||||
logger.info(f"Searching arxiv for {search_query}")
|
||||
with urllib.request.urlopen(search_query) as url:
|
||||
response = url.read()
|
||||
|
||||
if url.status != 200:
|
||||
logger.error(f"arxiv did not return status 200 response")
|
||||
|
||||
return response
|
||||
|
||||
def encode_feedparser_dict(d):
|
||||
""" helper function to strip feedparser objects using a deep copy """
|
||||
if isinstance(d, feedparser.FeedParserDict) or isinstance(d, dict):
|
||||
return {k: encode_feedparser_dict(d[k]) for k in d.keys()}
|
||||
elif isinstance(d, list):
|
||||
return [encode_feedparser_dict(k) for k in d]
|
||||
else:
|
||||
return d
|
||||
|
||||
def parse_arxiv_url(url):
|
||||
"""
|
||||
examples is http://arxiv.org/abs/1512.08756v2
|
||||
we want to extract the raw id (1512.08756) and the version (2)
|
||||
"""
|
||||
ix = url.rfind('/')
|
||||
assert ix >= 0, 'bad url: ' + url
|
||||
idv = url[ix+1:] # extract just the id (and the version)
|
||||
parts = idv.split('v')
|
||||
assert len(parts) == 2, 'error splitting id and version in idv string: ' + idv
|
||||
return idv, parts[0], int(parts[1])
|
||||
|
||||
def parse_response(response):
|
||||
|
||||
out = []
|
||||
parse = feedparser.parse(response)
|
||||
for e in parse.entries:
|
||||
j = encode_feedparser_dict(e)
|
||||
# extract / parse id information
|
||||
idv, rawid, version = parse_arxiv_url(j['id'])
|
||||
j['_idv']= idv
|
||||
j['_id'] = rawid
|
||||
j['_version'] = version
|
||||
j['_time'] = time.mktime(j['updated_parsed'])
|
||||
j['_time_str'] = time.strftime('%b %d %Y', j['updated_parsed'])
|
||||
# delete apparently spurious and redundant information
|
||||
del j['summary_detail']
|
||||
del j['title_detail']
|
||||
out.append(j)
|
||||
|
||||
return out
|
||||
|
||||
def filter_latest_version(idvs):
|
||||
"""
|
||||
for each idv filter the list down to only the most recent version
|
||||
"""
|
||||
|
||||
pid_to_v = OrderedDict()
|
||||
for idv in idvs:
|
||||
pid, v = idv.split('v')
|
||||
pid_to_v[pid] = max(int(v), pid_to_v.get(pid, 0))
|
||||
|
||||
filt = [f"{pid}v{v}" for pid, v in pid_to_v.items()]
|
||||
return filt
|
||||
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Database support functions
|
||||
"""
|
||||
|
||||
import sqlite3, zlib, pickle
|
||||
from sqlitedict import SqliteDict
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class CompressedSqliteDict(SqliteDict):
|
||||
""" overrides the encode/decode methods to use zlib, so we get compressed storage """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
def encode(obj):
|
||||
return sqlite3.Binary(zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)))
|
||||
|
||||
def decode(obj):
|
||||
return pickle.loads(zlib.decompress(bytes(obj)))
|
||||
|
||||
super().__init__(*args, **kwargs, encode=encode, decode=decode)
|
||||
Reference in New Issue
Block a user