Files
2021-11-12 22:49:29 -08:00

81 lines
2.6 KiB
Python

"""
Utils for dealing with arxiv API and related processing
"""
import time
import logging
import urllib.request
import feedparser
from collections import OrderedDict
logger = logging.getLogger(__name__)
def get_response(search_query, start_index=0):
""" pings arxiv.org API to fetch a batch of 100 papers """
# fetch raw response
base_url = 'http://export.arxiv.org/api/query?'
add_url = 'search_query=%s&sortBy=lastUpdatedDate&start=%d&max_results=100' % (search_query, start_index)
#add_url = 'search_query=%s&sortBy=submittedDate&start=%d&max_results=100' % (search_query, start_index)
search_query = base_url + add_url
logger.debug(f"Searching arxiv for {search_query}")
with urllib.request.urlopen(search_query) as url:
response = url.read()
if url.status != 200:
logger.error(f"arxiv did not return status 200 response")
return response
def encode_feedparser_dict(d):
""" helper function to strip feedparser objects using a deep copy """
if isinstance(d, feedparser.FeedParserDict) or isinstance(d, dict):
return {k: encode_feedparser_dict(d[k]) for k in d.keys()}
elif isinstance(d, list):
return [encode_feedparser_dict(k) for k in d]
else:
return d
def parse_arxiv_url(url):
"""
examples is http://arxiv.org/abs/1512.08756v2
we want to extract the raw id (1512.08756) and the version (2)
"""
ix = url.rfind('/')
assert ix >= 0, 'bad url: ' + url
idv = url[ix+1:] # extract just the id (and the version)
parts = idv.split('v')
assert len(parts) == 2, 'error splitting id and version in idv string: ' + idv
return idv, parts[0], int(parts[1])
def parse_response(response):
out = []
parse = feedparser.parse(response)
for e in parse.entries:
j = encode_feedparser_dict(e)
# extract / parse id information
idv, rawid, version = parse_arxiv_url(j['id'])
j['_idv']= idv
j['_id'] = rawid
j['_version'] = version
j['_time'] = time.mktime(j['updated_parsed'])
j['_time_str'] = time.strftime('%b %d %Y', j['updated_parsed'])
# delete apparently spurious and redundant information
del j['summary_detail']
del j['title_detail']
out.append(j)
return out
def filter_latest_version(idvs):
"""
for each idv filter the list down to only the most recent version
"""
pid_to_v = OrderedDict()
for idv in idvs:
pid, v = idv.split('v')
pid_to_v[pid] = max(int(v), pid_to_v.get(pid, 0))
filt = [f"{pid}v{v}" for pid, v in pid_to_v.items()]
return filt