add an option to break out early when we've pulled in all new papers most likely
This commit is contained in:
+17
-1
@@ -18,10 +18,16 @@ if __name__ == '__main__':
|
|||||||
logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
|
logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Arxiv Daemon')
|
parser = argparse.ArgumentParser(description='Arxiv Daemon')
|
||||||
parser.add_argument('-n', '--num', type=int, default=100, help='how many papers to fetch')
|
parser.add_argument('-n', '--num', type=int, default=100, help='up to how many papers to fetch')
|
||||||
parser.add_argument('-s', '--start', type=int, default=0, help='start at what index')
|
parser.add_argument('-s', '--start', type=int, default=0, help='start at what index')
|
||||||
|
parser.add_argument('-b', '--break-after', type=int, default=3, help='how many 0 new papers in a row would cause us to stop early? or 0 to disable.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
print(args)
|
||||||
|
"""
|
||||||
|
Quick note on the break_after argument: In a typical setting where one wants to update
|
||||||
|
the papers database you'd choose a slightly higher num, but then break out early in case
|
||||||
|
we've reached older papers that are already part of the database, to spare the arxiv API.
|
||||||
|
"""
|
||||||
|
|
||||||
# query string of papers to look for
|
# query string of papers to look for
|
||||||
q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO'
|
q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO'
|
||||||
@@ -35,6 +41,7 @@ if __name__ == '__main__':
|
|||||||
mdb[p['_id']] = {'_time': p['_time']}
|
mdb[p['_id']] = {'_time': p['_time']}
|
||||||
|
|
||||||
# fetch the latest papers
|
# fetch the latest papers
|
||||||
|
zero_updates_in_a_row = 0
|
||||||
for k in range(args.start, args.start + args.num, 100):
|
for k in range(args.start, args.start + args.num, 100):
|
||||||
logging.info('querying arxiv api for query %s at start_index %d' % (q, k))
|
logging.info('querying arxiv api for query %s at start_index %d' % (q, k))
|
||||||
|
|
||||||
@@ -79,5 +86,14 @@ if __name__ == '__main__':
|
|||||||
logging.info("k=%d, out of %d: had %d, replaced %d, new %d. now have: %d" %
|
logging.info("k=%d, out of %d: had %d, replaced %d, new %d. now have: %d" %
|
||||||
(k, len(papers), nhad, nreplace, nnew, prevn))
|
(k, len(papers), nhad, nreplace, nnew, prevn))
|
||||||
|
|
||||||
|
# early termination criteria
|
||||||
|
if nnew == 0:
|
||||||
|
zero_updates_in_a_row += 1
|
||||||
|
if args.break_after > 0 and zero_updates_in_a_row >= args.break_after:
|
||||||
|
logging.info("breaking out early, no new papers %d times in a row" % (args.break_after, ))
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
zero_updates_in_a_row = 0
|
||||||
|
|
||||||
# zzz
|
# zzz
|
||||||
time.sleep(1 + random.uniform(0, 3))
|
time.sleep(1 + random.uniform(0, 3))
|
||||||
|
|||||||
Reference in New Issue
Block a user