sequester gross details about database instantiation in the filesystem away from the scripts

This commit is contained in:
Andrej Karpathy
2021-11-12 21:12:09 -08:00
parent 194b7f4b22
commit 13a1d5ff48
4 changed files with 46 additions and 23 deletions
+4 -5
View File
@@ -1,7 +1,7 @@
""" """
This script is intended to wake up every 30 min or so (eg via cron), This script is intended to wake up every 30 min or so (eg via cron),
it checks for any new arxiv papers via the arxiv API and stashes it checks for any new arxiv papers via the arxiv API and stashes
them into a sqlite database papers.db them into a sqlite database.
""" """
import sys import sys
@@ -11,7 +11,7 @@ import logging
import argparse import argparse
from aslite.arxiv import get_response, parse_response from aslite.arxiv import get_response, parse_response
from aslite.db import SqliteDict, CompressedSqliteDict from aslite.db import get_papers_db, get_metas_db
if __name__ == '__main__': if __name__ == '__main__':
@@ -25,9 +25,8 @@ if __name__ == '__main__':
# query string of papers to look for # query string of papers to look for
q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO' q = 'cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.AI+OR+cat:cs.NE+OR+cat:cs.RO'
# flag='c': default mode, open for read/write, creating the db/table if necessary. pdb = get_papers_db(flag='c', autocommit=True)
pdb = CompressedSqliteDict('papers.db', tablename='papers', flag='c', autocommit=True) mdb = get_metas_db(flag='c', autocommit=True)
mdb = SqliteDict('papers.db', tablename='metas', flag='c', autocommit=True)
prevn = len(pdb) prevn = len(pdb)
def store(p): def store(p):
+26
View File
@@ -19,3 +19,29 @@ class CompressedSqliteDict(SqliteDict):
return pickle.loads(zlib.decompress(bytes(obj))) return pickle.loads(zlib.decompress(bytes(obj)))
super().__init__(*args, **kwargs, encode=encode, decode=decode) super().__init__(*args, **kwargs, encode=encode, decode=decode)
# -----------------------------------------------------------------------------
"""
some docs to self:
flag='c': default mode, open for read/write, and creating the db/table if necessary
flag='r': open for read-only
"""
PAPERS_DB_FILE = 'papers.db' # stores info about papers, and also their lighter-weight metadata
DICT_DB_FILE = 'dict.db' # stores account-relevant info, like which tags exist for which papers
def get_papers_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
pdb = CompressedSqliteDict(PAPERS_DB_FILE, tablename='papers', flag=flag, autocommit=autocommit)
return pdb
def get_metas_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
mdb = SqliteDict(PAPERS_DB_FILE, tablename='metas', flag=flag, autocommit=autocommit)
return mdb
def get_tags_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
ddb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
return ddb
+2 -2
View File
@@ -10,7 +10,7 @@ import argparse
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from aslite.db import SqliteDict, CompressedSqliteDict from aslite.db import get_papers_db
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@@ -31,7 +31,7 @@ if __name__ == '__main__':
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
max_df=args.max_df, min_df=args.min_df) max_df=args.max_df, min_df=args.min_df)
pdb = CompressedSqliteDict('papers.db', tablename='papers', flag='r') pdb = get_papers_db(flag='r')
def make_corpus(): def make_corpus():
for p, d in pdb.items(): for p, d in pdb.items():
+14 -16
View File
@@ -18,7 +18,7 @@ from flask import Flask, request, redirect, url_for
from flask import render_template from flask import render_template
from flask import g # global session-level object from flask import g # global session-level object
from aslite.db import SqliteDict, CompressedSqliteDict from aslite.db import get_papers_db, get_metas_db, get_tags_db
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# TODO: user accounts / password login are necessary... # TODO: user accounts / password login are necessary...
@@ -30,19 +30,19 @@ def get_tags():
if not hasattr(g, '_tags'): if not hasattr(g, '_tags'):
user = 'root' # root for now, the only default user user = 'root' # root for now, the only default user
print("reading tags for user %s" % (user, )) print("reading tags for user %s" % (user, ))
with CompressedSqliteDict('dict.db', tablename='tags', flag='r') as dict_db: with get_tags_db() as tags_db:
tags_dict = dict_db[user] if user in dict_db else {} tags_dict = tags_db[user] if user in tags_db else {}
g._tags = tags_dict g._tags = tags_dict
return g._tags return g._tags
def get_papers(): def get_papers():
if not hasattr(g, '_pdb'): if not hasattr(g, '_pdb'):
g._pdb = CompressedSqliteDict('papers.db', tablename='papers', flag='r') g._pdb = get_papers_db()
return g._pdb return g._pdb
def get_metas(): def get_metas():
if not hasattr(g, '_mdb'): if not hasattr(g, '_mdb'):
g._mdb = SqliteDict('papers.db', tablename='metas', flag='r') g._mdb = get_metas_db()
return g._mdb return g._mdb
def render_pids(pids): def render_pids(pids):
@@ -231,14 +231,14 @@ def search():
@app.route('/add/<pid>/<tag>') @app.route('/add/<pid>/<tag>')
def add(pid=None, tag=None): def add(pid=None, tag=None):
user = 'root' user = 'root'
with CompressedSqliteDict('dict.db', tablename='tags', flag='c') as dict_db: with get_tags_db(flag='c') as tags_db:
# create the user if we don't know about them yet with an empty library # create the user if we don't know about them yet with an empty library
if not user in dict_db: if not user in tags_db:
dict_db[user] = {} tags_db[user] = {}
# fetch the user library object # fetch the user library object
d = dict_db[user] d = tags_db[user]
# add the paper to the tag # add the paper to the tag
if tag not in d: if tag not in d:
@@ -246,8 +246,7 @@ def add(pid=None, tag=None):
d[tag].add(pid) d[tag].add(pid)
# write back to database # write back to database
dict_db[user] = d tags_db[user] = d
dict_db.commit()
print("added paper %s to tag %s for user %s" % (pid, tag, user)) print("added paper %s to tag %s for user %s" % (pid, tag, user))
return "ok: " + str(d) # return back the user library for debugging atm return "ok: " + str(d) # return back the user library for debugging atm
@@ -255,12 +254,12 @@ def add(pid=None, tag=None):
@app.route('/del/<tag>') @app.route('/del/<tag>')
def delete_tag(tag=None): def delete_tag(tag=None):
user = 'root' user = 'root'
with CompressedSqliteDict('dict.db', tablename='tags', flag='c') as dict_db: with get_tags_db() as tags_db:
if user not in dict_db: if user not in tags_db:
return "user does not have a library" return "user does not have a library"
d = dict_db[user] d = tags_db[user]
if tag not in d: if tag not in d:
return "user does not have this tag" return "user does not have this tag"
@@ -269,8 +268,7 @@ def delete_tag(tag=None):
del d[tag] del d[tag]
# write back to database # write back to database
dict_db[user] = d tags_db[user] = d
dict_db.commit()
print("deleted tag %s for user %s" % (tag, user)) print("deleted tag %s for user %s" % (tag, user))
return "ok: " + str(d) # return back the user library for debugging atm return "ok: " + str(d) # return back the user library for debugging atm