Files
arxiv-sanity-lite/aslite/db.py
T

73 lines
2.4 KiB
Python

"""
Database support functions.
The idea is that none of the individual scripts deal directly with the file system.
Any of the file system I/O and the associated settings are in this single file.
"""
import os
import sqlite3, zlib, pickle
from sqlitedict import SqliteDict
DATA_DIR = 'data'
# -----------------------------------------------------------------------------
class CompressedSqliteDict(SqliteDict):
""" overrides the encode/decode methods to use zlib, so we get compressed storage """
def __init__(self, *args, **kwargs):
def encode(obj):
return sqlite3.Binary(zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)))
def decode(obj):
return pickle.loads(zlib.decompress(bytes(obj)))
super().__init__(*args, **kwargs, encode=encode, decode=decode)
# -----------------------------------------------------------------------------
"""
some docs to self:
flag='c': default mode, open for read/write, and creating the db/table if necessary
flag='r': open for read-only
"""
# stores info about papers, and also their lighter-weight metadata
PAPERS_DB_FILE = os.path.join(DATA_DIR, 'papers.db')
# stores account-relevant info, like which tags exist for which papers
DICT_DB_FILE = os.path.join(DATA_DIR, 'dict.db')
def get_papers_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
pdb = CompressedSqliteDict(PAPERS_DB_FILE, tablename='papers', flag=flag, autocommit=autocommit)
return pdb
def get_metas_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
mdb = SqliteDict(PAPERS_DB_FILE, tablename='metas', flag=flag, autocommit=autocommit)
return mdb
def get_tags_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
ddb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
return ddb
# -----------------------------------------------------------------------------
"""
our "feature store" is currently just a pickle file, may want to consider hdf5 in the future
"""
# stores tfidf features a bunch of other metadata
FEATURES_FILE = os.path.join(DATA_DIR, 'features.p')
def save_features(features):
""" takes the features dict and save it to disk in a simple pickle file """
with open(FEATURES_FILE, 'wb') as f:
pickle.dump(features, f)
def load_features():
""" loads the features dict from disk """
with open(FEATURES_FILE, 'rb') as f:
features = pickle.load(f)
return features