149 lines
4.8 KiB
Python
149 lines
4.8 KiB
Python
"""
|
|
Database support functions.
|
|
The idea is that none of the individual scripts deal directly with the file system.
|
|
Any of the file system I/O and the associated settings are in this single file.
|
|
"""
|
|
|
|
import os
|
|
import sqlite3, zlib, pickle, tempfile
|
|
from sqlitedict import SqliteDict
|
|
from contextlib import contextmanager
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# global configuration
|
|
|
|
DATA_DIR = 'data'
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# utilities for safe writing of a pickle file
|
|
|
|
# Context managers for atomic writes courtesy of
|
|
# http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python
|
|
@contextmanager
|
|
def _tempfile(*args, **kws):
|
|
""" Context for temporary file.
|
|
Will find a free temporary filename upon entering
|
|
and will try to delete the file on leaving
|
|
Parameters
|
|
----------
|
|
suffix : string
|
|
optional file suffix
|
|
"""
|
|
|
|
fd, name = tempfile.mkstemp(*args, **kws)
|
|
os.close(fd)
|
|
try:
|
|
yield name
|
|
finally:
|
|
try:
|
|
os.remove(name)
|
|
except OSError as e:
|
|
if e.errno == 2:
|
|
pass
|
|
else:
|
|
raise e
|
|
|
|
|
|
@contextmanager
|
|
def open_atomic(filepath, *args, **kwargs):
|
|
""" Open temporary file object that atomically moves to destination upon
|
|
exiting.
|
|
Allows reading and writing to and from the same filename.
|
|
Parameters
|
|
----------
|
|
filepath : string
|
|
the file path to be opened
|
|
fsync : bool
|
|
whether to force write the file to disk
|
|
kwargs : mixed
|
|
Any valid keyword arguments for :code:`open`
|
|
"""
|
|
fsync = kwargs.pop('fsync', False)
|
|
|
|
with _tempfile(dir=os.path.dirname(filepath)) as tmppath:
|
|
with open(tmppath, *args, **kwargs) as f:
|
|
yield f
|
|
if fsync:
|
|
f.flush()
|
|
os.fsync(f.fileno())
|
|
os.rename(tmppath, filepath)
|
|
|
|
def safe_pickle_dump(obj, fname):
|
|
"""
|
|
prevents a case where one process could be writing a pickle file
|
|
while another process is reading it, causing a crash. the solution
|
|
is to write the pickle file to a temporary file and then move it.
|
|
"""
|
|
with open_atomic(fname, 'wb') as f:
|
|
pickle.dump(obj, f, -1) # -1 specifies highest binary protocol
|
|
|
|
# -----------------------------------------------------------------------------
|
|
|
|
class CompressedSqliteDict(SqliteDict):
|
|
""" overrides the encode/decode methods to use zlib, so we get compressed storage """
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
def encode(obj):
|
|
return sqlite3.Binary(zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)))
|
|
|
|
def decode(obj):
|
|
return pickle.loads(zlib.decompress(bytes(obj)))
|
|
|
|
super().__init__(*args, **kwargs, encode=encode, decode=decode)
|
|
|
|
# -----------------------------------------------------------------------------
|
|
"""
|
|
some docs to self:
|
|
flag='c': default mode, open for read/write, and creating the db/table if necessary
|
|
flag='r': open for read-only
|
|
"""
|
|
|
|
# stores info about papers, and also their lighter-weight metadata
|
|
PAPERS_DB_FILE = os.path.join(DATA_DIR, 'papers.db')
|
|
# stores account-relevant info, like which tags exist for which papers
|
|
DICT_DB_FILE = os.path.join(DATA_DIR, 'dict.db')
|
|
|
|
def get_papers_db(flag='r', autocommit=True):
|
|
assert flag in ['r', 'c']
|
|
pdb = CompressedSqliteDict(PAPERS_DB_FILE, tablename='papers', flag=flag, autocommit=autocommit)
|
|
return pdb
|
|
|
|
def get_metas_db(flag='r', autocommit=True):
|
|
assert flag in ['r', 'c']
|
|
mdb = SqliteDict(PAPERS_DB_FILE, tablename='metas', flag=flag, autocommit=autocommit)
|
|
return mdb
|
|
|
|
def get_tags_db(flag='r', autocommit=True):
|
|
assert flag in ['r', 'c']
|
|
tdb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
|
|
return tdb
|
|
|
|
def get_last_active_db(flag='r', autocommit=True):
|
|
assert flag in ['r', 'c']
|
|
ladb = SqliteDict(DICT_DB_FILE, tablename='last_active', flag=flag, autocommit=autocommit)
|
|
return ladb
|
|
|
|
def get_email_db(flag='r', autocommit=True):
|
|
assert flag in ['r', 'c']
|
|
edb = SqliteDict(DICT_DB_FILE, tablename='email', flag=flag, autocommit=autocommit)
|
|
return edb
|
|
|
|
# -----------------------------------------------------------------------------
|
|
"""
|
|
our "feature store" is currently just a pickle file, may want to consider hdf5 in the future
|
|
"""
|
|
|
|
# stores tfidf features a bunch of other metadata
|
|
FEATURES_FILE = os.path.join(DATA_DIR, 'features.p')
|
|
|
|
def save_features(features):
|
|
""" takes the features dict and save it to disk in a simple pickle file """
|
|
safe_pickle_dump(features, FEATURES_FILE)
|
|
|
|
def load_features():
|
|
""" loads the features dict from disk """
|
|
with open(FEATURES_FILE, 'rb') as f:
|
|
features = pickle.load(f)
|
|
return features
|