diff --git a/aslite/db.py b/aslite/db.py index 39a7e97..14aca0c 100644 --- a/aslite/db.py +++ b/aslite/db.py @@ -1,5 +1,7 @@ """ -Database support functions +Database support functions. +The idea is that none of the individual scripts deal directly with the file system. +Any of the file system I/O and the associated settings are in this single file. """ import sqlite3, zlib, pickle @@ -21,7 +23,6 @@ class CompressedSqliteDict(SqliteDict): super().__init__(*args, **kwargs, encode=encode, decode=decode) # ----------------------------------------------------------------------------- - """ some docs to self: flag='c': default mode, open for read/write, and creating the db/table if necessary @@ -45,3 +46,21 @@ def get_tags_db(flag='r', autocommit=True): assert flag in ['r', 'c'] ddb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit) return ddb + +# ----------------------------------------------------------------------------- +""" +our "feature store" is currently just a pickle file, may want to consider hdf5 in the future +""" + +FEATURES_FILE = 'features.p' # stores tfidf features a bunch of other metadata + +def save_features(features): + """ takes the features dict and save it to disk in a simple pickle file """ + with open(FEATURES_FILE, 'wb') as f: + pickle.dump(features, f) + +def load_features(): + """ loads the features dict from disk """ + with open(FEATURES_FILE, 'rb') as f: + features = pickle.load(f) + return features diff --git a/compute.py b/compute.py index 7e93c1a..6961fa9 100644 --- a/compute.py +++ b/compute.py @@ -1,7 +1,5 @@ """ -Extracts features from all paper abstracts. -Saves them into one big features.p pickle file holding the numpy array -of features for all the paper abstracts... +Extracts tfidf features from all paper abstracts and saves them to disk. """ import pickle @@ -10,7 +8,7 @@ import argparse import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer -from aslite.db import get_papers_db +from aslite.db import get_papers_db, save_features # ----------------------------------------------------------------------------- @@ -45,11 +43,11 @@ if __name__ == '__main__': x = v.transform(make_corpus()).astype(np.float32) print(x.shape) - print("saving to features.p") + print("saving to features to disk...") features = { 'pids': list(pdb.keys()), 'x': x, 'vocab': v.vocabulary_, 'idf': v._tfidf.idf_, } - pickle.dump(features, open('features.p', 'wb' )) + save_features(features) diff --git a/serve.py b/serve.py index 99fbf1a..43fd67d 100644 --- a/serve.py +++ b/serve.py @@ -19,6 +19,7 @@ from flask import render_template from flask import g # global session-level object from aslite.db import get_papers_db, get_metas_db, get_tags_db +from aslite.db import load_features # ----------------------------------------------------------------------------- # TODO: user accounts / password login are necessary... @@ -89,8 +90,7 @@ def svm_rank(tags: str = '', pid: str = ''): assert tags or pid # load all of the features - with open('features.p', 'rb') as f: - features = pickle.load(f) + features = load_features() x, pids = features['x'], features['pids'] n, d = x.shape ptoi, itop = {}, {} @@ -242,8 +242,7 @@ def inspect(): return "error, malformed pid" # todo: better error handling # load the tfidf vectors, the vocab, and the idf table - with open('features.p', 'rb') as f: - features = pickle.load(f) + features = load_features() x = features['x'] idf = features['idf'] ivocab = {v:k for k,v in features['vocab'].items()}