sequester all file sytem IO ops only to db.py, so it's not total chaos

This commit is contained in:
Andrej Karpathy
2021-11-25 13:28:04 -08:00
parent bd11865a49
commit 77279e1777
3 changed files with 28 additions and 12 deletions
+21 -2
View File
@@ -1,5 +1,7 @@
"""
Database support functions
Database support functions.
The idea is that none of the individual scripts deal directly with the file system.
Any of the file system I/O and the associated settings are in this single file.
"""
import sqlite3, zlib, pickle
@@ -21,7 +23,6 @@ class CompressedSqliteDict(SqliteDict):
super().__init__(*args, **kwargs, encode=encode, decode=decode)
# -----------------------------------------------------------------------------
"""
some docs to self:
flag='c': default mode, open for read/write, and creating the db/table if necessary
@@ -45,3 +46,21 @@ def get_tags_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
ddb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
return ddb
# -----------------------------------------------------------------------------
"""
our "feature store" is currently just a pickle file, may want to consider hdf5 in the future
"""
FEATURES_FILE = 'features.p' # stores tfidf features a bunch of other metadata
def save_features(features):
""" takes the features dict and save it to disk in a simple pickle file """
with open(FEATURES_FILE, 'wb') as f:
pickle.dump(features, f)
def load_features():
""" loads the features dict from disk """
with open(FEATURES_FILE, 'rb') as f:
features = pickle.load(f)
return features
+4 -6
View File
@@ -1,7 +1,5 @@
"""
Extracts features from all paper abstracts.
Saves them into one big features.p pickle file holding the numpy array
of features for all the paper abstracts...
Extracts tfidf features from all paper abstracts and saves them to disk.
"""
import pickle
@@ -10,7 +8,7 @@ import argparse
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from aslite.db import get_papers_db
from aslite.db import get_papers_db, save_features
# -----------------------------------------------------------------------------
@@ -45,11 +43,11 @@ if __name__ == '__main__':
x = v.transform(make_corpus()).astype(np.float32)
print(x.shape)
print("saving to features.p")
print("saving to features to disk...")
features = {
'pids': list(pdb.keys()),
'x': x,
'vocab': v.vocabulary_,
'idf': v._tfidf.idf_,
}
pickle.dump(features, open('features.p', 'wb' ))
save_features(features)
+3 -4
View File
@@ -19,6 +19,7 @@ from flask import render_template
from flask import g # global session-level object
from aslite.db import get_papers_db, get_metas_db, get_tags_db
from aslite.db import load_features
# -----------------------------------------------------------------------------
# TODO: user accounts / password login are necessary...
@@ -89,8 +90,7 @@ def svm_rank(tags: str = '', pid: str = ''):
assert tags or pid
# load all of the features
with open('features.p', 'rb') as f:
features = pickle.load(f)
features = load_features()
x, pids = features['x'], features['pids']
n, d = x.shape
ptoi, itop = {}, {}
@@ -242,8 +242,7 @@ def inspect():
return "error, malformed pid" # todo: better error handling
# load the tfidf vectors, the vocab, and the idf table
with open('features.p', 'rb') as f:
features = pickle.load(f)
features = load_features()
x = features['x']
idf = features['idf']
ivocab = {v:k for k,v in features['vocab'].items()}