sequester all file sytem IO ops only to db.py, so it's not total chaos
This commit is contained in:
+21
-2
@@ -1,5 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Database support functions
|
Database support functions.
|
||||||
|
The idea is that none of the individual scripts deal directly with the file system.
|
||||||
|
Any of the file system I/O and the associated settings are in this single file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sqlite3, zlib, pickle
|
import sqlite3, zlib, pickle
|
||||||
@@ -21,7 +23,6 @@ class CompressedSqliteDict(SqliteDict):
|
|||||||
super().__init__(*args, **kwargs, encode=encode, decode=decode)
|
super().__init__(*args, **kwargs, encode=encode, decode=decode)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
"""
|
"""
|
||||||
some docs to self:
|
some docs to self:
|
||||||
flag='c': default mode, open for read/write, and creating the db/table if necessary
|
flag='c': default mode, open for read/write, and creating the db/table if necessary
|
||||||
@@ -45,3 +46,21 @@ def get_tags_db(flag='r', autocommit=True):
|
|||||||
assert flag in ['r', 'c']
|
assert flag in ['r', 'c']
|
||||||
ddb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
|
ddb = CompressedSqliteDict(DICT_DB_FILE, tablename='tags', flag=flag, autocommit=autocommit)
|
||||||
return ddb
|
return ddb
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
"""
|
||||||
|
our "feature store" is currently just a pickle file, may want to consider hdf5 in the future
|
||||||
|
"""
|
||||||
|
|
||||||
|
FEATURES_FILE = 'features.p' # stores tfidf features a bunch of other metadata
|
||||||
|
|
||||||
|
def save_features(features):
|
||||||
|
""" takes the features dict and save it to disk in a simple pickle file """
|
||||||
|
with open(FEATURES_FILE, 'wb') as f:
|
||||||
|
pickle.dump(features, f)
|
||||||
|
|
||||||
|
def load_features():
|
||||||
|
""" loads the features dict from disk """
|
||||||
|
with open(FEATURES_FILE, 'rb') as f:
|
||||||
|
features = pickle.load(f)
|
||||||
|
return features
|
||||||
|
|||||||
+4
-6
@@ -1,7 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Extracts features from all paper abstracts.
|
Extracts tfidf features from all paper abstracts and saves them to disk.
|
||||||
Saves them into one big features.p pickle file holding the numpy array
|
|
||||||
of features for all the paper abstracts...
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
@@ -10,7 +8,7 @@ import argparse
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
from aslite.db import get_papers_db
|
from aslite.db import get_papers_db, save_features
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -45,11 +43,11 @@ if __name__ == '__main__':
|
|||||||
x = v.transform(make_corpus()).astype(np.float32)
|
x = v.transform(make_corpus()).astype(np.float32)
|
||||||
print(x.shape)
|
print(x.shape)
|
||||||
|
|
||||||
print("saving to features.p")
|
print("saving to features to disk...")
|
||||||
features = {
|
features = {
|
||||||
'pids': list(pdb.keys()),
|
'pids': list(pdb.keys()),
|
||||||
'x': x,
|
'x': x,
|
||||||
'vocab': v.vocabulary_,
|
'vocab': v.vocabulary_,
|
||||||
'idf': v._tfidf.idf_,
|
'idf': v._tfidf.idf_,
|
||||||
}
|
}
|
||||||
pickle.dump(features, open('features.p', 'wb' ))
|
save_features(features)
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from flask import render_template
|
|||||||
from flask import g # global session-level object
|
from flask import g # global session-level object
|
||||||
|
|
||||||
from aslite.db import get_papers_db, get_metas_db, get_tags_db
|
from aslite.db import get_papers_db, get_metas_db, get_tags_db
|
||||||
|
from aslite.db import load_features
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# TODO: user accounts / password login are necessary...
|
# TODO: user accounts / password login are necessary...
|
||||||
@@ -89,8 +90,7 @@ def svm_rank(tags: str = '', pid: str = ''):
|
|||||||
assert tags or pid
|
assert tags or pid
|
||||||
|
|
||||||
# load all of the features
|
# load all of the features
|
||||||
with open('features.p', 'rb') as f:
|
features = load_features()
|
||||||
features = pickle.load(f)
|
|
||||||
x, pids = features['x'], features['pids']
|
x, pids = features['x'], features['pids']
|
||||||
n, d = x.shape
|
n, d = x.shape
|
||||||
ptoi, itop = {}, {}
|
ptoi, itop = {}, {}
|
||||||
@@ -242,8 +242,7 @@ def inspect():
|
|||||||
return "error, malformed pid" # todo: better error handling
|
return "error, malformed pid" # todo: better error handling
|
||||||
|
|
||||||
# load the tfidf vectors, the vocab, and the idf table
|
# load the tfidf vectors, the vocab, and the idf table
|
||||||
with open('features.p', 'rb') as f:
|
features = load_features()
|
||||||
features = pickle.load(f)
|
|
||||||
x = features['x']
|
x = features['x']
|
||||||
idf = features['idf']
|
idf = features['idf']
|
||||||
ivocab = {v:k for k,v in features['vocab'].items()}
|
ivocab = {v:k for k,v in features['vocab'].items()}
|
||||||
|
|||||||
Reference in New Issue
Block a user