first leet codes

This commit is contained in:
Andrej Karpathy
2021-11-12 20:40:19 -08:00
parent 089adab199
commit 194b7f4b22
12 changed files with 820 additions and 0 deletions
+53
View File
@@ -0,0 +1,53 @@
"""
Extracts features from all paper abstracts.
Saves them into one big features.p pickle file holding the numpy array
of features for all the paper abstracts...
"""
import pickle
import argparse
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from aslite.db import SqliteDict, CompressedSqliteDict
# -----------------------------------------------------------------------------
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Arxiv Computor')
parser.add_argument('-n', '--num', type=int, default=10000, help='number of tfidf features')
parser.add_argument('--min_df', type=int, default=5, help='min df')
parser.add_argument('--max_df', type=float, default=0.5, help='max df')
args = parser.parse_args()
print(args)
v = TfidfVectorizer(input='content',
encoding='utf-8', decode_error='replace', strip_accents='unicode',
lowercase=True, analyzer='word', stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
ngram_range=(1, 2), max_features=args.num,
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
max_df=args.max_df, min_df=args.min_df)
pdb = CompressedSqliteDict('papers.db', tablename='papers', flag='r')
def make_corpus():
for p, d in pdb.items():
author_str = ' '.join([a['name'] for a in d['authors']])
yield ' '.join([d['title'], d['summary'], author_str])
print("training tfidf vectors...")
v.fit(make_corpus())
print("running inference...")
x = v.transform(make_corpus()).astype(np.float32)
print(x.shape)
print("saving to features.p")
features = {
'pids': list(pdb.keys()),
'x': x,
}
pickle.dump(features, open('features.p', 'wb' ))