big new feature: ability to inspect any paper to see the raw tfidf tokens and their weights that summarize the paper, and which powers the SVM recommendation engine. basically a bit of a debugging / insight feature, but a really good sanity check that papers are being properly represented

This commit is contained in:
Andrej Karpathy
2021-11-21 20:51:01 -08:00
parent e5798ddb2f
commit cf1bef6f53
6 changed files with 161 additions and 13 deletions
+2
View File
@@ -49,5 +49,7 @@ if __name__ == '__main__':
features = {
'pids': list(pdb.keys()),
'x': x,
'vocab': v.vocabulary_,
'idf': v._tfidf.idf_,
}
pickle.dump(features, open('features.p', 'wb' ))
+35 -1
View File
@@ -89,7 +89,8 @@ def svm_rank(tags: str = '', pid: str = ''):
assert tags or pid
# load all of the features
features = pickle.load(open('features.p', 'rb'))
with open('features.p', 'rb') as f:
features = pickle.load(f)
x, pids = features['x'], features['pids']
n, d = x.shape
ptoi, itop = {}, {}
@@ -231,6 +232,39 @@ def search():
context['gvars']['search_query'] = q
return render_template('index.html', **context)
@app.route('/inspect', methods=['GET'])
def inspect():
# fetch the paper of interest based on the pid
pid = request.args.get('pid', '')
pdb = get_papers()
if pid not in pdb:
return "error, malformed pid" # todo: better error handling
# load the tfidf vectors, the vocab, and the idf table
with open('features.p', 'rb') as f:
features = pickle.load(f)
x = features['x']
idf = features['idf']
ivocab = {v:k for k,v in features['vocab'].items()}
pix = features['pids'].index(pid)
wixs = np.flatnonzero(np.asarray(x[pix].todense()))
words = []
for ix in wixs:
words.append({
'word': ivocab[ix],
'weight': float(x[pix, ix]),
'idf': float(idf[ix]),
})
words.sort(key=lambda w: w['weight'], reverse=True)
# package everything up and render
paper = render_pids([pid])[0]
context = dict(
paper = paper,
words = words,
)
return render_template('inspect.html', **context)
@app.route('/add/<pid>/<tag>')
def add(pid=None, tag=None):
+43
View File
@@ -0,0 +1,43 @@
'use strict';
const PaperLite = props => {
const p = props.paper;
return (
<div class='rel_paper'>
<div class='rel_title'><a href={'http://arxiv.org/abs/' + p.id}>{p.title}</a></div>
<div class='rel_authors'>{p.authors}</div>
<div class="rel_time">{p.time}</div>
<div class='rel_tags'>{p.tags}</div>
<div class='rel_abs'>{p.summary}</div>
</div>
)
}
const Word = props => {
const p = props.word;
// word, weight, idf
return (
<div class='rel_word'>
<div class='rel_word_weight'>{p.weight.toFixed(2)}</div>
{/* <div class='rel_word_idf'>{p.idf.toFixed(2)}</div> */}
<div class="rel_word_txt">{p.word}</div>
</div>
)
}
const WordList = props => {
const lst = props.words;
const wlst = lst.map((jword, ix) => <Word key={ix} word={jword} />);
return (
<div>
<div>The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!</div>
<div id="wordList" class="rel_words">
{wlst}
</div>
</div>
)
}
ReactDOM.render(<PaperLite paper={paper} />, document.getElementById('wrap'))
ReactDOM.render(<WordList words={words} />, document.getElementById('wordwrap'))
+3 -10
View File
@@ -20,6 +20,8 @@ const Paper = props => {
.then(response => console.log(response.text()));
const utags = p.utags.map((utxt, ix) => <UTag key={ix} tag={utxt} />);
const similar_url = "/?rank=pid&pid=" + p.id;
const inspect_url = "/inspect?pid=" + p.id;
return (
<div class='rel_paper'>
<div class="rel_score">{p.weight.toFixed(2)}</div>
@@ -34,6 +36,7 @@ const Paper = props => {
</div>
<div class='rel_abs'>{p.summary}</div>
<div class='rel_more'><a href={similar_url}>similar</a></div>
<div class='rel_inspect'><a href={inspect_url}>inspect</a></div>
</div>
)
}
@@ -77,15 +80,5 @@ const TagList = props => {
)
}
const Opts = props => {
const g = props.gvars;
return (
<div>
time filter (days): <input type="text" value={g.time_filter} />
</div>
)
}
ReactDOM.render(<PaperList papers={papers} />, document.getElementById('wrap'))
ReactDOM.render(<TagList tags={tags} />, document.getElementById('tagwrap'))
//ReactDOM.render(<Opts gvars={gvars} />, document.getElementById('cbox'))
+37 -2
View File
@@ -25,7 +25,10 @@ body {
margin: 10px 40px 0 40px;
font-size: 18px;
}
#wordwrap {
margin: 10px 40px 0 40px;
font-size: 14px;
}
.rel_title {
display: inline-block;
}
@@ -139,4 +142,36 @@ body {
#header a{
color: white;
text-decoration: none;
}
}
.rel_more {
font-size: 14px;
display: inline-block;
margin-right: 10px;
}
.rel_inspect {
font-size: 14px;
display: inline-block;
margin-right: 10px;
}
#wordList {
margin-top: 5px;
margin-bottom: 10px;
}
.rel_word {
display: inline-block;
background-color: #eef;
padding: 2px;
margin: 2px;
}
.rel_word_weight {
display: inline-block;
color: #009;
}
.rel_word_idf {
display: inline-block;
margin-left: 5px;
}
.rel_word_txt {
display: inline-block;
margin-left: 5px;
}
+41
View File
@@ -0,0 +1,41 @@
<!DOCTYPE HTML>
<html>
<head>
<!-- meta info -->
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>arxiv-sanity</title>
<!-- CSS -->
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
<!-- Favicon -->
<link rel="shortcut icon" type="image/png" href="{{ url_for('static', filename='favicon.png') }}" />
<script>
var paper = {{ paper | tojson }};
var words = {{ words | tojson }};
</script>
</head>
<body>
<div id="header">
<a href="/">arxiv-sanity</a>
</div>
<div id="wrap">
</div>
<div id="wordwrap">
</div>
<!-- React -->
<script src="https://unpkg.com/react@16/umd/react.production.min.js" crossorigin></script>
<script src="https://unpkg.com/react-dom@16/umd/react-dom.production.min.js" crossorigin></script>
<!-- Babel for displaying JSX -->
<script src="https://unpkg.com/babel-standalone@6/babel.min.js"></script>
<!-- Load our React component -->
<script src="{{ url_for('static', filename='paper_detail.js') }}" type="text/babel"></script>
</body>
</html>