From cf1bef6f531244ecaaeac6b71d6c7b4ed5f38bfc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 21 Nov 2021 20:51:01 -0800 Subject: [PATCH] big new feature: ability to inspect any paper to see the raw tfidf tokens and their weights that summarize the paper, and which powers the SVM recommendation engine. basically a bit of a debugging / insight feature, but a really good sanity check that papers are being properly represented --- compute.py | 2 ++ serve.py | 36 ++++++++++++++++++++++++++++++++++- static/paper_detail.js | 43 ++++++++++++++++++++++++++++++++++++++++++ static/paper_list.js | 13 +++---------- static/style.css | 39 ++++++++++++++++++++++++++++++++++++-- templates/inspect.html | 41 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 161 insertions(+), 13 deletions(-) create mode 100644 static/paper_detail.js create mode 100644 templates/inspect.html diff --git a/compute.py b/compute.py index a583c0b..7e93c1a 100644 --- a/compute.py +++ b/compute.py @@ -49,5 +49,7 @@ if __name__ == '__main__': features = { 'pids': list(pdb.keys()), 'x': x, + 'vocab': v.vocabulary_, + 'idf': v._tfidf.idf_, } pickle.dump(features, open('features.p', 'wb' )) diff --git a/serve.py b/serve.py index 7891057..99fbf1a 100644 --- a/serve.py +++ b/serve.py @@ -89,7 +89,8 @@ def svm_rank(tags: str = '', pid: str = ''): assert tags or pid # load all of the features - features = pickle.load(open('features.p', 'rb')) + with open('features.p', 'rb') as f: + features = pickle.load(f) x, pids = features['x'], features['pids'] n, d = x.shape ptoi, itop = {}, {} @@ -231,6 +232,39 @@ def search(): context['gvars']['search_query'] = q return render_template('index.html', **context) +@app.route('/inspect', methods=['GET']) +def inspect(): + + # fetch the paper of interest based on the pid + pid = request.args.get('pid', '') + pdb = get_papers() + if pid not in pdb: + return "error, malformed pid" # todo: better error handling + + # load the tfidf vectors, the vocab, and the idf table + with open('features.p', 'rb') as f: + features = pickle.load(f) + x = features['x'] + idf = features['idf'] + ivocab = {v:k for k,v in features['vocab'].items()} + pix = features['pids'].index(pid) + wixs = np.flatnonzero(np.asarray(x[pix].todense())) + words = [] + for ix in wixs: + words.append({ + 'word': ivocab[ix], + 'weight': float(x[pix, ix]), + 'idf': float(idf[ix]), + }) + words.sort(key=lambda w: w['weight'], reverse=True) + + # package everything up and render + paper = render_pids([pid])[0] + context = dict( + paper = paper, + words = words, + ) + return render_template('inspect.html', **context) @app.route('/add//') def add(pid=None, tag=None): diff --git a/static/paper_detail.js b/static/paper_detail.js new file mode 100644 index 0000000..306c627 --- /dev/null +++ b/static/paper_detail.js @@ -0,0 +1,43 @@ +'use strict'; + +const PaperLite = props => { + const p = props.paper; + return ( +
+ +
{p.authors}
+
{p.time}
+
{p.tags}
+
{p.summary}
+
+ ) +} + + +const Word = props => { + const p = props.word; + // word, weight, idf + return ( +
+
{p.weight.toFixed(2)}
+ {/*
{p.idf.toFixed(2)}
*/} +
{p.word}
+
+ ) +} + +const WordList = props => { + const lst = props.words; + const wlst = lst.map((jword, ix) => ); + return ( +
+
The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!
+
+ {wlst} +
+
+ ) +} + +ReactDOM.render(, document.getElementById('wrap')) +ReactDOM.render(, document.getElementById('wordwrap')) diff --git a/static/paper_list.js b/static/paper_list.js index fe69ae9..a06121e 100644 --- a/static/paper_list.js +++ b/static/paper_list.js @@ -20,6 +20,8 @@ const Paper = props => { .then(response => console.log(response.text())); const utags = p.utags.map((utxt, ix) => ); const similar_url = "/?rank=pid&pid=" + p.id; + const inspect_url = "/inspect?pid=" + p.id; + return (
{p.weight.toFixed(2)}
@@ -34,6 +36,7 @@ const Paper = props => {
{p.summary}
+ ) } @@ -77,15 +80,5 @@ const TagList = props => { ) } -const Opts = props => { - const g = props.gvars; - return ( -
- time filter (days): -
- ) -} - ReactDOM.render(, document.getElementById('wrap')) ReactDOM.render(, document.getElementById('tagwrap')) -//ReactDOM.render(, document.getElementById('cbox')) \ No newline at end of file diff --git a/static/style.css b/static/style.css index 7e01e41..82470be 100644 --- a/static/style.css +++ b/static/style.css @@ -25,7 +25,10 @@ body { margin: 10px 40px 0 40px; font-size: 18px; } - +#wordwrap { + margin: 10px 40px 0 40px; + font-size: 14px; +} .rel_title { display: inline-block; } @@ -139,4 +142,36 @@ body { #header a{ color: white; text-decoration: none; -} \ No newline at end of file +} +.rel_more { + font-size: 14px; + display: inline-block; + margin-right: 10px; +} +.rel_inspect { + font-size: 14px; + display: inline-block; + margin-right: 10px; +} +#wordList { + margin-top: 5px; + margin-bottom: 10px; +} +.rel_word { + display: inline-block; + background-color: #eef; + padding: 2px; + margin: 2px; +} +.rel_word_weight { + display: inline-block; + color: #009; +} +.rel_word_idf { + display: inline-block; + margin-left: 5px; +} +.rel_word_txt { + display: inline-block; + margin-left: 5px; +} diff --git a/templates/inspect.html b/templates/inspect.html new file mode 100644 index 0000000..553ffed --- /dev/null +++ b/templates/inspect.html @@ -0,0 +1,41 @@ + + + + + + +arxiv-sanity + + + + + + + + + + + + + +
+
+ +
+
+ + + + + + + + + + +