big new feature: ability to inspect any paper to see the raw tfidf tokens and their weights that summarize the paper, and which powers the SVM recommendation engine. basically a bit of a debugging / insight feature, but a really good sanity check that papers are being properly represented

2021-11-21 20:51:01 -08:00
parent e5798ddb2f
commit cf1bef6f53
6 changed files with 161 additions and 13 deletions
@@ -49,5 +49,7 @@ if __name__ == '__main__':
    features = {
        'pids': list(pdb.keys()),
        'x': x,
+        'vocab': v.vocabulary_,
+        'idf': v._tfidf.idf_,
    }
    pickle.dump(features, open('features.p', 'wb' ))
@@ -89,7 +89,8 @@ def svm_rank(tags: str = '', pid: str = ''):
    assert tags or pid

    # load all of the features
-    features = pickle.load(open('features.p', 'rb'))
+    with open('features.p', 'rb') as f:
+        features = pickle.load(f)
    x, pids = features['x'], features['pids']
    n, d = x.shape
    ptoi, itop = {}, {}
@@ -231,6 +232,39 @@ def search():
    context['gvars']['search_query'] = q
    return render_template('index.html', **context)

+@app.route('/inspect', methods=['GET'])
+def inspect():
+
+    # fetch the paper of interest based on the pid
+    pid = request.args.get('pid', '')
+    pdb = get_papers()
+    if pid not in pdb:
+        return "error, malformed pid" # todo: better error handling
+
+    # load the tfidf vectors, the vocab, and the idf table
+    with open('features.p', 'rb') as f:
+        features = pickle.load(f)
+    x = features['x']
+    idf = features['idf']
+    ivocab = {v:k for k,v in features['vocab'].items()}
+    pix = features['pids'].index(pid)
+    wixs = np.flatnonzero(np.asarray(x[pix].todense()))
+    words = []
+    for ix in wixs:
+        words.append({
+            'word': ivocab[ix],
+            'weight': float(x[pix, ix]),
+            'idf': float(idf[ix]),
+        })
+    words.sort(key=lambda w: w['weight'], reverse=True)
+
+    # package everything up and render
+    paper = render_pids([pid])[0]
+    context = dict(
+        paper = paper,
+        words = words,
+    )
+    return render_template('inspect.html', **context)

@app.route('/add/<pid>/<tag>')
 def add(pid=None, tag=None):
@@ -0,0 +1,43 @@
+'use strict';
+
+const PaperLite = props => {
+    const p = props.paper;
+    return (
+    <div class='rel_paper'>
+        <div class='rel_title'><a href={'http://arxiv.org/abs/' + p.id}>{p.title}</a></div>
+        <div class='rel_authors'>{p.authors}</div>
+        <div class="rel_time">{p.time}</div>
+        <div class='rel_tags'>{p.tags}</div>
+        <div class='rel_abs'>{p.summary}</div>
+    </div>
+    )
+}
+
+
+const Word = props => {
+    const p = props.word;
+    // word, weight, idf
+    return (
+    <div class='rel_word'>
+        <div class='rel_word_weight'>{p.weight.toFixed(2)}</div>
+        {/* <div class='rel_word_idf'>{p.idf.toFixed(2)}</div> */}
+        <div class="rel_word_txt">{p.word}</div>
+    </div>
+    )
+}
+
+const WordList = props => {
+    const lst = props.words;
+    const wlst = lst.map((jword, ix) => <Word key={ix} word={jword} />);
+    return (
+        <div>
+            <div>The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!</div>
+            <div id="wordList" class="rel_words">
+                {wlst}
+            </div>
+        </div>
+    )
+}
+
+ReactDOM.render(<PaperLite paper={paper} />, document.getElementById('wrap'))
+ReactDOM.render(<WordList words={words} />, document.getElementById('wordwrap'))
@@ -20,6 +20,8 @@ const Paper = props => {
                        .then(response => console.log(response.text()));
    const utags = p.utags.map((utxt, ix) => <UTag key={ix} tag={utxt} />);
    const similar_url = "/?rank=pid&pid=" + p.id;
+    const inspect_url = "/inspect?pid=" + p.id;
+
    return (
    <div class='rel_paper'>
        <div class="rel_score">{p.weight.toFixed(2)}</div>
@@ -34,6 +36,7 @@ const Paper = props => {
        </div>
        <div class='rel_abs'>{p.summary}</div>
        <div class='rel_more'><a href={similar_url}>similar</a></div>
+        <div class='rel_inspect'><a href={inspect_url}>inspect</a></div>
    </div>
    )
 }
@@ -77,15 +80,5 @@ const TagList = props => {
    )
 }

-const Opts = props => {
-    const g = props.gvars;
-    return (
-        <div>
-             time filter (days): <input type="text" value={g.time_filter} />
-        </div>
-    )
-}
-
 ReactDOM.render(<PaperList papers={papers} />, document.getElementById('wrap'))
 ReactDOM.render(<TagList tags={tags} />, document.getElementById('tagwrap'))
-//ReactDOM.render(<Opts gvars={gvars} />, document.getElementById('cbox'))
@@ -25,7 +25,10 @@ body {
    margin: 10px 40px 0 40px;
    font-size: 18px;
 }
-
+#wordwrap {
+    margin: 10px 40px 0 40px;
+    font-size: 14px;
+}
 .rel_title {
    display: inline-block;
 }
@@ -139,4 +142,36 @@ body {
 #header a{
    color: white;
    text-decoration: none;
-}
+}
+.rel_more {
+    font-size: 14px;
+    display: inline-block;
+    margin-right: 10px;
+}
+.rel_inspect {
+    font-size: 14px;
+    display: inline-block;
+    margin-right: 10px;
+}
+#wordList {
+    margin-top: 5px;
+    margin-bottom: 10px;
+}
+.rel_word {
+    display: inline-block;
+    background-color: #eef;
+    padding: 2px;
+    margin: 2px;
+}
+.rel_word_weight {
+    display: inline-block;
+    color: #009;
+}
+.rel_word_idf {
+    display: inline-block;
+    margin-left: 5px;
+}
+.rel_word_txt {
+    display: inline-block;
+    margin-left: 5px;
+}
@@ -0,0 +1,41 @@
+<!DOCTYPE HTML>
+<html>
+
+<head>
+<!-- meta info -->
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>arxiv-sanity</title>
+<!-- CSS -->
+<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
+<!-- Favicon -->
+<link rel="shortcut icon" type="image/png" href="{{ url_for('static', filename='favicon.png') }}" />
+
+<script>
+var paper = {{ paper | tojson }};
+var words = {{ words | tojson }};
+</script>
+
+</head>
+
+<body>
+
+<div id="header">
+    <a href="/">arxiv-sanity</a>
+</div>
+
+<div id="wrap">
+</div>
+
+<div id="wordwrap">
+</div>
+
+<!-- React -->
+<script src="https://unpkg.com/react@16/umd/react.production.min.js" crossorigin></script>
+<script src="https://unpkg.com/react-dom@16/umd/react-dom.production.min.js" crossorigin></script>
+<!-- Babel for displaying JSX -->
+<script src="https://unpkg.com/babel-standalone@6/babel.min.js"></script>
+<!-- Load our React component -->
+<script src="{{ url_for('static', filename='paper_detail.js') }}" type="text/babel"></script>
+
+</body>
+</html>