From 33b2b018abc1adf8019991f230790a3781d1fc43 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 8 Dec 2021 00:17:37 -0800
Subject: [PATCH] kind of big changes here: we can now inspect & see the most
 positive and negative words for a trained svm, to help tune the value C. then
 there is also the UI for setting value C in the SVM. Finally the value of C I
 adjusted to default to 0.01 (was 0.1 before) because the results and the
 weights look more sensible based on manual inspection. We need some dataset
 of people libraries in order to potentially cross-validate a good value C
 automatically. For now there are not enough active users of the site that
 such a thing could be attempted and succeed. Doing my best now just by
 eyeballing

---
 serve.py               | 40 ++++++++++++++++++++++++++++------------
 static/paper_detail.js | 27 ---------------------------
 static/paper_list.js   |  4 ++++
 static/style.css       | 22 ++++++++++++++++++++++
 static/word_list.js    | 30 ++++++++++++++++++++++++++++++
 templates/index.html   | 14 +++++++++++++-
 templates/inspect.html |  2 ++
 7 files changed, 99 insertions(+), 40 deletions(-)
 create mode 100644 static/word_list.js

diff --git a/serve.py b/serve.py
index 5ec22df..4a9fca9 100644
--- a/serve.py
+++ b/serve.py
@@ -114,12 +114,12 @@ def time_rank():
     scores = [(tnow - v['_time'])/60/60/24 for k, v in ms] # time delta in days
     return pids, scores
 
-def svm_rank(tags: str = '', pid: str = '', svm_c: str = ''):
+def svm_rank(tags: str = '', pid: str = '', C: float = 0.01):
 
     # tag can be one tag or a few comma-separated tags or 'all' for all tags we have in db
     # pid can be a specific paper id to set as positive for a kind of nearest neighbor search
     if not (tags or pid):
-        return [], []
+        return [], [], []
 
     # load all of the features
     features = load_features()
@@ -143,15 +143,9 @@ def svm_rank(tags: str = '', pid: str = '', svm_c: str = ''):
                     y[ptoi[pid]] = 1.0
 
     if y.sum() == 0:
-        return [], [] # there are no positives?
+        return [], [], [] # there are no positives?
 
     # classify
-    C = 0.1
-    if svm_c: # if a desired C is provided attempt to use it as a float
-        try:
-            C = float(svm_c)
-        except ValueError:
-            C = 1.0
     clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=C)
     clf.fit(x, y)
     s = clf.decision_function(x)
@@ -159,7 +153,18 @@ def svm_rank(tags: str = '', pid: str = '', svm_c: str = ''):
     pids = [itop[ix] for ix in sortix]
     scores = [100*float(s[ix]) for ix in sortix]
 
-    return pids, scores
+    # get the words that score most positively and most negatively for the svm
+    ivocab = {v:k for k,v in features['vocab'].items()} # index to word mapping
+    weights = clf.coef_[0] # (n_features,) weights of the trained svm
+    sortix = np.argsort(-weights)
+    words = []
+    for ix in list(sortix[:40]) + list(sortix[-20:]):
+        words.append({
+            'word': ivocab[ix],
+            'weight': weights[ix],
+        })
+
+    return pids, scores, words
 
 def search_rank(q: str = ''):
     if not q:
@@ -209,13 +214,20 @@ def main():
     if opt_q:
         opt_rank = 'search'
 
+    # try to parse opt_svm_c into something sensible (a float)
+    try:
+        C = float(opt_svm_c)
+    except ValueError:
+        C = 0.01 # sensible default, i think
+
     # rank papers: by tags, by time, by random
+    words = [] # only populated in the case of svm rank
     if opt_rank == 'search':
         pids, scores = search_rank(q=opt_q)
     elif opt_rank == 'tags':
-        pids, scores = svm_rank(tags=opt_tags, svm_c=opt_svm_c)
+        pids, scores, words = svm_rank(tags=opt_tags, C=C)
     elif opt_rank == 'pid':
-        pids, scores = svm_rank(pid=opt_pid, svm_c=opt_svm_c)
+        pids, scores, words = svm_rank(pid=opt_pid, C=C)
     elif opt_rank == 'time':
         pids, scores = time_rank()
     elif opt_rank == 'random':
@@ -257,6 +269,8 @@ def main():
     context = default_context()
     context['papers'] = papers
     context['tags'] = rtags
+    context['words'] = words
+    context['words_desc'] = "Here are the top 40 most positive and bottom 20 most negative weights of the SVM. If they don't look great then try tuning the regularization strength hyperparameter of the SVM, svm_c, above. Lower C is higher regularization."
     context['gvars'] = {}
     context['gvars']['rank'] = opt_rank
     context['gvars']['tags'] = opt_tags
@@ -264,6 +278,7 @@ def main():
     context['gvars']['time_filter'] = opt_time_filter
     context['gvars']['skip_have'] = opt_skip_have
     context['gvars']['search_query'] = opt_q
+    context['gvars']['svm_c'] = str(C)
     return render_template('index.html', **context)
 
 @app.route('/inspect', methods=['GET'])
@@ -296,6 +311,7 @@ def inspect():
     context = default_context()
     context['paper'] = paper
     context['words'] = words
+    context['words_desc'] = "The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!"
     return render_template('inspect.html', **context)
 
 @app.route('/profile')
diff --git a/static/paper_detail.js b/static/paper_detail.js
index 306c627..44b0457 100644
--- a/static/paper_detail.js
+++ b/static/paper_detail.js
@@ -13,31 +13,4 @@ const PaperLite = props => {
     )
 }
 
-
-const Word = props => {
-    const p = props.word;
-    // word, weight, idf
-    return (
-    <div class='rel_word'>
-        <div class='rel_word_weight'>{p.weight.toFixed(2)}</div>
-        {/* <div class='rel_word_idf'>{p.idf.toFixed(2)}</div> */}
-        <div class="rel_word_txt">{p.word}</div>
-    </div>
-    )
-}
-
-const WordList = props => {
-    const lst = props.words;
-    const wlst = lst.map((jword, ix) => <Word key={ix} word={jword} />);
-    return (
-        <div>
-            <div>The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!</div>
-            <div id="wordList" class="rel_words">
-                {wlst}
-            </div>
-        </div>
-    )
-}
-
 ReactDOM.render(<PaperLite paper={paper} />, document.getElementById('wrap'))
-ReactDOM.render(<WordList words={words} />, document.getElementById('wordwrap'))
diff --git a/static/paper_list.js b/static/paper_list.js
index 61c8504..7434911 100644
--- a/static/paper_list.js
+++ b/static/paper_list.js
@@ -80,12 +80,16 @@ const TagList = props => {
     const tlst = lst.map((jtag, ix) => <Tag key={ix} tag={jtag} />);
     const deleter = () => fetch("/del/" + prompt("delete tag name:"))
                           .then(response => console.log(response.text()));
+    // show the #wordwrap element if the user clicks inspect
+    const show_inspect = () => { document.getElementById("wordwrap").style.display = "block"; };
+    const inspect_elt = words.length > 0 ? <div id="inspect_svm" onClick={show_inspect}>inspect</div> : null;
     return (
         <div>
             <div class="rel_tag" onClick={deleter}>-</div>
             <div id="tagList" class="rel_utags">
                 {tlst}
             </div>
+            {inspect_elt}
         </div>
     )
 }
diff --git a/static/style.css b/static/style.css
index e4cc133..dac83a2 100644
--- a/static/style.css
+++ b/static/style.css
@@ -253,3 +253,25 @@ body {
     margin: 10px 40px 0 40px;
     font-size: 16px;
 }
+#tags_field {
+    width: 100px;
+}
+#pid_field {
+    width: 60px;
+}
+#time_filter_field {
+    width: 20px;
+}
+#svm_c_field {
+    width: 30px;
+}
+#tagList {
+    display: inline-block;
+}
+#inspect_svm {
+    display: inline-block;
+    font-size: 16px;
+    cursor: pointer;
+    text-decoration: underline;
+    color: #009;
+}
\ No newline at end of file
diff --git a/static/word_list.js b/static/word_list.js
new file mode 100644
index 0000000..9ba9531
--- /dev/null
+++ b/static/word_list.js
@@ -0,0 +1,30 @@
+'use strict';
+
+const Word = props => {
+    const p = props.word;
+    // word, weight, idf
+    return (
+    <div class='rel_word'>
+        <div class='rel_word_weight'>{p.weight.toFixed(2)}</div>
+        {/* <div class='rel_word_idf'>{p.idf.toFixed(2)}</div> */}
+        <div class="rel_word_txt">{p.word}</div>
+    </div>
+    )
+}
+
+const WordList = props => {
+    const lst = props.words;
+    const words_desc = props.words_desc;
+    const wlst = lst.map((jword, ix) => <Word key={ix} word={jword} />);
+    return (
+        <div>
+            <div>{words_desc}</div>
+            <div id="wordList" class="rel_words">
+                {wlst}
+            </div>
+        </div>
+    )
+}
+
+ReactDOM.render(<WordList words={words} words_desc={words_desc} />,
+                document.getElementById('wordwrap'));
diff --git a/templates/index.html b/templates/index.html
index 0f4058c..1d726ba 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -4,6 +4,8 @@
 <script>
 var papers = {{ papers | tojson }};
 var tags = {{ tags | tojson }};
+var words = {{ words | tojson }};
+var words_desc = {{ words_desc | tojson }};
 var gvars = {{ gvars | tojson }};
 </script>
 {% endblock %}
@@ -43,9 +45,13 @@ var gvars = {{ gvars | tojson }};
                 <input name="pid" type="text" id="pid_field" value="{{ gvars.pid }}">
 
                 <!-- current time_filter, in a text field -->
-                <label for="time_filter">time_filter: </label>
+                <label for="time_filter">time_filter (days): </label>
                 <input name="time_filter" type="text" id="time_filter_field" value="{{ gvars.time_filter }}">
 
+                <!-- current svm_c, in a text field -->
+                <label for="svm_c">svm_c: </label>
+                <input name="svm_c" type="text" id="svm_c_field" value="{{ gvars.svm_c }}">
+
                 <!-- current skip_have: one of yes or no -->
                 <label for="skip_have">skip_have: </label>
                 <select name="skip_have" id="skip_have_select">
@@ -75,10 +81,16 @@ var gvars = {{ gvars | tojson }};
 </div>
 {% endif %}
 
+{% if user and words %}
+<div id="wordwrap" style="display:none;">
+</div>
+{% endif %}
+
 <div id="wrap">
 </div>
 {% endblock %}
 
 {% block elements %}
 <script src="{{ url_for('static', filename='paper_list.js') }}" type="text/babel"></script>
+<script src="{{ url_for('static', filename='word_list.js') }}" type="text/babel"></script>
 {% endblock %}
diff --git a/templates/inspect.html b/templates/inspect.html
index 38b9da4..79b35bf 100644
--- a/templates/inspect.html
+++ b/templates/inspect.html
@@ -4,6 +4,7 @@
 <script>
 var paper = {{ paper | tojson }};
 var words = {{ words | tojson }};
+var words_desc = {{ words_desc | tojson }};
 </script>
 {% endblock %}
 
@@ -16,4 +17,5 @@ var words = {{ words | tojson }};
 
 {% block elements %}
 <script src="{{ url_for('static', filename='paper_detail.js') }}" type="text/babel"></script>
+<script src="{{ url_for('static', filename='word_list.js') }}" type="text/babel"></script>
 {% endblock %}