add thumbnails for papers, which apparently ppl like

2022-02-13 18:30:14 -08:00
parent f980c7947a
commit d7a303b410
4 changed files with 111 additions and 2 deletions
@@ -87,6 +87,8 @@ def render_pid(pid):
    # render a single paper with just the information we need for the UI
    pdb = get_papers()
    tags = get_tags()
    thumb_path = 'static/thumb/' + pid + '.jpg'
    thumb_url = thumb_path if os.path.isfile(thumb_path) else ''
    d = pdb[pid]
    return dict(
        weight = 0.0,
@@ -97,6 +99,7 @@ def render_pid(pid):
        tags = ', '.join(t['term'] for t in d['tags']),
        utags = [t for t, pids in tags.items() if pid in pids],
        summary = d['summary'],
        thumb_url = thumb_url,
    )
 def random_rank():
@@ -22,7 +22,7 @@ const Paper = props => {
    const utags = p.utags.map((utxt, ix) => <UTag key={ix} tag={utxt} />);
    const similar_url = "/?rank=pid&pid=" + p.id;
    const inspect_url = "/inspect?pid=" + p.id;
-
+    const thumb_img = p.thumb_url === '' ? null : <div class='rel_img'><img src={p.thumb_url} /></div>;
    // if the user is logged in then we can show add/sub buttons
    let utag_controls = null;
    if(user) {
@@ -43,6 +43,7 @@ const Paper = props => {
        <div class="rel_time">{p.time}</div>
        <div class='rel_tags'>{p.tags}</div>
        {utag_controls}
        {thumb_img}
        <div class='rel_abs'>{p.summary}</div>
        <div class='rel_more'><a href={similar_url}>similar</a></div>
        <div class='rel_inspect'><a href={inspect_url}>inspect</a></div>
@@ -3,6 +3,7 @@ body {
    padding: 0;
    font-family: sans-serif;
    line-height: 1.2;
    background-color: #eee;
 }
 #header {
    height: 24px;
@@ -50,7 +51,7 @@ body {
 .rel_paper {
    margin-bottom: 10px;
    padding: 10px;
-    background-color: #eee;
+    background-color: white;
    border-radius: 5px;
 }
@@ -0,0 +1,104 @@
 """
 Iterates over the current database and makes best effort to download the papers,
 convert them to thumbnail images and save them to disk, for display in the UI.
 Atm only runs the most recent 5K papers. Intended to be run as a cron job daily
 or something like that.
 """
 import os
 import time
 import random
 import requests
 from subprocess import Popen
 from aslite.db import get_papers_db, get_metas_db
 # create the tmp directory if it does not exist, where we will do temporary work
 TMP_DIR = 'tmp'
 if not os.path.exists(TMP_DIR):
    os.makedirs(TMP_DIR)
 # create the thumb directory, where we will store the paper thumbnails
 THUMB_DIR = os.path.join('static', 'thumb')
 if not os.path.exists(THUMB_DIR):
    os.makedirs(THUMB_DIR)
 # open the database, determine which papers we'll try to get thumbs for
 pdb = get_papers_db()
 n = len(pdb)
 mdb = get_metas_db()
 metas = list(mdb.items())
 metas.sort(key=lambda kv: kv[1]['_time'], reverse=True) # most recent papers first
 keys = [k for k,v in metas[:5000]] # only the most recent papers
 for i, key in enumerate(keys):
    time.sleep(0.01) # for safety
    # the path where we would store the thumbnail for this key
    thumb_path = os.path.join(THUMB_DIR, key + '.jpg')
    if os.path.exists(thumb_path):
        continue
    # fetch the paper
    p = pdb[key]
    print("%d/%d: paper to process: %s" % (i, n, key))
    # get the link to the pdf
    url = p['link'].replace('abs', 'pdf')
    # attempt to download the pdf
    print("attempting to download pdf from: ", url)
    try:
        x = requests.get(url, timeout=10, allow_redirects=True)
        with open(os.path.join(TMP_DIR, 'paper.pdf'), 'wb') as f:
            f.write(x.content)
        print("OK")
    except Exception as e:
        print("error downloading the pdf at url", url)
        print(e)
        continue
    time.sleep(5 + random.uniform(0, 5)) # take a breather
    # mv away the previous temporary files if they exist
    if os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
        for i in range(8):
            f1 = os.path.join(TMP_DIR, 'thumb-%d.png' % (i,))
            f2 = os.path.join(TMP_DIR, 'thumbbuf-%d.png' % (i,))
            if os.path.isfile(f1):
                cmd = 'mv %s %s' % (f1, f2)
                os.system(cmd)
    # convert pdf to png images per page. spawn async because convert can unfortunately enter an infinite loop, have to handle this.
    # this command will generate 8 independent images thumb-0.png ... thumb-7.png of the thumbnails
    print("converting the pdf to png images")
    pp = Popen(['convert', '%s[0-7]' % ('tmp/paper.pdf', ), '-thumbnail', 'x156', os.path.join(TMP_DIR, 'thumb.png')])
    t0 = time.time()
    while time.time() - t0 < 20: # give it 20 seconds deadline
        ret = pp.poll()
        if not (ret is None):
            # process terminated
            break
        time.sleep(0.1)
    ret = pp.poll()
    if ret is None:
        print("convert command did not terminate in 20 seconds, terminating.")
        pp.terminate() # give up
        continue
    if not os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
        # failed to render pdf, replace with missing image
        #missing_thumb_path = os.path.join('static', 'missing.jpg')
        #os.system('cp %s %s' % (missing_thumb_path, thumb_path))
        #print("could not render pdf, creating a missing image placeholder")
        print("could not render pdf, skipping")
        continue
    else:
        # otherwise concatenate the 8 images into one
        cmd = "montage -mode concatenate -quality 80 -tile x1 %s %s" \
              % (os.path.join(TMP_DIR, 'thumb-*.png'), thumb_path)
        print(cmd)
        os.system(cmd)
    # remove the temporary paper.pdf file
    tmp_pdf = os.path.join(TMP_DIR, 'paper.pdf')
    if os.path.isfile(tmp_pdf):
        os.remove(tmp_pdf)