diff --git a/send_emails.py b/send_emails.py index e4df822..62cfa30 100644 --- a/send_emails.py +++ b/send_emails.py @@ -50,6 +50,10 @@ body { color: #333; margin-bottom: 10px; } +.f { + color: #933; + display: inline-block; +} @@ -88,37 +92,62 @@ def calculate_recommendation( ptoi[p] = i itop[i] = p - # construct the positive set via simple union of all tags - y = np.zeros(n, dtype=np.float32) + # loop over all the tags + all_pids, all_scores = {}, {} for tag, pids in tags.items(): + + if len(pids) == 0: + continue + + # construct the positive set for this tag + y = np.zeros(n, dtype=np.float32) for pid in pids: y[ptoi[pid]] = 1.0 - # classify - clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1) - clf.fit(x, y) - s = clf.decision_function(x) - sortix = np.argsort(-s) - pids = [itop[ix] for ix in sortix] - scores = [100*float(s[ix]) for ix in sortix] + # classify + clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01) + clf.fit(x, y) + s = clf.decision_function(x) + sortix = np.argsort(-s) + pids = [itop[ix] for ix in sortix] + scores = [100*float(s[ix]) for ix in sortix] - # filter by time to only recent papers - deltat = time_delta*60*60*24 # allowed time delta in seconds - keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat] - pids, scores = [pids[i] for i in keep], [scores[i] for i in keep] + # filter by time to only recent papers + deltat = time_delta*60*60*24 # allowed time delta in seconds + keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat] + pids, scores = [pids[i] for i in keep], [scores[i] for i in keep] - # finally exclude the papers we already have tagged - have = set().union(*tags.values()) - keep = [i for i,pid in enumerate(pids) if pid not in have] - pids, scores = [pids[i] for i in keep], [scores[i] for i in keep] + # finally exclude the papers we already have tagged + have = set().union(*tags.values()) + keep = [i for i,pid in enumerate(pids) if pid not in have] + pids, scores = [pids[i] for i in keep], [scores[i] for i in keep] - return pids, scores + # store results + all_pids[tag] = pids + all_scores[tag] = scores + + + return all_pids, all_scores # ----------------------------------------------------------------------------- -def render_recommendations(user, tags, pids, scores): +def render_recommendations(user, tags, tag_pids, tag_scores): # render the paper recommendations into the html template + # first we are going to merge all of the papers / scores together using a MAX + max_score = {} + max_source_tag = {} + for tag in tag_pids: + for pid, score in zip(tag_pids[tag], tag_scores[tag]): + max_score[pid] = max(max_score.get(pid, -99999), score) # lol + if max_score[pid] == score: + max_source_tag[pid] = tag + + # now we have a dict of pid -> max score. sort by score + max_score_list = sorted(max_score.items(), key=lambda x: x[1], reverse=True) + pids, scores = zip(*max_score_list) + + # now render the html for each individual recommendation parts = [] n = min(len(scores), args.num_recommendations) for score, pid in zip(scores[:n], pids[:n]): @@ -134,12 +163,12 @@ def render_recommendations(user, tags, pids, scores):
%.2f
-%s +%s
(%s)
%s
%s
-""" % (score, p['link'], p['title'], authors, summary) +""" % (score, p['link'], p['title'], max_source_tag[pid], authors, summary) ) # render the final html @@ -239,6 +268,9 @@ if __name__ == "__main__": print("skipping user %s, no papers tagged" % (user, )) continue + # insert a fake entry in tags for the special "all" tag, which is the union of all papers + # tags['all'] = set().union(*tags.values()) + # calculate the recommendations pids, scores = calculate_recommendation(tags, time_delta=args.time_delta) print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta)) @@ -259,8 +291,8 @@ if __name__ == "__main__": send_email(email, html) num_sent += 1 - # zzz - time.sleep(1 + random.uniform(0, 2)) + # zzz? + # time.sleep(1 + random.uniform(0, 2)) print("done.") print("sent %d emails" % (num_sent, ))