David Blume's GitList
Repositories
techcrunch.git
Code
Commits
Branches
Tags
Search
Tree:
297a766
Branches
Tags
main
techcrunch.git
techcrunch.py
Moved SMTP credentials to smtp_creds
David Blume
commited
297a766
at 2018-01-21 14:55:04
techcrunch.py
Blame
History
Raw
#!/usr/bin/env python # # Testing without affecting the yaml file and saving the updated one aside: # cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; \ # cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml import feedparser import yaml import sys import os import time import StringIO import codecs import traceback import calendar import pickle import exceptions import urllib import urllib2 import httplib import shutil import glob import smtplib import analysis import json import xml import texttime import operator from datetime import timedelta import cgi import smtp_creds # Your own credentials, used in send_email() debug = True any_entry_added = False tags_to_post = set(['apple', 'google']) authors_to_post = ['michael arrington',] # TODO 2018-01-18: Maybe combine fb_likes with bf_shares or something... rhs_metric = 'fb_likes' rhs_metric_times = 'comment_times' localdir = '' html_head = """ <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'> <HTML><HEAD> <title>TechCrunch Feed Filter</title> <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> --> <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" /> <style type="text/css"> body { font-family: "Arial", san-serif; } .author { font-size: smaller; } .h3 { font-size: larger; } a { text-decoration: none; } /* table { border: none; border-collapse:collapse; font-size: large } */ table { border-collapse: collapse; } table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; } table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; } table.legend td { border: 1px solid LightSlateGray; } tr.even { background:#%s; padding: 2em; } tr.odd { background:#%s; padding-bottom: 2em; } </style> </HEAD> <BODY> <div align='center'><h3>TechCrunch Feed Filter</h3></div> This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>. <a href="http://david.dlma.com/blog/my-techcrunch-feed-filter">Learn more about the Feed Filter</a>.<br /><br /> """ html_footer = """ </table> </div><br /> <div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>, <a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a><br />© 2011 <a href="http://david.dlma.com">David Blume</a></div><br /> </BODY> </HTML> """ img_width = 300 img_height = 50 series_1_color = "0000FF" series_2_color = "00AA00" threshold_color = "FF8C00" tag_color = "F01000" even_background = "F8F8F8" odd_background = "E8E8E8" even_watermark = "E0E0FF" odd_watermark = "D0D0F0" def asciiize(s): try: return s.encode('ascii') except UnicodeEncodeError as e: return s except exceptions.AttributeError as e: return s def send_email(subject, message, toaddrs, fromaddr='"%s" <%s>' % (os.path.basename(__file__), smtp_creds.user)): """ Sends Email """ smtp = smtplib.SMTP(smtp_creds.server, port=smtp_creds.port) smtp.login(smtp_creds.user, smtp_creds.passw) smtp.sendmail(fromaddr, \ toaddrs, \ "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ (fromaddr, ", ".join(toaddrs), subject, message)) smtp.quit() def index_id(a_list, op, elem): try: return (index for index, item in enumerate(a_list) if op(item, elem)).next() except: return -1 def make_chart_url(time_posted, lhs_times, lhs_values, rhs_times, rhs_values, threshold_value, is_odd_row, tag_hit): # lhs_times, lhs_values = zip(*comments) # rhs_times, rhs_values = zip(*rhs) if not len(lhs_times): lhs_times = [time_posted,] if not len(lhs_values): lhs_values = [0,] if not len(rhs_times): rhs_times = [time_posted,] if not len(rhs_values): rhs_values = [0,] lhs_times = [(i - time_posted) / 1800 for i in lhs_times] rhs_times = [(i - time_posted) / 1800 for i in rhs_times] min_comment_time = min(lhs_times) max_comment_time = max(lhs_times) min_comment_value = min(lhs_values) max_comment_value = max(lhs_values) min_rhs_time = min(rhs_times) max_rhs_time = max(rhs_times) min_rhs_value = min(rhs_values) max_rhs_value = max(rhs_values) met_threshold_pt = -1 if threshold_value != -1: met_threshold_pt = index_id(rhs_values, operator.ge, threshold_value) if met_threshold_pt == -1 or tag_hit: # This can happen if threshold_value was set to a number # because the author or a tag was matched, but the article # was unpopular. We choose to put a marker at point index 0. met_threshold_pt = 0 if is_odd_row != 0: bg_color = even_background watermark_color = even_watermark else: bg_color = odd_background watermark_color = odd_watermark if len(lhs_values) < 8 and len(lhs_values) > 1: # max_comment_value *= 2 pass elif len(lhs_values) == 1: min_comment_value = 0 if len(rhs_values) < 8 and len(rhs_values) > 1: # max_rhs_value *= 2 pass elif len(rhs_values) == 1: min_rhs_value = 0 min_comment_value = 0 min_rhs_value = 0 chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \ (series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color) chart_url += "&chd=t:%s|%s|%s|%s" % (','.join([str(n) for n in lhs_times]), ','.join([str(n) for n in lhs_values]), ','.join([str(n) for n in rhs_times]), ','.join([str(n) for n in rhs_values])) # TODO: Consider watermark levels, like: # chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5 if max_rhs_value > 0: threshold_percent = max(0, min((float(threshold_value) / max_rhs_value) - 0.01, 1.0)) else: threshold_percent = 1.0 chart_url += "&chm=r,%s,0,0,%1.3f" % (watermark_color, threshold_percent) if met_threshold_pt != -1: if tag_hit: dot_color = tag_color dot_shape = 'd' else: dot_color = threshold_color dot_shape = 'o' chart_url += "|%s,%s,1,%d,10" % (dot_shape, dot_color, met_threshold_pt) chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ (min_comment_value, max_comment_value, min_rhs_value, max_rhs_value, 0, max(7, max_comment_time), min_comment_value, max_comment_value, 0, max(7, max_rhs_time), min_comment_value, max_rhs_value) chart_url += "&chf=bg,s,%s&chdl=comments|shares" % (bg_color,) return chart_url def process_feed(yaml_items): """Retrieve the url and process it. feed_info (in, out) A tuple that describes an individual feed, like its name and etag. """ feed = feedparser.parse('http://feeds.feedburner.com/TechCrunch') if hasattr(feed, 'status'): if feed.status == 304: pass else: feed_is_modified = True if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: if feed.status == 503: print "the feed is temporarily unavailable." elif feed.status == 400: print "the feed says we made a bad request." elif feed.status == 502: print "the feed reported a bad gateway error." elif feed.status == 404: print "the feed says the page was not found." elif feed.status == 500: print "the feed had an internal server error." elif feed.status == 403: print "Access to the feed was forbidden." else: print "the feed returned feed.status %d." % ( feed.status, ) else: # Save off this if hasattr(feed, 'bozo_exception') and isinstance(feed.bozo_exception, xml.sax._exceptions.SAXParseException): print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception)) else: try: with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f: pickle.dump(feed, f) except(pickle.PicklingError, exceptions.TypeError) as e: print "An error occurred while pickling the feed: %s." % \ (# str(e.__class__), str(e)) traceback.print_exc(3, file=sys.stdout) feed_is_modified = False for i in reversed(feed.entries): process_item(i, yaml_items) # If we have more than 200 items, remove the old ones. while len(yaml_items) > 200: yaml_items.pop() for i in yaml_items: # i['title'] = asciiize(i['title']) # i['tags'] = map(asciiize, i['tags']) process_yaml_item(i) else: if hasattr(feed, 'bozo_exception'): e = feed.bozo_exception if isinstance(e, urllib2.URLError): print_last_line = True if hasattr(e, 'reason'): if e.reason[0] == 110: print "the feed's connection timed out." print_last_line = False elif e.reason[0] == 111: print "the feed's connection was refused." print_last_line = False elif e.reason[0] == 104: print "the feed reset the connection." print_last_line = False else: print "the feed had a URLError with reason %s." % (str(e.reason),) print_last_line = False if print_last_line: print "the feed had a URLError %s" % (str(e),) elif isinstance(e, httplib.BadStatusLine): print "the feed gave a bad status line. (%s)" % (str(e),) else: if len(str(e)): print "the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e)) else: print "the feed bozo_exception: %s %s" % (str(e.__class__), repr(e)) else: print "the feed returned class %s, %s" % (str(feed.__class__), str(feed)) def process_item(feed_item, yaml_items): """Processes an RSS feed item, and converts it to a YAML item""" # Get the time global any_entry_added timecode_now = int(time.time()) date_parsed = time.gmtime() if hasattr(feed_item, 'issued_parsed'): date_parsed = feed_item.issued_parsed date_set = True elif hasattr(feed_item, 'date_parsed'): date_parsed = feed_item.date_parsed date_set = True else: print "process_item found no timestamp for", asciiize(feed_item.link) timecode_parsed = calendar.timegm(date_parsed) link = feed_item.link if hasattr(feed_item, 'feedburner_origlink'): link = feed_item.feedburner_origlink # TODO 2018-01-18: Leave in the ncid for URL clicks, but remove during processing. # suffix_to_remove = '?ncid=rss' # if link.endswith(suffix_to_remove): # link = link[:-len(suffix_to_remove)] # Look for i.feedburner_origlink in yaml_items yaml_item = None for i in yaml_items: if link == i['link']: yaml_item = i break if yaml_item is None: author = '' if hasattr(feed_item, 'author'): author = asciiize(feed_item.author) # Make a new yaml_item yaml_item = {'title' : asciiize(feed_item.title), 'link' : asciiize(link), 'author' : author, 'tags' : [], 'orig_posted' : timecode_parsed, 'qualified' : -1, 'comment_times' : [], 'fb_comments' : [], 'fb_shares' : [], 'fb_likes' : [], 'slash_comment_times' : [], 'slash_comments' : [] } if hasattr(feed_item, 'tags'): for i in feed_item.tags: yaml_item['tags'].append(asciiize(i.term)) yaml_items.insert(0, yaml_item) any_entry_added = True # Maybe check to ensure that this item isn't too old. if timecode_parsed < timecode_now - 60 * 30 * 9: return # Now, add the new values if hasattr(feed_item, 'slash_comments') and len(yaml_item['slash_comments']) < 8: any_entry_added = True yaml_item['slash_comment_times'].append(timecode_now) yaml_item['slash_comments'].append(int(feed_item.slash_comments)) def process_yaml_item(yaml_item): global any_entry_added # Related to TODO 2018-01-18: Remove ncid only during processing. link = yaml_item['link'] suffix_to_remove = '?ncid=rss' # Maybe we should find() it instead, in case feedburner adds other options if link.endswith(suffix_to_remove): link = link[:-len(suffix_to_remove)] timecode_now = int(time.time()) if len(yaml_item['fb_comments']) < 8: num_shares, num_comments, num_likes = Get_fb_stats(link) if num_comments != -1: any_entry_added = True yaml_item['comment_times'].append(timecode_now) yaml_item['fb_shares'].append(num_shares) yaml_item['fb_comments'].append(num_comments) yaml_item['fb_likes'].append(num_likes) # if len(yaml_item['reddit_']) < 8: # num_ = Get_reddit_stats(link) # if num_ != -1: # any_entry_added = True # yaml_item['reddit_times'].append(timecode_now) # yaml_item['reddit_'].append(num_) def Get_reddit_stats(url_string): """ Consider curl "https://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" """ return -1 def Get_fb_stats(url_string): """Use graph's "engagement" field to get reactions and shares.""" shares = -1 comments = -1 likes = -1 url_string = url_string.encode('utf-8') try: encoded = urllib.urlencode({'access_token': facebook_token}) url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s' f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded)) data = f.read() f.close() except (urllib2.URLError, httplib.BadStatusLine) as e: if hasattr(e, 'reason'): # URLError if hasattr(e, 'code'): print "Get_fb_stats got an error (1):", e.code, e.reason, url_string else: print "Get_fb_stats got an error (2):", e.reason, url_string elif hasattr(e, 'code'): #URLError print "Get_fb_stats got an error. Code:", e.code, url_string else: print "Get_fb_stats got an error (3):", str(e) return shares, comments, likes except KeyError as e: print "Get_fb_stats got a key error 1e (%s)" % (str(e), ) print "Get_fb_stats got a key error 2.2 enc (%s)" % (encoded, ) print url_string.encode('utf-8') print u"Get_fb_stats got a key error 2.1 url (%s)" % (url_string, ) print "Get_fb_stats got a key error 3q (%s)" % (urllib.quote_plus(url_string)) print "Get_fb_stats got a key error 4 (%s)" % (url % (urllib.quote_plus(url_string), encoded)) print "Get_fb_stats got a key error 5 (%s) for url %s:" % (str(e), url % (urllib.quote_plus(url_string), encoded)) return shares, comments, likes if len(data) > 20: d = json.loads(data)['engagement'] try: shares = d['share_count'] except KeyError: shares = 0 try: likes = d['reaction_count'] except KeyError: likes = 0 # TODO 2018-01-18: og_object metric was likes + shares + comments # Here we'll combine likes and shares, and comments with plugin_comments likes += shares try: comments = d['comment_plugin_count'] + d['comment_count'] except KeyError: comments = 0 else: print "Get_fb_stats got too little data for ", url_string return shares, comments, likes def save_image(url_string, file_path): try: f = urllib2.urlopen(url_string) data = f.read() f.close() except (urllib2.URLError, httplib.BadStatusLine) as e: if hasattr(e, 'reason'): # URLError print "save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason elif hasattr(e, 'code'): # URLError print "save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Code:", e.code else: print "save_image: Error from urlopen", e return url_string if len(data) > 50: with open(file_path, 'wb') as f: f.write(data) return 'cache/' + os.path.basename(file_path) return url_string def make_index_html(yaml_items, weekend_stats, weekday_stats): """Writes a static index.html file from the YAML items.""" cur_time = int(time.time()) new_index_fullpath = os.path.join(localdir, 'index.html_new') index_fullpath = os.path.join(localdir, 'index.html') cache_path = os.path.join(localdir, 'cache') files_to_delete = glob.glob(os.path.join(cache_path, '*.png')) with codecs.open(new_index_fullpath, 'w', 'utf-8') as f: f.write(html_head % (even_background, odd_background)) f.write('<div align="center">\n<table class="legend">\n<tr><th></th><th>Median</th><th>Mean</th><th>Std. Dev</th><th>Threshold</th></tr>\n') f.write('<tr><th>Weekday</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % (weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2])) f.write('<tr><th>Weekend</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % (weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2])) f.write('</table></div>\n<br />\n') f.write('<div align="center">\n<table>\n') for image_index, image in enumerate(yaml_items[:40]): tag_hit = False if image['author'].lower() in authors_to_post: tag_hit = True elif len(set([j.lower() for j in image['tags']]) & tags_to_post) > 0: tag_hit = True chart_url = make_chart_url(image['orig_posted'], image['comment_times'], image['fb_comments'], image[rhs_metric_times], image[rhs_metric], image['qualified'], image_index % 2, tag_hit ) image_url = save_image(chart_url, os.path.join(cache_path, '%d_%d.png' % (cur_time, image_index))) f.write('<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ (image_index % 2 and "even" or "odd", image['link'], image['title'].encode('ascii', 'xmlcharrefreplace'), image['author'].encode('ascii', 'xmlcharrefreplace'), ) ) f.write(' <td>%s<td>\n' % (image['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '')) f.write(' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \ (image_url, img_width, img_height ) ) f.write(html_footer) if os.path.exists(index_fullpath): os.unlink(index_fullpath) shutil.move(new_index_fullpath, index_fullpath) for fname in files_to_delete: os.unlink(fname) def make_feed_file(yaml_items): """Writes the RSS feed file with the YAML items.""" with codecs.open(os.path.join(localdir, 'rss_feed.xml'), 'wb', 'utf-8') as f: f.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>") f.write("<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % (time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))) count = 0 for item in yaml_items: now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(item['orig_posted'])) if item['qualified'] != -1: escaped_title = cgi.escape(item['title']).encode('ascii', 'xmlcharrefreplace') escaped_author = cgi.escape(item['author']).encode('ascii', 'xmlcharrefreplace') f.write("<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ (escaped_title, now, item['link'], item['link'], escaped_author)) count += 1 if count > 14: break f.write("</channel></rss>") if __name__=='__main__': start_time = time.time() progress_text = [] old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = sys.stderr = StringIO.StringIO() try: localdir = os.path.abspath(os.path.dirname(sys.argv[0])) # # Read in techcrunch.yaml # # [ { 'title' : 'Title Text', # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', # 'author' : u'MG Siegler', # 'orig_posted' : 1282197199 # 'tags' : [ u'Google', u'privacy' ] # 'qualified' : -1 # 'comment_times' : [ 1282197199, 1282197407 ] # 'fb_comments' : [ 0, 5 ] # 'fb_shares' : [ 0, 300 ] # 'fb_likes' : [ 0, 19 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 5 ] # }, # { ... } # ] # yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml') if os.path.exists(yaml_fullpath): with open(yaml_fullpath, 'rb') as f: items = yaml.load(f) if items is None: print yaml_fullpath, "exists, but was empty." items = [] # Do any dictionary item updating that might be necessary # for item in items: # if not item.has_key('fb_shares'): # item['fb_shares'] = [] else: print "could not open", yaml_fullpath items = [] with open(os.path.join(localdir, 'facebook-token.txt'), 'r') as f: facebook_token = f.read() progress_text = ["read techcrunch.yaml"] process_feed(items) # # If any work was done, then write files. # if any_entry_added: weekend_stats, weekday_stats = analysis.Process_feed(items, rhs_metric, rhs_metric_times) # We'll only look at the stats for the time 1:00 to 1:30 after posting. weekend_median, weekend_mean, weekend_sigma = weekend_stats[2] weekend_threshold = weekend_mean + weekend_sigma weekday_median, weekday_mean, weekday_sigma = weekday_stats[2] weekday_threshold = weekday_mean + weekday_sigma for item in items: wday = time.localtime(item['orig_posted']).tm_wday if wday == 5 or wday == 6: threshold = weekend_threshold else: threshold = weekday_threshold if item['qualified'] == -1: for i in range(len(item[rhs_metric_times])): r_time = item[rhs_metric_times][i] if r_time - item['orig_posted'] < 5400: if item[rhs_metric][i] >= threshold: item['qualified'] = threshold if r_time - item['orig_posted'] >= 3600: break # Automatically add those items whose authors and tags I like for item in items: if item['qualified'] == -1 and len(item[rhs_metric_times]) > 0: if item['author'].lower() in authors_to_post: item['qualified'] = threshold elif len(set([j.lower() for j in item['tags']]) & tags_to_post) > 0: item['qualified'] = threshold # # Write out the updated yaml file. # # For the one file we really use, write to a file on the side, then move it. yaml_newfile_fullpath = os.path.join(localdir, 'techcrunch_temp_writable.yaml') with open(yaml_newfile_fullpath, 'wb') as f: yaml.dump(items, f, width=120) try: os.rename(yaml_newfile_fullpath, yaml_fullpath) except OSError as e: print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath) with open(os.path.join(localdir, 'techcrunch_text.yaml'), 'w') as f: yaml.dump(items, f, width=120) with codecs.open(os.path.join(localdir, 'techcrunch_unicode.yaml'), 'w', 'utf-8') as f: yaml.dump(items, f, encoding='utf-8', width=120) make_feed_file(items) make_index_html(items, weekend_stats, weekday_stats) else: print "No entries were added this time." except Exception as e: exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) print exceptional_text, ' '.join(progress_text) traceback.print_exc(file=sys.stdout) try: send_email('Exception thrown in techcrunch.py', exceptional_text + "\n" + traceback.format_exc(), ('david.blume@gmail.com',)) except Exception as e: print "Could not send email to notify you of the exception. :(" message = sys.stdout.getvalue() sys.stdout = old_stdout sys.stderr = old_stderr if not debug: print message # Finally, let's save this to a statistics page if os.path.exists(os.path.join(localdir, 'stats.txt')): with open(os.path.join(localdir, 'stats.txt')) as f: lines = f.readlines() else: lines = [] lines = lines[:672] # Just keep the past week's worth # status = len(message.strip()) and message.strip().replace('\n', ' - ') or "OK" status = len(message.strip()) and '\n '.join( message.splitlines()) or "OK" lines.insert(0, "%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status)) with open(os.path.join(localdir,'stats.txt' ), 'w') as f: f.writelines(lines)