#!/usr/bin/env python # # Testing without affecting the yaml file and saving the updated one aside: # cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; \ # cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml import feedparser import yaml import sys import os import time import codecs import traceback import calendar import pickle import exceptions import urllib import urllib2 import httplib import shutil import smtplib import analysis import json import xml import operator import cgi import cStringIO import smtp_creds # Your own credentials, used in send_email() debug = True any_entry_added = False tags_to_post = {'apple', 'google', 'roku'} authors_to_post = ['michael arrington',] # TODO 2018-01-18: Maybe combine fb_likes with bf_shares or something... rhs_metric = 'fb_likes' rhs_metric_times = 'comment_times' localdir = '' html_head = """ TechCrunch Feed Filter

TechCrunch Feed Filter

This page shows what analysis is done to filter the noise away from the Techcrunch feed into a more concise feed feed. Learn more about the Feed Filter.

""" html_footer = """
Thanks to The Universal Feed Parser module, PyYAML and Google Charts.
sourceraw datastatus
© 2011 David Blume

""" chart_data_header = """ var data = google.visualization.arrayToDataTable([ ['', 'Comments', 'Shares', {'type': 'string', 'role': 'style'}], """ chart_data_middle = """ ]); var chart = new google.visualization.LineChart(document.getElementById('chart%d')); options.backgroundColor = '#%s'; """ img_width = 240 img_height = 68 series_1_color = "0000FF" series_2_color = "00AA00" threshold_color = "FF8C00" tag_color = "F01000" even_background = "F8F8F8" odd_background = "E8E8E8" even_watermark = "E0E0FF" odd_watermark = "D0D0F0" def asciiize(s): try: return s.encode('ascii') except UnicodeEncodeError: return s except exceptions.AttributeError: return s def send_email(subject, message, toaddrs, fromaddr='"%s" <%s>' % (os.path.basename(__file__), smtp_creds.user)): """ Sends Email """ smtp = smtplib.SMTP(smtp_creds.server, port=smtp_creds.port) smtp.login(smtp_creds.user, smtp_creds.passw) smtp.sendmail(fromaddr, toaddrs, "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ (fromaddr, ", ".join(toaddrs), subject, message)) smtp.quit() def index_id(a_list, op, elem): try: return (index for index, item in enumerate(a_list) if op(item, elem)).next() except: return -1 def write_chart_data(time_posted, lhs_times, lhs_values, rhs_times, rhs_values, threshold_value, image_index, tag_hit, chart_io): # lhs_times, lhs_values = zip(*comments) # rhs_times, rhs_values = zip(*rhs) is_odd_row = image_index % 2 if not len(lhs_times): lhs_times = [time_posted,] if not len(lhs_values): lhs_values = [0,] if not len(rhs_times): rhs_times = [time_posted,] if not len(rhs_values): rhs_values = [0,] lhs_times = [(i - time_posted) / 1800 for i in lhs_times] rhs_times = [(i - time_posted) / 1800 for i in rhs_times] met_threshold_pt = -1 if threshold_value != -1: met_threshold_pt = index_id(rhs_values, operator.ge, threshold_value) if met_threshold_pt == -1 or tag_hit: # This can happen if threshold_value was set to a number # because the author or a tag was matched, but the article # was unpopular. We choose to put a marker at point index 0. met_threshold_pt = 0 if is_odd_row != 0: bg_color = even_background else: bg_color = odd_background chart_io.write(chart_data_header) for i in range(8): if i == met_threshold_pt: if tag_hit: style = "'point { size: 5; fill-color: #FF0000; shape-type: diamond}'" else: style = "'point { size: 5; fill-color: #FF8C00; }'" else: style = "null" if i < len(lhs_values): lhs_value = str(lhs_values[i]) else: lhs_value = "null" if i < len(rhs_values): rhs_value = str(rhs_values[i]) else: rhs_value = "null" chart_io.write(" [%d, %s, %s, %s],\n" % (i, lhs_value, rhs_value, style)) chart_io.write(chart_data_middle % (image_index, bg_color)) if met_threshold_pt == -1 and not tag_hit: chart_io.write(" delete options.vAxes[1].baseline;\n") else: chart_io.write(" options.vAxes[1].baseline = %d;\n" % (threshold_value,)) chart_io.write(" chart.draw(data, options);\n\n") def process_feed(yaml_items): """Retrieve the url and process it. feed_info (in, out) A tuple that describes an individual feed, like its name and etag. """ feed = feedparser.parse('https://techcrunch.com/feed/') if hasattr(feed, 'status'): if feed.status == 304: pass else: if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: if feed.status == 503: print "the feed is temporarily unavailable." elif feed.status == 400: print "the feed says we made a bad request." elif feed.status == 502: print "the feed reported a bad gateway error." elif feed.status == 404: print "the feed says the page was not found." elif feed.status == 500: print "the feed had an internal server error." elif feed.status == 403: print "Access to the feed was forbidden." else: print "the feed returned feed.status %d." % ( feed.status, ) else: # Save off this if hasattr(feed, 'bozo_exception') and isinstance(feed.bozo_exception, xml.sax._exceptions.SAXParseException): print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception)) else: try: with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f: pickle.dump(feed, f) except(pickle.PicklingError, exceptions.TypeError) as e: print "An error occurred while pickling the feed: %s." % \ (# str(e.__class__), str(e)) traceback.print_exc(3, file=sys.stdout) for i in reversed(feed.entries): process_item(i, yaml_items) # If we have more than 200 items, remove the old ones. while len(yaml_items) > 200: yaml_items.pop() for i in yaml_items: # i['title'] = asciiize(i['title']) # i['tags'] = map(asciiize, i['tags']) process_yaml_item(i) else: if hasattr(feed, 'bozo_exception'): e = feed.bozo_exception if isinstance(e, urllib2.URLError): print_last_line = True if hasattr(e, 'reason'): if e.reason[0] == 110: print "the feed's connection timed out." print_last_line = False elif e.reason[0] == 111: print "the feed's connection was refused." print_last_line = False elif e.reason[0] == 104: print "the feed reset the connection." print_last_line = False else: print "the feed had a URLError with reason %s." % (str(e.reason),) print_last_line = False if print_last_line: print "the feed had a URLError %s" % (str(e),) elif isinstance(e, httplib.BadStatusLine): print "the feed gave a bad status line. (%s)" % (str(e),) else: if len(str(e)): print "the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e)) else: print "the feed bozo_exception: %s %s" % (str(e.__class__), repr(e)) else: print "the feed returned class %s, %s" % (str(feed.__class__), str(feed)) def process_item(feed_item, yaml_items): """Processes an RSS feed item, and converts it to a YAML item""" # Get the time global any_entry_added timecode_now = int(time.time()) date_parsed = time.gmtime() if hasattr(feed_item, 'issued_parsed'): date_parsed = feed_item.issued_parsed elif hasattr(feed_item, 'date_parsed'): date_parsed = feed_item.date_parsed else: print "process_item found no timestamp for", asciiize(feed_item.link) timecode_parsed = calendar.timegm(date_parsed) link = feed_item.link if hasattr(feed_item, 'feedburner_origlink'): link = feed_item.feedburner_origlink # TODO 2018-01-18: Leave in the ncid for URL clicks, but remove during processing. # suffix_to_remove = '?ncid=rss' # if link.endswith(suffix_to_remove): # link = link[:-len(suffix_to_remove)] # Look for i.feedburner_origlink in yaml_items yaml_item = None for i in yaml_items: if link == i['link']: yaml_item = i break if yaml_item is None: author = '' if hasattr(feed_item, 'author'): author = asciiize(feed_item.author) # Make a new yaml_item yaml_item = {'title' : asciiize(feed_item.title), 'link' : asciiize(link), 'author' : author, 'tags' : [], 'orig_posted' : timecode_parsed, 'qualified' : -1, 'comment_times' : [], 'fb_comments' : [], 'fb_shares' : [], 'fb_likes' : [], 'slash_comment_times' : [], 'slash_comments' : [] } if hasattr(feed_item, 'tags'): for i in feed_item.tags: yaml_item['tags'].append(asciiize(i.term)) yaml_items.insert(0, yaml_item) any_entry_added = True # Maybe check to ensure that this item isn't too old. if timecode_parsed < timecode_now - 60 * 30 * 9: return # Now, add the new values if hasattr(feed_item, 'slash_comments') and len(yaml_item['slash_comments']) < 8: any_entry_added = True yaml_item['slash_comment_times'].append(timecode_now) yaml_item['slash_comments'].append(int(feed_item.slash_comments)) def process_yaml_item(yaml_item): global any_entry_added # Related to TODO 2018-01-18: Remove ncid only during processing. link = yaml_item['link'] suffix_to_remove = '?ncid=rss' # Maybe we should find() it instead, in case feedburner adds other options if link.endswith(suffix_to_remove): link = link[:-len(suffix_to_remove)] timecode_now = int(time.time()) if len(yaml_item['fb_comments']) < 8: num_shares, num_comments, num_likes = Get_fb_stats(link) if num_comments != -1: any_entry_added = True yaml_item['comment_times'].append(timecode_now) yaml_item['fb_shares'].append(num_shares) yaml_item['fb_comments'].append(num_comments) yaml_item['fb_likes'].append(num_likes) # if len(yaml_item['reddit_']) < 8: # num_ = get_reddit_stats(link) # if num_ != -1: # any_entry_added = True # yaml_item['reddit_times'].append(timecode_now) # yaml_item['reddit_'].append(num_) def get_reddit_stats(url_string): """ Consider curl "https://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" """ return -1 def Get_fb_stats(url_string): """Use graph's "engagement" field to get reactions and shares.""" shares = -1 comments = -1 likes = -1 url_string = url_string.encode('utf-8') try: encoded = urllib.urlencode({'access_token': facebook_token}) url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s' f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded)) data = f.read() f.close() except (urllib2.URLError, httplib.BadStatusLine) as e: if hasattr(e, 'reason'): # URLError if hasattr(e, 'code'): print "Get_fb_stats got an error (1):", e.code, e.reason, url_string else: print "Get_fb_stats got an error (2):", e.reason, url_string elif hasattr(e, 'code'): #URLError print "Get_fb_stats got an error. Code:", e.code, url_string else: print "Get_fb_stats got an error (3):", str(e) return shares, comments, likes if len(data) > 20: d = json.loads(data)['engagement'] try: shares = d['share_count'] except KeyError: shares = 0 try: likes = d['reaction_count'] except KeyError: likes = 0 # TODO 2018-01-18: og_object metric was likes + shares + comments # Here we'll combine likes and shares, and comments with plugin_comments likes += shares try: comments = d['comment_plugin_count'] + d['comment_count'] except KeyError: comments = 0 else: print "Get_fb_stats got too little data for ", url_string return shares, comments, likes def make_index_html(yaml_items, weekend_stats, weekday_stats): """Writes a static index.html file from the YAML items.""" cur_time = int(time.time()) new_index_fullpath = os.path.join(localdir, 'index.html_new') index_fullpath = os.path.join(localdir, 'index.html') chart_io = cStringIO.StringIO() for image_index, image in enumerate(yaml_items[:40]): tag_hit = False if image['author'].lower() in authors_to_post: tag_hit = True elif len(set([j.lower() for j in image['tags']]) & tags_to_post) > 0: tag_hit = True write_chart_data(image['orig_posted'], image['comment_times'], image['fb_comments'], image[rhs_metric_times], image[rhs_metric], image['qualified'], image_index, tag_hit, chart_io ) with codecs.open(new_index_fullpath, 'w', 'utf-8') as f: f.write(html_head % (even_background, odd_background, img_width, chart_io.getvalue())) chart_io.close() f.write('
\n\n\n') f.write('\n' % (weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2])) f.write('\n' % (weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2])) f.write('
MedianMeanStd. DevThreshold
Weekday%1.1f%1.1f%1.1f%1.1f
Weekend%1.1f%1.1f%1.1f%1.1f
\n
\n') f.write('
\n\n') for image_index, image in enumerate(yaml_items[:40]): f.write('\n \n' % \ (image_index % 2 and "even" or "odd", image['link'], image['title'].encode('ascii', 'xmlcharrefreplace'), time.strftime("%H:%M", time.localtime(image['orig_posted'])).encode('ascii', 'xmlcharrefreplace'), image['author'].encode('ascii', 'xmlcharrefreplace'), ) ) f.write(' \n' % (image_index, )) f.write(html_footer) if os.path.exists(index_fullpath): os.unlink(index_fullpath) shutil.move(new_index_fullpath, index_fullpath) def make_feed_file(yaml_items): """Writes the RSS feed file with the YAML items.""" with codecs.open(os.path.join(localdir, 'rss_feed.xml'), 'wb', 'utf-8') as f: f.write("\n\n\n\nTrending at TechCrunchhttp://techcrunch.dlma.com") f.write("%sAutomatically Generated Feeden-us\n" % (time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))) count = 0 for item in yaml_items: now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(item['orig_posted'])) if item['qualified'] != -1: escaped_title = cgi.escape(item['title']).encode('ascii', 'xmlcharrefreplace') escaped_author = cgi.escape(item['author']).encode('ascii', 'xmlcharrefreplace') f.write("%s%s%s%s\n" % \ (escaped_title, now, item['link'], item['link'], escaped_author)) count += 1 if count > 14: break f.write("") if __name__=='__main__': start_time = time.time() progress_text = [] old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = sys.stderr = cStringIO.StringIO() try: localdir = os.path.abspath(os.path.dirname(sys.argv[0])) # # Read in techcrunch.yaml # # [ { 'title' : 'Title Text', # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', # 'author' : u'MG Siegler', # 'orig_posted' : 1282197199 # 'tags' : [ u'Google', u'privacy' ] # 'qualified' : -1 # 'comment_times' : [ 1282197199, 1282197407 ] # 'fb_comments' : [ 0, 5 ] # 'fb_shares' : [ 0, 300 ] # 'fb_likes' : [ 0, 19 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 5 ] # }, # { ... } # ] # yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml') if os.path.exists(yaml_fullpath): with open(yaml_fullpath, 'rb') as f: items = yaml.load(f) if items is None: print yaml_fullpath, "exists, but was empty." items = [] # Do any dictionary item updating that might be necessary # for item in items: # if not item.has_key('fb_shares'): # item['fb_shares'] = [] else: print "could not open", yaml_fullpath items = [] with open(os.path.join(localdir, 'facebook-token.txt'), 'r') as f: json_obj = json.load(f) facebook_token = json_obj['access_token'] progress_text = ["read techcrunch.yaml"] process_feed(items) # # If any work was done, then write files. # if any_entry_added: weekend_stats, weekday_stats = analysis.process_feed(items, rhs_metric, rhs_metric_times) # We'll only look at the stats up to 2 hours after posting. weekend_median, weekend_mean, weekend_sigma = weekend_stats[2] weekend_threshold = weekend_mean + weekend_sigma weekday_median, weekday_mean, weekday_sigma = weekday_stats[2] weekday_threshold = weekday_mean + weekday_sigma for item in items: wday = time.localtime(item['orig_posted']).tm_wday if wday == 5 or wday == 6: threshold = weekend_threshold else: threshold = weekday_threshold if item['qualified'] == -1: for i in range(len(item[rhs_metric_times])): r_time = item[rhs_metric_times][i] if r_time - item['orig_posted'] < 7200: if item[rhs_metric][i] >= threshold: # Comment out when graph.facebook.com engagement returns only 0s. item['qualified'] = threshold break else: break # Automatically add those items whose authors and tags I like for item in items: if item['qualified'] == -1 and len(item[rhs_metric_times]) > 0: if item['author'].lower() in authors_to_post: item['qualified'] = threshold elif len(set([j.lower() for j in item['tags']]) & tags_to_post) > 0: item['qualified'] = threshold # # Write out the updated yaml file. # # For the one file we really use, write to a file on the side, then move it. yaml_newfile_fullpath = os.path.join(localdir, 'techcrunch_temp_writable.yaml') with open(yaml_newfile_fullpath, 'wb') as f: yaml.dump(items, f, width=120) try: os.rename(yaml_newfile_fullpath, yaml_fullpath) except OSError as e: print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath) with open(os.path.join(localdir, 'techcrunch_text.yaml'), 'w') as f: yaml.dump(items, f, width=120) with codecs.open(os.path.join(localdir, 'techcrunch_unicode.yaml'), 'w', 'utf-8') as f: yaml.dump(items, f, encoding='utf-8', width=120) make_feed_file(items) make_index_html(items, weekend_stats, weekday_stats) else: print "No entries were added this time." except Exception as e: exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) print exceptional_text, ' '.join(progress_text) traceback.print_exc(file=sys.stdout) try: send_email('Exception thrown in ' + os.path.basename(__file__), exceptional_text + "\n" + traceback.format_exc(), (smtp_creds.default_recipient,)) except Exception as e: print "Could not send email to notify you of the exception. :(" message = sys.stdout.getvalue() sys.stdout = old_stdout sys.stderr = old_stderr if not debug: print message # Finally, let's save this to a statistics page if os.path.exists(os.path.join(localdir, 'stats.txt')): with open(os.path.join(localdir, 'stats.txt')) as f: lines = f.readlines() else: lines = [] lines = lines[:672] # Just keep the past week's worth # status = len(message.strip()) and message.strip().replace('\n', ' - ') or "OK" status = len(message.strip()) and '\n '.join( message.splitlines()) or "OK" lines.insert(0, "%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status)) with open(os.path.join(localdir,'stats.txt' ), 'w') as f: f.writelines(lines)
%s at %s by %s%s\n' % (image['qualified'] != -1 and '' or '')) f.write('