David Blume's GitList
Repositories
techcrunch.git
Code
Commits
Branches
Tags
Search
Tree:
7d91adf
Branches
Tags
main
master
techcrunch.git
techcrunch.py
Original 2010-09-03 version
David Blume
commited
7d91adf
at 2018-01-20 20:10:33
techcrunch.py
Blame
History
Raw
#!/usr/bin/python2.5 # chmod 755 me, and make sure I have UNIX style newlines. # # techcrunch.py # # http://feeds.feedburner.com/TechCrunch # feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) # feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments'] # # TODO: # 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' # link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" import feedparser import yaml import sys import os import time import StringIO import codecs import traceback import calendar import pickle import exceptions import urllib import urllib2 import httplib import shutil import glob import smtplib import bisect import analysis import simplejson as json import cookielib debug = True any_entry_added = False localdir = '' html_head = """ <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'> <HTML><HEAD> <title>TechCrunch Feed Filter</title> <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> --> <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" /> <style type="text/css"> body { font-family: "Arial", san-serif; } .author { font-size: smaller; } .h3 { font-size: larger; } a { text-decoration: none; } /* table { border: none; border-collapse:collapse; font-size: large } */ table { border-collapse: collapse; } table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; } table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; } table.legend td { border: 1px solid LightSlateGray; } tr.even { background:#%s; padding: 2em; } tr.odd { background:#%s; padding-bottom: 2em; } </style> </HEAD> <BODY> <div align='center'><h3>TechCrunch Feed Filter</h3></div> This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br /> """ html_footer = """ </table> </div><br /> <div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>, <a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a></div><br /> </BODY> </HTML> """ img_width = 300 img_height = 50 series_1_color = "0000FF" series_2_color = "00AA00" threshold_color = "FF8C00" even_background = "F8F8F8" #even_background = "FFFFFF" odd_background = "E8E8E8" def asciiize( s ): try: return s.encode( 'ascii' ) except UnicodeEncodeError, e: return s except exceptions.AttributeError, e: return s def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ): """Sends Email""" smtp = smtplib.SMTP( 'localhost' ) smtp.sendmail( fromaddr, \ toaddrs, \ "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ ( fromaddr, ", ".join( toaddrs ), subject, message ) ) smtp.quit() def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ): # comment_times, comment_values = zip( *comments ) # retweet_times, retweet_values = zip( *retweets ) # TODO handle failure cases, -1 if not len( comment_times ): comment_times = [ time_posted, ] if not len( comment_values ): comment_values = [ 0, ] if not len( retweet_times ): retweet_times = [ time_posted, ] if not len( retweet_values ): retweet_values = [ 0, ] # comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ] # retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ] comment_times = [ (i - time_posted) / 1800 for i in comment_times ] retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ] min_comment_time = min( comment_times ) max_comment_time = max( comment_times ) min_comment_value = min( comment_values ) max_comment_value = max( comment_values ) min_retweet_time = min( retweet_times ) max_retweet_time = max( retweet_times ) min_retweet_value = min( retweet_values ) max_retweet_value = max( retweet_values ) if len( comment_values ) < 8 and len( comment_values ) > 1: # max_comment_value *= 2 pass elif len( comment_values ) == 1: min_comment_value = 0 if len( retweet_values ) < 8 and len( retweet_values ) > 1: # max_retweet_value *= 2 pass elif len( retweet_values ) == 1: min_retweet_value = 0 min_comment_value = 0 min_retweet_value = 0 chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \ ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color ) chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ), ','.join( [ str( n ) for n in comment_values ] ), ','.join( [ str( n ) for n in retweet_times ] ), ','.join( [ str( n ) for n in retweet_values ] ) ) if met_threshold_pt != -1: chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt ) chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value, 0, max( 7, max_comment_time ), min_comment_value, max_comment_value, 0, max( 7, max_retweet_time ), min_comment_value, max_retweet_value ) chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, ) return chart_url def process_feed( yaml_items ): """ Retrieve the url and process it. feed_info (in, out) A tuple that describes an individual feed, like its name and etag. """ feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) if hasattr( feed, 'status' ): if feed.status == 304: pass else: feed_is_modified = True if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: if feed.status == 503: print "the feed is temporarily unavailable." elif feed.status == 400: print "the feed says we made a bad request." elif feed.status == 502: print "the feed reported a bad gateway error." elif feed.status == 404: print "the feed says the page was not found." elif feed.status == 500: print "the feed had an internal server error." elif feed.status == 403: print "Access to the feed was forbidden." else: print "the feed returned feed.status %d." % ( feed.status, ) else: # Save off this f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) try: pickle.dump( feed, f ) except( pickle.PicklingError, exceptions.TypeError ), e: print "An error occurred while pickling the feed: %s." % \ ( # str(e.__class__), str(e) ) traceback.print_exc( file = sys.stdout ) feed_is_modified = False f.close() for i in reversed( feed.entries ): process_item( i, yaml_items ) # If we have more than 200 items, remove the old ones. while len( yaml_items ) > 200: yaml_items.pop() cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) ) for i in yaml_items: # i['title'] = asciiize( i['title'] ) # i['tags'] = map( asciiize, i['tags'] ) process_yaml_item( i, cookie ) else: if hasattr(feed, 'bozo_exception'): e = feed.bozo_exception if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110: print_last_line = True if hasattr(e, 'reason'): if e.reason[0] == 110: print "the feed's connection timed out." print_last_line = False elif e.reason[0] == 111: print "the feed's connection was refused." print_last_line = False elif e.reason[0] == 104: print "the feed reset the connection." print_last_line = False else: print "the feed had a URLError with reason %s." % ( str(e.reason), ) print_last_line = False if print_last_line: print "the feed had a URLError %s" % ( str(e), ) elif isinstance( e, httplib.BadStatusLine ): if hasattr(e, 'message'): print "the feed gave a bad status line %s." % ( str(e.message ), ) else: print "the feed gave a bad status line." else: if len( str(e) ): print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) ) else: print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) ) else: print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) ) def process_item( feed_item, yaml_items ): # Get the time global any_entry_added timecode_now = int( time.time() ) date_parsed = time.gmtime() if hasattr( feed_item, 'issued_parsed' ): date_parsed = feed_item.issued_parsed date_set = True elif hasattr( feed_item, 'date_parsed' ): date_parsed = feed_item.date_parsed date_set = True else: print "process_item found no timestamp for", asciiize( feed_item.link ) timecode_parsed = calendar.timegm( date_parsed ) # Look for i.feedburner_origlink in yaml_items yaml_item = None for i in yaml_items: if feed_item.feedburner_origlink == i['link']: yaml_item = i break if not yaml_item: author = '' link = feed_item.link if hasattr( feed_item, 'author' ): author = asciiize( feed_item.author ) if hasattr( feed_item, 'feedburner_origlink' ): link = feed_item.feedburner_origlink # Make a new yaml_item yaml_item = { 'title' : asciiize( feed_item.title ), 'link' : asciiize( link ), 'author' : author, 'tags' : [], 'orig_posted' : timecode_parsed, 'qualified' : -1, 'comment_times' : [], 'comments' : [], 'slash_comment_times' : [], 'slash_comments' : [], 'retweet_times' : [], 'retweets' : [] } if hasattr( feed_item, 'tags' ): for i in feed_item.tags: yaml_item['tags'].append( asciiize( i.term ) ) yaml_items.insert( 0, yaml_item ) any_entry_added = True # Maybe check to ensure that this item isn't too old. if timecode_parsed < timecode_now - 60 * 30 * 9: return # Now, add the new values if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8: any_entry_added = True yaml_item['slash_comment_times'].append( timecode_now ) yaml_item['slash_comments'].append( int( feed_item.slash_comments ) ) def process_yaml_item( yaml_item, cookie ): global any_entry_added timecode_now = int( time.time() ) if len( yaml_item['comments'] ) < 8: num_comments = Get_num_disqus_comments( yaml_item['link'], cookie ) if num_comments != -1: any_entry_added = True yaml_item['comment_times'].append( timecode_now ) yaml_item['comments'].append( num_comments ) if len( yaml_item['retweets'] ) < 8: num_retweets = Get_num_retweets( yaml_item['link'] ) if num_retweets != -1: any_entry_added = True yaml_item['retweet_times'].append( timecode_now ) yaml_item['retweets'].append( num_retweets ) def Get_num_comments( url_string ): try: f = urllib2.urlopen( url_string ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_comments got an error:", e.reason elif hasattr( e, 'code' ): print "Get_num_comments got an error. Code:", e.code return -1 tag_to_find = '<a href="#comments" rel="nofollow">' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = start_pos while str.isdigit( data[ end_pos ] ): end_pos += 1 if end_pos > start_pos: return int( data[start_pos:end_pos] ) return -1 def Get_cookie( cookie_request ): cookie = cookielib.CookieJar() try: cookie_response = urllib2.urlopen( cookie_request ) cookie.extract_cookies( cookie_response, cookie_request ) return cookie except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_cookie got an error:", e.reason elif hasattr( e, 'code' ): print "Get_cookie got an error. Code:", e.code return None def Get_num_disqus_comments( url_string, cookie ): if cookie == None: return -1 try: f = urllib2.urlopen( url_string ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_disqus_comments got an error:", e.reason elif hasattr( e, 'code' ): print "Get_num_disqus_comments got an error. Code:", e.code return -1 tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="' disqus_tag_to_find = 'displayCount(' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = start_pos while data[ end_pos ] != '"' and end_pos < start_pos + 200: end_pos += 1 if end_pos < start_pos + 200: opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) ) url_GET_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' ) request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + url_GET_data ) try: response = opener.open( request ) disqus_data = response.read() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_disqus_comments got an error getting the count:", e.reason elif hasattr( e, 'code' ): print "Get_num_disqus_comments got an error getting the count. Code:", e.code disqus_data = "" disqus_offset = disqus_data.find( disqus_tag_to_find ) if disqus_offset != -1: start_pos = disqus_offset + len( disqus_tag_to_find ) end_pos = disqus_data.find( '}]})', start_pos ) if end_pos != -1: return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] ) return -1 def Get_num_retweets( url_string ): try: f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_retweets got an error:", e.reason elif hasattr( e, 'code' ): print "Get_num_retweets got an error. Code:", e.code return -1 tag_to_find = '<span class="c">' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = data.find( '<', start_pos ) if end_pos != -1: return int( data[ start_pos:end_pos ] ) return -1 def Save_image( url_string, file_path ): try: f = urllib2.urlopen( url_string ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Save_image got an error:", e.reason elif hasattr( e, 'code' ): print "Save_image got an error. Code:", e.code return url_string if len( data ) > 50: f = open( file_path, 'wb' ) f.write( data ) f.close() return 'cache/' + os.path.basename( file_path ) return url_string def Make_index_html( yaml_items, stats ): cur_time = int( time.time() ) new_index_fullpath = os.path.join( localdir, 'index.html_new' ) index_fullpath = os.path.join( localdir, 'index.html' ) cache_path = os.path.join( localdir, 'cache' ) files_to_delete = glob.glob( cache_path + '*.png' ) # shutil.rmtree( cache_path ) # os.mkdir( cache_path ) f = file( new_index_fullpath, 'w' ) f.write( html_head % ( even_background, odd_background ) ) # f.write( '<div align="center">\n<table cellpadding="4">' ) f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) for median, mean, std_dev in stats: f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) f.write( '</tr>\n</table></div>\n<br />\n' ) f.write( '<div align="center">\n<table>\n' ) image_index = 0 for i in yaml_items[:40]: chart_url = make_chart_url( i['orig_posted'], i['comment_times'], i['comments'], i['retweet_times'], i['retweets'], i['qualified'], image_index % 2 and even_background or odd_background, ) image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ ( image_index % 2 and "even" or "odd", i['link'], i['title'].encode( 'ascii', 'xmlcharrefreplace' ), i['author'].encode( 'ascii', 'xmlcharrefreplace' ), ) ) f.write( ' <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) ) f.write( ' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \ ( image_url, img_width, img_height ) ) image_index += 1 f.write( html_footer ) f.close() if os.path.exists( index_fullpath ): os.unlink( index_fullpath ) shutil.move( new_index_fullpath, index_fullpath ) for fname in files_to_delete: os.unlink( fname ) def Make_feed_file( yaml_items ): f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' ) f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" ) f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) ) count = 0 for item in yaml_items: now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) ) if item['qualified'] != -1: f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) ) count += 1 if count > 14: break f.write( "</channel></rss>" ) f.close() if __name__=='__main__': start_time = time.time() progress_text = [] old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = sys.stderr = StringIO.StringIO() try: localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) ) # # Read in techcrunch.yaml # # [ { 'title' : 'Title Text', # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', # 'author' : u'MG Siegler', # 'orig_posted' : 1282197199 # 'tags' : [ u'Google', u'privacy' ] # 'qualified' : -1 # 'comment_times' : [ 1282197199, 1282197407 ] # 'comments' : [ 0, 15 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 5 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 3 ] # 'retweet_times' : [ 1282197199, 1282197407 ] # 'retweets' : [ 0, 43 ] # }, # { ... } # ] # yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' ) if os.path.exists( yaml_fullpath ): f = file( yaml_fullpath, 'rb' ) items = yaml.load( f ) f.close() else: print "could not open", yaml_fullpath items = [] progress_text = [ "read techcrunch.yaml" ] process_feed( items ) # # If any work was done, then write files. # if True or any_entry_added: stats = analysis.Process_retweets_for_feed( items ) # We'll only look at the stats for the time 1:00 to 1:30 after posting. median, mean, sigma = stats[2] threshold = median + sigma for item in items: if item['qualified'] == -1: for i in range( len( item['retweet_times'] ) ): r_time = item['retweet_times'][i] if r_time - item['orig_posted'] < 5400: if item['retweets'][i] >= threshold: item['qualified'] = i if r_time - item['orig_posted'] >= 3600: break # # Write out the updated yaml file. # f = file( yaml_fullpath, 'wb' ) yaml.dump( items, f, width=120 ) f.close() f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) yaml.dump( items, f, width=120 ) f.close() f = codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' ) yaml.dump( items, f, encoding='utf-8', width=120 ) f.close() Make_feed_file( items ) Make_index_html( items, stats ) else: print "No entries were added this time." except Exception, e: exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e) print exceptional_text, ' '.join( progress_text ) traceback.print_exc( file = sys.stdout ) try: sendEmail( 'Exception thrown in techcrunch.py', exceptional_text, ( 'david.blume@gmail.com', ) ) except Exception, e: print "Could not send email to notify you of the exception. :(" message = sys.stdout.getvalue() sys.stdout = old_stdout sys.stderr = old_stderr if not debug: print message # Finally, let's save this to a statistics page if os.path.exists( os.path.join( localdir, 'stats.txt' ) ): f = open( os.path.join( localdir, 'stats.txt' )) try: lines = f.readlines() finally: f.close() else: lines = [] lines = lines[:168] # Just keep the past week's worth status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status )) f = open( os.path.join( localdir,'stats.txt' ), 'w' ) f.writelines( lines ) f.close()