David Blume's GitList
Repositories
techcrunch.git
Code
Commits
Branches
Tags
Search
Tree:
ef6f5ca
Branches
Tags
main
techcrunch.git
techcrunch.py
2015-11-23: Resync svn with production site.
David Blume
commited
ef6f5ca
at 2018-01-20 20:30:31
techcrunch.py
Blame
History
Raw
#!/usr/bin/python # chmod 755 me, and make sure I have UNIX style newlines. # # techcrunch.py # # http://feeds.feedburner.com/TechCrunch # feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) # feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments'] # # TODO: # 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' # link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" # 2. Add Reddit counts: curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" # # This file was coverted from tabs to spaces with the vim command %retab # # cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml # import feedparser import yaml import sys import os import time import StringIO import codecs import traceback import calendar import pickle import exceptions import urllib import urllib2 import httplib import shutil import glob import smtplib import bisect import analysis import json import cookielib import xml import texttime import operator from datetime import timedelta import cgi debug = True any_entry_added = False tags_to_post = set([ 'apple', 'google']) authors_to_post = [ 'michael arrington', ] localdir = '' html_head = """ <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'> <HTML><HEAD> <title>TechCrunch Feed Filter</title> <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> --> <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" /> <style type="text/css"> body { font-family: "Arial", san-serif; } .author { font-size: smaller; } .h3 { font-size: larger; } a { text-decoration: none; } /* table { border: none; border-collapse:collapse; font-size: large } */ table { border-collapse: collapse; } table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; } table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; } table.legend td { border: 1px solid LightSlateGray; } tr.even { background:#%s; padding: 2em; } tr.odd { background:#%s; padding-bottom: 2em; } </style> </HEAD> <BODY> <div align='center'><h3>TechCrunch Feed Filter</h3></div> This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br /> """ html_footer = """ </table> </div><br /> <div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>, <a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a><br />© 2011 <a href="http://david.dlma.com">David Blume</a></div><br /> </BODY> </HTML> """ img_width = 300 img_height = 50 series_1_color = "0000FF" series_2_color = "00AA00" threshold_color = "FF8C00" tag_color = "F01000" even_background = "F8F8F8" odd_background = "E8E8E8" even_watermark = "E0E0FF" odd_watermark = "D0D0F0" def asciiize( s ): try: return s.encode( 'ascii' ) except UnicodeEncodeError, e: return s except exceptions.AttributeError, e: return s def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ): """Sends Email""" smtp = smtplib.SMTP( 'localhost', port=587 ) smtp.login( user, passw ) smtp.sendmail( fromaddr, \ toaddrs, \ "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ ( fromaddr, ", ".join( toaddrs ), subject, message ) ) smtp.quit() def index_id( a_list, op, elem ): try: return (index for index, item in enumerate( a_list ) if op( item, elem ) ).next() except: return -1 def index_id_simple( a_list, elem ): index = 0 for item in a_list: if item == elem: return index index += 1 return -1 def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, threshold_value, is_odd_row, tag_hit ): # comment_times, comment_values = zip( *comments ) # retweet_times, retweet_values = zip( *retweets ) # TODO handle failure cases, -1 if not len( comment_times ): comment_times = [ time_posted, ] if not len( comment_values ): comment_values = [ 0, ] if not len( retweet_times ): retweet_times = [ time_posted, ] if not len( retweet_values ): retweet_values = [ 0, ] # comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ] # retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ] comment_times = [ (i - time_posted) / 1800 for i in comment_times ] retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ] min_comment_time = min( comment_times ) max_comment_time = max( comment_times ) min_comment_value = min( comment_values ) max_comment_value = max( comment_values ) min_retweet_time = min( retweet_times ) max_retweet_time = max( retweet_times ) min_retweet_value = min( retweet_values ) max_retweet_value = max( retweet_values ) met_threshold_pt = -1 if threshold_value != -1: met_threshold_pt = index_id( retweet_values, operator.ge, threshold_value ) if met_threshold_pt == -1 or tag_hit: # This can happen if threshold_value was set to a number # because the author or a tag was matched, but the article # was unpopular. We choose to put a marker at point index 0. met_threshold_pt = 0 if is_odd_row != 0: bg_color = even_background watermark_color = even_watermark else: bg_color = odd_background watermark_color = odd_watermark if len( comment_values ) < 8 and len( comment_values ) > 1: # max_comment_value *= 2 pass elif len( comment_values ) == 1: min_comment_value = 0 if len( retweet_values ) < 8 and len( retweet_values ) > 1: # max_retweet_value *= 2 pass elif len( retweet_values ) == 1: min_retweet_value = 0 min_comment_value = 0 min_retweet_value = 0 chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \ ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color ) chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ), ','.join( [ str( n ) for n in comment_values ] ), ','.join( [ str( n ) for n in retweet_times ] ), ','.join( [ str( n ) for n in retweet_values ] ) ) # TODO: Consider watermark levels, like: # chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5 if max_retweet_value > 0: threshold_percent = max( 0, min( (float(threshold_value) / max_retweet_value) - 0.01, 1.0 ) ) else: threshold_percent = 1.0 chart_url += "&chm=r,%s,0,0,%1.3f" % ( watermark_color, threshold_percent ) if met_threshold_pt != -1: if tag_hit: dot_color = tag_color dot_shape = 'd' else: dot_color = threshold_color dot_shape = 'o' chart_url += "|%s,%s,1,%d,10" % ( dot_shape, dot_color, met_threshold_pt ) chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value, 0, max( 7, max_comment_time ), min_comment_value, max_comment_value, 0, max( 7, max_retweet_time ), min_comment_value, max_retweet_value ) chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, ) return chart_url def process_feed( yaml_items ): """ Retrieve the url and process it. feed_info (in, out) A tuple that describes an individual feed, like its name and etag. """ feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) if hasattr( feed, 'status' ): if feed.status == 304: pass else: feed_is_modified = True if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: if feed.status == 503: print "the feed is temporarily unavailable." elif feed.status == 400: print "the feed says we made a bad request." elif feed.status == 502: print "the feed reported a bad gateway error." elif feed.status == 404: print "the feed says the page was not found." elif feed.status == 500: print "the feed had an internal server error." elif feed.status == 403: print "Access to the feed was forbidden." else: print "the feed returned feed.status %d." % ( feed.status, ) else: # Save off this if hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ): print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % ( str( feed.bozo_exception ) ) else: try: with open( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) as f: pickle.dump( feed, f ) except( pickle.PicklingError, exceptions.TypeError ), e: print "An error occurred while pickling the feed: %s." % \ ( # str(e.__class__), str(e) ) traceback.print_exc( 3, file = sys.stdout ) feed_is_modified = False for i in reversed( feed.entries ): process_item( i, yaml_items ) # If we have more than 200 items, remove the old ones. while len( yaml_items ) > 200: yaml_items.pop() # cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) ) for i in yaml_items: # i['title'] = asciiize( i['title'] ) # i['tags'] = map( asciiize, i['tags'] ) process_yaml_item( i ) else: if hasattr(feed, 'bozo_exception'): e = feed.bozo_exception if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110: print_last_line = True if hasattr(e, 'reason'): if e.reason[0] == 110: print "the feed's connection timed out." print_last_line = False elif e.reason[0] == 111: print "the feed's connection was refused." print_last_line = False elif e.reason[0] == 104: print "the feed reset the connection." print_last_line = False else: print "the feed had a URLError with reason %s." % ( str(e.reason), ) print_last_line = False if print_last_line: print "the feed had a URLError %s" % ( str(e), ) elif isinstance( e, httplib.BadStatusLine ): print "the feed gave a bad status line. (%s)" % ( str(e), ) else: if len( str(e) ): print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) ) else: print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) ) else: print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) ) def process_item( feed_item, yaml_items ): # Get the time global any_entry_added timecode_now = int( time.time() ) date_parsed = time.gmtime() if hasattr( feed_item, 'issued_parsed' ): date_parsed = feed_item.issued_parsed date_set = True elif hasattr( feed_item, 'date_parsed' ): date_parsed = feed_item.date_parsed date_set = True else: print "process_item found no timestamp for", asciiize( feed_item.link ) timecode_parsed = calendar.timegm( date_parsed ) # Look for i.feedburner_origlink in yaml_items yaml_item = None for i in yaml_items: if hasattr( feed_item, 'feedburner_origlink' ) and feed_item.feedburner_origlink == i['link']: yaml_item = i break elif feed_item.link == i['link']: yaml_item = i break if not yaml_item: author = '' link = feed_item.link if hasattr( feed_item, 'author' ): author = asciiize( feed_item.author ) if hasattr( feed_item, 'feedburner_origlink' ): link = feed_item.feedburner_origlink # Make a new yaml_item yaml_item = { 'title' : asciiize( feed_item.title ), 'link' : asciiize( link ), 'author' : author, 'tags' : [], 'orig_posted' : timecode_parsed, 'qualified' : -1, 'comment_times' : [], 'comments' : [], 'fb_shares' : [], 'slash_comment_times' : [], 'slash_comments' : [], 'retweet_times' : [], 'retweets' : [] } if hasattr( feed_item, 'tags' ): for i in feed_item.tags: yaml_item['tags'].append( asciiize( i.term ) ) yaml_items.insert( 0, yaml_item ) any_entry_added = True # Maybe check to ensure that this item isn't too old. if timecode_parsed < timecode_now - 60 * 30 * 9: return # Now, add the new values if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8: any_entry_added = True yaml_item['slash_comment_times'].append( timecode_now ) yaml_item['slash_comments'].append( int( feed_item.slash_comments ) ) def process_yaml_item( yaml_item ): global any_entry_added timecode_now = int( time.time() ) if len( yaml_item['comments'] ) < 8: num_shares, num_comments = Get_fb_stats( yaml_item['link'] ) # disqus_id = Get_disqus_id( yaml_item ) # num_comments = Get_num_disqus_comments( yaml_item['link'], disqus_id, cookie ) if num_comments != -1: any_entry_added = True yaml_item['comment_times'].append( timecode_now ) yaml_item['comments'].append( num_comments ) yaml_item['fb_shares'].append( num_shares ) if len( yaml_item['retweets'] ) < 8: num_retweets = Get_num_retweets( yaml_item ) if num_retweets != -1: any_entry_added = True yaml_item['retweet_times'].append( timecode_now ) yaml_item['retweets'].append( num_retweets ) def Get_num_comments( url_string ): try: f = urllib2.urlopen( url_string ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_comments got an error:", e.reason elif hasattr( e, 'code' ): print "Get_num_comments got an error. Code:", e.code return -1 tag_to_find = '<a href="#comments" rel="nofollow">' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = start_pos while str.isdigit( data[ end_pos ] ): end_pos += 1 if end_pos > start_pos: return int( data[start_pos:end_pos] ) return -1 def Get_cookie( cookie_request ): cookie = cookielib.CookieJar() error_string = "Get_cookie didn't." try: cookie_response = urllib2.urlopen( cookie_request ) cookie.extract_cookies( cookie_response, cookie_request ) return cookie except urllib2.URLError, e: if hasattr( e, 'reason' ): error_string = "Get_cookie got an error: %s" % ( str( e.reason ) ) elif hasattr( e, 'code' ): error_string = "Get_cookie got an error. Code: %s" % ( str( e.code ) ) print error_string return None def Get_disqus_id( yaml_item ): if 'disqus_id' in yaml_item: return yaml_item['disqus_id'] url_get_data = '' try: f = urllib2.urlopen( yaml_item['link'] ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_disqus_id got an error:", e.reason elif hasattr( e, 'code' ): print "Get_disqus_id got an error. Code:", e.code, yaml_item['link'] return url_get_data except httplib.BadStatusLine, e: print "Get_discus_id got a BadStatusLine:", str( e ) return url_get_data tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = start_pos while data[ end_pos ] != '"' and end_pos < start_pos + 200: end_pos += 1 if end_pos < start_pos + 200: url_get_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' ) yaml_item['disqus_id'] = url_get_data # else: # print "Get_disqus_id could not find #comments anchor for", yaml_item['link'] return url_get_data def Get_num_disqus_comments( url_string, disqus_id, cookie ): if cookie == None or disqus_id == '': return -1 opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) ) request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + disqus_id ) try: response = opener.open( request ) disqus_data = response.read() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_disqus_comments got an error getting the count:", e.reason elif hasattr( e, 'code' ): print "Get_num_disqus_comments got an error getting the count. Code:", e.code return -1 disqus_tag_to_find = 'displayCount(' disqus_offset = disqus_data.find( disqus_tag_to_find ) if disqus_offset != -1: start_pos = disqus_offset + len( disqus_tag_to_find ) end_pos = disqus_data.find( '}]})', start_pos ) if end_pos != -1: return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] ) else: print "Get_num_disqus_comments found no disqus tag for", url_string return -1 def Get_num_retweets_unused( yaml_item ): """ TODO: Support for retweents has been removed. See: https://twittercommunity.com/t/a-new-design-for-tweet-and-follow-buttons/52791 So instead, use facebook. curl https://graph.facebook.com/fql?q=SELECT%20total_count,comment_count,like_count,share_count%20FROM%20link_stat%20WHERE%20url=%27http://techcrunch.com/2015/11/22/the-real-reason-on-demand-startups-are-reclassifying-workers/?ncid=rss%27 """ url_string = yaml_item['link'] try: f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) ) data = f.read() f.close() except urllib2.URLError, e: if hasattr( e, 'reason' ): print "Get_num_retweets got an error:", e.reason elif hasattr( e, 'code' ): print "Get_num_retweets got an error. Code:", e.code return -1 tag_to_find = '<span class="c">' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = data.find( '<', start_pos ) if end_pos != -1: try: return int( data[ start_pos:end_pos ] ) except ValueError, e: if data[ start_pos:end_pos ] != '?': print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], ) else: print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \ ( yaml_item['title'][:20], texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) ) ) return -1 def Get_num_retweets( yaml_item ): """ TODO: Support for retweents has been removed. See: https://twittercommunity.com/t/a-new-design-for-tweet-and-follow-buttons/52791 So instead, use facebook. curl https://graph.facebook.com/fql?q=SELECT%20total_count,comment_count,like_count,share_count%20FROM%20link_stat%20WHERE%20url=%27http://techcrunch.com/2015/11/22/the-real-reason-on-demand-startups-are-reclassifying-workers/?ncid=rss%27 """ url_string = yaml_item['link'] try: f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % \ urllib.quote_plus( url_string ) ) data = f.read() f.close() except (urllib2.URLError, httplib.BadStatusLine), e: if hasattr( e, 'reason' ): print "Get_num_retweets got an error:", e.reason elif hasattr( e, 'code' ): print "Get_num_retweets got an error. Code:", e.code else: print "Get_num_retweets got an error:", str( e ) return -1 tag_to_find = '"count":' offset = data.find( tag_to_find ) if offset != -1: start_pos = offset + len( tag_to_find ) end_pos = data.find( ',', start_pos ) if end_pos != -1: try: return int( data[ start_pos:end_pos ] ) except ValueError, e: if data[ start_pos:end_pos ] != '?': print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], ) else: print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \ ( yaml_item['title'][:20], texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) ) ) return -1 def Get_fb_stats( url_string ): """ Returns shares and comments """ shares = -1 comments = -1 try: f = urllib2.urlopen( 'https://graph.facebook.com/?ids=' + url_string ) data = f.read() f.close() except (urllib2.URLError, httplib.BadStatusLine), e: if hasattr( e, 'reason' ): # URLError print "Get_fb_stats got an error:", e.reason, url_string elif hasattr( e, 'code' ): #URLError print "Get_fb_stats got an error. Code:", e.code, url_string else: print "Get_fb_stats got an error:", str( e ) return -1, -1 if len( data ) > len( url_string ): d = json.loads( data ).values()[0] if d.has_key( 'shares' ): shares = d['shares'] else: shares = 0 if d.has_key( 'comments' ): comments = d['comments'] else: comments = 0 else: print "Get_fb_stats got too little data for ", url_string return shares, comments def Save_image( url_string, file_path ): try: f = urllib2.urlopen( url_string ) data = f.read() f.close() except (urllib2.URLError, httplib.BadStatusLine), e: if hasattr( e, 'reason' ): # URLError print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason elif hasattr( e, 'code' ): # URLError print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Code:", e.code else: print "Save_image: Error from urlopen", e return url_string if len( data ) > 50: with open( file_path, 'wb' ) as f: f.write( data ) return 'cache/' + os.path.basename( file_path ) return url_string def Make_index_html( yaml_items, weekend_stats, weekday_stats ): cur_time = int( time.time() ) new_index_fullpath = os.path.join( localdir, 'index.html_new' ) index_fullpath = os.path.join( localdir, 'index.html' ) cache_path = os.path.join( localdir, 'cache' ) files_to_delete = glob.glob( os.path.join( cache_path, '*.png' ) ) f = file( new_index_fullpath, 'w' ) f.write( html_head % ( even_background, odd_background ) ) f.write( '<div align="center">\n<table class="legend">\n<tr><th></th><th>Median</th><th>Mean</th><th>Std. Dev</th><th>Threshold</th></tr>\n' ) f.write( '<tr><th>Weekday</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2] ) ) f.write( '<tr><th>Weekend</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2] ) ) f.write( '</table></div>\n<br />\n' ) f.write( '<div align="center">\n<table>\n' ) for image_index, image in enumerate(yaml_items[:40]): tag_hit = False if image['author'].lower() in authors_to_post: tag_hit = True elif len( set([j.lower() for j in image['tags']]) & tags_to_post ) > 0: tag_hit = True chart_url = make_chart_url( image['orig_posted'], image['comment_times'], image['comments'], image['retweet_times'], image['retweets'], image['qualified'], image_index % 2, tag_hit ) # if image['title'].startswith( 'Too ' ): # print image['title'], image['qualified'], image['retweet_times'] # print chart_url image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ ( image_index % 2 and "even" or "odd", image['link'], image['title'].encode( 'ascii', 'xmlcharrefreplace' ), image['author'].encode( 'ascii', 'xmlcharrefreplace' ), ) ) f.write( ' <td>%s<td>\n' % ( image['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) ) f.write( ' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \ ( image_url, img_width, img_height ) ) f.write( html_footer ) f.close() if os.path.exists( index_fullpath ): os.unlink( index_fullpath ) shutil.move( new_index_fullpath, index_fullpath ) for fname in files_to_delete: os.unlink( fname ) def Make_feed_file( yaml_items ): with open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' ) as f: f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" ) f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) ) count = 0 for item in yaml_items: now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) ) if item['qualified'] != -1: escaped_title = cgi.escape( item['title'] ).encode( 'ascii', 'xmlcharrefreplace' ) escaped_author = cgi.escape( item['author'] ).encode( 'ascii', 'xmlcharrefreplace' ) f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ ( escaped_title, now, item['link'], item['link'], escaped_author ) ) count += 1 if count > 14: break f.write( "</channel></rss>" ) if __name__=='__main__': start_time = time.time() progress_text = [] old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = sys.stderr = StringIO.StringIO() try: localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) ) # # Read in techcrunch.yaml # # [ { 'title' : 'Title Text', # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', # 'author' : u'MG Siegler', # 'orig_posted' : 1282197199 # 'tags' : [ u'Google', u'privacy' ] # 'qualified' : -1 # 'comment_times' : [ 1282197199, 1282197407 ] # 'comments' : [ 0, 15 ] # 'fb_shares' : [ 0, 3 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 5 ] # 'retweet_times' : [ 1282197199, 1282197407 ] # 'retweets' : [ 0, 43 ] # }, # { ... } # ] # yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' ) if os.path.exists( yaml_fullpath ): with open( yaml_fullpath, 'rb' ) as f: items = yaml.load( f ) # Do any dictionary item updating that might be necessary # for item in items: # if not item.has_key( 'fb_shares' ): # item['fb_shares'] = [] else: print "could not open", yaml_fullpath items = [] progress_text = [ "read techcrunch.yaml" ] process_feed( items ) # # If any work was done, then write files. # if any_entry_added: weekend_stats, weekday_stats = analysis.Process_retweets_for_feed( items ) # We'll only look at the stats for the time 1:00 to 1:30 after posting. weekend_median, weekend_mean, weekend_sigma = weekend_stats[2] weekend_threshold = weekend_mean + weekend_sigma weekday_median, weekday_mean, weekday_sigma = weekday_stats[2] weekday_threshold = weekday_mean + weekday_sigma for item in items: wday = time.localtime( item['orig_posted'] ).tm_wday if wday == 5 or wday == 6: threshold = weekend_threshold else: threshold = weekday_threshold if item['qualified'] == -1: for i in range( len( item['retweet_times'] ) ): r_time = item['retweet_times'][i] if r_time - item['orig_posted'] < 5400: if item['retweets'][i] >= threshold: item['qualified'] = threshold if r_time - item['orig_posted'] >= 3600: break # Automatically add those items whose authors and tags I like for item in items: if item['qualified'] == -1 and len( item['retweet_times'] ) > 0: if item['author'].lower() in authors_to_post: item['qualified'] = threshold elif len( set([j.lower() for j in item['tags']]) & tags_to_post ) > 0: item['qualified'] = threshold # # Write out the updated yaml file. # # For the one file we really use, write to a file on the side, then move it. yaml_newfile_fullpath = os.path.join( localdir, 'techcrunch_temp_writable.yaml' ) with open( yaml_newfile_fullpath, 'wb' ) as f: yaml.dump( items, f, width=120 ) try: os.rename( yaml_newfile_fullpath, yaml_fullpath ) except OSError as e: print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath) with open( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) as f: yaml.dump( items, f, width=120 ) with codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' ) as f: yaml.dump( items, f, encoding='utf-8', width=120 ) Make_feed_file( items ) Make_index_html( items, weekend_stats, weekday_stats ) else: print "No entries were added this time." except Exception, e: exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e) print exceptional_text, ' '.join( progress_text ) traceback.print_exc( file = sys.stdout ) try: sendEmail( 'Exception thrown in techcrunch.py', exceptional_text + "\n" + traceback.format_exc(), ( 'david.blume@gmail.com', ) ) except Exception, e: print "Could not send email to notify you of the exception. :(" message = sys.stdout.getvalue() sys.stdout = old_stdout sys.stderr = old_stderr if not debug: print message # Finally, let's save this to a statistics page if os.path.exists( os.path.join( localdir, 'stats.txt' ) ): with open( os.path.join( localdir, 'stats.txt' )) as f: lines = f.readlines() else: lines = [] lines = lines[:168] # Just keep the past week's worth # status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" status = len( message.strip() ) and '\n '.join( message.splitlines() ) or "OK" lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status )) with open( os.path.join( localdir,'stats.txt' ), 'w' ) as f: f.writelines( lines )