techcrunch.py (7d91adf) - techcrunch.git

techcrunch.py

#!/usr/bin/python2.5
# chmod 755 me, and make sure I have UNIX style newlines.
#
# techcrunch.py
#
# http://feeds.feedburner.com/TechCrunch
# feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
# feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments']
#
# TODO:
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry &raquo;</a>'
#   link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"

import feedparser
import yaml
import sys
import os
import time
import StringIO
import codecs
import traceback
import calendar
import pickle
import exceptions
import urllib
import urllib2
import httplib
import shutil
import glob
import smtplib
import bisect
import analysis
import simplejson as json
import cookielib

debug = True
any_entry_added = False

localdir = ''

html_head = """
<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>
<HTML><HEAD>
  <title>TechCrunch Feed Filter</title>
  
  <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" />
  <style type="text/css">
    body { font-family: "Arial", san-serif; }
    .author { font-size: smaller; }
    .h3 { font-size: larger; }
    a { text-decoration: none; }
    /* table { border: none; border-collapse:collapse; font-size: large } */
    table { border-collapse: collapse; }
    table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; }
    table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; }
    table.legend td { border: 1px solid LightSlateGray; }
    tr.even { background:#%s; padding: 2em; }
    tr.odd { background:#%s; padding-bottom: 2em; }
  </style>
</HEAD>
<BODY>
<div align='center'><h3>TechCrunch Feed Filter</h3></div>
This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br />
"""

html_footer = """
</table>
</div><br />
<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>,
<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> &bull; <a href="stats.txt">status</a></div><br />
</BODY>
</HTML>
"""

img_width = 300
img_height = 50

series_1_color = "0000FF"
series_2_color = "00AA00"
threshold_color = "FF8C00"

even_background = "F8F8F8"
#even_background = "FFFFFF"
odd_background = "E8E8E8"

def asciiize( s ):
    try:
        return s.encode( 'ascii' )
    except UnicodeEncodeError, e:
        return s
    except exceptions.AttributeError, e:
        return s

def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ):
    """Sends Email"""
    smtp = smtplib.SMTP( 'localhost' )
    smtp.sendmail( fromaddr, \
                   toaddrs, \
                   "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \
                   ( fromaddr, ", ".join( toaddrs ), subject, message ) )
    smtp.quit()

def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ):
#    comment_times, comment_values = zip( *comments )
#    retweet_times, retweet_values = zip( *retweets )

# TODO handle failure cases, -1

if not len( comment_times ):
        comment_times = [ time_posted, ]
    if not len( comment_values ):
        comment_values = [ 0, ]
    if not len( retweet_times ):
        retweet_times = [ time_posted, ]
    if not len( retweet_values ):
        retweet_values = [ 0, ]

#    comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ]
#    retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ]
    comment_times = [ (i - time_posted) / 1800 for i in comment_times ]
    retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ]

min_comment_time = min( comment_times )
    max_comment_time = max( comment_times )
    min_comment_value = min( comment_values )
    max_comment_value = max( comment_values )
    min_retweet_time = min( retweet_times )
    max_retweet_time = max( retweet_times )
    min_retweet_value = min( retweet_values )
    max_retweet_value = max( retweet_values )

if len( comment_values ) < 8 and len( comment_values ) > 1:
        # max_comment_value *= 2
        pass
    elif len( comment_values ) == 1:
        min_comment_value = 0
    if len( retweet_values ) < 8 and len( retweet_values ) > 1:
        # max_retweet_value *= 2
        pass
    elif len( retweet_values ) == 1:
        min_retweet_value = 0

min_comment_value = 0
    min_retweet_value = 0

chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \
                ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color )
    chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ),
                                          ','.join( [ str( n ) for n in comment_values ] ),
                                          ','.join( [ str( n ) for n in retweet_times ] ),
                                          ','.join( [ str( n ) for n in retweet_values ] ) )
    if met_threshold_pt != -1:
        chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt )
    chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \
                 ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value,
                   0, max( 7, max_comment_time ),
                   min_comment_value, max_comment_value,
                   0, max( 7, max_retweet_time ),
                   min_comment_value, max_retweet_value )
    chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, )
    return chart_url

def process_feed( yaml_items ):
    """
    Retrieve the url and process it.
    feed_info (in, out) A tuple that describes an individual feed, like its name and etag.
    """

feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
    if hasattr( feed, 'status' ):
        if feed.status == 304:
            pass
        else:
            feed_is_modified = True
            if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302:
                if feed.status == 503:
                    print "the feed is temporarily unavailable."
                elif feed.status == 400:
                    print "the feed says we made a bad request."
                elif feed.status == 502:
                    print "the feed reported a bad gateway error."
                elif feed.status == 404:
                    print "the feed says the page was not found."
                elif feed.status == 500:
                    print "the feed had an internal server error."
                elif feed.status == 403:
                    print "Access to the feed was forbidden."
                else:
                    print "the feed returned feed.status %d." % ( feed.status, )
            else:
                # Save off this
                f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' )
                try:
                    pickle.dump( feed, f )
                except( pickle.PicklingError, exceptions.TypeError ), e:
                    print "An error occurred while pickling the feed: %s." % \
                          ( # str(e.__class__),
                            str(e) )
                    traceback.print_exc( file = sys.stdout )
                    feed_is_modified = False
                f.close()

for i in reversed( feed.entries ):
                process_item( i, yaml_items )

# If we have more than 200 items, remove the old ones.
            while len( yaml_items ) > 200:
                yaml_items.pop()

cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) )

for i in yaml_items:
                # i['title'] = asciiize( i['title'] )
                # i['tags'] = map( asciiize, i['tags'] )
                process_yaml_item( i, cookie )

else:
        if hasattr(feed, 'bozo_exception'):
            e = feed.bozo_exception
            if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110:
                print_last_line = True
                if hasattr(e, 'reason'):
                    if e.reason[0] == 110:
                        print "the feed's connection timed out."
                        print_last_line = False
                    elif e.reason[0] == 111:
                        print "the feed's connection was refused."
                        print_last_line = False
                    elif e.reason[0] == 104:
                        print "the feed reset the connection."
                        print_last_line = False
                    else:
                        print "the feed had a URLError with reason %s." % ( str(e.reason), )
                        print_last_line = False
                if print_last_line:
                    print "the feed had a URLError %s" % ( str(e), )
            elif isinstance( e, httplib.BadStatusLine ):
                if hasattr(e, 'message'):
                    print "the feed gave a bad status line %s." % ( str(e.message ), )
                else:
                    print "the feed gave a bad status line."
            else:
                if len( str(e) ):
                    print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) )
                else:
                    print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) )
        else:
            print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) )

def process_item( feed_item, yaml_items ):
    # Get the time
    global any_entry_added
    timecode_now = int( time.time() )
    date_parsed = time.gmtime()
    if hasattr( feed_item, 'issued_parsed' ):
        date_parsed = feed_item.issued_parsed
        date_set = True
    elif hasattr( feed_item, 'date_parsed' ):
        date_parsed = feed_item.date_parsed
        date_set = True
    else:
    print "process_item found no timestamp for", asciiize( feed_item.link )
    timecode_parsed = calendar.timegm( date_parsed )

# Look for i.feedburner_origlink in yaml_items
    yaml_item = None
    for i in yaml_items:
        if feed_item.feedburner_origlink == i['link']:
            yaml_item = i
            break
    if not yaml_item:
        author = ''
        link = feed_item.link
        if hasattr( feed_item, 'author' ):
            author = asciiize( feed_item.author )
        if hasattr( feed_item, 'feedburner_origlink' ):
            link = feed_item.feedburner_origlink

# Make a new yaml_item
        yaml_item = { 'title'               : asciiize( feed_item.title ),
                      'link'                : asciiize( link ),
                      'author'              : author,
                      'tags'                : [],
                      'orig_posted'         : timecode_parsed,
                      'qualified'           : -1,
                      'comment_times'       : [],
                      'comments'            : [],
                      'slash_comment_times' : [],
                      'slash_comments'      : [],
                      'retweet_times'       : [],
                      'retweets'            : []
                    }
        if hasattr( feed_item, 'tags' ):
            for i in feed_item.tags:
                yaml_item['tags'].append( asciiize( i.term ) )

yaml_items.insert( 0, yaml_item )
        any_entry_added = True

# Maybe check to ensure that this item isn't too old.
    if timecode_parsed < timecode_now - 60 * 30 * 9:
        return

# Now, add the new values
    if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8:
        any_entry_added = True
        yaml_item['slash_comment_times'].append( timecode_now )
        yaml_item['slash_comments'].append( int( feed_item.slash_comments ) )

def process_yaml_item( yaml_item, cookie ):
    global any_entry_added

timecode_now = int( time.time() )
    if len( yaml_item['comments'] ) < 8:
        num_comments = Get_num_disqus_comments( yaml_item['link'], cookie )
        if num_comments != -1:
            any_entry_added = True
            yaml_item['comment_times'].append( timecode_now )
            yaml_item['comments'].append( num_comments )

if len( yaml_item['retweets'] ) < 8:
        num_retweets = Get_num_retweets( yaml_item['link'] )
        if num_retweets != -1:
            any_entry_added = True
            yaml_item['retweet_times'].append( timecode_now )
            yaml_item['retweets'].append( num_retweets )

def Get_num_comments( url_string ):
    try:
        f = urllib2.urlopen( url_string )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_num_comments got an error:", e.reason
    elif hasattr( e, 'code' ):
            print "Get_num_comments got an error. Code:", e.code
        return -1
    tag_to_find = '<a href="#comments" rel="nofollow">'
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = start_pos
        while str.isdigit( data[ end_pos ] ):
            end_pos += 1
    if end_pos > start_pos:
            return int( data[start_pos:end_pos] )
    return -1

def Get_cookie( cookie_request ):
    cookie = cookielib.CookieJar()
    try:
        cookie_response = urllib2.urlopen( cookie_request )
        cookie.extract_cookies( cookie_response, cookie_request )
        return cookie
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_cookie got an error:", e.reason
    elif hasattr( e, 'code' ):
            print "Get_cookie got an error. Code:", e.code
    return None

def Get_num_disqus_comments( url_string, cookie ):

if cookie == None:
        return -1

try:
        f = urllib2.urlopen( url_string )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_num_disqus_comments got an error:", e.reason
    elif hasattr( e, 'code' ):
            print "Get_num_disqus_comments got an error. Code:", e.code
        return -1

tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="'
    disqus_tag_to_find = 'displayCount('
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = start_pos
        while data[ end_pos ] != '"' and end_pos < start_pos + 200:
            end_pos += 1
        if end_pos < start_pos + 200:
            opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) )
            url_GET_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' )
            request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + url_GET_data )
        try:
                response = opener.open( request )
                disqus_data = response.read()
            except urllib2.URLError, e:
                if hasattr( e, 'reason' ):
                    print "Get_num_disqus_comments got an error getting the count:", e.reason
            elif hasattr( e, 'code' ):
                    print "Get_num_disqus_comments got an error getting the count. Code:", e.code
        disqus_data = ""
            disqus_offset = disqus_data.find( disqus_tag_to_find )
            if disqus_offset != -1:
                start_pos = disqus_offset + len( disqus_tag_to_find )
                end_pos = disqus_data.find( '}]})', start_pos )
                if end_pos != -1:
                    return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] )

return -1

def Get_num_retweets( url_string ):
    try:
        f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_num_retweets got an error:", e.reason
    elif hasattr( e, 'code' ):
            print "Get_num_retweets got an error. Code:", e.code
        return -1
    tag_to_find = '<span class="c">'
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = data.find( '<', start_pos )
        if end_pos != -1:
            return int( data[ start_pos:end_pos ] )
    return -1

def Save_image( url_string, file_path ):
    try:
        f = urllib2.urlopen( url_string )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Save_image got an error:", e.reason
    elif hasattr( e, 'code' ):
            print "Save_image got an error. Code:", e.code
        return url_string
    if len( data ) > 50:
        f = open( file_path, 'wb' )
        f.write( data )
        f.close()
        return 'cache/' + os.path.basename( file_path )
    return url_string

def Make_index_html( yaml_items, stats ):
    cur_time = int( time.time() )
    new_index_fullpath = os.path.join( localdir, 'index.html_new' )
    index_fullpath = os.path.join( localdir, 'index.html' )
    cache_path = os.path.join( localdir, 'cache' )

files_to_delete = glob.glob( cache_path + '*.png' )
#    shutil.rmtree( cache_path )
#    os.mkdir( cache_path )

f = file( new_index_fullpath, 'w' )
    f.write( html_head % ( even_background, odd_background ) )
#    f.write( '<div align="center">\n<table cellpadding="4">' )

f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' )
    for median, mean, std_dev in stats:
    f.write( '<td>med=%1.1f &#956;=%1.1f &#963;=%1.1f&nbsp;</td> ' % ( median, mean, std_dev ) )
    f.write( '</tr>\n</table></div>\n<br />\n' )

f.write( '<div align="center">\n<table>\n' )
    image_index = 0
    for i in yaml_items[:40]:
        chart_url = make_chart_url( i['orig_posted'],
                                i['comment_times'],
                                    i['comments'],
                                    i['retweet_times'],
                                    i['retweets'],
                                    i['qualified'],
                    image_index % 2 and even_background or odd_background,
                                  )
        image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) )
        f.write( '<tr valign="center" class="%s">\n  <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \
                 ( image_index % 2 and "even" or "odd",
           i['link'],
                   i['title'].encode( 'ascii', 'xmlcharrefreplace' ),
           i['author'].encode( 'ascii', 'xmlcharrefreplace' ),
                 )
               )
    f.write( '  <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) )
        f.write( '  <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \
                 ( image_url,
                   img_width,
                   img_height
                 )
               )
        image_index += 1
    f.write( html_footer )
    f.close()
    if os.path.exists( index_fullpath ):
        os.unlink( index_fullpath )
    shutil.move( new_index_fullpath, index_fullpath )
    for fname in files_to_delete:
        os.unlink( fname )

def Make_feed_file( yaml_items ):
    f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' )
    f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" )
    f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) )
    count = 0
    for item in yaml_items:
        now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) )
        if item['qualified'] != -1:
            f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \
                     ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) )
            count += 1
            if count > 14:
                break
    f.write( "</channel></rss>" )
    f.close()

if __name__=='__main__':
    start_time = time.time()
    progress_text = []

old_stdout = sys.stdout
    old_stderr = sys.stderr
    sys.stdout = sys.stderr = StringIO.StringIO()

try:
        localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) )
        #
        # Read in techcrunch.yaml
        #
        # [ { 'title'               : 'Title Text',
        #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
        #     'author'              : u'MG Siegler',
        #     'orig_posted'         : 1282197199
        #     'tags'                : [ u'Google', u'privacy' ]
        #     'qualified'           : -1
        #     'comment_times'       : [ 1282197199, 1282197407 ]
        #     'comments'            : [ 0, 15 ]
        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
        #     'slash_comments'      : [ 0, 5 ]
        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
        #     'slash_comments'      : [ 0, 3 ]
        #     'retweet_times'       : [ 1282197199, 1282197407 ]
        #     'retweets'            : [ 0, 43 ]
        #    },
        #    { ... }
        #  ]
        #
        yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' )
        if os.path.exists( yaml_fullpath ):
            f = file( yaml_fullpath, 'rb' )
            items = yaml.load( f )
            f.close()
        else:
            print "could not open", yaml_fullpath
            items = []

progress_text = [ "read techcrunch.yaml" ]
        process_feed( items )

#
        # If any work was done, then write files.
        #
        if True or any_entry_added:

stats = analysis.Process_retweets_for_feed( items )

# We'll only look at the stats for the time 1:00 to 1:30 after posting.
        median, mean, sigma = stats[2]
        threshold = median + sigma
        for item in items:
        if item['qualified'] == -1:
            for i in range( len( item['retweet_times'] ) ):
            r_time = item['retweet_times'][i]
                if r_time - item['orig_posted'] < 5400:
                            if item['retweets'][i] >= threshold:
                    item['qualified'] = i
                if r_time - item['orig_posted'] >= 3600:
                    break

#
            # Write out the updated yaml file.
            #
            f = file( yaml_fullpath, 'wb' )
            yaml.dump( items, f, width=120 )
            f.close()
            f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' )
            yaml.dump( items, f, width=120 )
            f.close()
            f = codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' )
            yaml.dump( items, f, encoding='utf-8', width=120 )
            f.close()

Make_feed_file( items )

Make_index_html( items, stats )
        else:
            print "No entries were added this time."

except Exception, e:
        exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e)
        print exceptional_text, ' '.join( progress_text )
        traceback.print_exc( file = sys.stdout )
        try:
            sendEmail( 'Exception thrown in techcrunch.py',
                       exceptional_text,
                       ( 'david.blume@gmail.com', ) )
        except Exception, e:
            print "Could not send email to notify you of the exception. :("

message = sys.stdout.getvalue()
    sys.stdout = old_stdout
    sys.stderr = old_stderr
    if not debug:
        print message

# Finally, let's save this to a statistics page
    if os.path.exists( os.path.join( localdir, 'stats.txt' ) ):
        f = open( os.path.join( localdir, 'stats.txt' ))
        try:
            lines = f.readlines()
        finally:
            f.close()
    else:
        lines = []
    lines = lines[:168] # Just keep the past week's worth
    status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
    lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
    f = open( os.path.join( localdir,'stats.txt' ), 'w' )
    f.writelines( lines )
    f.close()

techcrunch.git

Original 2010-09-03 version