techcrunch.py (ef6f5ca) - techcrunch.git

techcrunch.py

#!/usr/bin/python
# chmod 755 me, and make sure I have UNIX style newlines.
#
# techcrunch.py
#
# http://feeds.feedburner.com/TechCrunch
# feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
# feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments']
#
# TODO:
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry &raquo;</a>'
#   link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"
# 2. Add Reddit counts: curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg"
#
# This file was coverted from tabs to spaces with the vim command %retab
#
# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml
#

import feedparser
import yaml
import sys
import os
import time
import StringIO
import codecs
import traceback
import calendar
import pickle
import exceptions
import urllib
import urllib2
import httplib
import shutil
import glob
import smtplib
import bisect
import analysis
import json
import cookielib
import xml
import texttime
import operator
from datetime import timedelta
import cgi

debug = True
any_entry_added = False
tags_to_post = set([ 'apple', 'google'])
authors_to_post = [ 'michael arrington', ]

localdir = ''

html_head = """
<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>
<HTML><HEAD>
  <title>TechCrunch Feed Filter</title>
  
  <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" />
  <style type="text/css">
    body { font-family: "Arial", san-serif; }
    .author { font-size: smaller; }
    .h3 { font-size: larger; }
    a { text-decoration: none; }
    /* table { border: none; border-collapse:collapse; font-size: large } */
    table { border-collapse: collapse; }
    table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; }
    table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; }
    table.legend td { border: 1px solid LightSlateGray; }
    tr.even { background:#%s; padding: 2em; }
    tr.odd { background:#%s; padding-bottom: 2em; }
  </style>
</HEAD>
<BODY>
<div align='center'><h3>TechCrunch Feed Filter</h3></div>
This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br />
"""

html_footer = """
</table>
</div><br />
<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>,
<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> &bull; <a href="stats.txt">status</a><br />&copy; 2011 <a href="http://david.dlma.com">David Blume</a></div><br />
</BODY>
</HTML>
"""

img_width = 300
img_height = 50

series_1_color = "0000FF"
series_2_color = "00AA00"
threshold_color = "FF8C00"
tag_color = "F01000"

even_background = "F8F8F8"
odd_background = "E8E8E8"

even_watermark = "E0E0FF"
odd_watermark = "D0D0F0"

def asciiize( s ):
    try:
        return s.encode( 'ascii' )
    except UnicodeEncodeError, e:
        return s
    except exceptions.AttributeError, e:
        return s

def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ):
    """Sends Email"""
    smtp = smtplib.SMTP( 'localhost', port=587 )
    smtp.login( user, passw )
    smtp.sendmail( fromaddr, \
                   toaddrs, \
                   "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \
                   ( fromaddr, ", ".join( toaddrs ), subject, message ) )
    smtp.quit()

def index_id( a_list, op, elem ):
    try:
        return (index for index, item in enumerate( a_list ) if op( item, elem ) ).next()
    except:
        return -1

def index_id_simple( a_list, elem ):
    index = 0
    for item in a_list:
        if item == elem:
            return index
        index += 1
    return -1

def make_chart_url( time_posted, comment_times, comment_values, retweet_times,
                    retweet_values, threshold_value, is_odd_row, tag_hit ):
#    comment_times, comment_values = zip( *comments )
#    retweet_times, retweet_values = zip( *retweets )

# TODO handle failure cases, -1

if not len( comment_times ):
        comment_times = [ time_posted, ]
    if not len( comment_values ):
        comment_values = [ 0, ]
    if not len( retweet_times ):
        retweet_times = [ time_posted, ]
    if not len( retweet_values ):
        retweet_values = [ 0, ]

#    comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ]
#    retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ]
    comment_times = [ (i - time_posted) / 1800 for i in comment_times ]
    retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ]

min_comment_time = min( comment_times )
    max_comment_time = max( comment_times )
    min_comment_value = min( comment_values )
    max_comment_value = max( comment_values )
    min_retweet_time = min( retweet_times )
    max_retweet_time = max( retweet_times )
    min_retweet_value = min( retweet_values )
    max_retweet_value = max( retweet_values )

met_threshold_pt = -1
    if threshold_value != -1:
        met_threshold_pt = index_id( retweet_values, operator.ge, threshold_value )
        if met_threshold_pt == -1 or tag_hit:
            # This can happen if threshold_value was set to a number
            # because the author or a tag was matched, but the article
            # was unpopular. We choose to put a marker at point index 0.
            met_threshold_pt = 0

if is_odd_row != 0:
        bg_color = even_background
        watermark_color = even_watermark
    else:
        bg_color = odd_background
        watermark_color = odd_watermark

if len( comment_values ) < 8 and len( comment_values ) > 1:
        # max_comment_value *= 2
        pass
    elif len( comment_values ) == 1:
        min_comment_value = 0
    if len( retweet_values ) < 8 and len( retweet_values ) > 1:
        # max_retweet_value *= 2
        pass
    elif len( retweet_values ) == 1:
        min_retweet_value = 0

min_comment_value = 0
    min_retweet_value = 0

chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \
                ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color )
    chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ),
                                          ','.join( [ str( n ) for n in comment_values ] ),
                                          ','.join( [ str( n ) for n in retweet_times ] ),
                                          ','.join( [ str( n ) for n in retweet_values ] ) )
    # TODO: Consider watermark levels, like:
    # chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5
    if max_retweet_value > 0:
        threshold_percent = max( 0, min( (float(threshold_value) / max_retweet_value) - 0.01, 1.0  ) )
    else:
        threshold_percent = 1.0
    chart_url += "&chm=r,%s,0,0,%1.3f" % ( watermark_color, threshold_percent )
    if met_threshold_pt != -1:
        if tag_hit:
            dot_color = tag_color
            dot_shape = 'd'
        else:
            dot_color = threshold_color
            dot_shape = 'o'
        chart_url += "|%s,%s,1,%d,10" % ( dot_shape, dot_color, met_threshold_pt )
    chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \
                 ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value,
                   0, max( 7, max_comment_time ),
                   min_comment_value, max_comment_value,
                   0, max( 7, max_retweet_time ),
                   min_comment_value, max_retweet_value )
    chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, )
    return chart_url

def process_feed( yaml_items ):
    """
    Retrieve the url and process it.
    feed_info (in, out) A tuple that describes an individual feed, like its name and etag.
    """

feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
    if hasattr( feed, 'status' ):
        if feed.status == 304:
            pass
        else:
            feed_is_modified = True
            if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302:
                if feed.status == 503:
                    print "the feed is temporarily unavailable."
                elif feed.status == 400:
                    print "the feed says we made a bad request."
                elif feed.status == 502:
                    print "the feed reported a bad gateway error."
                elif feed.status == 404:
                    print "the feed says the page was not found."
                elif feed.status == 500:
                    print "the feed had an internal server error."
                elif feed.status == 403:
                    print "Access to the feed was forbidden."
                else:
                    print "the feed returned feed.status %d." % ( feed.status, )
            else:
                # Save off this
                if hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ):
                    print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % ( str( feed.bozo_exception ) )
                else:
                    try:
                        with open( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) as f:
                            pickle.dump( feed, f )
                    except( pickle.PicklingError, exceptions.TypeError ), e:
                        print "An error occurred while pickling the feed: %s." % \
                              ( # str(e.__class__),
                                str(e) )
                        traceback.print_exc( 3, file = sys.stdout )
                        feed_is_modified = False

for i in reversed( feed.entries ):
                process_item( i, yaml_items )

# If we have more than 200 items, remove the old ones.
            while len( yaml_items ) > 200:
                yaml_items.pop()

#            cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) )

for i in yaml_items:
                # i['title'] = asciiize( i['title'] )
                # i['tags'] = map( asciiize, i['tags'] )
                process_yaml_item( i )

else:
        if hasattr(feed, 'bozo_exception'):
            e = feed.bozo_exception
            if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110:
                print_last_line = True
                if hasattr(e, 'reason'):
                    if e.reason[0] == 110:
                        print "the feed's connection timed out."
                        print_last_line = False
                    elif e.reason[0] == 111:
                        print "the feed's connection was refused."
                        print_last_line = False
                    elif e.reason[0] == 104:
                        print "the feed reset the connection."
                        print_last_line = False
                    else:
                        print "the feed had a URLError with reason %s." % ( str(e.reason), )
                        print_last_line = False
                if print_last_line:
                    print "the feed had a URLError %s" % ( str(e), )
            elif isinstance( e, httplib.BadStatusLine ):
                print "the feed gave a bad status line. (%s)" % ( str(e), )
            else:
                if len( str(e) ):
                    print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) )
                else:
                    print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) )
        else:
            print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) )

def process_item( feed_item, yaml_items ):
    # Get the time
    global any_entry_added
    timecode_now = int( time.time() )
    date_parsed = time.gmtime()
    if hasattr( feed_item, 'issued_parsed' ):
        date_parsed = feed_item.issued_parsed
        date_set = True
    elif hasattr( feed_item, 'date_parsed' ):
        date_parsed = feed_item.date_parsed
        date_set = True
    else:
        print "process_item found no timestamp for", asciiize( feed_item.link )
    timecode_parsed = calendar.timegm( date_parsed )

# Look for i.feedburner_origlink in yaml_items
    yaml_item = None
    for i in yaml_items:
        if hasattr( feed_item, 'feedburner_origlink' ) and feed_item.feedburner_origlink == i['link']:
            yaml_item = i
            break
        elif feed_item.link == i['link']:
            yaml_item = i
            break
    if not yaml_item:
        author = ''
        link = feed_item.link
        if hasattr( feed_item, 'author' ):
            author = asciiize( feed_item.author )
        if hasattr( feed_item, 'feedburner_origlink' ):
            link = feed_item.feedburner_origlink

# Make a new yaml_item
        yaml_item = { 'title'               : asciiize( feed_item.title ),
                      'link'                : asciiize( link ),
                      'author'              : author,
                      'tags'                : [],
                      'orig_posted'         : timecode_parsed,
                      'qualified'           : -1,
                      'comment_times'       : [],
                      'comments'            : [],
                      'fb_shares'           : [],
                      'slash_comment_times' : [],
                      'slash_comments'      : [],
                      'retweet_times'       : [],
                      'retweets'            : []
                    }
        if hasattr( feed_item, 'tags' ):
            for i in feed_item.tags:
                yaml_item['tags'].append( asciiize( i.term ) )

yaml_items.insert( 0, yaml_item )
        any_entry_added = True

# Maybe check to ensure that this item isn't too old.
    if timecode_parsed < timecode_now - 60 * 30 * 9:
        return

# Now, add the new values
    if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8:
        any_entry_added = True
        yaml_item['slash_comment_times'].append( timecode_now )
        yaml_item['slash_comments'].append( int( feed_item.slash_comments ) )

def process_yaml_item( yaml_item ):
    global any_entry_added

timecode_now = int( time.time() )
    if len( yaml_item['comments'] ) < 8:
        num_shares, num_comments = Get_fb_stats( yaml_item['link'] )
#        disqus_id = Get_disqus_id( yaml_item )
#        num_comments = Get_num_disqus_comments( yaml_item['link'], disqus_id, cookie )
        if num_comments != -1:
            any_entry_added = True
            yaml_item['comment_times'].append( timecode_now )
            yaml_item['comments'].append( num_comments )
            yaml_item['fb_shares'].append( num_shares )

if len( yaml_item['retweets'] ) < 8:
        num_retweets = Get_num_retweets( yaml_item )
        if num_retweets != -1:
            any_entry_added = True
            yaml_item['retweet_times'].append( timecode_now )
            yaml_item['retweets'].append( num_retweets )

def Get_num_comments( url_string ):
    try:
        f = urllib2.urlopen( url_string )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_num_comments got an error:", e.reason
        elif hasattr( e, 'code' ):
            print "Get_num_comments got an error. Code:", e.code
        return -1
    tag_to_find = '<a href="#comments" rel="nofollow">'
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = start_pos
        while str.isdigit( data[ end_pos ] ):
            end_pos += 1
        if end_pos > start_pos:
            return int( data[start_pos:end_pos] )
    return -1

def Get_cookie( cookie_request ):
    cookie = cookielib.CookieJar()
    error_string = "Get_cookie didn't."
    try:
        cookie_response = urllib2.urlopen( cookie_request )
        cookie.extract_cookies( cookie_response, cookie_request )
        return cookie
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            error_string = "Get_cookie got an error: %s" % ( str( e.reason ) )
        elif hasattr( e, 'code' ):
            error_string = "Get_cookie got an error. Code: %s" % ( str( e.code ) )
    print error_string
    return None

def Get_disqus_id( yaml_item ):
    if 'disqus_id' in yaml_item:
        return yaml_item['disqus_id']
    url_get_data = ''
    try:
        f = urllib2.urlopen( yaml_item['link'] )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_disqus_id got an error:", e.reason
        elif hasattr( e, 'code' ):
            print "Get_disqus_id got an error. Code:", e.code, yaml_item['link']
        return url_get_data
    except httplib.BadStatusLine, e:
        print "Get_discus_id got a BadStatusLine:", str( e )
        return url_get_data

tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="'
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = start_pos
        while data[ end_pos ] != '"' and end_pos < start_pos + 200:
            end_pos += 1
        if end_pos < start_pos + 200:
            url_get_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' )
            yaml_item['disqus_id'] = url_get_data
#    else:
#        print "Get_disqus_id could not find #comments anchor for", yaml_item['link']
    return url_get_data

def Get_num_disqus_comments( url_string, disqus_id, cookie ):

if cookie == None or disqus_id == '':
        return -1

opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) )
    request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + disqus_id )
    try:
        response = opener.open( request )
        disqus_data = response.read()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_num_disqus_comments got an error getting the count:", e.reason
        elif hasattr( e, 'code' ):
            print "Get_num_disqus_comments got an error getting the count. Code:", e.code
        return -1
    disqus_tag_to_find = 'displayCount('
    disqus_offset = disqus_data.find( disqus_tag_to_find )
    if disqus_offset != -1:
        start_pos = disqus_offset + len( disqus_tag_to_find )
        end_pos = disqus_data.find( '}]})', start_pos )
        if end_pos != -1:
            return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] )
    else:
        print "Get_num_disqus_comments found no disqus tag for", url_string
    return -1

def Get_num_retweets_unused( yaml_item ):
    """ TODO: Support for retweents has been removed.
    See: https://twittercommunity.com/t/a-new-design-for-tweet-and-follow-buttons/52791
    So instead, use facebook.
    curl https://graph.facebook.com/fql?q=SELECT%20total_count,comment_count,like_count,share_count%20FROM%20link_stat%20WHERE%20url=%27http://techcrunch.com/2015/11/22/the-real-reason-on-demand-startups-are-reclassifying-workers/?ncid=rss%27
    """
    url_string = yaml_item['link']
    try:
        f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) )
        data = f.read()
        f.close()
    except urllib2.URLError, e:
        if hasattr( e, 'reason' ):
            print "Get_num_retweets got an error:", e.reason
        elif hasattr( e, 'code' ):
            print "Get_num_retweets got an error. Code:", e.code
        return -1
    tag_to_find = '<span class="c">'
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = data.find( '<', start_pos )
        if end_pos != -1:
            try:
                return int( data[ start_pos:end_pos ] )
            except ValueError, e:
                if data[ start_pos:end_pos ] != '?':
                    print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], )
                else:
                    print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \
                          ( yaml_item['title'][:20],
                            texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) )
                          )
    return -1

def Get_num_retweets( yaml_item ):
    """ TODO: Support for retweents has been removed.
    See: https://twittercommunity.com/t/a-new-design-for-tweet-and-follow-buttons/52791
    So instead, use facebook.
    curl https://graph.facebook.com/fql?q=SELECT%20total_count,comment_count,like_count,share_count%20FROM%20link_stat%20WHERE%20url=%27http://techcrunch.com/2015/11/22/the-real-reason-on-demand-startups-are-reclassifying-workers/?ncid=rss%27
    """
    url_string = yaml_item['link']
    try:
        f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % \
                             urllib.quote_plus( url_string ) )
        data = f.read()
        f.close()
    except (urllib2.URLError, httplib.BadStatusLine), e:
        if hasattr( e, 'reason' ):
            print "Get_num_retweets got an error:", e.reason
        elif hasattr( e, 'code' ):
            print "Get_num_retweets got an error. Code:", e.code
        else:
            print "Get_num_retweets got an error:", str( e )
        return -1
    tag_to_find = '"count":'
    offset = data.find( tag_to_find )
    if offset != -1:
        start_pos = offset + len( tag_to_find )
        end_pos = data.find( ',', start_pos )
        if end_pos != -1:
            try:
                return int( data[ start_pos:end_pos ] )
            except ValueError, e:
                if data[ start_pos:end_pos ] != '?':
                    print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], )
                else:
                    print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \
                          ( yaml_item['title'][:20],
                            texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) )
                          )
    return -1

def Get_fb_stats( url_string ):
    """ Returns shares and comments """
    shares = -1
    comments = -1
    try:
        f = urllib2.urlopen( 'https://graph.facebook.com/?ids=' + url_string )
        data = f.read()
        f.close()
    except (urllib2.URLError, httplib.BadStatusLine), e:
        if hasattr( e, 'reason' ): # URLError
            print "Get_fb_stats got an error:", e.reason, url_string
        elif hasattr( e, 'code' ): #URLError
            print "Get_fb_stats got an error. Code:", e.code, url_string
        else:
            print "Get_fb_stats got an error:", str( e )
        return -1, -1
    if len( data ) > len( url_string ):
        d = json.loads( data ).values()[0]
        if d.has_key( 'shares' ):
            shares = d['shares']
        else:
            shares = 0
        if d.has_key( 'comments' ):
            comments = d['comments']
        else:
            comments = 0
    else:
        print "Get_fb_stats got too little data for ",  url_string
    return shares, comments

def Save_image( url_string, file_path ):
    try:
        f = urllib2.urlopen( url_string )
        data = f.read()
        f.close()
    except (urllib2.URLError, httplib.BadStatusLine), e:
        if hasattr( e, 'reason' ): # URLError
            print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason
        elif hasattr( e, 'code' ): # URLError
            print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Code:", e.code
        else:
            print "Save_image: Error from urlopen", e
        return url_string

if len( data ) > 50:
        with open( file_path, 'wb' ) as f:
            f.write( data )
        return 'cache/' + os.path.basename( file_path )
    return url_string

def Make_index_html( yaml_items, weekend_stats, weekday_stats ):
    cur_time = int( time.time() )
    new_index_fullpath = os.path.join( localdir, 'index.html_new' )
    index_fullpath = os.path.join( localdir, 'index.html' )
    cache_path = os.path.join( localdir, 'cache' )

files_to_delete = glob.glob( os.path.join( cache_path, '*.png' ) )

f = file( new_index_fullpath, 'w' )
    f.write( html_head % ( even_background, odd_background ) )

f.write( '<div align="center">\n<table class="legend">\n<tr><th></th><th>Median</th><th>Mean</th><th>Std. Dev</th><th>Threshold</th></tr>\n' )
    f.write( '<tr><th>Weekday</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2] ) )
    f.write( '<tr><th>Weekend</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2] ) )
    f.write( '</table></div>\n<br />\n' )

f.write( '<div align="center">\n<table>\n' )
    for image_index, image in enumerate(yaml_items[:40]):
        tag_hit = False
        if image['author'].lower() in authors_to_post:
            tag_hit = True
        elif len( set([j.lower() for j in image['tags']]) & tags_to_post ) > 0:
            tag_hit = True
        chart_url = make_chart_url( image['orig_posted'],
                                    image['comment_times'],
                                    image['comments'],
                                    image['retweet_times'],
                                    image['retweets'],
                                    image['qualified'],
                                    image_index % 2,
                                    tag_hit
                                  )
#        if image['title'].startswith( 'Too ' ):
#            print image['title'], image['qualified'], image['retweet_times']
#            print chart_url
        image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) )
        f.write( '<tr valign="center" class="%s">\n  <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \
                 ( image_index % 2 and "even" or "odd",
                   image['link'],
                   image['title'].encode( 'ascii', 'xmlcharrefreplace' ),
                   image['author'].encode( 'ascii', 'xmlcharrefreplace' ),
                 )
               )
        f.write( '  <td>%s<td>\n' % ( image['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) )
        f.write( '  <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \
                 ( image_url,
                   img_width,
                   img_height
                 )
               )
    f.write( html_footer )
    f.close()
    if os.path.exists( index_fullpath ):
        os.unlink( index_fullpath )
    shutil.move( new_index_fullpath, index_fullpath )
    for fname in files_to_delete:
        os.unlink( fname )

def Make_feed_file( yaml_items ):
    with open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' ) as f:
        f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" )
        f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) )
        count = 0
        for item in yaml_items:
            now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) )
            if item['qualified'] != -1:
                escaped_title = cgi.escape( item['title'] ).encode( 'ascii', 'xmlcharrefreplace' )
                escaped_author = cgi.escape( item['author'] ).encode( 'ascii', 'xmlcharrefreplace' )
                f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \
                         ( escaped_title, now, item['link'], item['link'], escaped_author ) )
                count += 1
                if count > 14:
                    break
        f.write( "</channel></rss>" )

if __name__=='__main__':
    start_time = time.time()
    progress_text = []

old_stdout = sys.stdout
    old_stderr = sys.stderr
    sys.stdout = sys.stderr = StringIO.StringIO()

try:
        localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) )
        #
        # Read in techcrunch.yaml
        #
        # [ { 'title'               : 'Title Text',
        #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
        #     'author'              : u'MG Siegler',
        #     'orig_posted'         : 1282197199
        #     'tags'                : [ u'Google', u'privacy' ]
        #     'qualified'           : -1
        #     'comment_times'       : [ 1282197199, 1282197407 ]
        #     'comments'            : [ 0, 15 ]
        #     'fb_shares'           : [ 0, 3 ]
        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
        #     'slash_comments'      : [ 0, 5 ]
        #     'retweet_times'       : [ 1282197199, 1282197407 ]
        #     'retweets'            : [ 0, 43 ]
        #    },
        #    { ... }
        #  ]
        #
        yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' )
        if os.path.exists( yaml_fullpath ):
            with open( yaml_fullpath, 'rb' ) as f:
                items = yaml.load( f )

# Do any dictionary item updating that might be necessary
#                for item in items:
#                    if not item.has_key( 'fb_shares' ):
#                        item['fb_shares'] = []
        else:
            print "could not open", yaml_fullpath
            items = []

progress_text = [ "read techcrunch.yaml" ]
        process_feed( items )

#
        # If any work was done, then write files.
        #
        if any_entry_added:

weekend_stats, weekday_stats = analysis.Process_retweets_for_feed( items )

# We'll only look at the stats for the time 1:00 to 1:30 after posting.
            weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
            weekend_threshold = weekend_mean + weekend_sigma
            weekday_median, weekday_mean, weekday_sigma = weekday_stats[2]
            weekday_threshold = weekday_mean + weekday_sigma
            for item in items:
                wday = time.localtime( item['orig_posted'] ).tm_wday
                if wday == 5 or wday == 6:
                    threshold = weekend_threshold
                else:
                    threshold = weekday_threshold
                if item['qualified'] == -1:
                    for i in range( len( item['retweet_times'] ) ):
                        r_time = item['retweet_times'][i]
                        if r_time - item['orig_posted'] < 5400:
                            if item['retweets'][i] >= threshold:
                                item['qualified'] = threshold
                            if r_time - item['orig_posted'] >= 3600:
                                break

# Automatically add those items whose authors and tags I like
            for item in items:
                if item['qualified'] == -1 and len( item['retweet_times'] ) > 0:
                    if item['author'].lower() in authors_to_post:
                        item['qualified'] = threshold
                    elif len( set([j.lower() for j in item['tags']]) & tags_to_post ) > 0:
                        item['qualified'] = threshold

#
            # Write out the updated yaml file.
            #

# For the one file we really use, write to a file on the side, then move it.
            yaml_newfile_fullpath = os.path.join( localdir, 'techcrunch_temp_writable.yaml' )
            with open( yaml_newfile_fullpath, 'wb' ) as f:
                yaml.dump( items, f, width=120 )
            try:
                os.rename( yaml_newfile_fullpath, yaml_fullpath )
            except OSError as e:
                print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath)
            with open( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) as f:
                yaml.dump( items, f, width=120 )
            with codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' ) as f:
                yaml.dump( items, f, encoding='utf-8', width=120 )

Make_feed_file( items )

Make_index_html( items, weekend_stats, weekday_stats )
        else:
            print "No entries were added this time."

except Exception, e:
        exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e)
        print exceptional_text, ' '.join( progress_text )
        traceback.print_exc( file = sys.stdout )
        try:
            sendEmail( 'Exception thrown in techcrunch.py',
                       exceptional_text + "\n" + traceback.format_exc(),
                       ( 'david.blume@gmail.com', ) )
        except Exception, e:
            print "Could not send email to notify you of the exception. :("

message = sys.stdout.getvalue()
    sys.stdout = old_stdout
    sys.stderr = old_stderr
    if not debug:
        print message

# Finally, let's save this to a statistics page
    if os.path.exists( os.path.join( localdir, 'stats.txt' ) ):
        with open( os.path.join( localdir, 'stats.txt' )) as f:
            lines = f.readlines()
    else:
        lines = []
    lines = lines[:168] # Just keep the past week's worth
    # status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
    status = len( message.strip() ) and '\n                       '.join( message.splitlines() ) or "OK"
    lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status ))
    with open( os.path.join( localdir,'stats.txt' ), 'w' ) as f:
        f.writelines( lines )

techcrunch.git

2015-11-23: Resync svn with production site.