#!/usr/bin/env python
#
# TODO:
# 1. Deep links: 'Read the rest of this entry »'
# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"
# 2. Add Reddit counts: curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg"
#
# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml
#
import feedparser
import yaml
import sys
import os
import time
import StringIO
import codecs
import traceback
import calendar
import pickle
import exceptions
import urllib
import urllib2
import httplib
import shutil
import glob
import smtplib
import analysis
import json
import xml
import texttime
import operator
from datetime import timedelta
import cgi
debug = True
any_entry_added = False
tags_to_post = set(['apple', 'google'])
authors_to_post = ['michael arrington',]
# TODO 2018-01-18: Maybe combine fb_likes with bf_shares or something...
rhs_metric = 'fb_likes'
rhs_metric_times = 'comment_times'
localdir = ''
html_head = """
TechCrunch Feed Filter
TechCrunch Feed Filter
This page shows what analysis is done to filter the noise away from the Techcrunch feed into a more concise feed. Learn more about the Feed Filter.
"""
html_footer = """
"""
img_width = 300
img_height = 50
series_1_color = "0000FF"
series_2_color = "00AA00"
threshold_color = "FF8C00"
tag_color = "F01000"
even_background = "F8F8F8"
odd_background = "E8E8E8"
even_watermark = "E0E0FF"
odd_watermark = "D0D0F0"
def asciiize(s):
try:
return s.encode('ascii')
except UnicodeEncodeError as e:
return s
except exceptions.AttributeError as e:
return s
def sendEmail(subject, message, toaddrs, fromaddr='"techcrunch.py" '):
"""Sends Email"""
smtp = smtplib.SMTP('mail.dlma.com', port=587)
smtp.login(user, passw)
smtp.sendmail(fromaddr, \
toaddrs, \
"Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \
(fromaddr, ", ".join(toaddrs), subject, message))
smtp.quit()
def index_id(a_list, op, elem):
try:
return (index for index, item in enumerate(a_list) if op(item, elem)).next()
except:
return -1
def make_chart_url(time_posted, lhs_times, lhs_values, rhs_times,
rhs_values, threshold_value, is_odd_row, tag_hit):
# lhs_times, lhs_values = zip(*comments)
# rhs_times, rhs_values = zip(*rhs)
# TODO handle failure cases, -1
if not len(lhs_times):
lhs_times = [time_posted,]
if not len(lhs_values):
lhs_values = [0,]
if not len(rhs_times):
rhs_times = [time_posted,]
if not len(rhs_values):
rhs_values = [0,]
# lhs_times = [(i - time_posted + 900) / 1800 for i in lhs_times]
# rhs_times = [(i - time_posted + 900) / 1800 for i in rhs_times]
lhs_times = [(i - time_posted) / 1800 for i in lhs_times]
rhs_times = [(i - time_posted) / 1800 for i in rhs_times]
min_comment_time = min(lhs_times)
max_comment_time = max(lhs_times)
min_comment_value = min(lhs_values)
max_comment_value = max(lhs_values)
min_rhs_time = min(rhs_times)
max_rhs_time = max(rhs_times)
min_rhs_value = min(rhs_values)
max_rhs_value = max(rhs_values)
met_threshold_pt = -1
if threshold_value != -1:
met_threshold_pt = index_id(rhs_values, operator.ge, threshold_value)
if met_threshold_pt == -1 or tag_hit:
# This can happen if threshold_value was set to a number
# because the author or a tag was matched, but the article
# was unpopular. We choose to put a marker at point index 0.
met_threshold_pt = 0
if is_odd_row != 0:
bg_color = even_background
watermark_color = even_watermark
else:
bg_color = odd_background
watermark_color = odd_watermark
if len(lhs_values) < 8 and len(lhs_values) > 1:
# max_comment_value *= 2
pass
elif len(lhs_values) == 1:
min_comment_value = 0
if len(rhs_values) < 8 and len(rhs_values) > 1:
# max_rhs_value *= 2
pass
elif len(rhs_values) == 1:
min_rhs_value = 0
min_comment_value = 0
min_rhs_value = 0
chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \
(series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color)
chart_url += "&chd=t:%s|%s|%s|%s" % (','.join([str(n) for n in lhs_times]),
','.join([str(n) for n in lhs_values]),
','.join([str(n) for n in rhs_times]),
','.join([str(n) for n in rhs_values]))
# TODO: Consider watermark levels, like:
# chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5
if max_rhs_value > 0:
threshold_percent = max(0, min((float(threshold_value) / max_rhs_value) - 0.01, 1.0))
else:
threshold_percent = 1.0
chart_url += "&chm=r,%s,0,0,%1.3f" % (watermark_color, threshold_percent)
if met_threshold_pt != -1:
if tag_hit:
dot_color = tag_color
dot_shape = 'd'
else:
dot_color = threshold_color
dot_shape = 'o'
chart_url += "|%s,%s,1,%d,10" % (dot_shape, dot_color, met_threshold_pt)
chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \
(min_comment_value, max_comment_value, min_rhs_value, max_rhs_value,
0, max(7, max_comment_time),
min_comment_value, max_comment_value,
0, max(7, max_rhs_time),
min_comment_value, max_rhs_value)
chart_url += "&chf=bg,s,%s&chdl=comments|shares" % (bg_color,)
return chart_url
def process_feed(yaml_items):
"""
Retrieve the url and process it.
feed_info (in, out) A tuple that describes an individual feed, like its name and etag.
"""
feed = feedparser.parse('http://feeds.feedburner.com/TechCrunch')
if hasattr(feed, 'status'):
if feed.status == 304:
pass
else:
feed_is_modified = True
if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302:
if feed.status == 503:
print "the feed is temporarily unavailable."
elif feed.status == 400:
print "the feed says we made a bad request."
elif feed.status == 502:
print "the feed reported a bad gateway error."
elif feed.status == 404:
print "the feed says the page was not found."
elif feed.status == 500:
print "the feed had an internal server error."
elif feed.status == 403:
print "Access to the feed was forbidden."
else:
print "the feed returned feed.status %d." % ( feed.status, )
else:
# Save off this
if hasattr(feed, 'bozo_exception') and isinstance(feed.bozo_exception, xml.sax._exceptions.SAXParseException):
print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception))
else:
try:
with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f:
pickle.dump(feed, f)
except(pickle.PicklingError, exceptions.TypeError) as e:
print "An error occurred while pickling the feed: %s." % \
(# str(e.__class__),
str(e))
traceback.print_exc(3, file=sys.stdout)
feed_is_modified = False
for i in reversed(feed.entries):
process_item(i, yaml_items)
# If we have more than 200 items, remove the old ones.
while len(yaml_items) > 200:
yaml_items.pop()
for i in yaml_items:
# i['title'] = asciiize(i['title'])
# i['tags'] = map(asciiize, i['tags'])
process_yaml_item(i)
else:
if hasattr(feed, 'bozo_exception'):
e = feed.bozo_exception
if isinstance(e, urllib2.URLError):
print_last_line = True
if hasattr(e, 'reason'):
if e.reason[0] == 110:
print "the feed's connection timed out."
print_last_line = False
elif e.reason[0] == 111:
print "the feed's connection was refused."
print_last_line = False
elif e.reason[0] == 104:
print "the feed reset the connection."
print_last_line = False
else:
print "the feed had a URLError with reason %s." % (str(e.reason),)
print_last_line = False
if print_last_line:
print "the feed had a URLError %s" % (str(e),)
elif isinstance(e, httplib.BadStatusLine):
print "the feed gave a bad status line. (%s)" % (str(e),)
else:
if len(str(e)):
print "the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e))
else:
print "the feed bozo_exception: %s %s" % (str(e.__class__), repr(e))
else:
print "the feed returned class %s, %s" % (str(feed.__class__), str(feed))
def process_item(feed_item, yaml_items):
"""Processes an RSS feed item, and converts it to a YAML item"""
# Get the time
global any_entry_added
timecode_now = int(time.time())
date_parsed = time.gmtime()
if hasattr(feed_item, 'issued_parsed'):
date_parsed = feed_item.issued_parsed
date_set = True
elif hasattr(feed_item, 'date_parsed'):
date_parsed = feed_item.date_parsed
date_set = True
else:
print "process_item found no timestamp for", asciiize(feed_item.link)
timecode_parsed = calendar.timegm(date_parsed)
link = feed_item.link
if hasattr(feed_item, 'feedburner_origlink'):
link = feed_item.feedburner_origlink
# TODO 2018-01-18: Leave in the ncid for URL clicks, but remove during processing.
# suffix_to_remove = '?ncid=rss'
# if link.endswith(suffix_to_remove):
# link = link[:-len(suffix_to_remove)]
# Look for i.feedburner_origlink in yaml_items
yaml_item = None
for i in yaml_items:
if link == i['link']:
yaml_item = i
break
if yaml_item is None:
author = ''
if hasattr(feed_item, 'author'):
author = asciiize(feed_item.author)
# Make a new yaml_item
yaml_item = {'title' : asciiize(feed_item.title),
'link' : asciiize(link),
'author' : author,
'tags' : [],
'orig_posted' : timecode_parsed,
'qualified' : -1,
'comment_times' : [],
'fb_comments' : [],
'fb_shares' : [],
'fb_likes' : [],
'slash_comment_times' : [],
'slash_comments' : []
}
if hasattr(feed_item, 'tags'):
for i in feed_item.tags:
yaml_item['tags'].append(asciiize(i.term))
yaml_items.insert(0, yaml_item)
any_entry_added = True
# Maybe check to ensure that this item isn't too old.
if timecode_parsed < timecode_now - 60 * 30 * 9:
return
# Now, add the new values
if hasattr(feed_item, 'slash_comments') and len(yaml_item['slash_comments']) < 8:
any_entry_added = True
yaml_item['slash_comment_times'].append(timecode_now)
yaml_item['slash_comments'].append(int(feed_item.slash_comments))
def process_yaml_item(yaml_item):
global any_entry_added
# Related to TODO 2018-01-18: Remove ncid only during processing.
link = yaml_item['link']
suffix_to_remove = '?ncid=rss'
# Maybe we should find() it instead, in case feedburner adds other options
if link.endswith(suffix_to_remove):
link = link[:-len(suffix_to_remove)]
timecode_now = int(time.time())
if len(yaml_item['fb_comments']) < 8:
num_shares, num_comments, num_likes = Get_fb_stats(link)
if num_comments != -1:
any_entry_added = True
yaml_item['comment_times'].append(timecode_now)
yaml_item['fb_shares'].append(num_shares)
yaml_item['fb_comments'].append(num_comments)
yaml_item['fb_likes'].append(num_likes)
# if len(yaml_item['reddit_']) < 8:
# num_ = Get_reddit_stats(link)
# if num_ != -1:
# any_entry_added = True
# yaml_item['reddit_times'].append(timecode_now)
# yaml_item['reddit_'].append(num_)
def Get_reddit_stats(url_string):
""" Consider curl "https://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg"
"""
return -1
def Get_fb_stats(url_string):
""" There are apparently two pretty good ways to do this. One, with FQL, querying for the parameters you want,
and two, with URL id. They go like this:
FQL:
u = urllib.quote_plus(url_string)
urllib2.urlopen('https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27' % (u))
URL ID:
u = urllib.quote_plus(url_string)
with open('facebook-token.txt', 'r') as f:
token = f.read()
encoded = urllib.urlencode({'access_token': token})
urllib2.urlopen('https://graph.facebook.com/vX.Y/?id=%s&%s' % (u, encoded)
"""
shares = -1
comments = -1
likes = -1
url_string = url_string.encode('utf-8')
try:
encoded = urllib.urlencode({'access_token': facebook_token})
# url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27&%s'
# This stopped working 2018-01-13, 11:25, after I told Facebook the app would use v2.11
# https://developers.facebook.com/docs/graph-api/changelog/version2.9#gapi-deprecate
# url = 'https://graph.facebook.com/v2.8/?id=%s&fields=og_object{engagement},share&%s'
# Consider the following for a different engagement field:
# "engagement": {
# "reaction_count": 115,
# "comment_count": 0,
# "share_count": 102,
# "comment_plugin_count": 0
# },
# Where reaction_count + share_count = og_object.engagement.count
url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s'
f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded))
data = f.read()
f.close()
except (urllib2.URLError, httplib.BadStatusLine) as e:
if hasattr(e, 'reason'): # URLError
if hasattr(e, 'code'):
print "Get_fb_stats got an error (1):", e.code, e.reason, url_string
else:
print "Get_fb_stats got an error (2):", e.reason, url_string
elif hasattr(e, 'code'): #URLError
print "Get_fb_stats got an error. Code:", e.code, url_string
else:
print "Get_fb_stats got an error (3):", str(e)
return shares, comments, likes
except KeyError as e:
print "Get_fb_stats got a key error 1e (%s)" % (str(e), )
print "Get_fb_stats got a key error 2.2 enc (%s)" % (encoded, )
print url_string.encode('utf-8')
print u"Get_fb_stats got a key error 2.1 url (%s)" % (url_string, )
print "Get_fb_stats got a key error 3q (%s)" % (urllib.quote_plus(url_string))
print "Get_fb_stats got a key error 4 (%s)" % (url % (urllib.quote_plus(url_string), encoded))
print "Get_fb_stats got a key error 5 (%s) for url %s:" % (str(e), url % (urllib.quote_plus(url_string), encoded))
return shares, comments, likes
if len(data) > 20:
d = json.loads(data)['engagement']
try:
shares = d['share_count']
except KeyError:
shares = 0
try:
likes = d['reaction_count']
except KeyError:
likes = 0
# TODO 2018-01-18: og_object metric was likes + shares + comments
# Here we'll combine likes and shares, and comments with plugin_comments
likes += shares
try:
comments = d['comment_plugin_count'] + d['comment_count']
except KeyError:
comments = 0
else:
print "Get_fb_stats got too little data for ", url_string
return shares, comments, likes
def Save_image(url_string, file_path):
try:
f = urllib2.urlopen(url_string)
data = f.read()
f.close()
except (urllib2.URLError, httplib.BadStatusLine) as e:
if hasattr(e, 'reason'): # URLError
print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason
elif hasattr(e, 'code'): # URLError
print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Code:", e.code
else:
print "Save_image: Error from urlopen", e
return url_string
if len(data) > 50:
with open(file_path, 'wb') as f:
f.write(data)
return 'cache/' + os.path.basename(file_path)
return url_string
def Make_index_html(yaml_items, weekend_stats, weekday_stats):
cur_time = int(time.time())
new_index_fullpath = os.path.join(localdir, 'index.html_new')
index_fullpath = os.path.join(localdir, 'index.html')
cache_path = os.path.join(localdir, 'cache')
files_to_delete = glob.glob(os.path.join(cache_path, '*.png'))
with codecs.open(new_index_fullpath, 'w', 'utf-8') as f:
f.write(html_head % (even_background, odd_background))
f.write('\n
\n | Median | Mean | Std. Dev | Threshold |
\n')
f.write('Weekday | %1.1f | %1.1f | %1.1f | %1.1f |
\n' % (weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2]))
f.write('Weekend | %1.1f | %1.1f | %1.1f | %1.1f |
\n' % (weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2]))
f.write('
\n
\n')
f.write('\n
\n')
for image_index, image in enumerate(yaml_items[:40]):
tag_hit = False
if image['author'].lower() in authors_to_post:
tag_hit = True
elif len(set([j.lower() for j in image['tags']]) & tags_to_post) > 0:
tag_hit = True
chart_url = make_chart_url(image['orig_posted'],
image['comment_times'],
image['fb_comments'],
image[rhs_metric_times],
image[rhs_metric],
image['qualified'],
image_index % 2,
tag_hit
)
# if image['title'].startswith( 'Too ' ):
# print image['title'], image['qualified'], image['rhs_times']
# print chart_url
image_url = Save_image(chart_url, os.path.join(cache_path, '%d_%d.png' % (cur_time, image_index)))
f.write('\n %s by %s | \n' % \
(image_index % 2 and "even" or "odd",
image['link'],
image['title'].encode('ascii', 'xmlcharrefreplace'),
image['author'].encode('ascii', 'xmlcharrefreplace'),
)
)
f.write(' %s | \n' % (image['qualified'] != -1 and ' ' or ''))
f.write(' |  |
\n' % \
(image_url,
img_width,
img_height
)
)
f.write(html_footer)
if os.path.exists(index_fullpath):
os.unlink(index_fullpath)
shutil.move(new_index_fullpath, index_fullpath)
for fname in files_to_delete:
os.unlink(fname)
def Make_feed_file(yaml_items):
with codecs.open(os.path.join(localdir, 'rss_feed.xml'), 'wb', 'utf-8') as f:
f.write("\n\n\nTrending at TechCrunchhttp://techcrunch.dlma.com")
f.write("%sAutomatically Generated Feeden-us\n" % (time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
count = 0
for item in yaml_items:
now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(item['orig_posted']))
if item['qualified'] != -1:
escaped_title = cgi.escape(item['title']).encode('ascii', 'xmlcharrefreplace')
escaped_author = cgi.escape(item['author']).encode('ascii', 'xmlcharrefreplace')
f.write("- %s%s%s%s
\n" % \
(escaped_title, now, item['link'], item['link'], escaped_author))
count += 1
if count > 14:
break
f.write("")
if __name__=='__main__':
start_time = time.time()
progress_text = []
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = sys.stderr = StringIO.StringIO()
try:
localdir = os.path.abspath(os.path.dirname(sys.argv[0]))
#
# Read in techcrunch.yaml
#
# [ { 'title' : 'Title Text',
# 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
# 'author' : u'MG Siegler',
# 'orig_posted' : 1282197199
# 'tags' : [ u'Google', u'privacy' ]
# 'qualified' : -1
# 'comment_times' : [ 1282197199, 1282197407 ]
# 'fb_comments' : [ 0, 5 ]
# 'fb_shares' : [ 0, 300 ]
# 'fb_likes' : [ 0, 19 ]
# 'slash_comment_times' : [ 1282197199, 1282197407 ]
# 'slash_comments' : [ 0, 5 ]
# },
# { ... }
# ]
#
yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml')
if os.path.exists(yaml_fullpath):
with open(yaml_fullpath, 'rb') as f:
items = yaml.load(f)
if items is None:
print yaml_fullpath, "exists, but was empty."
items = []
# Do any dictionary item updating that might be necessary
# for item in items:
# if not item.has_key('fb_shares'):
# item['fb_shares'] = []
else:
print "could not open", yaml_fullpath
items = []
with open('facebook-token.txt', 'r') as f:
facebook_token = f.read()
progress_text = ["read techcrunch.yaml"]
process_feed(items)
#
# If any work was done, then write files.
#
if any_entry_added:
weekend_stats, weekday_stats = analysis.Process_feed(items, rhs_metric, rhs_metric_times)
# We'll only look at the stats for the time 1:00 to 1:30 after posting.
weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
weekend_threshold = weekend_mean + weekend_sigma
weekday_median, weekday_mean, weekday_sigma = weekday_stats[2]
weekday_threshold = weekday_mean + weekday_sigma
for item in items:
wday = time.localtime(item['orig_posted']).tm_wday
if wday == 5 or wday == 6:
threshold = weekend_threshold
else:
threshold = weekday_threshold
if item['qualified'] == -1:
for i in range(len(item[rhs_metric_times])):
r_time = item[rhs_metric_times][i]
if r_time - item['orig_posted'] < 5400:
if item[rhs_metric][i] >= threshold:
item['qualified'] = threshold
if r_time - item['orig_posted'] >= 3600:
break
# Automatically add those items whose authors and tags I like
for item in items:
if item['qualified'] == -1 and len(item[rhs_metric_times]) > 0:
if item['author'].lower() in authors_to_post:
item['qualified'] = threshold
elif len(set([j.lower() for j in item['tags']]) & tags_to_post) > 0:
item['qualified'] = threshold
#
# Write out the updated yaml file.
#
# For the one file we really use, write to a file on the side, then move it.
yaml_newfile_fullpath = os.path.join(localdir, 'techcrunch_temp_writable.yaml')
with open(yaml_newfile_fullpath, 'wb') as f:
yaml.dump(items, f, width=120)
try:
os.rename(yaml_newfile_fullpath, yaml_fullpath)
except OSError as e:
print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath)
with open(os.path.join(localdir, 'techcrunch_text.yaml'), 'w') as f:
yaml.dump(items, f, width=120)
with codecs.open(os.path.join(localdir, 'techcrunch_unicode.yaml'), 'w', 'utf-8') as f:
yaml.dump(items, f, encoding='utf-8', width=120)
Make_feed_file(items)
Make_index_html(items, weekend_stats, weekday_stats)
else:
print "No entries were added this time."
except Exception as e:
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
print exceptional_text, ' '.join(progress_text)
traceback.print_exc(file=sys.stdout)
try:
sendEmail('Exception thrown in techcrunch.py',
exceptional_text + "\n" + traceback.format_exc(),
('david.blume@gmail.com',))
except Exception as e:
print "Could not send email to notify you of the exception. :("
message = sys.stdout.getvalue()
sys.stdout = old_stdout
sys.stderr = old_stderr
if not debug:
print message
# Finally, let's save this to a statistics page
if os.path.exists(os.path.join(localdir, 'stats.txt')):
with open(os.path.join(localdir, 'stats.txt')) as f:
lines = f.readlines()
else:
lines = []
lines = lines[:672] # Just keep the past week's worth
# status = len(message.strip()) and message.strip().replace('\n', ' - ') or "OK"
status = len(message.strip()) and '\n '.join( message.splitlines()) or "OK"
lines.insert(0, "%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status))
with open(os.path.join(localdir,'stats.txt' ), 'w') as f:
f.writelines(lines)