David Blume commited on 2018-01-20 20:24:48
Showing 1 changed files, with 88 additions and 48 deletions.
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-#!/usr/bin/python2.5 |
|
1 |
+#!/usr/bin/python |
|
2 | 2 |
# chmod 755 me, and make sure I have UNIX style newlines. |
3 | 3 |
# |
4 | 4 |
# techcrunch.py |
... | ... |
@@ -10,10 +10,11 @@ |
10 | 10 |
# TODO: |
11 | 11 |
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
12 | 12 |
# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" |
13 |
+# 2. Add Reddit counts: curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" |
|
13 | 14 |
# |
14 | 15 |
# This file was coverted from tabs to spaces with the vim command %retab |
15 | 16 |
# |
16 |
-# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; mv techcrunch.yaml_back techcrunch.yaml |
|
17 |
+# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml |
|
17 | 18 |
# |
18 | 19 |
|
19 | 20 |
import feedparser |
... | ... |
@@ -35,12 +36,13 @@ import glob |
35 | 36 |
import smtplib |
36 | 37 |
import bisect |
37 | 38 |
import analysis |
38 |
-import simplejson as json |
|
39 |
+import json |
|
39 | 40 |
import cookielib |
40 | 41 |
import xml |
41 | 42 |
import texttime |
42 | 43 |
import operator |
43 | 44 |
from datetime import timedelta |
45 |
+import cgi |
|
44 | 46 |
|
45 | 47 |
debug = True |
46 | 48 |
any_entry_added = False |
... | ... |
@@ -78,7 +80,7 @@ html_footer = """ |
78 | 80 |
</table> |
79 | 81 |
</div><br /> |
80 | 82 |
<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>, |
81 |
-<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a></div><br /> |
|
83 |
+<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a><br />© 2011 <a href="http://david.dlma.com">David Blume</a></div><br /> |
|
82 | 84 |
</BODY> |
83 | 85 |
</HTML> |
84 | 86 |
""" |
... | ... |
@@ -109,6 +111,7 @@ def asciiize( s ): |
109 | 111 |
def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ): |
110 | 112 |
"""Sends Email""" |
111 | 113 |
smtp = smtplib.SMTP( 'localhost' ) |
114 |
+ smtp.login( user, passw ) |
|
112 | 115 |
smtp.sendmail( fromaddr, \ |
113 | 116 |
toaddrs, \ |
114 | 117 |
"Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ |
... | ... |
@@ -248,10 +251,8 @@ def process_feed( yaml_items ): |
248 | 251 |
print "the feed returned feed.status %d." % ( feed.status, ) |
249 | 252 |
else: |
250 | 253 |
# Save off this |
251 |
- if hasattr( feed, 'bozo_exception' ) and type( feed.bozo_exception ) == xml.sax._exceptions.SAXParseException: |
|
252 |
- print "Didn't pickle because of bozo_exception %s." % ( str( feed.bozo_exception ) ) |
|
253 |
- elif hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ): |
|
254 |
- print "Didn't pickle because of bozo_exception instance %s." % ( str( feed.bozo_exception ) ) |
|
254 |
+ if hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ): |
|
255 |
+ print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % ( str( feed.bozo_exception ) ) |
|
255 | 256 |
else: |
256 | 257 |
f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) |
257 | 258 |
try: |
... | ... |
@@ -271,12 +272,12 @@ def process_feed( yaml_items ): |
271 | 272 |
while len( yaml_items ) > 200: |
272 | 273 |
yaml_items.pop() |
273 | 274 |
|
274 |
- cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) ) |
|
275 |
+# cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) ) |
|
275 | 276 |
|
276 | 277 |
for i in yaml_items: |
277 | 278 |
# i['title'] = asciiize( i['title'] ) |
278 | 279 |
# i['tags'] = map( asciiize, i['tags'] ) |
279 |
- process_yaml_item( i, cookie ) |
|
280 |
+ process_yaml_item( i ) |
|
280 | 281 |
|
281 | 282 |
else: |
282 | 283 |
if hasattr(feed, 'bozo_exception'): |
... | ... |
@@ -299,10 +300,7 @@ def process_feed( yaml_items ): |
299 | 300 |
if print_last_line: |
300 | 301 |
print "the feed had a URLError %s" % ( str(e), ) |
301 | 302 |
elif isinstance( e, httplib.BadStatusLine ): |
302 |
- if hasattr(e, 'message'): |
|
303 |
- print "the feed gave a bad status line %s." % ( str(e.message ), ) |
|
304 |
- else: |
|
305 |
- print "the feed gave a bad status line." |
|
303 |
+ print "the feed gave a bad status line. (%s)" % ( str(e), ) |
|
306 | 304 |
else: |
307 | 305 |
if len( str(e) ): |
308 | 306 |
print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) ) |
... | ... |
@@ -352,6 +350,7 @@ def process_item( feed_item, yaml_items ): |
352 | 350 |
'qualified' : -1, |
353 | 351 |
'comment_times' : [], |
354 | 352 |
'comments' : [], |
353 |
+ 'fb_shares' : [], |
|
355 | 354 |
'slash_comment_times' : [], |
356 | 355 |
'slash_comments' : [], |
357 | 356 |
'retweet_times' : [], |
... | ... |
@@ -374,17 +373,19 @@ def process_item( feed_item, yaml_items ): |
374 | 373 |
yaml_item['slash_comment_times'].append( timecode_now ) |
375 | 374 |
yaml_item['slash_comments'].append( int( feed_item.slash_comments ) ) |
376 | 375 |
|
377 |
-def process_yaml_item( yaml_item, cookie ): |
|
376 |
+def process_yaml_item( yaml_item ): |
|
378 | 377 |
global any_entry_added |
379 | 378 |
|
380 | 379 |
timecode_now = int( time.time() ) |
381 | 380 |
if len( yaml_item['comments'] ) < 8: |
382 |
- disqus_id = Get_disqus_id( yaml_item ) |
|
383 |
- num_comments = Get_num_disqus_comments( yaml_item['link'], disqus_id, cookie ) |
|
381 |
+ num_shares, num_comments = Get_fb_stats( yaml_item['link'] ) |
|
382 |
+# disqus_id = Get_disqus_id( yaml_item ) |
|
383 |
+# num_comments = Get_num_disqus_comments( yaml_item['link'], disqus_id, cookie ) |
|
384 | 384 |
if num_comments != -1: |
385 | 385 |
any_entry_added = True |
386 | 386 |
yaml_item['comment_times'].append( timecode_now ) |
387 | 387 |
yaml_item['comments'].append( num_comments ) |
388 |
+ yaml_item['fb_shares'].append( num_shares ) |
|
388 | 389 |
|
389 | 390 |
if len( yaml_item['retweets'] ) < 8: |
390 | 391 |
num_retweets = Get_num_retweets( yaml_item ) |
... | ... |
@@ -522,14 +523,17 @@ def Get_num_retweets_unused( yaml_item ): |
522 | 523 |
def Get_num_retweets( yaml_item ): |
523 | 524 |
url_string = yaml_item['link'] |
524 | 525 |
try: |
525 |
- f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % urllib.quote_plus( url_string ) ) |
|
526 |
+ f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % \ |
|
527 |
+ urllib.quote_plus( url_string ) ) |
|
526 | 528 |
data = f.read() |
527 | 529 |
f.close() |
528 |
- except urllib2.URLError, e: |
|
530 |
+ except (urllib2.URLError, httplib.BadStatusLine), e: |
|
529 | 531 |
if hasattr( e, 'reason' ): |
530 | 532 |
print "Get_num_retweets got an error:", e.reason |
531 | 533 |
elif hasattr( e, 'code' ): |
532 | 534 |
print "Get_num_retweets got an error. Code:", e.code |
535 |
+ else: |
|
536 |
+ print "Get_num_retweets got an error:", str( e ) |
|
533 | 537 |
return -1 |
534 | 538 |
tag_to_find = '"count":' |
535 | 539 |
offset = data.find( tag_to_find ) |
... | ... |
@@ -549,6 +553,35 @@ def Get_num_retweets( yaml_item ): |
549 | 553 |
) |
550 | 554 |
return -1 |
551 | 555 |
|
556 |
+def Get_fb_stats( url_string ): |
|
557 |
+ """ Returns shares and comments """ |
|
558 |
+ shares = -1 |
|
559 |
+ comments = -1 |
|
560 |
+ try: |
|
561 |
+ f = urllib2.urlopen( 'https://graph.facebook.com/?ids=' + url_string ) |
|
562 |
+ data = f.read() |
|
563 |
+ f.close() |
|
564 |
+ except (urllib2.URLError, httplib.BadStatusLine), e: |
|
565 |
+ if hasattr( e, 'reason' ): # URLError |
|
566 |
+ print "Get_fb_stats got an error:", e.reason, url_string |
|
567 |
+ elif hasattr( e, 'code' ): #URLError |
|
568 |
+ print "Get_fb_stats got an error. Code:", e.code, url_string |
|
569 |
+ else: |
|
570 |
+ print "Get_fb_stats got an error:", str( e ) |
|
571 |
+ return -1, -1 |
|
572 |
+ if len( data ) > len( url_string ): |
|
573 |
+ d = json.loads( data ).values()[0] |
|
574 |
+ if d.has_key( 'shares' ): |
|
575 |
+ shares = d['shares'] |
|
576 |
+ else: |
|
577 |
+ shares = 0 |
|
578 |
+ if d.has_key( 'comments' ): |
|
579 |
+ comments = d['comments'] |
|
580 |
+ else: |
|
581 |
+ comments = 0 |
|
582 |
+ else: |
|
583 |
+ print "Get_fb_stats got too little data for ", url_string |
|
584 |
+ return shares, comments |
|
552 | 585 |
|
553 | 586 |
|
554 | 587 |
def Save_image( url_string, file_path ): |
... | ... |
@@ -556,12 +589,15 @@ def Save_image( url_string, file_path ): |
556 | 589 |
f = urllib2.urlopen( url_string ) |
557 | 590 |
data = f.read() |
558 | 591 |
f.close() |
559 |
- except urllib2.URLError, e: |
|
560 |
- if hasattr( e, 'reason' ): |
|
561 |
- print "Save_image got an error:", e.reason |
|
562 |
- elif hasattr( e, 'code' ): |
|
563 |
- print "Save_image got an error. Code:", e.code |
|
592 |
+ except (urllib2.URLError, httplib.BadStatusLine), e: |
|
593 |
+ if hasattr( e, 'reason' ): # URLError |
|
594 |
+ print "Save_image got an error attempting to create", file_path, "Reason:", e.reason |
|
595 |
+ elif hasattr( e, 'code' ): # URLError |
|
596 |
+ print "Save_image got an error attempting to create", file_path, "Code:", e.code |
|
597 |
+ else: |
|
598 |
+ print "Save_image got an error from urlopen", e |
|
564 | 599 |
return url_string |
600 |
+ |
|
565 | 601 |
if len( data ) > 50: |
566 | 602 |
f = open( file_path, 'wb' ) |
567 | 603 |
f.write( data ) |
... | ... |
@@ -575,7 +611,7 @@ def Make_index_html( yaml_items, weekend_stats, weekday_stats ): |
575 | 611 |
index_fullpath = os.path.join( localdir, 'index.html' ) |
576 | 612 |
cache_path = os.path.join( localdir, 'cache' ) |
577 | 613 |
|
578 |
- files_to_delete = glob.glob( cache_path + '*.png' ) |
|
614 |
+ files_to_delete = glob.glob( os.path.join( cache_path, '*.png' ) ) |
|
579 | 615 |
|
580 | 616 |
f = file( new_index_fullpath, 'w' ) |
581 | 617 |
f.write( html_head % ( even_background, odd_background ) ) |
... | ... |
@@ -588,41 +624,39 @@ def Make_index_html( yaml_items, weekend_stats, weekday_stats ): |
588 | 624 |
|
589 | 625 |
|
590 | 626 |
f.write( '<div align="center">\n<table>\n' ) |
591 |
- image_index = 0 |
|
592 |
- for i in yaml_items[:40]: |
|
627 |
+ for image_index, image in enumerate(yaml_items[:40]): |
|
593 | 628 |
tag_hit = False |
594 |
- if i['author'].lower() in authors_to_post: |
|
629 |
+ if image['author'].lower() in authors_to_post: |
|
595 | 630 |
tag_hit = True |
596 |
- elif len( set([j.lower() for j in i['tags']]) & tags_to_post ) > 0: |
|
631 |
+ elif len( set([j.lower() for j in image['tags']]) & tags_to_post ) > 0: |
|
597 | 632 |
tag_hit = True |
598 |
- chart_url = make_chart_url( i['orig_posted'], |
|
599 |
- i['comment_times'], |
|
600 |
- i['comments'], |
|
601 |
- i['retweet_times'], |
|
602 |
- i['retweets'], |
|
603 |
- i['qualified'], |
|
633 |
+ chart_url = make_chart_url( image['orig_posted'], |
|
634 |
+ image['comment_times'], |
|
635 |
+ image['comments'], |
|
636 |
+ image['retweet_times'], |
|
637 |
+ image['retweets'], |
|
638 |
+ image['qualified'], |
|
604 | 639 |
image_index % 2, |
605 | 640 |
tag_hit |
606 | 641 |
) |
607 |
-# if i['title'].startswith( 'Too ' ): |
|
608 |
-# print i['title'], i['qualified'], i['retweet_times'] |
|
642 |
+# if image['title'].startswith( 'Too ' ): |
|
643 |
+# print image['title'], image['qualified'], image['retweet_times'] |
|
609 | 644 |
# print chart_url |
610 | 645 |
image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) |
611 | 646 |
f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ |
612 | 647 |
( image_index % 2 and "even" or "odd", |
613 |
- i['link'], |
|
614 |
- i['title'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
615 |
- i['author'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
648 |
+ image['link'], |
|
649 |
+ image['title'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
650 |
+ image['author'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
616 | 651 |
) |
617 | 652 |
) |
618 |
- f.write( ' <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) ) |
|
653 |
+ f.write( ' <td>%s<td>\n' % ( image['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) ) |
|
619 | 654 |
f.write( ' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \ |
620 | 655 |
( image_url, |
621 | 656 |
img_width, |
622 | 657 |
img_height |
623 | 658 |
) |
624 | 659 |
) |
625 |
- image_index += 1 |
|
626 | 660 |
f.write( html_footer ) |
627 | 661 |
f.close() |
628 | 662 |
if os.path.exists( index_fullpath ): |
... | ... |
@@ -634,13 +668,15 @@ def Make_index_html( yaml_items, weekend_stats, weekday_stats ): |
634 | 668 |
def Make_feed_file( yaml_items ): |
635 | 669 |
f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' ) |
636 | 670 |
f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" ) |
637 |
- f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) ) |
|
671 |
+ f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) ) |
|
638 | 672 |
count = 0 |
639 | 673 |
for item in yaml_items: |
640 | 674 |
now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) ) |
641 | 675 |
if item['qualified'] != -1: |
676 |
+ escaped_title = cgi.escape( item['title'] ).encode( 'ascii', 'xmlcharrefreplace' ) |
|
677 |
+ escaped_author = cgi.escape( item['author'] ).encode( 'ascii', 'xmlcharrefreplace' ) |
|
642 | 678 |
f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ |
643 |
- ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) ) |
|
679 |
+ ( escaped_title, now, item['link'], item['link'], escaped_author ) ) |
|
644 | 680 |
count += 1 |
645 | 681 |
if count > 14: |
646 | 682 |
break |
... | ... |
@@ -668,10 +704,9 @@ if __name__=='__main__': |
668 | 704 |
# 'qualified' : -1 |
669 | 705 |
# 'comment_times' : [ 1282197199, 1282197407 ] |
670 | 706 |
# 'comments' : [ 0, 15 ] |
707 |
+ # 'fb_shares' : [ 0, 3 ] |
|
671 | 708 |
# 'slash_comment_times' : [ 1282197199, 1282197407 ] |
672 | 709 |
# 'slash_comments' : [ 0, 5 ] |
673 |
- # 'slash_comment_times' : [ 1282197199, 1282197407 ] |
|
674 |
- # 'slash_comments' : [ 0, 3 ] |
|
675 | 710 |
# 'retweet_times' : [ 1282197199, 1282197407 ] |
676 | 711 |
# 'retweets' : [ 0, 43 ] |
677 | 712 |
# }, |
... | ... |
@@ -682,6 +717,11 @@ if __name__=='__main__': |
682 | 717 |
if os.path.exists( yaml_fullpath ): |
683 | 718 |
f = file( yaml_fullpath, 'rb' ) |
684 | 719 |
items = yaml.load( f ) |
720 |
+ |
|
721 |
+ # Do any dictionary item updating that might be necessary |
|
722 |
+# for item in items: |
|
723 |
+# if not item.has_key( 'fb_shares' ): |
|
724 |
+# item['fb_shares'] = [] |
|
685 | 725 |
f.close() |
686 | 726 |
else: |
687 | 727 |
print "could not open", yaml_fullpath |
... | ... |
@@ -754,7 +794,7 @@ if __name__=='__main__': |
754 | 794 |
traceback.print_exc( file = sys.stdout ) |
755 | 795 |
try: |
756 | 796 |
sendEmail( 'Exception thrown in techcrunch.py', |
757 |
- exceptional_text, |
|
797 |
+ exceptional_text + "\n" + traceback.format_exc(), |
|
758 | 798 |
( 'david.blume@gmail.com', ) ) |
759 | 799 |
except Exception, e: |
760 | 800 |
print "Could not send email to notify you of the exception. :(" |
... | ... |
@@ -777,7 +817,7 @@ if __name__=='__main__': |
777 | 817 |
lines = lines[:168] # Just keep the past week's worth |
778 | 818 |
# status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
779 | 819 |
status = len( message.strip() ) and '\n '.join( message.splitlines() ) or "OK" |
780 |
- lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status )) |
|
820 |
+ lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status )) |
|
781 | 821 |
f = open( os.path.join( localdir,'stats.txt' ), 'w' ) |
782 | 822 |
f.writelines( lines ) |
783 | 823 |
f.close() |
784 | 824 |