2013-08-04: Miscellaneous changes to techcrunch.py
David Blume

David Blume commited on 2018-01-20 20:24:48
Showing 1 changed files, with 88 additions and 48 deletions.

... ...
@@ -1,4 +1,4 @@
1
-#!/usr/bin/python2.5
1
+#!/usr/bin/python
2 2
 # chmod 755 me, and make sure I have UNIX style newlines.
3 3
 #
4 4
 # techcrunch.py
... ...
@@ -10,10 +10,11 @@
10 10
 # TODO:
11 11
 # 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry &raquo;</a>'
12 12
 #   link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"
13
+# 2. Add Reddit counts: curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg"
13 14
 #
14 15
 # This file was coverted from tabs to spaces with the vim command %retab
15 16
 #
16
-# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; mv techcrunch.yaml_back techcrunch.yaml
17
+# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; cp techcrunch.yaml techcrunch_tmp.yaml; cp techcrunch.yaml_back techcrunch.yaml
17 18
 #
18 19
 
19 20
 import feedparser
... ...
@@ -35,12 +36,13 @@ import glob
35 36
 import smtplib
36 37
 import bisect
37 38
 import analysis
38
-import simplejson as json
39
+import json
39 40
 import cookielib
40 41
 import xml
41 42
 import texttime
42 43
 import operator
43 44
 from datetime import timedelta
45
+import cgi
44 46
 
45 47
 debug = True
46 48
 any_entry_added = False
... ...
@@ -78,7 +80,7 @@ html_footer = """
78 80
 </table>
79 81
 </div><br />
80 82
 <div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>,
81
-<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> &bull; <a href="stats.txt">status</a></div><br />
83
+<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> &bull; <a href="stats.txt">status</a><br />&copy; 2011 <a href="http://david.dlma.com">David Blume</a></div><br />
82 84
 </BODY>
83 85
 </HTML>
84 86
 """
... ...
@@ -109,6 +111,7 @@ def asciiize( s ):
109 111
 def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ):
110 112
     """Sends Email"""
111 113
     smtp = smtplib.SMTP( 'localhost' )
114
+    smtp.login( user, passw )
112 115
     smtp.sendmail( fromaddr, \
113 116
                    toaddrs, \
114 117
                    "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \
... ...
@@ -248,10 +251,8 @@ def process_feed( yaml_items ):
248 251
                     print "the feed returned feed.status %d." % ( feed.status, )
249 252
             else:
250 253
                 # Save off this
251
-                if hasattr( feed, 'bozo_exception' ) and type( feed.bozo_exception ) == xml.sax._exceptions.SAXParseException:
252
-                    print "Didn't pickle because of bozo_exception %s." % ( str( feed.bozo_exception ) )
253
-                elif hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ):
254
-                    print "Didn't pickle because of bozo_exception instance %s." % ( str( feed.bozo_exception ) )
254
+                if hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ):
255
+                    print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % ( str( feed.bozo_exception ) )
255 256
                 else:
256 257
                     f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' )
257 258
                     try:
... ...
@@ -271,12 +272,12 @@ def process_feed( yaml_items ):
271 272
             while len( yaml_items ) > 200:
272 273
                 yaml_items.pop()
273 274
 
274
-            cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) )
275
+#            cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) )
275 276
 
276 277
             for i in yaml_items:
277 278
                 # i['title'] = asciiize( i['title'] )
278 279
                 # i['tags'] = map( asciiize, i['tags'] )
279
-                process_yaml_item( i, cookie )
280
+                process_yaml_item( i )
280 281
 
281 282
     else:
282 283
         if hasattr(feed, 'bozo_exception'):
... ...
@@ -299,10 +300,7 @@ def process_feed( yaml_items ):
299 300
                 if print_last_line:
300 301
                     print "the feed had a URLError %s" % ( str(e), )
301 302
             elif isinstance( e, httplib.BadStatusLine ):
302
-                if hasattr(e, 'message'):
303
-                    print "the feed gave a bad status line %s." % ( str(e.message ), )
304
-                else:
305
-                    print "the feed gave a bad status line."
303
+                print "the feed gave a bad status line. (%s)" % ( str(e), )
306 304
             else:
307 305
                 if len( str(e) ):
308 306
                     print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) )
... ...
@@ -352,6 +350,7 @@ def process_item( feed_item, yaml_items ):
352 350
                       'qualified'           : -1,
353 351
                       'comment_times'       : [],
354 352
                       'comments'            : [],
353
+                      'fb_shares'           : [],
355 354
                       'slash_comment_times' : [],
356 355
                       'slash_comments'      : [],
357 356
                       'retweet_times'       : [],
... ...
@@ -374,17 +373,19 @@ def process_item( feed_item, yaml_items ):
374 373
         yaml_item['slash_comment_times'].append( timecode_now )
375 374
         yaml_item['slash_comments'].append( int( feed_item.slash_comments ) )
376 375
 
377
-def process_yaml_item( yaml_item, cookie ):
376
+def process_yaml_item( yaml_item ):
378 377
     global any_entry_added
379 378
 
380 379
     timecode_now = int( time.time() )
381 380
     if len( yaml_item['comments'] ) < 8:
382
-        disqus_id = Get_disqus_id( yaml_item )
383
-        num_comments = Get_num_disqus_comments( yaml_item['link'], disqus_id, cookie )
381
+        num_shares, num_comments = Get_fb_stats( yaml_item['link'] )
382
+#        disqus_id = Get_disqus_id( yaml_item )
383
+#        num_comments = Get_num_disqus_comments( yaml_item['link'], disqus_id, cookie )
384 384
         if num_comments != -1:
385 385
             any_entry_added = True
386 386
             yaml_item['comment_times'].append( timecode_now )
387 387
             yaml_item['comments'].append( num_comments )
388
+            yaml_item['fb_shares'].append( num_shares )
388 389
 
389 390
     if len( yaml_item['retweets'] ) < 8:
390 391
         num_retweets = Get_num_retweets( yaml_item )
... ...
@@ -522,14 +523,17 @@ def Get_num_retweets_unused( yaml_item ):
522 523
 def Get_num_retweets( yaml_item ):
523 524
     url_string = yaml_item['link']
524 525
     try:
525
-        f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % urllib.quote_plus( url_string ) )
526
+        f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % \
527
+                             urllib.quote_plus( url_string ) )
526 528
         data = f.read()
527 529
         f.close()
528
-    except urllib2.URLError, e:
530
+    except (urllib2.URLError, httplib.BadStatusLine), e:
529 531
         if hasattr( e, 'reason' ):
530 532
             print "Get_num_retweets got an error:", e.reason
531 533
         elif hasattr( e, 'code' ):
532 534
             print "Get_num_retweets got an error. Code:", e.code
535
+        else:
536
+            print "Get_num_retweets got an error:", str( e )
533 537
         return -1
534 538
     tag_to_find = '"count":'
535 539
     offset = data.find( tag_to_find )
... ...
@@ -549,6 +553,35 @@ def Get_num_retweets( yaml_item ):
549 553
                           )
550 554
     return -1
551 555
 
556
+def Get_fb_stats( url_string ):
557
+    """ Returns shares and comments """
558
+    shares = -1
559
+    comments = -1
560
+    try:
561
+        f = urllib2.urlopen( 'https://graph.facebook.com/?ids=' + url_string )
562
+        data = f.read()
563
+        f.close()
564
+    except (urllib2.URLError, httplib.BadStatusLine), e:
565
+        if hasattr( e, 'reason' ): # URLError
566
+            print "Get_fb_stats got an error:", e.reason, url_string
567
+        elif hasattr( e, 'code' ): #URLError
568
+            print "Get_fb_stats got an error. Code:", e.code, url_string
569
+        else:
570
+            print "Get_fb_stats got an error:", str( e )
571
+        return -1, -1
572
+    if len( data ) > len( url_string ):
573
+        d = json.loads( data ).values()[0]
574
+        if d.has_key( 'shares' ):
575
+            shares = d['shares']
576
+        else:
577
+            shares = 0
578
+        if d.has_key( 'comments' ):
579
+            comments = d['comments']
580
+        else:
581
+            comments = 0
582
+    else:
583
+        print "Get_fb_stats got too little data for ",  url_string
584
+    return shares, comments
552 585
 
553 586
 
554 587
 def Save_image( url_string, file_path ):
... ...
@@ -556,12 +589,15 @@ def Save_image( url_string, file_path ):
556 589
         f = urllib2.urlopen( url_string )
557 590
         data = f.read()
558 591
         f.close()
559
-    except urllib2.URLError, e:
560
-        if hasattr( e, 'reason' ):
561
-            print "Save_image got an error:", e.reason
562
-        elif hasattr( e, 'code' ):
563
-            print "Save_image got an error. Code:", e.code
592
+    except (urllib2.URLError, httplib.BadStatusLine), e:
593
+        if hasattr( e, 'reason' ): # URLError
594
+            print "Save_image got an error attempting to create", file_path, "Reason:", e.reason
595
+        elif hasattr( e, 'code' ): # URLError
596
+            print "Save_image got an error attempting to create", file_path, "Code:", e.code
597
+        else:
598
+            print "Save_image got an error from urlopen", e
564 599
         return url_string
600
+
565 601
     if len( data ) > 50:
566 602
         f = open( file_path, 'wb' )
567 603
         f.write( data )
... ...
@@ -575,7 +611,7 @@ def Make_index_html( yaml_items, weekend_stats, weekday_stats ):
575 611
     index_fullpath = os.path.join( localdir, 'index.html' )
576 612
     cache_path = os.path.join( localdir, 'cache' )
577 613
 
578
-    files_to_delete = glob.glob( cache_path + '*.png' )
614
+    files_to_delete = glob.glob( os.path.join( cache_path, '*.png' ) )
579 615
 
580 616
     f = file( new_index_fullpath, 'w' )
581 617
     f.write( html_head % ( even_background, odd_background ) )
... ...
@@ -588,41 +624,39 @@ def Make_index_html( yaml_items, weekend_stats, weekday_stats ):
588 624
 
589 625
 
590 626
     f.write( '<div align="center">\n<table>\n' )
591
-    image_index = 0
592
-    for i in yaml_items[:40]:
627
+    for image_index, image in enumerate(yaml_items[:40]):
593 628
         tag_hit = False
594
-        if i['author'].lower() in authors_to_post:
629
+        if image['author'].lower() in authors_to_post:
595 630
             tag_hit = True
596
-        elif len( set([j.lower() for j in i['tags']]) & tags_to_post ) > 0:
631
+        elif len( set([j.lower() for j in image['tags']]) & tags_to_post ) > 0:
597 632
             tag_hit = True
598
-        chart_url = make_chart_url( i['orig_posted'],
599
-                                    i['comment_times'],
600
-                                    i['comments'],
601
-                                    i['retweet_times'],
602
-                                    i['retweets'],
603
-                                    i['qualified'],
633
+        chart_url = make_chart_url( image['orig_posted'],
634
+                                    image['comment_times'],
635
+                                    image['comments'],
636
+                                    image['retweet_times'],
637
+                                    image['retweets'],
638
+                                    image['qualified'],
604 639
                                     image_index % 2,
605 640
                                     tag_hit
606 641
                                   )
607
-#        if i['title'].startswith( 'Too ' ):
608
-#            print i['title'], i['qualified'], i['retweet_times']
642
+#        if image['title'].startswith( 'Too ' ):
643
+#            print image['title'], image['qualified'], image['retweet_times']
609 644
 #            print chart_url
610 645
         image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) )
611 646
         f.write( '<tr valign="center" class="%s">\n  <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \
612 647
                  ( image_index % 2 and "even" or "odd",
613
-                   i['link'],
614
-                   i['title'].encode( 'ascii', 'xmlcharrefreplace' ),
615
-                   i['author'].encode( 'ascii', 'xmlcharrefreplace' ),
648
+                   image['link'],
649
+                   image['title'].encode( 'ascii', 'xmlcharrefreplace' ),
650
+                   image['author'].encode( 'ascii', 'xmlcharrefreplace' ),
616 651
                  )
617 652
                )
618
-        f.write( '  <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) )
653
+        f.write( '  <td>%s<td>\n' % ( image['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) )
619 654
         f.write( '  <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \
620 655
                  ( image_url,
621 656
                    img_width,
622 657
                    img_height
623 658
                  )
624 659
                )
625
-        image_index += 1
626 660
     f.write( html_footer )
627 661
     f.close()
628 662
     if os.path.exists( index_fullpath ):
... ...
@@ -634,13 +668,15 @@ def Make_index_html( yaml_items, weekend_stats, weekday_stats ):
634 668
 def Make_feed_file( yaml_items ):
635 669
     f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' )
636 670
     f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" )
637
-    f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) )
671
+    f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) )
638 672
     count = 0
639 673
     for item in yaml_items:
640 674
         now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) )
641 675
         if item['qualified'] != -1:
676
+            escaped_title = cgi.escape( item['title'] ).encode( 'ascii', 'xmlcharrefreplace' )
677
+            escaped_author = cgi.escape( item['author'] ).encode( 'ascii', 'xmlcharrefreplace' )
642 678
             f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \
643
-                     ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) )
679
+                     ( escaped_title, now, item['link'], item['link'], escaped_author ) )
644 680
             count += 1
645 681
             if count > 14:
646 682
                 break
... ...
@@ -668,10 +704,9 @@ if __name__=='__main__':
668 704
         #     'qualified'           : -1
669 705
         #     'comment_times'       : [ 1282197199, 1282197407 ]
670 706
         #     'comments'            : [ 0, 15 ]
707
+        #     'fb_shares'           : [ 0, 3 ]
671 708
         #     'slash_comment_times' : [ 1282197199, 1282197407 ]
672 709
         #     'slash_comments'      : [ 0, 5 ]
673
-        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
674
-        #     'slash_comments'      : [ 0, 3 ]
675 710
         #     'retweet_times'       : [ 1282197199, 1282197407 ]
676 711
         #     'retweets'            : [ 0, 43 ]
677 712
         #    },
... ...
@@ -682,6 +717,11 @@ if __name__=='__main__':
682 717
         if os.path.exists( yaml_fullpath ):
683 718
             f = file( yaml_fullpath, 'rb' )
684 719
             items = yaml.load( f )
720
+
721
+            # Do any dictionary item updating that might be necessary
722
+#            for item in items:
723
+#                if not item.has_key( 'fb_shares' ):
724
+#                    item['fb_shares'] = []
685 725
             f.close()
686 726
         else:
687 727
             print "could not open", yaml_fullpath
... ...
@@ -754,7 +794,7 @@ if __name__=='__main__':
754 794
         traceback.print_exc( file = sys.stdout )
755 795
         try:
756 796
             sendEmail( 'Exception thrown in techcrunch.py',
757
-                       exceptional_text,
797
+                       exceptional_text + "\n" + traceback.format_exc(),
758 798
                        ( 'david.blume@gmail.com', ) )
759 799
         except Exception, e:
760 800
             print "Could not send email to notify you of the exception. :("
... ...
@@ -777,7 +817,7 @@ if __name__=='__main__':
777 817
     lines = lines[:168] # Just keep the past week's worth
778 818
     # status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
779 819
     status = len( message.strip() ) and '\n                       '.join( message.splitlines() ) or "OK"
780
-    lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
820
+    lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status ))
781 821
     f = open( os.path.join( localdir,'stats.txt' ), 'w' )
782 822
     f.writelines( lines )
783 823
     f.close()
784 824