2011-02-04: Algorithm changes (tags and author checked), new chart drawing, spaces used instead of tabs.
David Blume

David Blume commited on 2018-01-20 20:16:24
Showing 1 changed files, with 141 additions and 19 deletions.

... ...
@@ -10,6 +10,11 @@
10 10
 # TODO:
11 11
 # 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry &raquo;</a>'
12 12
 #   link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"
13
+#
14
+# This file was coverted from tabs to spaces with the vim command %retab
15
+#
16
+# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; mv techcrunch.yaml_back techcrunch.yaml
17
+#
13 18
 
14 19
 import feedparser
15 20
 import yaml
... ...
@@ -32,9 +37,14 @@ import bisect
32 37
 import analysis
33 38
 import simplejson as json
34 39
 import cookielib
40
+import xml
41
+import texttime
42
+from datetime import timedelta
35 43
 
36 44
 debug = True
37 45
 any_entry_added = False
46
+tags_to_post = set([ 'apple', 'google'])
47
+authors_to_post = [ 'michael arrington', ]
38 48
 
39 49
 localdir = ''
40 50
 
... ...
@@ -78,6 +88,7 @@ img_height = 50
78 88
 series_1_color = "0000FF"
79 89
 series_2_color = "00AA00"
80 90
 threshold_color = "FF8C00"
91
+tag_color = "F01000"
81 92
 
82 93
 even_background = "F8F8F8"
83 94
 #even_background = "FFFFFF"
... ...
@@ -100,7 +111,21 @@ def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@
100 111
                    ( fromaddr, ", ".join( toaddrs ), subject, message ) )
101 112
     smtp.quit()
102 113
 
103
-def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ):
114
+def index_id(a_list, elem):
115
+    try:
116
+        return (index for index, item in enumerate( a_list ) if item == elem).next()
117
+    except:
118
+        return -1
119
+
120
+def index_id_simple(a_list, elem):
121
+    index = 0
122
+    for item in a_list:
123
+        if item == elem:
124
+            return index
125
+        index += 1
126
+    return -1
127
+
128
+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color, tag_hit ):
104 129
 #    comment_times, comment_values = zip( *comments )
105 130
 #    retweet_times, retweet_values = zip( *retweets )
106 131
 
... ...
@@ -149,8 +174,16 @@ def make_chart_url( time_posted, comment_times, comment_values, retweet_times, r
149 174
                                           ','.join( [ str( n ) for n in comment_values ] ),
150 175
                                           ','.join( [ str( n ) for n in retweet_times ] ),
151 176
                                           ','.join( [ str( n ) for n in retweet_values ] ) )
177
+    # TODO: Consider watermark levels, like:
178
+    # chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5
152 179
     if met_threshold_pt != -1:
153
-        chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt )
180
+        if tag_hit:
181
+            dot_color = tag_color
182
+            dot_shape = 'd'
183
+        else:
184
+            dot_color = threshold_color
185
+            dot_shape = 'o'
186
+        chart_url += "&chm=%s,%s,1,%d,10" % ( dot_shape, dot_color, met_threshold_pt )
154 187
     chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \
155 188
                  ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value,
156 189
                    0, max( 7, max_comment_time ),
... ...
@@ -189,6 +222,11 @@ def process_feed( yaml_items ):
189 222
                     print "the feed returned feed.status %d." % ( feed.status, )
190 223
             else:
191 224
                 # Save off this
225
+                if hasattr( feed, 'bozo_exception' ) and type( feed.bozo_exception ) == xml.sax._exceptions.SAXParseException:
226
+                    print "Didn't pickle because of bozo_exception %s." % ( str( feed.bozo_exception ) )
227
+                elif hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ):
228
+                    print "Didn't pickle because of bozo_exception instance %s." % ( str( feed.bozo_exception ) )
229
+                else:
192 230
                     f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' )
193 231
                     try:
194 232
                         pickle.dump( feed, f )
... ...
@@ -196,7 +234,7 @@ def process_feed( yaml_items ):
196 234
                         print "An error occurred while pickling the feed: %s." % \
197 235
                               ( # str(e.__class__),
198 236
                                 str(e) )
199
-                    traceback.print_exc( file = sys.stdout )
237
+                        traceback.print_exc( 3, file = sys.stdout )
200 238
                         feed_is_modified = False
201 239
                     f.close()
202 240
 
... ...
@@ -265,7 +303,10 @@ def process_item( feed_item, yaml_items ):
265 303
     # Look for i.feedburner_origlink in yaml_items
266 304
     yaml_item = None
267 305
     for i in yaml_items:
268
-        if feed_item.feedburner_origlink == i['link']:
306
+        if hasattr( feed_item, 'feedburner_origlink' ) and feed_item.feedburner_origlink == i['link']:
307
+            yaml_item = i
308
+            break
309
+        elif feed_item.link == i['link']:
269 310
             yaml_item = i
270 311
             break
271 312
     if not yaml_item:
... ...
@@ -320,7 +361,7 @@ def process_yaml_item( yaml_item, cookie ):
320 361
             yaml_item['comments'].append( num_comments )
321 362
 
322 363
     if len( yaml_item['retweets'] ) < 8:
323
-        num_retweets = Get_num_retweets( yaml_item['link'] )
364
+        num_retweets = Get_num_retweets( yaml_item )
324 365
         if num_retweets != -1:
325 366
             any_entry_added = True
326 367
             yaml_item['retweet_times'].append( timecode_now )
... ...
@@ -375,7 +416,10 @@ def Get_disqus_id( yaml_item ):
375 416
         if hasattr( e, 'reason' ):
376 417
             print "Get_disqus_id got an error:", e.reason
377 418
         elif hasattr( e, 'code' ):
378
-            print "Get_disqus_id got an error. Code:", e.code
419
+            print "Get_disqus_id got an error. Code:", e.code, yaml_item['link']
420
+        return url_get_data
421
+    except httplib.BadStatusLine, e:
422
+        print "Get_discus_id got a BadStatusLine:", str( e )
379 423
         return url_get_data
380 424
 
381 425
     tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="'
... ...
@@ -407,6 +451,7 @@ def Get_num_disqus_comments( url_string, disqus_id, cookie ):
407 451
             print "Get_num_disqus_comments got an error getting the count:", e.reason
408 452
         elif hasattr( e, 'code' ):
409 453
             print "Get_num_disqus_comments got an error getting the count. Code:", e.code
454
+        return -1
410 455
     disqus_tag_to_find = 'displayCount('
411 456
     disqus_offset = disqus_data.find( disqus_tag_to_find )
412 457
     if disqus_offset != -1:
... ...
@@ -418,7 +463,8 @@ def Get_num_disqus_comments( url_string, disqus_id, cookie ):
418 463
         print "Get_num_disqus_comments found no disqus tag for", url_string
419 464
     return -1
420 465
 
421
-def Get_num_retweets( url_string ):
466
+def Get_num_retweets_unused( yaml_item ):
467
+    url_string = yaml_item['link']
422 468
     try:
423 469
         f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) )
424 470
         data = f.read()
... ...
@@ -435,9 +481,50 @@ def Get_num_retweets( url_string ):
435 481
         start_pos = offset + len( tag_to_find )
436 482
         end_pos = data.find( '<', start_pos )
437 483
         if end_pos != -1:
484
+            try:
438 485
                 return int( data[ start_pos:end_pos ] )
486
+            except ValueError, e:
487
+                if data[ start_pos:end_pos ] != '?':
488
+                    print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], )
489
+                else:
490
+                    print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \
491
+                          ( yaml_item['title'][:20],
492
+                            texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) )
493
+                          )
439 494
     return -1
440 495
 
496
+def Get_num_retweets( yaml_item ):
497
+    url_string = yaml_item['link']
498
+    try:
499
+        f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % urllib.quote_plus( url_string ) )
500
+        data = f.read()
501
+        f.close()
502
+    except urllib2.URLError, e:
503
+        if hasattr( e, 'reason' ):
504
+            print "Get_num_retweets got an error:", e.reason
505
+        elif hasattr( e, 'code' ):
506
+            print "Get_num_retweets got an error. Code:", e.code
507
+        return -1
508
+    tag_to_find = '"count":'
509
+    offset = data.find( tag_to_find )
510
+    if offset != -1:
511
+        start_pos = offset + len( tag_to_find )
512
+        end_pos = data.find( ',', start_pos )
513
+        if end_pos != -1:
514
+            try:
515
+                return int( data[ start_pos:end_pos ] )
516
+            except ValueError, e:
517
+                if data[ start_pos:end_pos ] != '?':
518
+                    print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], )
519
+                else:
520
+                    print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \
521
+                          ( yaml_item['title'][:20],
522
+                            texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) )
523
+                          )
524
+    return -1
525
+
526
+
527
+
441 528
 def Save_image( url_string, file_path ):
442 529
     try:
443 530
         f = urllib2.urlopen( url_string )
... ...
@@ -456,7 +543,7 @@ def Save_image( url_string, file_path ):
456 543
         return 'cache/' + os.path.basename( file_path )
457 544
     return url_string
458 545
 
459
-def Make_index_html( yaml_items, stats ):
546
+def Make_index_html( yaml_items, weekend_stats, weekday_stats ):
460 547
     cur_time = int( time.time() )
461 548
     new_index_fullpath = os.path.join( localdir, 'index.html_new' )
462 549
     index_fullpath = os.path.join( localdir, 'index.html' )
... ...
@@ -470,14 +557,25 @@ def Make_index_html( yaml_items, stats ):
470 557
     f.write( html_head % ( even_background, odd_background ) )
471 558
 #    f.write( '<div align="center">\n<table cellpadding="4">' )
472 559
 
473
-    f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' )
474
-    for median, mean, std_dev in stats:
475
-    f.write( '<td>med=%1.1f &#956;=%1.1f &#963;=%1.1f&nbsp;</td> ' % ( median, mean, std_dev ) )
476
-    f.write( '</tr>\n</table></div>\n<br />\n' )
560
+#    f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' )
561
+#    for median, mean, std_dev in weekday_stats:
562
+#        f.write( '<td>med=%1.1f &#956;=%1.1f &#963;=%1.1f&nbsp;</td> ' % ( median, mean, std_dev ) )
563
+#    f.write( '</tr>\n</table></div>\n<br />\n' )
564
+
565
+    f.write( '<div align="center">\n<table class="legend">\n<tr><th></th><th>Median</th><th>Mean</th><th>Std. Dev</th><th>Threshold</th></tr>\n' )
566
+    f.write( '<tr><th>Weekday</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2] ) )
567
+    f.write( '<tr><th>Weekend</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2] ) )
568
+    f.write( '</table></div>\n<br />\n' )
569
+
477 570
 
478 571
     f.write( '<div align="center">\n<table>\n' )
479 572
     image_index = 0
480 573
     for i in yaml_items[:40]:
574
+        tag_hit = False
575
+        if i['author'].lower() in authors_to_post:
576
+            tag_hit = True
577
+        elif len( set([j.lower() for j in i['tags']]) & tags_to_post ) > 0:
578
+            tag_hit = True
481 579
         chart_url = make_chart_url( i['orig_posted'],
482 580
                                     i['comment_times'],
483 581
                                     i['comments'],
... ...
@@ -485,7 +583,11 @@ def Make_index_html( yaml_items, stats ):
485 583
                                     i['retweets'],
486 584
                                     i['qualified'],
487 585
                                     image_index % 2 and even_background or odd_background,
586
+                                    tag_hit
488 587
                                   )
588
+#        if i['title'].startswith( 'Verizon To' ):
589
+#            print i['title'], i['qualified'], i['retweet_times']
590
+#            print chart_url
489 591
         image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) )
490 592
         f.write( '<tr valign="center" class="%s">\n  <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \
491 593
                  ( image_index % 2 and "even" or "odd",
... ...
@@ -572,14 +674,21 @@ if __name__=='__main__':
572 674
         #
573 675
         # If any work was done, then write files.
574 676
         #
575
-        if True or any_entry_added:
677
+        if any_entry_added:
576 678
 
577
-            stats = analysis.Process_retweets_for_feed( items )
679
+            weekend_stats, weekday_stats = analysis.Process_retweets_for_feed( items )
578 680
 
579 681
             # We'll only look at the stats for the time 1:00 to 1:30 after posting.
580
-        median, mean, sigma = stats[2]
581
-        threshold = median + sigma
682
+            weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
683
+            weekend_threshold = weekend_mean + weekend_sigma
684
+            weekday_median, weekday_mean, weekday_sigma = weekday_stats[2]
685
+            weekday_threshold = weekday_mean + weekday_sigma
582 686
             for item in items:
687
+                wday = time.localtime( item['orig_posted'] ).tm_wday
688
+                if wday == 5 or wday == 6:
689
+                    threshold = weekend_threshold
690
+                else:
691
+                    threshold = weekday_threshold
583 692
                 if item['qualified'] == -1:
584 693
                     for i in range( len( item['retweet_times'] ) ):
585 694
                         r_time = item['retweet_times'][i]
... ...
@@ -589,12 +698,24 @@ if __name__=='__main__':
589 698
                             if r_time - item['orig_posted'] >= 3600:
590 699
                                 break
591 700
 
701
+            # Automatically add those items whose authors and tags I like
702
+            for item in items:
703
+                if item['qualified'] == -1 and len( item['retweet_times'] ) > 0:
704
+                    if item['author'].lower() in authors_to_post:
705
+                        item['qualified'] = 0
706
+                    elif len( set([j.lower() for j in item['tags']]) & tags_to_post ) > 0:
707
+                        item['qualified'] = 0
708
+
592 709
             #
593 710
             # Write out the updated yaml file.
594 711
             #
595
-            f = file( yaml_fullpath, 'wb' )
712
+
713
+            # For the one file we really use, write to a file on the side, then move it.
714
+            yaml_newfile_fullpath = os.path.join( localdir, 'techcrunch_temp_writable.yaml' )
715
+            f = file( yaml_newfile_fullpath, 'wb' )
596 716
             yaml.dump( items, f, width=120 )
597 717
             f.close()
718
+            os.rename( yaml_newfile_fullpath, yaml_fullpath )
598 719
             f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' )
599 720
             yaml.dump( items, f, width=120 )
600 721
             f.close()
... ...
@@ -604,7 +725,7 @@ if __name__=='__main__':
604 725
 
605 726
             Make_feed_file( items )
606 727
 
607
-            Make_index_html( items, stats )
728
+            Make_index_html( items, weekend_stats, weekday_stats )
608 729
         else:
609 730
             print "No entries were added this time."
610 731
 
... ...
@@ -635,7 +756,8 @@ if __name__=='__main__':
635 756
     else:
636 757
         lines = []
637 758
     lines = lines[:168] # Just keep the past week's worth
638
-    status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
759
+    # status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
760
+    status = len( message.strip() ) and '\n                       '.join( message.splitlines() ) or "OK"
639 761
     lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
640 762
     f = open( os.path.join( localdir,'stats.txt' ), 'w' )
641 763
     f.writelines( lines )
642 764