David Blume commited on 2018-01-20 20:16:24
Showing 1 changed files, with 141 additions and 19 deletions.
| ... | ... |
@@ -10,6 +10,11 @@ |
| 10 | 10 |
# TODO: |
| 11 | 11 |
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
| 12 | 12 |
# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" |
| 13 |
+# |
|
| 14 |
+# This file was coverted from tabs to spaces with the vim command %retab |
|
| 15 |
+# |
|
| 16 |
+# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; mv techcrunch.yaml_back techcrunch.yaml |
|
| 17 |
+# |
|
| 13 | 18 |
|
| 14 | 19 |
import feedparser |
| 15 | 20 |
import yaml |
| ... | ... |
@@ -32,9 +37,14 @@ import bisect |
| 32 | 37 |
import analysis |
| 33 | 38 |
import simplejson as json |
| 34 | 39 |
import cookielib |
| 40 |
+import xml |
|
| 41 |
+import texttime |
|
| 42 |
+from datetime import timedelta |
|
| 35 | 43 |
|
| 36 | 44 |
debug = True |
| 37 | 45 |
any_entry_added = False |
| 46 |
+tags_to_post = set([ 'apple', 'google']) |
|
| 47 |
+authors_to_post = [ 'michael arrington', ] |
|
| 38 | 48 |
|
| 39 | 49 |
localdir = '' |
| 40 | 50 |
|
| ... | ... |
@@ -78,6 +88,7 @@ img_height = 50 |
| 78 | 88 |
series_1_color = "0000FF" |
| 79 | 89 |
series_2_color = "00AA00" |
| 80 | 90 |
threshold_color = "FF8C00" |
| 91 |
+tag_color = "F01000" |
|
| 81 | 92 |
|
| 82 | 93 |
even_background = "F8F8F8" |
| 83 | 94 |
#even_background = "FFFFFF" |
| ... | ... |
@@ -100,7 +111,21 @@ def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@ |
| 100 | 111 |
( fromaddr, ", ".join( toaddrs ), subject, message ) ) |
| 101 | 112 |
smtp.quit() |
| 102 | 113 |
|
| 103 |
-def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ): |
|
| 114 |
+def index_id(a_list, elem): |
|
| 115 |
+ try: |
|
| 116 |
+ return (index for index, item in enumerate( a_list ) if item == elem).next() |
|
| 117 |
+ except: |
|
| 118 |
+ return -1 |
|
| 119 |
+ |
|
| 120 |
+def index_id_simple(a_list, elem): |
|
| 121 |
+ index = 0 |
|
| 122 |
+ for item in a_list: |
|
| 123 |
+ if item == elem: |
|
| 124 |
+ return index |
|
| 125 |
+ index += 1 |
|
| 126 |
+ return -1 |
|
| 127 |
+ |
|
| 128 |
+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color, tag_hit ): |
|
| 104 | 129 |
# comment_times, comment_values = zip( *comments ) |
| 105 | 130 |
# retweet_times, retweet_values = zip( *retweets ) |
| 106 | 131 |
|
| ... | ... |
@@ -149,8 +174,16 @@ def make_chart_url( time_posted, comment_times, comment_values, retweet_times, r |
| 149 | 174 |
','.join( [ str( n ) for n in comment_values ] ), |
| 150 | 175 |
','.join( [ str( n ) for n in retweet_times ] ), |
| 151 | 176 |
','.join( [ str( n ) for n in retweet_values ] ) ) |
| 177 |
+ # TODO: Consider watermark levels, like: |
|
| 178 |
+ # chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5 |
|
| 152 | 179 |
if met_threshold_pt != -1: |
| 153 |
- chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt ) |
|
| 180 |
+ if tag_hit: |
|
| 181 |
+ dot_color = tag_color |
|
| 182 |
+ dot_shape = 'd' |
|
| 183 |
+ else: |
|
| 184 |
+ dot_color = threshold_color |
|
| 185 |
+ dot_shape = 'o' |
|
| 186 |
+ chart_url += "&chm=%s,%s,1,%d,10" % ( dot_shape, dot_color, met_threshold_pt ) |
|
| 154 | 187 |
chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ |
| 155 | 188 |
( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value, |
| 156 | 189 |
0, max( 7, max_comment_time ), |
| ... | ... |
@@ -189,6 +222,11 @@ def process_feed( yaml_items ): |
| 189 | 222 |
print "the feed returned feed.status %d." % ( feed.status, ) |
| 190 | 223 |
else: |
| 191 | 224 |
# Save off this |
| 225 |
+ if hasattr( feed, 'bozo_exception' ) and type( feed.bozo_exception ) == xml.sax._exceptions.SAXParseException: |
|
| 226 |
+ print "Didn't pickle because of bozo_exception %s." % ( str( feed.bozo_exception ) ) |
|
| 227 |
+ elif hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ): |
|
| 228 |
+ print "Didn't pickle because of bozo_exception instance %s." % ( str( feed.bozo_exception ) ) |
|
| 229 |
+ else: |
|
| 192 | 230 |
f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) |
| 193 | 231 |
try: |
| 194 | 232 |
pickle.dump( feed, f ) |
| ... | ... |
@@ -196,7 +234,7 @@ def process_feed( yaml_items ): |
| 196 | 234 |
print "An error occurred while pickling the feed: %s." % \ |
| 197 | 235 |
( # str(e.__class__), |
| 198 | 236 |
str(e) ) |
| 199 |
- traceback.print_exc( file = sys.stdout ) |
|
| 237 |
+ traceback.print_exc( 3, file = sys.stdout ) |
|
| 200 | 238 |
feed_is_modified = False |
| 201 | 239 |
f.close() |
| 202 | 240 |
|
| ... | ... |
@@ -265,7 +303,10 @@ def process_item( feed_item, yaml_items ): |
| 265 | 303 |
# Look for i.feedburner_origlink in yaml_items |
| 266 | 304 |
yaml_item = None |
| 267 | 305 |
for i in yaml_items: |
| 268 |
- if feed_item.feedburner_origlink == i['link']: |
|
| 306 |
+ if hasattr( feed_item, 'feedburner_origlink' ) and feed_item.feedburner_origlink == i['link']: |
|
| 307 |
+ yaml_item = i |
|
| 308 |
+ break |
|
| 309 |
+ elif feed_item.link == i['link']: |
|
| 269 | 310 |
yaml_item = i |
| 270 | 311 |
break |
| 271 | 312 |
if not yaml_item: |
| ... | ... |
@@ -320,7 +361,7 @@ def process_yaml_item( yaml_item, cookie ): |
| 320 | 361 |
yaml_item['comments'].append( num_comments ) |
| 321 | 362 |
|
| 322 | 363 |
if len( yaml_item['retweets'] ) < 8: |
| 323 |
- num_retweets = Get_num_retweets( yaml_item['link'] ) |
|
| 364 |
+ num_retweets = Get_num_retweets( yaml_item ) |
|
| 324 | 365 |
if num_retweets != -1: |
| 325 | 366 |
any_entry_added = True |
| 326 | 367 |
yaml_item['retweet_times'].append( timecode_now ) |
| ... | ... |
@@ -375,7 +416,10 @@ def Get_disqus_id( yaml_item ): |
| 375 | 416 |
if hasattr( e, 'reason' ): |
| 376 | 417 |
print "Get_disqus_id got an error:", e.reason |
| 377 | 418 |
elif hasattr( e, 'code' ): |
| 378 |
- print "Get_disqus_id got an error. Code:", e.code |
|
| 419 |
+ print "Get_disqus_id got an error. Code:", e.code, yaml_item['link'] |
|
| 420 |
+ return url_get_data |
|
| 421 |
+ except httplib.BadStatusLine, e: |
|
| 422 |
+ print "Get_discus_id got a BadStatusLine:", str( e ) |
|
| 379 | 423 |
return url_get_data |
| 380 | 424 |
|
| 381 | 425 |
tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="' |
| ... | ... |
@@ -407,6 +451,7 @@ def Get_num_disqus_comments( url_string, disqus_id, cookie ): |
| 407 | 451 |
print "Get_num_disqus_comments got an error getting the count:", e.reason |
| 408 | 452 |
elif hasattr( e, 'code' ): |
| 409 | 453 |
print "Get_num_disqus_comments got an error getting the count. Code:", e.code |
| 454 |
+ return -1 |
|
| 410 | 455 |
disqus_tag_to_find = 'displayCount('
|
| 411 | 456 |
disqus_offset = disqus_data.find( disqus_tag_to_find ) |
| 412 | 457 |
if disqus_offset != -1: |
| ... | ... |
@@ -418,7 +463,8 @@ def Get_num_disqus_comments( url_string, disqus_id, cookie ): |
| 418 | 463 |
print "Get_num_disqus_comments found no disqus tag for", url_string |
| 419 | 464 |
return -1 |
| 420 | 465 |
|
| 421 |
-def Get_num_retweets( url_string ): |
|
| 466 |
+def Get_num_retweets_unused( yaml_item ): |
|
| 467 |
+ url_string = yaml_item['link'] |
|
| 422 | 468 |
try: |
| 423 | 469 |
f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) ) |
| 424 | 470 |
data = f.read() |
| ... | ... |
@@ -435,9 +481,50 @@ def Get_num_retweets( url_string ): |
| 435 | 481 |
start_pos = offset + len( tag_to_find ) |
| 436 | 482 |
end_pos = data.find( '<', start_pos ) |
| 437 | 483 |
if end_pos != -1: |
| 484 |
+ try: |
|
| 438 | 485 |
return int( data[ start_pos:end_pos ] ) |
| 486 |
+ except ValueError, e: |
|
| 487 |
+ if data[ start_pos:end_pos ] != '?': |
|
| 488 |
+ print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], ) |
|
| 489 |
+ else: |
|
| 490 |
+ print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \ |
|
| 491 |
+ ( yaml_item['title'][:20], |
|
| 492 |
+ texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) ) |
|
| 493 |
+ ) |
|
| 439 | 494 |
return -1 |
| 440 | 495 |
|
| 496 |
+def Get_num_retweets( yaml_item ): |
|
| 497 |
+ url_string = yaml_item['link'] |
|
| 498 |
+ try: |
|
| 499 |
+ f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % urllib.quote_plus( url_string ) ) |
|
| 500 |
+ data = f.read() |
|
| 501 |
+ f.close() |
|
| 502 |
+ except urllib2.URLError, e: |
|
| 503 |
+ if hasattr( e, 'reason' ): |
|
| 504 |
+ print "Get_num_retweets got an error:", e.reason |
|
| 505 |
+ elif hasattr( e, 'code' ): |
|
| 506 |
+ print "Get_num_retweets got an error. Code:", e.code |
|
| 507 |
+ return -1 |
|
| 508 |
+ tag_to_find = '"count":' |
|
| 509 |
+ offset = data.find( tag_to_find ) |
|
| 510 |
+ if offset != -1: |
|
| 511 |
+ start_pos = offset + len( tag_to_find ) |
|
| 512 |
+ end_pos = data.find( ',', start_pos ) |
|
| 513 |
+ if end_pos != -1: |
|
| 514 |
+ try: |
|
| 515 |
+ return int( data[ start_pos:end_pos ] ) |
|
| 516 |
+ except ValueError, e: |
|
| 517 |
+ if data[ start_pos:end_pos ] != '?': |
|
| 518 |
+ print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], ) |
|
| 519 |
+ else: |
|
| 520 |
+ print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \ |
|
| 521 |
+ ( yaml_item['title'][:20], |
|
| 522 |
+ texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) ) |
|
| 523 |
+ ) |
|
| 524 |
+ return -1 |
|
| 525 |
+ |
|
| 526 |
+ |
|
| 527 |
+ |
|
| 441 | 528 |
def Save_image( url_string, file_path ): |
| 442 | 529 |
try: |
| 443 | 530 |
f = urllib2.urlopen( url_string ) |
| ... | ... |
@@ -456,7 +543,7 @@ def Save_image( url_string, file_path ): |
| 456 | 543 |
return 'cache/' + os.path.basename( file_path ) |
| 457 | 544 |
return url_string |
| 458 | 545 |
|
| 459 |
-def Make_index_html( yaml_items, stats ): |
|
| 546 |
+def Make_index_html( yaml_items, weekend_stats, weekday_stats ): |
|
| 460 | 547 |
cur_time = int( time.time() ) |
| 461 | 548 |
new_index_fullpath = os.path.join( localdir, 'index.html_new' ) |
| 462 | 549 |
index_fullpath = os.path.join( localdir, 'index.html' ) |
| ... | ... |
@@ -470,14 +557,25 @@ def Make_index_html( yaml_items, stats ): |
| 470 | 557 |
f.write( html_head % ( even_background, odd_background ) ) |
| 471 | 558 |
# f.write( '<div align="center">\n<table cellpadding="4">' ) |
| 472 | 559 |
|
| 473 |
- f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) |
|
| 474 |
- for median, mean, std_dev in stats: |
|
| 475 |
- f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) |
|
| 476 |
- f.write( '</tr>\n</table></div>\n<br />\n' ) |
|
| 560 |
+# f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) |
|
| 561 |
+# for median, mean, std_dev in weekday_stats: |
|
| 562 |
+# f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) |
|
| 563 |
+# f.write( '</tr>\n</table></div>\n<br />\n' ) |
|
| 564 |
+ |
|
| 565 |
+ f.write( '<div align="center">\n<table class="legend">\n<tr><th></th><th>Median</th><th>Mean</th><th>Std. Dev</th><th>Threshold</th></tr>\n' ) |
|
| 566 |
+ f.write( '<tr><th>Weekday</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2] ) ) |
|
| 567 |
+ f.write( '<tr><th>Weekend</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2] ) ) |
|
| 568 |
+ f.write( '</table></div>\n<br />\n' ) |
|
| 569 |
+ |
|
| 477 | 570 |
|
| 478 | 571 |
f.write( '<div align="center">\n<table>\n' ) |
| 479 | 572 |
image_index = 0 |
| 480 | 573 |
for i in yaml_items[:40]: |
| 574 |
+ tag_hit = False |
|
| 575 |
+ if i['author'].lower() in authors_to_post: |
|
| 576 |
+ tag_hit = True |
|
| 577 |
+ elif len( set([j.lower() for j in i['tags']]) & tags_to_post ) > 0: |
|
| 578 |
+ tag_hit = True |
|
| 481 | 579 |
chart_url = make_chart_url( i['orig_posted'], |
| 482 | 580 |
i['comment_times'], |
| 483 | 581 |
i['comments'], |
| ... | ... |
@@ -485,7 +583,11 @@ def Make_index_html( yaml_items, stats ): |
| 485 | 583 |
i['retweets'], |
| 486 | 584 |
i['qualified'], |
| 487 | 585 |
image_index % 2 and even_background or odd_background, |
| 586 |
+ tag_hit |
|
| 488 | 587 |
) |
| 588 |
+# if i['title'].startswith( 'Verizon To' ): |
|
| 589 |
+# print i['title'], i['qualified'], i['retweet_times'] |
|
| 590 |
+# print chart_url |
|
| 489 | 591 |
image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) |
| 490 | 592 |
f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ |
| 491 | 593 |
( image_index % 2 and "even" or "odd", |
| ... | ... |
@@ -572,14 +674,21 @@ if __name__=='__main__': |
| 572 | 674 |
# |
| 573 | 675 |
# If any work was done, then write files. |
| 574 | 676 |
# |
| 575 |
- if True or any_entry_added: |
|
| 677 |
+ if any_entry_added: |
|
| 576 | 678 |
|
| 577 |
- stats = analysis.Process_retweets_for_feed( items ) |
|
| 679 |
+ weekend_stats, weekday_stats = analysis.Process_retweets_for_feed( items ) |
|
| 578 | 680 |
|
| 579 | 681 |
# We'll only look at the stats for the time 1:00 to 1:30 after posting. |
| 580 |
- median, mean, sigma = stats[2] |
|
| 581 |
- threshold = median + sigma |
|
| 682 |
+ weekend_median, weekend_mean, weekend_sigma = weekend_stats[2] |
|
| 683 |
+ weekend_threshold = weekend_mean + weekend_sigma |
|
| 684 |
+ weekday_median, weekday_mean, weekday_sigma = weekday_stats[2] |
|
| 685 |
+ weekday_threshold = weekday_mean + weekday_sigma |
|
| 582 | 686 |
for item in items: |
| 687 |
+ wday = time.localtime( item['orig_posted'] ).tm_wday |
|
| 688 |
+ if wday == 5 or wday == 6: |
|
| 689 |
+ threshold = weekend_threshold |
|
| 690 |
+ else: |
|
| 691 |
+ threshold = weekday_threshold |
|
| 583 | 692 |
if item['qualified'] == -1: |
| 584 | 693 |
for i in range( len( item['retweet_times'] ) ): |
| 585 | 694 |
r_time = item['retweet_times'][i] |
| ... | ... |
@@ -589,12 +698,24 @@ if __name__=='__main__': |
| 589 | 698 |
if r_time - item['orig_posted'] >= 3600: |
| 590 | 699 |
break |
| 591 | 700 |
|
| 701 |
+ # Automatically add those items whose authors and tags I like |
|
| 702 |
+ for item in items: |
|
| 703 |
+ if item['qualified'] == -1 and len( item['retweet_times'] ) > 0: |
|
| 704 |
+ if item['author'].lower() in authors_to_post: |
|
| 705 |
+ item['qualified'] = 0 |
|
| 706 |
+ elif len( set([j.lower() for j in item['tags']]) & tags_to_post ) > 0: |
|
| 707 |
+ item['qualified'] = 0 |
|
| 708 |
+ |
|
| 592 | 709 |
# |
| 593 | 710 |
# Write out the updated yaml file. |
| 594 | 711 |
# |
| 595 |
- f = file( yaml_fullpath, 'wb' ) |
|
| 712 |
+ |
|
| 713 |
+ # For the one file we really use, write to a file on the side, then move it. |
|
| 714 |
+ yaml_newfile_fullpath = os.path.join( localdir, 'techcrunch_temp_writable.yaml' ) |
|
| 715 |
+ f = file( yaml_newfile_fullpath, 'wb' ) |
|
| 596 | 716 |
yaml.dump( items, f, width=120 ) |
| 597 | 717 |
f.close() |
| 718 |
+ os.rename( yaml_newfile_fullpath, yaml_fullpath ) |
|
| 598 | 719 |
f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) |
| 599 | 720 |
yaml.dump( items, f, width=120 ) |
| 600 | 721 |
f.close() |
| ... | ... |
@@ -604,7 +725,7 @@ if __name__=='__main__': |
| 604 | 725 |
|
| 605 | 726 |
Make_feed_file( items ) |
| 606 | 727 |
|
| 607 |
- Make_index_html( items, stats ) |
|
| 728 |
+ Make_index_html( items, weekend_stats, weekday_stats ) |
|
| 608 | 729 |
else: |
| 609 | 730 |
print "No entries were added this time." |
| 610 | 731 |
|
| ... | ... |
@@ -635,7 +756,8 @@ if __name__=='__main__': |
| 635 | 756 |
else: |
| 636 | 757 |
lines = [] |
| 637 | 758 |
lines = lines[:168] # Just keep the past week's worth |
| 638 |
- status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
|
| 759 |
+ # status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
|
| 760 |
+ status = len( message.strip() ) and '\n '.join( message.splitlines() ) or "OK" |
|
| 639 | 761 |
lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
|
| 640 | 762 |
f = open( os.path.join( localdir,'stats.txt' ), 'w' ) |
| 641 | 763 |
f.writelines( lines ) |
| 642 | 764 |