David Blume commited on 2018-01-20 20:16:24
Showing 1 changed files, with 141 additions and 19 deletions.
... | ... |
@@ -10,6 +10,11 @@ |
10 | 10 |
# TODO: |
11 | 11 |
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
12 | 12 |
# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" |
13 |
+# |
|
14 |
+# This file was coverted from tabs to spaces with the vim command %retab |
|
15 |
+# |
|
16 |
+# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; mv techcrunch.yaml_back techcrunch.yaml |
|
17 |
+# |
|
13 | 18 |
|
14 | 19 |
import feedparser |
15 | 20 |
import yaml |
... | ... |
@@ -32,9 +37,14 @@ import bisect |
32 | 37 |
import analysis |
33 | 38 |
import simplejson as json |
34 | 39 |
import cookielib |
40 |
+import xml |
|
41 |
+import texttime |
|
42 |
+from datetime import timedelta |
|
35 | 43 |
|
36 | 44 |
debug = True |
37 | 45 |
any_entry_added = False |
46 |
+tags_to_post = set([ 'apple', 'google']) |
|
47 |
+authors_to_post = [ 'michael arrington', ] |
|
38 | 48 |
|
39 | 49 |
localdir = '' |
40 | 50 |
|
... | ... |
@@ -78,6 +88,7 @@ img_height = 50 |
78 | 88 |
series_1_color = "0000FF" |
79 | 89 |
series_2_color = "00AA00" |
80 | 90 |
threshold_color = "FF8C00" |
91 |
+tag_color = "F01000" |
|
81 | 92 |
|
82 | 93 |
even_background = "F8F8F8" |
83 | 94 |
#even_background = "FFFFFF" |
... | ... |
@@ -100,7 +111,21 @@ def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@ |
100 | 111 |
( fromaddr, ", ".join( toaddrs ), subject, message ) ) |
101 | 112 |
smtp.quit() |
102 | 113 |
|
103 |
-def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ): |
|
114 |
+def index_id(a_list, elem): |
|
115 |
+ try: |
|
116 |
+ return (index for index, item in enumerate( a_list ) if item == elem).next() |
|
117 |
+ except: |
|
118 |
+ return -1 |
|
119 |
+ |
|
120 |
+def index_id_simple(a_list, elem): |
|
121 |
+ index = 0 |
|
122 |
+ for item in a_list: |
|
123 |
+ if item == elem: |
|
124 |
+ return index |
|
125 |
+ index += 1 |
|
126 |
+ return -1 |
|
127 |
+ |
|
128 |
+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color, tag_hit ): |
|
104 | 129 |
# comment_times, comment_values = zip( *comments ) |
105 | 130 |
# retweet_times, retweet_values = zip( *retweets ) |
106 | 131 |
|
... | ... |
@@ -149,8 +174,16 @@ def make_chart_url( time_posted, comment_times, comment_values, retweet_times, r |
149 | 174 |
','.join( [ str( n ) for n in comment_values ] ), |
150 | 175 |
','.join( [ str( n ) for n in retweet_times ] ), |
151 | 176 |
','.join( [ str( n ) for n in retweet_values ] ) ) |
177 |
+ # TODO: Consider watermark levels, like: |
|
178 |
+ # chm=h,B0B0B0,1,0.3,1|r,E0E0E0,0,0,0.5 |
|
152 | 179 |
if met_threshold_pt != -1: |
153 |
- chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt ) |
|
180 |
+ if tag_hit: |
|
181 |
+ dot_color = tag_color |
|
182 |
+ dot_shape = 'd' |
|
183 |
+ else: |
|
184 |
+ dot_color = threshold_color |
|
185 |
+ dot_shape = 'o' |
|
186 |
+ chart_url += "&chm=%s,%s,1,%d,10" % ( dot_shape, dot_color, met_threshold_pt ) |
|
154 | 187 |
chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ |
155 | 188 |
( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value, |
156 | 189 |
0, max( 7, max_comment_time ), |
... | ... |
@@ -189,6 +222,11 @@ def process_feed( yaml_items ): |
189 | 222 |
print "the feed returned feed.status %d." % ( feed.status, ) |
190 | 223 |
else: |
191 | 224 |
# Save off this |
225 |
+ if hasattr( feed, 'bozo_exception' ) and type( feed.bozo_exception ) == xml.sax._exceptions.SAXParseException: |
|
226 |
+ print "Didn't pickle because of bozo_exception %s." % ( str( feed.bozo_exception ) ) |
|
227 |
+ elif hasattr( feed, 'bozo_exception' ) and isinstance( feed.bozo_exception, xml.sax._exceptions.SAXParseException ): |
|
228 |
+ print "Didn't pickle because of bozo_exception instance %s." % ( str( feed.bozo_exception ) ) |
|
229 |
+ else: |
|
192 | 230 |
f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) |
193 | 231 |
try: |
194 | 232 |
pickle.dump( feed, f ) |
... | ... |
@@ -196,7 +234,7 @@ def process_feed( yaml_items ): |
196 | 234 |
print "An error occurred while pickling the feed: %s." % \ |
197 | 235 |
( # str(e.__class__), |
198 | 236 |
str(e) ) |
199 |
- traceback.print_exc( file = sys.stdout ) |
|
237 |
+ traceback.print_exc( 3, file = sys.stdout ) |
|
200 | 238 |
feed_is_modified = False |
201 | 239 |
f.close() |
202 | 240 |
|
... | ... |
@@ -265,7 +303,10 @@ def process_item( feed_item, yaml_items ): |
265 | 303 |
# Look for i.feedburner_origlink in yaml_items |
266 | 304 |
yaml_item = None |
267 | 305 |
for i in yaml_items: |
268 |
- if feed_item.feedburner_origlink == i['link']: |
|
306 |
+ if hasattr( feed_item, 'feedburner_origlink' ) and feed_item.feedburner_origlink == i['link']: |
|
307 |
+ yaml_item = i |
|
308 |
+ break |
|
309 |
+ elif feed_item.link == i['link']: |
|
269 | 310 |
yaml_item = i |
270 | 311 |
break |
271 | 312 |
if not yaml_item: |
... | ... |
@@ -320,7 +361,7 @@ def process_yaml_item( yaml_item, cookie ): |
320 | 361 |
yaml_item['comments'].append( num_comments ) |
321 | 362 |
|
322 | 363 |
if len( yaml_item['retweets'] ) < 8: |
323 |
- num_retweets = Get_num_retweets( yaml_item['link'] ) |
|
364 |
+ num_retweets = Get_num_retweets( yaml_item ) |
|
324 | 365 |
if num_retweets != -1: |
325 | 366 |
any_entry_added = True |
326 | 367 |
yaml_item['retweet_times'].append( timecode_now ) |
... | ... |
@@ -375,7 +416,10 @@ def Get_disqus_id( yaml_item ): |
375 | 416 |
if hasattr( e, 'reason' ): |
376 | 417 |
print "Get_disqus_id got an error:", e.reason |
377 | 418 |
elif hasattr( e, 'code' ): |
378 |
- print "Get_disqus_id got an error. Code:", e.code |
|
419 |
+ print "Get_disqus_id got an error. Code:", e.code, yaml_item['link'] |
|
420 |
+ return url_get_data |
|
421 |
+ except httplib.BadStatusLine, e: |
|
422 |
+ print "Get_discus_id got a BadStatusLine:", str( e ) |
|
379 | 423 |
return url_get_data |
380 | 424 |
|
381 | 425 |
tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="' |
... | ... |
@@ -407,6 +451,7 @@ def Get_num_disqus_comments( url_string, disqus_id, cookie ): |
407 | 451 |
print "Get_num_disqus_comments got an error getting the count:", e.reason |
408 | 452 |
elif hasattr( e, 'code' ): |
409 | 453 |
print "Get_num_disqus_comments got an error getting the count. Code:", e.code |
454 |
+ return -1 |
|
410 | 455 |
disqus_tag_to_find = 'displayCount(' |
411 | 456 |
disqus_offset = disqus_data.find( disqus_tag_to_find ) |
412 | 457 |
if disqus_offset != -1: |
... | ... |
@@ -418,7 +463,8 @@ def Get_num_disqus_comments( url_string, disqus_id, cookie ): |
418 | 463 |
print "Get_num_disqus_comments found no disqus tag for", url_string |
419 | 464 |
return -1 |
420 | 465 |
|
421 |
-def Get_num_retweets( url_string ): |
|
466 |
+def Get_num_retweets_unused( yaml_item ): |
|
467 |
+ url_string = yaml_item['link'] |
|
422 | 468 |
try: |
423 | 469 |
f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) ) |
424 | 470 |
data = f.read() |
... | ... |
@@ -435,9 +481,50 @@ def Get_num_retweets( url_string ): |
435 | 481 |
start_pos = offset + len( tag_to_find ) |
436 | 482 |
end_pos = data.find( '<', start_pos ) |
437 | 483 |
if end_pos != -1: |
484 |
+ try: |
|
438 | 485 |
return int( data[ start_pos:end_pos ] ) |
486 |
+ except ValueError, e: |
|
487 |
+ if data[ start_pos:end_pos ] != '?': |
|
488 |
+ print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], ) |
|
489 |
+ else: |
|
490 |
+ print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \ |
|
491 |
+ ( yaml_item['title'][:20], |
|
492 |
+ texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) ) |
|
493 |
+ ) |
|
439 | 494 |
return -1 |
440 | 495 |
|
496 |
+def Get_num_retweets( yaml_item ): |
|
497 |
+ url_string = yaml_item['link'] |
|
498 |
+ try: |
|
499 |
+ f = urllib2.urlopen( 'http://urls.api.twitter.com/1/urls/count.json?url=%s&callback=twttr.receiveCount' % urllib.quote_plus( url_string ) ) |
|
500 |
+ data = f.read() |
|
501 |
+ f.close() |
|
502 |
+ except urllib2.URLError, e: |
|
503 |
+ if hasattr( e, 'reason' ): |
|
504 |
+ print "Get_num_retweets got an error:", e.reason |
|
505 |
+ elif hasattr( e, 'code' ): |
|
506 |
+ print "Get_num_retweets got an error. Code:", e.code |
|
507 |
+ return -1 |
|
508 |
+ tag_to_find = '"count":' |
|
509 |
+ offset = data.find( tag_to_find ) |
|
510 |
+ if offset != -1: |
|
511 |
+ start_pos = offset + len( tag_to_find ) |
|
512 |
+ end_pos = data.find( ',', start_pos ) |
|
513 |
+ if end_pos != -1: |
|
514 |
+ try: |
|
515 |
+ return int( data[ start_pos:end_pos ] ) |
|
516 |
+ except ValueError, e: |
|
517 |
+ if data[ start_pos:end_pos ] != '?': |
|
518 |
+ print "Get_num_retweets expected a number but got \"%s\"" % ( data[ start_pos:end_pos ], ) |
|
519 |
+ else: |
|
520 |
+ print "Get_num_retweets got '?' for \"%s...\", posted %s ago." % \ |
|
521 |
+ ( yaml_item['title'][:20], |
|
522 |
+ texttime.stringify( timedelta( seconds = time.time() - yaml_item['orig_posted'] ) ) |
|
523 |
+ ) |
|
524 |
+ return -1 |
|
525 |
+ |
|
526 |
+ |
|
527 |
+ |
|
441 | 528 |
def Save_image( url_string, file_path ): |
442 | 529 |
try: |
443 | 530 |
f = urllib2.urlopen( url_string ) |
... | ... |
@@ -456,7 +543,7 @@ def Save_image( url_string, file_path ): |
456 | 543 |
return 'cache/' + os.path.basename( file_path ) |
457 | 544 |
return url_string |
458 | 545 |
|
459 |
-def Make_index_html( yaml_items, stats ): |
|
546 |
+def Make_index_html( yaml_items, weekend_stats, weekday_stats ): |
|
460 | 547 |
cur_time = int( time.time() ) |
461 | 548 |
new_index_fullpath = os.path.join( localdir, 'index.html_new' ) |
462 | 549 |
index_fullpath = os.path.join( localdir, 'index.html' ) |
... | ... |
@@ -470,14 +557,25 @@ def Make_index_html( yaml_items, stats ): |
470 | 557 |
f.write( html_head % ( even_background, odd_background ) ) |
471 | 558 |
# f.write( '<div align="center">\n<table cellpadding="4">' ) |
472 | 559 |
|
473 |
- f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) |
|
474 |
- for median, mean, std_dev in stats: |
|
475 |
- f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) |
|
476 |
- f.write( '</tr>\n</table></div>\n<br />\n' ) |
|
560 |
+# f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) |
|
561 |
+# for median, mean, std_dev in weekday_stats: |
|
562 |
+# f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) |
|
563 |
+# f.write( '</tr>\n</table></div>\n<br />\n' ) |
|
564 |
+ |
|
565 |
+ f.write( '<div align="center">\n<table class="legend">\n<tr><th></th><th>Median</th><th>Mean</th><th>Std. Dev</th><th>Threshold</th></tr>\n' ) |
|
566 |
+ f.write( '<tr><th>Weekday</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekday_stats[2][0], weekday_stats[2][1], weekday_stats[2][2], weekday_stats[2][1] + weekday_stats[2][2] ) ) |
|
567 |
+ f.write( '<tr><th>Weekend</th><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td><td>%1.1f</td></tr>\n' % ( weekend_stats[2][0], weekend_stats[2][1], weekend_stats[2][2], weekend_stats[2][1] + weekend_stats[2][2] ) ) |
|
568 |
+ f.write( '</table></div>\n<br />\n' ) |
|
569 |
+ |
|
477 | 570 |
|
478 | 571 |
f.write( '<div align="center">\n<table>\n' ) |
479 | 572 |
image_index = 0 |
480 | 573 |
for i in yaml_items[:40]: |
574 |
+ tag_hit = False |
|
575 |
+ if i['author'].lower() in authors_to_post: |
|
576 |
+ tag_hit = True |
|
577 |
+ elif len( set([j.lower() for j in i['tags']]) & tags_to_post ) > 0: |
|
578 |
+ tag_hit = True |
|
481 | 579 |
chart_url = make_chart_url( i['orig_posted'], |
482 | 580 |
i['comment_times'], |
483 | 581 |
i['comments'], |
... | ... |
@@ -485,7 +583,11 @@ def Make_index_html( yaml_items, stats ): |
485 | 583 |
i['retweets'], |
486 | 584 |
i['qualified'], |
487 | 585 |
image_index % 2 and even_background or odd_background, |
586 |
+ tag_hit |
|
488 | 587 |
) |
588 |
+# if i['title'].startswith( 'Verizon To' ): |
|
589 |
+# print i['title'], i['qualified'], i['retweet_times'] |
|
590 |
+# print chart_url |
|
489 | 591 |
image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) |
490 | 592 |
f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ |
491 | 593 |
( image_index % 2 and "even" or "odd", |
... | ... |
@@ -572,14 +674,21 @@ if __name__=='__main__': |
572 | 674 |
# |
573 | 675 |
# If any work was done, then write files. |
574 | 676 |
# |
575 |
- if True or any_entry_added: |
|
677 |
+ if any_entry_added: |
|
576 | 678 |
|
577 |
- stats = analysis.Process_retweets_for_feed( items ) |
|
679 |
+ weekend_stats, weekday_stats = analysis.Process_retweets_for_feed( items ) |
|
578 | 680 |
|
579 | 681 |
# We'll only look at the stats for the time 1:00 to 1:30 after posting. |
580 |
- median, mean, sigma = stats[2] |
|
581 |
- threshold = median + sigma |
|
682 |
+ weekend_median, weekend_mean, weekend_sigma = weekend_stats[2] |
|
683 |
+ weekend_threshold = weekend_mean + weekend_sigma |
|
684 |
+ weekday_median, weekday_mean, weekday_sigma = weekday_stats[2] |
|
685 |
+ weekday_threshold = weekday_mean + weekday_sigma |
|
582 | 686 |
for item in items: |
687 |
+ wday = time.localtime( item['orig_posted'] ).tm_wday |
|
688 |
+ if wday == 5 or wday == 6: |
|
689 |
+ threshold = weekend_threshold |
|
690 |
+ else: |
|
691 |
+ threshold = weekday_threshold |
|
583 | 692 |
if item['qualified'] == -1: |
584 | 693 |
for i in range( len( item['retweet_times'] ) ): |
585 | 694 |
r_time = item['retweet_times'][i] |
... | ... |
@@ -589,12 +698,24 @@ if __name__=='__main__': |
589 | 698 |
if r_time - item['orig_posted'] >= 3600: |
590 | 699 |
break |
591 | 700 |
|
701 |
+ # Automatically add those items whose authors and tags I like |
|
702 |
+ for item in items: |
|
703 |
+ if item['qualified'] == -1 and len( item['retweet_times'] ) > 0: |
|
704 |
+ if item['author'].lower() in authors_to_post: |
|
705 |
+ item['qualified'] = 0 |
|
706 |
+ elif len( set([j.lower() for j in item['tags']]) & tags_to_post ) > 0: |
|
707 |
+ item['qualified'] = 0 |
|
708 |
+ |
|
592 | 709 |
# |
593 | 710 |
# Write out the updated yaml file. |
594 | 711 |
# |
595 |
- f = file( yaml_fullpath, 'wb' ) |
|
712 |
+ |
|
713 |
+ # For the one file we really use, write to a file on the side, then move it. |
|
714 |
+ yaml_newfile_fullpath = os.path.join( localdir, 'techcrunch_temp_writable.yaml' ) |
|
715 |
+ f = file( yaml_newfile_fullpath, 'wb' ) |
|
596 | 716 |
yaml.dump( items, f, width=120 ) |
597 | 717 |
f.close() |
718 |
+ os.rename( yaml_newfile_fullpath, yaml_fullpath ) |
|
598 | 719 |
f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) |
599 | 720 |
yaml.dump( items, f, width=120 ) |
600 | 721 |
f.close() |
... | ... |
@@ -604,7 +725,7 @@ if __name__=='__main__': |
604 | 725 |
|
605 | 726 |
Make_feed_file( items ) |
606 | 727 |
|
607 |
- Make_index_html( items, stats ) |
|
728 |
+ Make_index_html( items, weekend_stats, weekday_stats ) |
|
608 | 729 |
else: |
609 | 730 |
print "No entries were added this time." |
610 | 731 |
|
... | ... |
@@ -635,7 +756,8 @@ if __name__=='__main__': |
635 | 756 |
else: |
636 | 757 |
lines = [] |
637 | 758 |
lines = lines[:168] # Just keep the past week's worth |
638 |
- status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
|
759 |
+ # status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
|
760 |
+ status = len( message.strip() ) and '\n '.join( message.splitlines() ) or "OK" |
|
639 | 761 |
lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status )) |
640 | 762 |
f = open( os.path.join( localdir,'stats.txt' ), 'w' ) |
641 | 763 |
f.writelines( lines ) |
642 | 764 |