David Blume commited on 2018-01-20 20:38:45
Showing 1 changed files, with 97 additions and 52 deletions.
... | ... |
@@ -1,12 +1,4 @@ |
1 |
-#!/usr/bin/python |
|
2 |
-# |
|
3 |
-# techcrunch.py |
|
4 |
-# |
|
5 |
-# For reference: See the SVN history of this file to see how I implemented |
|
6 |
-# 1. retweet counts |
|
7 |
-# 2. slash comment counts |
|
8 |
-# 3. disqus comment counts, and the cookie for them |
|
9 |
-# http://websvn.dlma.com/filedetails.php?repname=private&path=%2Fwww%2Ftechcrunch.dlma.com%2Ftrunk%2Ftechcrunch.py |
|
1 |
+#!/usr/bin/env python |
|
10 | 2 |
# |
11 | 3 |
# TODO: |
12 | 4 |
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
... | ... |
@@ -46,7 +38,8 @@ any_entry_added = False |
46 | 38 |
tags_to_post = set(['apple', 'google']) |
47 | 39 |
authors_to_post = ['michael arrington',] |
48 | 40 |
|
49 |
-rhs_metric = 'fb_shares' |
|
41 |
+# TODO 2018-01-18: Maybe combine fb_likes with bf_shares or something... |
|
42 |
+rhs_metric = 'fb_likes' |
|
50 | 43 |
rhs_metric_times = 'comment_times' |
51 | 44 |
|
52 | 45 |
localdir = '' |
... | ... |
@@ -73,7 +66,7 @@ html_head = """ |
73 | 66 |
</HEAD> |
74 | 67 |
<BODY> |
75 | 68 |
<div align='center'><h3>TechCrunch Feed Filter</h3></div> |
76 |
-This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br /> |
|
69 |
+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>. <a href="http://david.dlma.com/blog/my-techcrunch-feed-filter">Learn more about the Feed Filter</a>.<br /><br /> |
|
77 | 70 |
""" |
78 | 71 |
|
79 | 72 |
html_footer = """ |
... | ... |
@@ -103,15 +96,15 @@ odd_watermark = "D0D0F0" |
103 | 96 |
def asciiize(s): |
104 | 97 |
try: |
105 | 98 |
return s.encode('ascii') |
106 |
- except UnicodeEncodeError, e: |
|
99 |
+ except UnicodeEncodeError as e: |
|
107 | 100 |
return s |
108 |
- except exceptions.AttributeError, e: |
|
101 |
+ except exceptions.AttributeError as e: |
|
109 | 102 |
return s |
110 | 103 |
|
111 | 104 |
|
112 | 105 |
def sendEmail(subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>'): |
113 | 106 |
"""Sends Email""" |
114 |
- smtp = smtplib.SMTP('localhost', port=587) |
|
107 |
+ smtp = smtplib.SMTP('mail.dlma.com', port=587) |
|
115 | 108 |
smtp.login(user, passw) |
116 | 109 |
smtp.sendmail(fromaddr, \ |
117 | 110 |
toaddrs, \ |
... | ... |
@@ -253,7 +246,7 @@ def process_feed(yaml_items): |
253 | 246 |
try: |
254 | 247 |
with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f: |
255 | 248 |
pickle.dump(feed, f) |
256 |
- except(pickle.PicklingError, exceptions.TypeError), e: |
|
249 |
+ except(pickle.PicklingError, exceptions.TypeError) as e: |
|
257 | 250 |
print "An error occurred while pickling the feed: %s." % \ |
258 | 251 |
(# str(e.__class__), |
259 | 252 |
str(e)) |
... | ... |
@@ -304,6 +297,7 @@ def process_feed(yaml_items): |
304 | 297 |
|
305 | 298 |
|
306 | 299 |
def process_item(feed_item, yaml_items): |
300 |
+ """Processes an RSS feed item, and converts it to a YAML item""" |
|
307 | 301 |
# Get the time |
308 | 302 |
global any_entry_added |
309 | 303 |
timecode_now = int(time.time()) |
... | ... |
@@ -318,22 +312,25 @@ def process_item(feed_item, yaml_items): |
318 | 312 |
print "process_item found no timestamp for", asciiize(feed_item.link) |
319 | 313 |
timecode_parsed = calendar.timegm(date_parsed) |
320 | 314 |
|
315 |
+ link = feed_item.link |
|
316 |
+ if hasattr(feed_item, 'feedburner_origlink'): |
|
317 |
+ link = feed_item.feedburner_origlink |
|
318 |
+ |
|
319 |
+ # TODO 2018-01-18: Leave in the ncid for URL clicks, but remove during processing. |
|
320 |
+# suffix_to_remove = '?ncid=rss' |
|
321 |
+# if link.endswith(suffix_to_remove): |
|
322 |
+# link = link[:-len(suffix_to_remove)] |
|
323 |
+ |
|
321 | 324 |
# Look for i.feedburner_origlink in yaml_items |
322 | 325 |
yaml_item = None |
323 | 326 |
for i in yaml_items: |
324 |
- if hasattr(feed_item, 'feedburner_origlink') and feed_item.feedburner_origlink == i['link']: |
|
325 |
- yaml_item = i |
|
326 |
- break |
|
327 |
- elif feed_item.link == i['link']: |
|
327 |
+ if link == i['link']: |
|
328 | 328 |
yaml_item = i |
329 | 329 |
break |
330 |
- if not yaml_item: |
|
330 |
+ if yaml_item is None: |
|
331 | 331 |
author = '' |
332 |
- link = feed_item.link |
|
333 | 332 |
if hasattr(feed_item, 'author'): |
334 | 333 |
author = asciiize(feed_item.author) |
335 |
- if hasattr(feed_item, 'feedburner_origlink'): |
|
336 |
- link = feed_item.feedburner_origlink |
|
337 | 334 |
|
338 | 335 |
# Make a new yaml_item |
339 | 336 |
yaml_item = {'title' : asciiize(feed_item.title), |
... | ... |
@@ -370,9 +367,16 @@ def process_item(feed_item, yaml_items): |
370 | 367 |
def process_yaml_item(yaml_item): |
371 | 368 |
global any_entry_added |
372 | 369 |
|
370 |
+ # Related to TODO 2018-01-18: Remove ncid only during processing. |
|
371 |
+ link = yaml_item['link'] |
|
372 |
+ suffix_to_remove = '?ncid=rss' |
|
373 |
+ # Maybe we should find() it instead, in case feedburner adds other options |
|
374 |
+ if link.endswith(suffix_to_remove): |
|
375 |
+ link = link[:-len(suffix_to_remove)] |
|
376 |
+ |
|
373 | 377 |
timecode_now = int(time.time()) |
374 | 378 |
if len(yaml_item['fb_comments']) < 8: |
375 |
- num_shares, num_comments, num_likes = Get_fb_stats(yaml_item['link']) |
|
379 |
+ num_shares, num_comments, num_likes = Get_fb_stats(link) |
|
376 | 380 |
if num_comments != -1: |
377 | 381 |
any_entry_added = True |
378 | 382 |
yaml_item['comment_times'].append(timecode_now) |
... | ... |
@@ -381,7 +385,7 @@ def process_yaml_item(yaml_item): |
381 | 385 |
yaml_item['fb_likes'].append(num_likes) |
382 | 386 |
|
383 | 387 |
# if len(yaml_item['reddit_']) < 8: |
384 |
-# num_ = Get_reddit_stats(yaml_item['link']) |
|
388 |
+# num_ = Get_reddit_stats(link) |
|
385 | 389 |
# if num_ != -1: |
386 | 390 |
# any_entry_added = True |
387 | 391 |
# yaml_item['reddit_times'].append(timecode_now) |
... | ... |
@@ -389,7 +393,7 @@ def process_yaml_item(yaml_item): |
389 | 393 |
|
390 | 394 |
|
391 | 395 |
def Get_reddit_stats(url_string): |
392 |
- """ Consider curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" |
|
396 |
+ """ Consider curl "https://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" |
|
393 | 397 |
""" |
394 | 398 |
return -1 |
395 | 399 |
|
... | ... |
@@ -406,42 +410,77 @@ def Get_fb_stats(url_string): |
406 | 410 |
URL ID: |
407 | 411 |
|
408 | 412 |
u = urllib.quote_plus(url_string) |
409 |
- with open('/home/dblume/oauth.dlma.com/facebook-token.txt', 'r') as f: |
|
413 |
+ with open('facebook-token.txt', 'r') as f: |
|
410 | 414 |
token = f.read() |
411 | 415 |
encoded = urllib.urlencode({'access_token': token}) |
412 |
- urllib2.urlopen('https://graph.facebook.com/v2.5/?id=%s&%s' % (u, encoded) |
|
416 |
+ urllib2.urlopen('https://graph.facebook.com/vX.Y/?id=%s&%s' % (u, encoded) |
|
413 | 417 |
""" |
414 | 418 |
shares = -1 |
415 | 419 |
comments = -1 |
416 | 420 |
likes = -1 |
417 | 421 |
|
422 |
+ url_string = url_string.encode('utf-8') |
|
423 |
+ |
|
418 | 424 |
try: |
419 |
- url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27' |
|
420 |
- f = urllib2.urlopen(url % (urllib.quote_plus(url_string))) |
|
425 |
+ encoded = urllib.urlencode({'access_token': facebook_token}) |
|
426 |
+# url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27&%s' |
|
427 |
+ # This stopped working 2018-01-13, 11:25, after I told Facebook the app would use v2.11 |
|
428 |
+ # https://developers.facebook.com/docs/graph-api/changelog/version2.9#gapi-deprecate |
|
429 |
+ # url = 'https://graph.facebook.com/v2.8/?id=%s&fields=og_object{engagement},share&%s' |
|
430 |
+ |
|
431 |
+ # Consider the following for a different engagement field: |
|
432 |
+ # "engagement": { |
|
433 |
+ # "reaction_count": 115, |
|
434 |
+ # "comment_count": 0, |
|
435 |
+ # "share_count": 102, |
|
436 |
+ # "comment_plugin_count": 0 |
|
437 |
+ # }, |
|
438 |
+ # Where reaction_count + share_count = og_object.engagement.count |
|
439 |
+ url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s' |
|
440 |
+ |
|
441 |
+ f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded)) |
|
421 | 442 |
data = f.read() |
422 | 443 |
f.close() |
423 |
- except (urllib2.URLError, httplib.BadStatusLine), e: |
|
444 |
+ except (urllib2.URLError, httplib.BadStatusLine) as e: |
|
424 | 445 |
if hasattr(e, 'reason'): # URLError |
425 |
- print "Get_fb_stats got an error:", e.reason, url_string |
|
446 |
+ if hasattr(e, 'code'): |
|
447 |
+ print "Get_fb_stats got an error (1):", e.code, e.reason, url_string |
|
448 |
+ else: |
|
449 |
+ print "Get_fb_stats got an error (2):", e.reason, url_string |
|
426 | 450 |
elif hasattr(e, 'code'): #URLError |
427 | 451 |
print "Get_fb_stats got an error. Code:", e.code, url_string |
428 | 452 |
else: |
429 |
- print "Get_fb_stats got an error:", str(e) |
|
453 |
+ print "Get_fb_stats got an error (3):", str(e) |
|
454 |
+ return shares, comments, likes |
|
455 |
+ except KeyError as e: |
|
456 |
+ print "Get_fb_stats got a key error 1e (%s)" % (str(e), ) |
|
457 |
+ print "Get_fb_stats got a key error 2.2 enc (%s)" % (encoded, ) |
|
458 |
+ print url_string.encode('utf-8') |
|
459 |
+ print u"Get_fb_stats got a key error 2.1 url (%s)" % (url_string, ) |
|
460 |
+ print "Get_fb_stats got a key error 3q (%s)" % (urllib.quote_plus(url_string)) |
|
461 |
+ print "Get_fb_stats got a key error 4 (%s)" % (url % (urllib.quote_plus(url_string), encoded)) |
|
462 |
+ print "Get_fb_stats got a key error 5 (%s) for url %s:" % (str(e), url % (urllib.quote_plus(url_string), encoded)) |
|
430 | 463 |
return shares, comments, likes |
431 | 464 |
if len(data) > 20: |
432 |
- d = json.loads(data)['data'][0] |
|
433 |
- if 'like_count' in d: |
|
434 |
- likes = d['like_count'] |
|
435 |
- else: |
|
436 |
- likes = 0 |
|
437 |
- if 'comment_count' in d: |
|
438 |
- comments = d['comment_count'] |
|
439 |
- else: |
|
440 |
- comments = 0 |
|
441 |
- if 'share_count' in d: |
|
465 |
+ d = json.loads(data)['engagement'] |
|
466 |
+ try: |
|
442 | 467 |
shares = d['share_count'] |
443 |
- else: |
|
468 |
+ except KeyError: |
|
444 | 469 |
shares = 0 |
470 |
+ |
|
471 |
+ try: |
|
472 |
+ likes = d['reaction_count'] |
|
473 |
+ except KeyError: |
|
474 |
+ likes = 0 |
|
475 |
+ |
|
476 |
+ # TODO 2018-01-18: og_object metric was likes + shares + comments |
|
477 |
+ # Here we'll combine likes and shares, and comments with plugin_comments |
|
478 |
+ likes += shares |
|
479 |
+ |
|
480 |
+ try: |
|
481 |
+ comments = d['comment_plugin_count'] + d['comment_count'] |
|
482 |
+ except KeyError: |
|
483 |
+ comments = 0 |
|
445 | 484 |
else: |
446 | 485 |
print "Get_fb_stats got too little data for ", url_string |
447 | 486 |
return shares, comments, likes |
... | ... |
@@ -452,7 +491,7 @@ def Save_image(url_string, file_path): |
452 | 491 |
f = urllib2.urlopen(url_string) |
453 | 492 |
data = f.read() |
454 | 493 |
f.close() |
455 |
- except (urllib2.URLError, httplib.BadStatusLine), e: |
|
494 |
+ except (urllib2.URLError, httplib.BadStatusLine) as e: |
|
456 | 495 |
if hasattr(e, 'reason'): # URLError |
457 | 496 |
print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason |
458 | 497 |
elif hasattr(e, 'code'): # URLError |
... | ... |
@@ -476,7 +515,7 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats): |
476 | 515 |
|
477 | 516 |
files_to_delete = glob.glob(os.path.join(cache_path, '*.png')) |
478 | 517 |
|
479 |
- f = file(new_index_fullpath, 'w') |
|
518 |
+ with codecs.open(new_index_fullpath, 'w', 'utf-8') as f: |
|
480 | 519 |
f.write(html_head % (even_background, odd_background)) |
481 | 520 |
|
482 | 521 |
|
... | ... |
@@ -521,7 +560,7 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats): |
521 | 560 |
) |
522 | 561 |
) |
523 | 562 |
f.write(html_footer) |
524 |
- f.close() |
|
563 |
+ |
|
525 | 564 |
if os.path.exists(index_fullpath): |
526 | 565 |
os.unlink(index_fullpath) |
527 | 566 |
shutil.move(new_index_fullpath, index_fullpath) |
... | ... |
@@ -530,8 +569,8 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats): |
530 | 569 |
|
531 | 570 |
|
532 | 571 |
def Make_feed_file(yaml_items): |
533 |
- with open(os.path.join(localdir, 'rss_feed.xml'), 'wb') as f: |
|
534 |
- f.write("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>") |
|
572 |
+ with codecs.open(os.path.join(localdir, 'rss_feed.xml'), 'wb', 'utf-8') as f: |
|
573 |
+ f.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>") |
|
535 | 574 |
f.write("<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % (time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))) |
536 | 575 |
count = 0 |
537 | 576 |
for item in yaml_items: |
... | ... |
@@ -580,6 +619,9 @@ if __name__=='__main__': |
580 | 619 |
if os.path.exists(yaml_fullpath): |
581 | 620 |
with open(yaml_fullpath, 'rb') as f: |
582 | 621 |
items = yaml.load(f) |
622 |
+ if items is None: |
|
623 |
+ print yaml_fullpath, "exists, but was empty." |
|
624 |
+ items = [] |
|
583 | 625 |
|
584 | 626 |
# Do any dictionary item updating that might be necessary |
585 | 627 |
# for item in items: |
... | ... |
@@ -589,6 +631,9 @@ if __name__=='__main__': |
589 | 631 |
print "could not open", yaml_fullpath |
590 | 632 |
items = [] |
591 | 633 |
|
634 |
+ with open('facebook-token.txt', 'r') as f: |
|
635 |
+ facebook_token = f.read() |
|
636 |
+ |
|
592 | 637 |
progress_text = ["read techcrunch.yaml"] |
593 | 638 |
process_feed(items) |
594 | 639 |
|
... | ... |
@@ -649,7 +694,7 @@ if __name__=='__main__': |
649 | 694 |
else: |
650 | 695 |
print "No entries were added this time." |
651 | 696 |
|
652 |
- except Exception, e: |
|
697 |
+ except Exception as e: |
|
653 | 698 |
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) |
654 | 699 |
print exceptional_text, ' '.join(progress_text) |
655 | 700 |
traceback.print_exc(file=sys.stdout) |
... | ... |
@@ -657,7 +702,7 @@ if __name__=='__main__': |
657 | 702 |
sendEmail('Exception thrown in techcrunch.py', |
658 | 703 |
exceptional_text + "\n" + traceback.format_exc(), |
659 | 704 |
('david.blume@gmail.com',)) |
660 |
- except Exception, e: |
|
705 |
+ except Exception as e: |
|
661 | 706 |
print "Could not send email to notify you of the exception. :(" |
662 | 707 |
|
663 | 708 |
message = sys.stdout.getvalue() |
... | ... |
@@ -672,7 +717,7 @@ if __name__=='__main__': |
672 | 717 |
lines = f.readlines() |
673 | 718 |
else: |
674 | 719 |
lines = [] |
675 |
- lines = lines[:168] # Just keep the past week's worth |
|
720 |
+ lines = lines[:672] # Just keep the past week's worth |
|
676 | 721 |
# status = len(message.strip()) and message.strip().replace('\n', ' - ') or "OK" |
677 | 722 |
status = len(message.strip()) and '\n '.join( message.splitlines()) or "OK" |
678 | 723 |
lines.insert(0, "%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status)) |
679 | 724 |