David Blume commited on 2018-01-20 20:38:45
Showing 1 changed files, with 97 additions and 52 deletions.
| ... | ... |
@@ -1,12 +1,4 @@ |
| 1 |
-#!/usr/bin/python |
|
| 2 |
-# |
|
| 3 |
-# techcrunch.py |
|
| 4 |
-# |
|
| 5 |
-# For reference: See the SVN history of this file to see how I implemented |
|
| 6 |
-# 1. retweet counts |
|
| 7 |
-# 2. slash comment counts |
|
| 8 |
-# 3. disqus comment counts, and the cookie for them |
|
| 9 |
-# http://websvn.dlma.com/filedetails.php?repname=private&path=%2Fwww%2Ftechcrunch.dlma.com%2Ftrunk%2Ftechcrunch.py |
|
| 1 |
+#!/usr/bin/env python |
|
| 10 | 2 |
# |
| 11 | 3 |
# TODO: |
| 12 | 4 |
# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
| ... | ... |
@@ -46,7 +38,8 @@ any_entry_added = False |
| 46 | 38 |
tags_to_post = set(['apple', 'google']) |
| 47 | 39 |
authors_to_post = ['michael arrington',] |
| 48 | 40 |
|
| 49 |
-rhs_metric = 'fb_shares' |
|
| 41 |
+# TODO 2018-01-18: Maybe combine fb_likes with bf_shares or something... |
|
| 42 |
+rhs_metric = 'fb_likes' |
|
| 50 | 43 |
rhs_metric_times = 'comment_times' |
| 51 | 44 |
|
| 52 | 45 |
localdir = '' |
| ... | ... |
@@ -73,7 +66,7 @@ html_head = """ |
| 73 | 66 |
</HEAD> |
| 74 | 67 |
<BODY> |
| 75 | 68 |
<div align='center'><h3>TechCrunch Feed Filter</h3></div> |
| 76 |
-This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br /> |
|
| 69 |
+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>. <a href="http://david.dlma.com/blog/my-techcrunch-feed-filter">Learn more about the Feed Filter</a>.<br /><br /> |
|
| 77 | 70 |
""" |
| 78 | 71 |
|
| 79 | 72 |
html_footer = """ |
| ... | ... |
@@ -103,15 +96,15 @@ odd_watermark = "D0D0F0" |
| 103 | 96 |
def asciiize(s): |
| 104 | 97 |
try: |
| 105 | 98 |
return s.encode('ascii')
|
| 106 |
- except UnicodeEncodeError, e: |
|
| 99 |
+ except UnicodeEncodeError as e: |
|
| 107 | 100 |
return s |
| 108 |
- except exceptions.AttributeError, e: |
|
| 101 |
+ except exceptions.AttributeError as e: |
|
| 109 | 102 |
return s |
| 110 | 103 |
|
| 111 | 104 |
|
| 112 | 105 |
def sendEmail(subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>'): |
| 113 | 106 |
"""Sends Email""" |
| 114 |
- smtp = smtplib.SMTP('localhost', port=587)
|
|
| 107 |
+ smtp = smtplib.SMTP('mail.dlma.com', port=587)
|
|
| 115 | 108 |
smtp.login(user, passw) |
| 116 | 109 |
smtp.sendmail(fromaddr, \ |
| 117 | 110 |
toaddrs, \ |
| ... | ... |
@@ -253,7 +246,7 @@ def process_feed(yaml_items): |
| 253 | 246 |
try: |
| 254 | 247 |
with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f: |
| 255 | 248 |
pickle.dump(feed, f) |
| 256 |
- except(pickle.PicklingError, exceptions.TypeError), e: |
|
| 249 |
+ except(pickle.PicklingError, exceptions.TypeError) as e: |
|
| 257 | 250 |
print "An error occurred while pickling the feed: %s." % \ |
| 258 | 251 |
(# str(e.__class__), |
| 259 | 252 |
str(e)) |
| ... | ... |
@@ -304,6 +297,7 @@ def process_feed(yaml_items): |
| 304 | 297 |
|
| 305 | 298 |
|
| 306 | 299 |
def process_item(feed_item, yaml_items): |
| 300 |
+ """Processes an RSS feed item, and converts it to a YAML item""" |
|
| 307 | 301 |
# Get the time |
| 308 | 302 |
global any_entry_added |
| 309 | 303 |
timecode_now = int(time.time()) |
| ... | ... |
@@ -318,22 +312,25 @@ def process_item(feed_item, yaml_items): |
| 318 | 312 |
print "process_item found no timestamp for", asciiize(feed_item.link) |
| 319 | 313 |
timecode_parsed = calendar.timegm(date_parsed) |
| 320 | 314 |
|
| 315 |
+ link = feed_item.link |
|
| 316 |
+ if hasattr(feed_item, 'feedburner_origlink'): |
|
| 317 |
+ link = feed_item.feedburner_origlink |
|
| 318 |
+ |
|
| 319 |
+ # TODO 2018-01-18: Leave in the ncid for URL clicks, but remove during processing. |
|
| 320 |
+# suffix_to_remove = '?ncid=rss' |
|
| 321 |
+# if link.endswith(suffix_to_remove): |
|
| 322 |
+# link = link[:-len(suffix_to_remove)] |
|
| 323 |
+ |
|
| 321 | 324 |
# Look for i.feedburner_origlink in yaml_items |
| 322 | 325 |
yaml_item = None |
| 323 | 326 |
for i in yaml_items: |
| 324 |
- if hasattr(feed_item, 'feedburner_origlink') and feed_item.feedburner_origlink == i['link']: |
|
| 325 |
- yaml_item = i |
|
| 326 |
- break |
|
| 327 |
- elif feed_item.link == i['link']: |
|
| 327 |
+ if link == i['link']: |
|
| 328 | 328 |
yaml_item = i |
| 329 | 329 |
break |
| 330 |
- if not yaml_item: |
|
| 330 |
+ if yaml_item is None: |
|
| 331 | 331 |
author = '' |
| 332 |
- link = feed_item.link |
|
| 333 | 332 |
if hasattr(feed_item, 'author'): |
| 334 | 333 |
author = asciiize(feed_item.author) |
| 335 |
- if hasattr(feed_item, 'feedburner_origlink'): |
|
| 336 |
- link = feed_item.feedburner_origlink |
|
| 337 | 334 |
|
| 338 | 335 |
# Make a new yaml_item |
| 339 | 336 |
yaml_item = {'title' : asciiize(feed_item.title),
|
| ... | ... |
@@ -370,9 +367,16 @@ def process_item(feed_item, yaml_items): |
| 370 | 367 |
def process_yaml_item(yaml_item): |
| 371 | 368 |
global any_entry_added |
| 372 | 369 |
|
| 370 |
+ # Related to TODO 2018-01-18: Remove ncid only during processing. |
|
| 371 |
+ link = yaml_item['link'] |
|
| 372 |
+ suffix_to_remove = '?ncid=rss' |
|
| 373 |
+ # Maybe we should find() it instead, in case feedburner adds other options |
|
| 374 |
+ if link.endswith(suffix_to_remove): |
|
| 375 |
+ link = link[:-len(suffix_to_remove)] |
|
| 376 |
+ |
|
| 373 | 377 |
timecode_now = int(time.time()) |
| 374 | 378 |
if len(yaml_item['fb_comments']) < 8: |
| 375 |
- num_shares, num_comments, num_likes = Get_fb_stats(yaml_item['link']) |
|
| 379 |
+ num_shares, num_comments, num_likes = Get_fb_stats(link) |
|
| 376 | 380 |
if num_comments != -1: |
| 377 | 381 |
any_entry_added = True |
| 378 | 382 |
yaml_item['comment_times'].append(timecode_now) |
| ... | ... |
@@ -381,7 +385,7 @@ def process_yaml_item(yaml_item): |
| 381 | 385 |
yaml_item['fb_likes'].append(num_likes) |
| 382 | 386 |
|
| 383 | 387 |
# if len(yaml_item['reddit_']) < 8: |
| 384 |
-# num_ = Get_reddit_stats(yaml_item['link']) |
|
| 388 |
+# num_ = Get_reddit_stats(link) |
|
| 385 | 389 |
# if num_ != -1: |
| 386 | 390 |
# any_entry_added = True |
| 387 | 391 |
# yaml_item['reddit_times'].append(timecode_now) |
| ... | ... |
@@ -389,7 +393,7 @@ def process_yaml_item(yaml_item): |
| 389 | 393 |
|
| 390 | 394 |
|
| 391 | 395 |
def Get_reddit_stats(url_string): |
| 392 |
- """ Consider curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" |
|
| 396 |
+ """ Consider curl "https://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg" |
|
| 393 | 397 |
""" |
| 394 | 398 |
return -1 |
| 395 | 399 |
|
| ... | ... |
@@ -406,42 +410,77 @@ def Get_fb_stats(url_string): |
| 406 | 410 |
URL ID: |
| 407 | 411 |
|
| 408 | 412 |
u = urllib.quote_plus(url_string) |
| 409 |
- with open('/home/dblume/oauth.dlma.com/facebook-token.txt', 'r') as f:
|
|
| 413 |
+ with open('facebook-token.txt', 'r') as f:
|
|
| 410 | 414 |
token = f.read() |
| 411 | 415 |
encoded = urllib.urlencode({'access_token': token})
|
| 412 |
- urllib2.urlopen('https://graph.facebook.com/v2.5/?id=%s&%s' % (u, encoded)
|
|
| 416 |
+ urllib2.urlopen('https://graph.facebook.com/vX.Y/?id=%s&%s' % (u, encoded)
|
|
| 413 | 417 |
""" |
| 414 | 418 |
shares = -1 |
| 415 | 419 |
comments = -1 |
| 416 | 420 |
likes = -1 |
| 417 | 421 |
|
| 422 |
+ url_string = url_string.encode('utf-8')
|
|
| 423 |
+ |
|
| 418 | 424 |
try: |
| 419 |
- url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27' |
|
| 420 |
- f = urllib2.urlopen(url % (urllib.quote_plus(url_string))) |
|
| 425 |
+ encoded = urllib.urlencode({'access_token': facebook_token})
|
|
| 426 |
+# url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27&%s' |
|
| 427 |
+ # This stopped working 2018-01-13, 11:25, after I told Facebook the app would use v2.11 |
|
| 428 |
+ # https://developers.facebook.com/docs/graph-api/changelog/version2.9#gapi-deprecate |
|
| 429 |
+ # url = 'https://graph.facebook.com/v2.8/?id=%s&fields=og_object{engagement},share&%s'
|
|
| 430 |
+ |
|
| 431 |
+ # Consider the following for a different engagement field: |
|
| 432 |
+ # "engagement": {
|
|
| 433 |
+ # "reaction_count": 115, |
|
| 434 |
+ # "comment_count": 0, |
|
| 435 |
+ # "share_count": 102, |
|
| 436 |
+ # "comment_plugin_count": 0 |
|
| 437 |
+ # }, |
|
| 438 |
+ # Where reaction_count + share_count = og_object.engagement.count |
|
| 439 |
+ url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s' |
|
| 440 |
+ |
|
| 441 |
+ f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded)) |
|
| 421 | 442 |
data = f.read() |
| 422 | 443 |
f.close() |
| 423 |
- except (urllib2.URLError, httplib.BadStatusLine), e: |
|
| 444 |
+ except (urllib2.URLError, httplib.BadStatusLine) as e: |
|
| 424 | 445 |
if hasattr(e, 'reason'): # URLError |
| 425 |
- print "Get_fb_stats got an error:", e.reason, url_string |
|
| 446 |
+ if hasattr(e, 'code'): |
|
| 447 |
+ print "Get_fb_stats got an error (1):", e.code, e.reason, url_string |
|
| 448 |
+ else: |
|
| 449 |
+ print "Get_fb_stats got an error (2):", e.reason, url_string |
|
| 426 | 450 |
elif hasattr(e, 'code'): #URLError |
| 427 | 451 |
print "Get_fb_stats got an error. Code:", e.code, url_string |
| 428 | 452 |
else: |
| 429 |
- print "Get_fb_stats got an error:", str(e) |
|
| 453 |
+ print "Get_fb_stats got an error (3):", str(e) |
|
| 454 |
+ return shares, comments, likes |
|
| 455 |
+ except KeyError as e: |
|
| 456 |
+ print "Get_fb_stats got a key error 1e (%s)" % (str(e), ) |
|
| 457 |
+ print "Get_fb_stats got a key error 2.2 enc (%s)" % (encoded, ) |
|
| 458 |
+ print url_string.encode('utf-8')
|
|
| 459 |
+ print u"Get_fb_stats got a key error 2.1 url (%s)" % (url_string, ) |
|
| 460 |
+ print "Get_fb_stats got a key error 3q (%s)" % (urllib.quote_plus(url_string)) |
|
| 461 |
+ print "Get_fb_stats got a key error 4 (%s)" % (url % (urllib.quote_plus(url_string), encoded)) |
|
| 462 |
+ print "Get_fb_stats got a key error 5 (%s) for url %s:" % (str(e), url % (urllib.quote_plus(url_string), encoded)) |
|
| 430 | 463 |
return shares, comments, likes |
| 431 | 464 |
if len(data) > 20: |
| 432 |
- d = json.loads(data)['data'][0] |
|
| 433 |
- if 'like_count' in d: |
|
| 434 |
- likes = d['like_count'] |
|
| 435 |
- else: |
|
| 436 |
- likes = 0 |
|
| 437 |
- if 'comment_count' in d: |
|
| 438 |
- comments = d['comment_count'] |
|
| 439 |
- else: |
|
| 440 |
- comments = 0 |
|
| 441 |
- if 'share_count' in d: |
|
| 465 |
+ d = json.loads(data)['engagement'] |
|
| 466 |
+ try: |
|
| 442 | 467 |
shares = d['share_count'] |
| 443 |
- else: |
|
| 468 |
+ except KeyError: |
|
| 444 | 469 |
shares = 0 |
| 470 |
+ |
|
| 471 |
+ try: |
|
| 472 |
+ likes = d['reaction_count'] |
|
| 473 |
+ except KeyError: |
|
| 474 |
+ likes = 0 |
|
| 475 |
+ |
|
| 476 |
+ # TODO 2018-01-18: og_object metric was likes + shares + comments |
|
| 477 |
+ # Here we'll combine likes and shares, and comments with plugin_comments |
|
| 478 |
+ likes += shares |
|
| 479 |
+ |
|
| 480 |
+ try: |
|
| 481 |
+ comments = d['comment_plugin_count'] + d['comment_count'] |
|
| 482 |
+ except KeyError: |
|
| 483 |
+ comments = 0 |
|
| 445 | 484 |
else: |
| 446 | 485 |
print "Get_fb_stats got too little data for ", url_string |
| 447 | 486 |
return shares, comments, likes |
| ... | ... |
@@ -452,7 +491,7 @@ def Save_image(url_string, file_path): |
| 452 | 491 |
f = urllib2.urlopen(url_string) |
| 453 | 492 |
data = f.read() |
| 454 | 493 |
f.close() |
| 455 |
- except (urllib2.URLError, httplib.BadStatusLine), e: |
|
| 494 |
+ except (urllib2.URLError, httplib.BadStatusLine) as e: |
|
| 456 | 495 |
if hasattr(e, 'reason'): # URLError |
| 457 | 496 |
print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason
|
| 458 | 497 |
elif hasattr(e, 'code'): # URLError |
| ... | ... |
@@ -476,7 +515,7 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats): |
| 476 | 515 |
|
| 477 | 516 |
files_to_delete = glob.glob(os.path.join(cache_path, '*.png')) |
| 478 | 517 |
|
| 479 |
- f = file(new_index_fullpath, 'w') |
|
| 518 |
+ with codecs.open(new_index_fullpath, 'w', 'utf-8') as f: |
|
| 480 | 519 |
f.write(html_head % (even_background, odd_background)) |
| 481 | 520 |
|
| 482 | 521 |
|
| ... | ... |
@@ -521,7 +560,7 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats): |
| 521 | 560 |
) |
| 522 | 561 |
) |
| 523 | 562 |
f.write(html_footer) |
| 524 |
- f.close() |
|
| 563 |
+ |
|
| 525 | 564 |
if os.path.exists(index_fullpath): |
| 526 | 565 |
os.unlink(index_fullpath) |
| 527 | 566 |
shutil.move(new_index_fullpath, index_fullpath) |
| ... | ... |
@@ -530,8 +569,8 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats): |
| 530 | 569 |
|
| 531 | 570 |
|
| 532 | 571 |
def Make_feed_file(yaml_items): |
| 533 |
- with open(os.path.join(localdir, 'rss_feed.xml'), 'wb') as f: |
|
| 534 |
- f.write("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>")
|
|
| 572 |
+ with codecs.open(os.path.join(localdir, 'rss_feed.xml'), 'wb', 'utf-8') as f: |
|
| 573 |
+ f.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>")
|
|
| 535 | 574 |
f.write("<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % (time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
|
| 536 | 575 |
count = 0 |
| 537 | 576 |
for item in yaml_items: |
| ... | ... |
@@ -580,6 +619,9 @@ if __name__=='__main__': |
| 580 | 619 |
if os.path.exists(yaml_fullpath): |
| 581 | 620 |
with open(yaml_fullpath, 'rb') as f: |
| 582 | 621 |
items = yaml.load(f) |
| 622 |
+ if items is None: |
|
| 623 |
+ print yaml_fullpath, "exists, but was empty." |
|
| 624 |
+ items = [] |
|
| 583 | 625 |
|
| 584 | 626 |
# Do any dictionary item updating that might be necessary |
| 585 | 627 |
# for item in items: |
| ... | ... |
@@ -589,6 +631,9 @@ if __name__=='__main__': |
| 589 | 631 |
print "could not open", yaml_fullpath |
| 590 | 632 |
items = [] |
| 591 | 633 |
|
| 634 |
+ with open('facebook-token.txt', 'r') as f:
|
|
| 635 |
+ facebook_token = f.read() |
|
| 636 |
+ |
|
| 592 | 637 |
progress_text = ["read techcrunch.yaml"] |
| 593 | 638 |
process_feed(items) |
| 594 | 639 |
|
| ... | ... |
@@ -649,7 +694,7 @@ if __name__=='__main__': |
| 649 | 694 |
else: |
| 650 | 695 |
print "No entries were added this time." |
| 651 | 696 |
|
| 652 |
- except Exception, e: |
|
| 697 |
+ except Exception as e: |
|
| 653 | 698 |
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) |
| 654 | 699 |
print exceptional_text, ' '.join(progress_text) |
| 655 | 700 |
traceback.print_exc(file=sys.stdout) |
| ... | ... |
@@ -657,7 +702,7 @@ if __name__=='__main__': |
| 657 | 702 |
sendEmail('Exception thrown in techcrunch.py',
|
| 658 | 703 |
exceptional_text + "\n" + traceback.format_exc(), |
| 659 | 704 |
('david.blume@gmail.com',))
|
| 660 |
- except Exception, e: |
|
| 705 |
+ except Exception as e: |
|
| 661 | 706 |
print "Could not send email to notify you of the exception. :("
|
| 662 | 707 |
|
| 663 | 708 |
message = sys.stdout.getvalue() |
| ... | ... |
@@ -672,7 +717,7 @@ if __name__=='__main__': |
| 672 | 717 |
lines = f.readlines() |
| 673 | 718 |
else: |
| 674 | 719 |
lines = [] |
| 675 |
- lines = lines[:168] # Just keep the past week's worth |
|
| 720 |
+ lines = lines[:672] # Just keep the past week's worth |
|
| 676 | 721 |
# status = len(message.strip()) and message.strip().replace('\n', ' - ') or "OK"
|
| 677 | 722 |
status = len(message.strip()) and '\n '.join( message.splitlines()) or "OK" |
| 678 | 723 |
lines.insert(0, "%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status))
|
| 679 | 724 |