Catch up to production again. Need to clean up.
David Blume

David Blume commited on 2018-01-20 20:38:45
Showing 1 changed files, with 97 additions and 52 deletions.

... ...
@@ -1,12 +1,4 @@
1
-#!/usr/bin/python
2
-#
3
-# techcrunch.py
4
-#
5
-# For reference: See the SVN history of this file to see how I implemented
6
-# 1. retweet counts
7
-# 2. slash comment counts
8
-# 3. disqus comment counts, and the cookie for them
9
-# http://websvn.dlma.com/filedetails.php?repname=private&path=%2Fwww%2Ftechcrunch.dlma.com%2Ftrunk%2Ftechcrunch.py
1
+#!/usr/bin/env python
10 2
 #
11 3
 # TODO:
12 4
 # 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry &raquo;</a>'
... ...
@@ -46,7 +38,8 @@ any_entry_added = False
46 38
 tags_to_post = set(['apple', 'google'])
47 39
 authors_to_post = ['michael arrington',]
48 40
 
49
-rhs_metric = 'fb_shares'
41
+# TODO 2018-01-18: Maybe combine fb_likes with bf_shares or something...
42
+rhs_metric = 'fb_likes'
50 43
 rhs_metric_times = 'comment_times'
51 44
 
52 45
 localdir = ''
... ...
@@ -73,7 +66,7 @@ html_head = """
73 66
 </HEAD>
74 67
 <BODY>
75 68
 <div align='center'><h3>TechCrunch Feed Filter</h3></div>
76
-This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br />
69
+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>. <a href="http://david.dlma.com/blog/my-techcrunch-feed-filter">Learn more about the Feed Filter</a>.<br /><br />
77 70
 """
78 71
 
79 72
 html_footer = """
... ...
@@ -103,15 +96,15 @@ odd_watermark = "D0D0F0"
103 96
 def asciiize(s):
104 97
     try:
105 98
         return s.encode('ascii')
106
-    except UnicodeEncodeError, e:
99
+    except UnicodeEncodeError as e:
107 100
         return s
108
-    except exceptions.AttributeError, e:
101
+    except exceptions.AttributeError as e:
109 102
         return s
110 103
 
111 104
 
112 105
 def sendEmail(subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>'):
113 106
     """Sends Email"""
114
-    smtp = smtplib.SMTP('localhost', port=587)
107
+    smtp = smtplib.SMTP('mail.dlma.com', port=587)
115 108
     smtp.login(user, passw)
116 109
     smtp.sendmail(fromaddr, \
117 110
                   toaddrs, \
... ...
@@ -253,7 +246,7 @@ def process_feed(yaml_items):
253 246
                     try:
254 247
                         with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f:
255 248
                             pickle.dump(feed, f)
256
-                    except(pickle.PicklingError, exceptions.TypeError), e:
249
+                    except(pickle.PicklingError, exceptions.TypeError) as e:
257 250
                         print "An error occurred while pickling the feed: %s." % \
258 251
                               (# str(e.__class__),
259 252
                                str(e))
... ...
@@ -304,6 +297,7 @@ def process_feed(yaml_items):
304 297
 
305 298
 
306 299
 def process_item(feed_item, yaml_items):
300
+    """Processes an RSS feed item, and converts it to a YAML item"""
307 301
     # Get the time
308 302
     global any_entry_added
309 303
     timecode_now = int(time.time())
... ...
@@ -318,22 +312,25 @@ def process_item(feed_item, yaml_items):
318 312
         print "process_item found no timestamp for", asciiize(feed_item.link)
319 313
     timecode_parsed = calendar.timegm(date_parsed)
320 314
 
315
+    link = feed_item.link
316
+    if hasattr(feed_item, 'feedburner_origlink'):
317
+        link = feed_item.feedburner_origlink
318
+
319
+    # TODO 2018-01-18: Leave in the ncid for URL clicks, but remove during processing.
320
+#    suffix_to_remove = '?ncid=rss'
321
+#    if link.endswith(suffix_to_remove):
322
+#        link = link[:-len(suffix_to_remove)]
323
+
321 324
     # Look for i.feedburner_origlink in yaml_items
322 325
     yaml_item = None
323 326
     for i in yaml_items:
324
-        if hasattr(feed_item, 'feedburner_origlink') and feed_item.feedburner_origlink == i['link']:
325
-            yaml_item = i
326
-            break
327
-        elif feed_item.link == i['link']:
327
+        if link == i['link']:
328 328
             yaml_item = i
329 329
             break
330
-    if not yaml_item:
330
+    if yaml_item is None:
331 331
         author = ''
332
-        link = feed_item.link
333 332
         if hasattr(feed_item, 'author'):
334 333
             author = asciiize(feed_item.author)
335
-        if hasattr(feed_item, 'feedburner_origlink'):
336
-            link = feed_item.feedburner_origlink
337 334
 
338 335
         # Make a new yaml_item
339 336
         yaml_item = {'title'               : asciiize(feed_item.title),
... ...
@@ -370,9 +367,16 @@ def process_item(feed_item, yaml_items):
370 367
 def process_yaml_item(yaml_item):
371 368
     global any_entry_added
372 369
 
370
+    # Related to TODO 2018-01-18: Remove ncid only during processing.
371
+    link = yaml_item['link']
372
+    suffix_to_remove = '?ncid=rss'
373
+    # Maybe we should find() it instead, in case feedburner adds other options
374
+    if link.endswith(suffix_to_remove):
375
+        link = link[:-len(suffix_to_remove)]
376
+
373 377
     timecode_now = int(time.time())
374 378
     if len(yaml_item['fb_comments']) < 8:
375
-        num_shares, num_comments, num_likes = Get_fb_stats(yaml_item['link'])
379
+        num_shares, num_comments, num_likes = Get_fb_stats(link)
376 380
         if num_comments != -1:
377 381
             any_entry_added = True
378 382
             yaml_item['comment_times'].append(timecode_now)
... ...
@@ -381,7 +385,7 @@ def process_yaml_item(yaml_item):
381 385
             yaml_item['fb_likes'].append(num_likes)
382 386
 
383 387
 #    if len(yaml_item['reddit_']) < 8:
384
-#        num_ = Get_reddit_stats(yaml_item['link'])
388
+#        num_ = Get_reddit_stats(link)
385 389
 #        if num_ != -1:
386 390
 #            any_entry_added = True
387 391
 #            yaml_item['reddit_times'].append(timecode_now)
... ...
@@ -389,7 +393,7 @@ def process_yaml_item(yaml_item):
389 393
 
390 394
 
391 395
 def Get_reddit_stats(url_string):
392
-    """ Consider curl "http://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg"
396
+    """ Consider curl "https://www.reddit.com/api/info.json?url=http://i.imgur.com/HG9dJ.jpg"
393 397
     """
394 398
     return -1
395 399
 
... ...
@@ -406,42 +410,77 @@ def Get_fb_stats(url_string):
406 410
     URL ID:
407 411
 
408 412
     u = urllib.quote_plus(url_string)
409
-    with open('/home/dblume/oauth.dlma.com/facebook-token.txt', 'r') as f:
413
+    with open('facebook-token.txt', 'r') as f:
410 414
         token = f.read()
411 415
     encoded = urllib.urlencode({'access_token': token})
412
-    urllib2.urlopen('https://graph.facebook.com/v2.5/?id=%s&%s' % (u, encoded)
416
+    urllib2.urlopen('https://graph.facebook.com/vX.Y/?id=%s&%s' % (u, encoded)
413 417
     """
414 418
     shares = -1
415 419
     comments = -1
416 420
     likes = -1
417 421
 
422
+    url_string = url_string.encode('utf-8')
423
+
418 424
     try:
419
-        url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27'
420
-        f = urllib2.urlopen(url % (urllib.quote_plus(url_string)))
425
+        encoded = urllib.urlencode({'access_token': facebook_token})
426
+#        url = 'https://graph.facebook.com/fql?q=SELECT%%20total_count,comment_count,like_count,share_count%%20FROM%%20link_stat%%20WHERE%%20url=%%27%s%%27&%s'
427
+        # This stopped working 2018-01-13, 11:25, after I told Facebook the app would use v2.11
428
+        # https://developers.facebook.com/docs/graph-api/changelog/version2.9#gapi-deprecate
429
+        # url = 'https://graph.facebook.com/v2.8/?id=%s&fields=og_object{engagement},share&%s'
430
+
431
+        # Consider the following for a different engagement field:
432
+        #   "engagement": {
433
+        #     "reaction_count": 115,
434
+        #     "comment_count": 0,
435
+        #     "share_count": 102,
436
+        #     "comment_plugin_count": 0
437
+        #   },
438
+        # Where reaction_count + share_count = og_object.engagement.count
439
+        url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s'
440
+
441
+        f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded))
421 442
         data = f.read()
422 443
         f.close()
423
-    except (urllib2.URLError, httplib.BadStatusLine), e:
444
+    except (urllib2.URLError, httplib.BadStatusLine) as e:
424 445
         if hasattr(e, 'reason'): # URLError
425
-            print "Get_fb_stats got an error:", e.reason, url_string
446
+            if hasattr(e, 'code'):
447
+                print "Get_fb_stats got an error (1):", e.code, e.reason, url_string
448
+            else:
449
+                print "Get_fb_stats got an error (2):", e.reason, url_string
426 450
         elif hasattr(e, 'code'): #URLError
427 451
             print "Get_fb_stats got an error. Code:", e.code, url_string
428 452
         else:
429
-            print "Get_fb_stats got an error:", str(e)
453
+            print "Get_fb_stats got an error (3):", str(e)
454
+        return shares, comments, likes
455
+    except KeyError as e:
456
+        print "Get_fb_stats got a key error 1e (%s)" % (str(e), )
457
+        print "Get_fb_stats got a key error 2.2 enc (%s)" % (encoded, )
458
+        print url_string.encode('utf-8')
459
+        print u"Get_fb_stats got a key error 2.1 url (%s)" % (url_string, )
460
+        print "Get_fb_stats got a key error 3q (%s)" % (urllib.quote_plus(url_string))
461
+        print "Get_fb_stats got a key error 4 (%s)" % (url % (urllib.quote_plus(url_string), encoded))
462
+        print "Get_fb_stats got a key error 5 (%s) for url %s:" % (str(e), url % (urllib.quote_plus(url_string), encoded))
430 463
         return shares, comments, likes
431 464
     if len(data) > 20:
432
-        d = json.loads(data)['data'][0]
433
-        if 'like_count' in d:
434
-            likes = d['like_count']
435
-        else:
436
-            likes = 0
437
-        if 'comment_count' in d:
438
-            comments = d['comment_count']
439
-        else:
440
-            comments = 0
441
-        if 'share_count' in d:
465
+        d = json.loads(data)['engagement']
466
+        try:
442 467
             shares = d['share_count']
443
-        else:
468
+        except KeyError:
444 469
             shares = 0
470
+
471
+        try:
472
+            likes = d['reaction_count']
473
+        except KeyError:
474
+            likes = 0
475
+
476
+        # TODO 2018-01-18: og_object metric was likes + shares + comments
477
+        # Here we'll combine likes and shares, and comments with plugin_comments
478
+        likes += shares
479
+
480
+        try:
481
+            comments = d['comment_plugin_count'] + d['comment_count']
482
+        except KeyError:
483
+            comments = 0
445 484
     else:
446 485
         print "Get_fb_stats got too little data for ",  url_string
447 486
     return shares, comments, likes
... ...
@@ -452,7 +491,7 @@ def Save_image(url_string, file_path):
452 491
         f = urllib2.urlopen(url_string)
453 492
         data = f.read()
454 493
         f.close()
455
-    except (urllib2.URLError, httplib.BadStatusLine), e:
494
+    except (urllib2.URLError, httplib.BadStatusLine) as e:
456 495
         if hasattr(e, 'reason'): # URLError
457 496
             print "Save_image: Error attempting to create", file_path[file_path.rfind('/')+1:], "Reason:", e.reason
458 497
         elif hasattr(e, 'code'): # URLError
... ...
@@ -476,7 +515,7 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats):
476 515
 
477 516
     files_to_delete = glob.glob(os.path.join(cache_path, '*.png'))
478 517
 
479
-    f = file(new_index_fullpath, 'w')
518
+    with codecs.open(new_index_fullpath, 'w', 'utf-8') as f:
480 519
         f.write(html_head % (even_background, odd_background))
481 520
 
482 521
 
... ...
@@ -521,7 +560,7 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats):
521 560
                      )
522 561
                    )
523 562
         f.write(html_footer)
524
-    f.close()
563
+
525 564
     if os.path.exists(index_fullpath):
526 565
         os.unlink(index_fullpath)
527 566
     shutil.move(new_index_fullpath, index_fullpath)
... ...
@@ -530,8 +569,8 @@ def Make_index_html(yaml_items, weekend_stats, weekday_stats):
530 569
 
531 570
 
532 571
 def Make_feed_file(yaml_items):
533
-    with open(os.path.join(localdir, 'rss_feed.xml'), 'wb') as f:
534
-        f.write("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>")
572
+    with codecs.open(os.path.join(localdir, 'rss_feed.xml'), 'wb', 'utf-8') as f:
573
+        f.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>")
535 574
         f.write("<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>\n" % (time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
536 575
         count = 0
537 576
         for item in yaml_items:
... ...
@@ -580,6 +619,9 @@ if __name__=='__main__':
580 619
         if os.path.exists(yaml_fullpath):
581 620
             with open(yaml_fullpath, 'rb') as f:
582 621
                 items = yaml.load(f)
622
+                if items is None:
623
+                    print yaml_fullpath, "exists, but was empty."
624
+                    items = []
583 625
 
584 626
                 # Do any dictionary item updating that might be necessary
585 627
 #                for item in items:
... ...
@@ -589,6 +631,9 @@ if __name__=='__main__':
589 631
             print "could not open", yaml_fullpath
590 632
             items = []
591 633
 
634
+        with open('facebook-token.txt', 'r') as f:
635
+            facebook_token = f.read()
636
+
592 637
         progress_text = ["read techcrunch.yaml"]
593 638
         process_feed(items)
594 639
 
... ...
@@ -649,7 +694,7 @@ if __name__=='__main__':
649 694
         else:
650 695
             print "No entries were added this time."
651 696
 
652
-    except Exception, e:
697
+    except Exception as e:
653 698
         exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
654 699
         print exceptional_text, ' '.join(progress_text)
655 700
         traceback.print_exc(file=sys.stdout)
... ...
@@ -657,7 +702,7 @@ if __name__=='__main__':
657 702
             sendEmail('Exception thrown in techcrunch.py',
658 703
                       exceptional_text + "\n" + traceback.format_exc(),
659 704
                       ('david.blume@gmail.com',))
660
-        except Exception, e:
705
+        except Exception as e:
661 706
             print "Could not send email to notify you of the exception. :("
662 707
 
663 708
     message = sys.stdout.getvalue()
... ...
@@ -672,7 +717,7 @@ if __name__=='__main__':
672 717
             lines = f.readlines()
673 718
     else:
674 719
         lines = []
675
-    lines = lines[:168] # Just keep the past week's worth
720
+    lines = lines[:672] # Just keep the past week's worth
676 721
     # status = len(message.strip()) and message.strip().replace('\n', ' - ') or "OK"
677 722
     status = len(message.strip()) and '\n                       '.join( message.splitlines()) or "OK"
678 723
     lines.insert(0, "%s %3.0fs %s\n" % (time.strftime('%Y-%m-%d, %H:%M', time.localtime()), time.time() - start_time, status))
679 724