analysis.py
89fce464
 #!/usr/bin/env python
 
 import yaml
 import sys
 import os
 import time
 import traceback
 import exceptions
 import math
 import bisect
 
 debug = True
 
 
 def get_standard_deviation(l):
     """ returns the standard deviation of the iterable l """
     mean = sum(l) / len(l)
     squares_of_diffs = map(lambda x: pow(x - mean, 2), l)
     mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
     return math.sqrt(mean_of_squares)
 
 
 def unique(seq, idfun=None):
     if idfun is None:
         def idfun(x): return x
     seen = {}
     result = []
     for item in seq:
         marker = idfun(item)
         if marker in seen: continue
         seen[marker] = 1
         result.append(item)
     return result
 
 
ea0e1e38
 def process_comments_for_feed(yaml_items):
89fce464
     time_blocks = [[], [], [], [], [], [], [], []]
     for i in yaml_items:
         time_posted = i['orig_posted']
         comment_times = i['comment_times']
         comments = i['comments']
         comment_times_indices = [(t - time_posted) / 1800 for t in comment_times]
         for j in range(len(comments)):
             if comment_times_indices[j] > 7 or comment_times_indices[j] < 0:
                 continue
             time_blocks[comment_times_indices[j]].append(comments[j])
 
     stats = []
     for time_block in time_blocks:
         mean = sum(time_block) / len(time_block)
         squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block)
         mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
         std_dev = math.sqrt(mean_of_squares)
         stats.append((mean, std_dev))
     return stats
 
 
ea0e1e38
 def remove_outliers(time_blocks):
89fce464
     """remove 6% of the values as outliers (3% from each side)."""
     for block in time_blocks:
         pairs_to_remove = 0
         if len(block) > 66:
             pairs_to_remove = int(len(block) * 0.03)
         elif len(block) > 19:
             pairs_to_remove = 1
 
         while pairs_to_remove > 0:
             block.pop()
             block.pop(0)
             pairs_to_remove -= 1
 
 
ea0e1e38
 def calculate_median_mean_stddev(time_blocks):
89fce464
     stats = []
     for block in time_blocks:
         # Calculate the median
         count = len(block)
         median = 0.0
         if count % 2:
             median = float(block[count/2])
         elif count > 0:
             median = (block[count / 2 - 1] + block[count / 2]) / 2.0
 
         # Calculate the mean and standard deviation
         if count > 0:
             mean = sum(block) / float(len(block))
             squares_of_diffs = map(lambda x: pow(x - mean, 2), block)
             mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
         else:
             mean = 0
             mean_of_squares = 0
         std_dev = math.sqrt(mean_of_squares)
         stats.append((median, mean, std_dev))
     return stats
 
 
ea0e1e38
 def process_feed(yaml_items, metric, metric_times):
89fce464
     weekend_time_blocks = [[], [], [], [], [], [], [], []]
     weekday_time_blocks = [[], [], [], [], [], [], [], []]
     for i in yaml_items:
         time_posted = i['orig_posted']
         wday = time.localtime(time_posted).tm_wday
         value_times = i[metric_times]
         values = i[metric]
         value_times_indices = [(t - time_posted) / 1800 for t in value_times]
         for j in range(len(values)):
             if value_times_indices[j] > 7 or value_times_indices[j] < 0:
                 continue
             if wday == 5 or wday == 6:
                 bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j])
             else:
                 bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j])
 
ea0e1e38
     remove_outliers(weekend_time_blocks)
     remove_outliers(weekday_time_blocks)
89fce464
 
ea0e1e38
     weekend_stats = calculate_median_mean_stddev(weekend_time_blocks)
     weekday_stats = calculate_median_mean_stddev(weekday_time_blocks)
89fce464
 
     return weekend_stats, weekday_stats
 
 
 if __name__=='__main__':
     start_time = time.time()
     progress_text = []
 
     try:
         localdir = os.path.abspath(os.path.dirname(sys.argv[0]))
         #
         # Read in techcrunch.yaml
         #
         # [ { 'title'               : 'Title Text',
         #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
         #     'author'              : u'MG Siegler',
         #     'orig_posted'         : 1282197199
         #     'tags'                : [ u'Google', u'privacy' ]
         #     'qualified'           : -1
         #     'comment_times'       : [ 1282197199, 1282197407 ]
         #     'comments'            : [ 0, 15 ]
         #     'slash_comment_times' : [ 1282197199, 1282197407 ]
         #     'slash_comments'      : [ 0, 5 ]
         #     'slash_comment_times' : [ 1282197199, 1282197407 ]
         #     'slash_comments'      : [ 0, 3 ]
         #     'retweet_times'       : [ 1282197199, 1282197407 ]
         #     'retweets'            : [ 0, 43 ]
         #    },
         #    { ... }
         #  ]
         #
         yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml')
         if os.path.exists(yaml_fullpath):
             f = file(yaml_fullpath, 'rb')
             items = yaml.load(f)
             f.close()
         else:
             print "could not open", yaml_fullpath
             items = []
 
         weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times')
 
         # We'll only look at the stats for the time 1:00 to 1:30 after posting.
         weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
         weekend_threshold = weekend_median + (weekend_sigma)
         median, mean, sigma = weekday_stats[2]
         threshold = median + (sigma)
         print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold)
         print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold)
         for item in items:
             if item['qualified'] == -1:
                 print "Processing", item['title'].encode('ascii', 'replace')
                 for i in range(len(item['retweet_times'])):
                     r_time = item['retweet_times'][i]
                     if r_time - item['orig_posted'] < 5400:
                         print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]),
                         if item['retweets'][i] >= threshold:
                             item['qualified'] = i
                             print "NOW QUALIFIES",
                         if r_time - item['orig_posted'] >= 3600:
                             break
                 print
 
     except Exception as e:
         exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
         print exceptional_text, ' '.join(progress_text)
         traceback.print_exc(file=sys.stdout)