analysis.py
 89fce464 ``` #!/usr/bin/env python import yaml import sys import os import time import traceback import exceptions import math import bisect debug = True def get_standard_deviation(l): """ returns the standard deviation of the iterable l """ mean = sum(l) / len(l) squares_of_diffs = map(lambda x: pow(x - mean, 2), l) mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) return math.sqrt(mean_of_squares) def unique(seq, idfun=None): if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) if marker in seen: continue seen[marker] = 1 result.append(item) return result ``` ea0e1e38 ``` def process_comments_for_feed(yaml_items): ``` 89fce464 ``` time_blocks = [[], [], [], [], [], [], [], []] for i in yaml_items: time_posted = i['orig_posted'] comment_times = i['comment_times'] comments = i['comments'] comment_times_indices = [(t - time_posted) / 1800 for t in comment_times] for j in range(len(comments)): if comment_times_indices[j] > 7 or comment_times_indices[j] < 0: continue time_blocks[comment_times_indices[j]].append(comments[j]) stats = [] for time_block in time_blocks: mean = sum(time_block) / len(time_block) squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block) mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) std_dev = math.sqrt(mean_of_squares) stats.append((mean, std_dev)) return stats ``` ea0e1e38 ``` def remove_outliers(time_blocks): ``` 89fce464 ``` """remove 6% of the values as outliers (3% from each side).""" for block in time_blocks: pairs_to_remove = 0 if len(block) > 66: pairs_to_remove = int(len(block) * 0.03) elif len(block) > 19: pairs_to_remove = 1 while pairs_to_remove > 0: block.pop() block.pop(0) pairs_to_remove -= 1 ``` ea0e1e38 ``` def calculate_median_mean_stddev(time_blocks): ``` 89fce464 ``` stats = [] for block in time_blocks: # Calculate the median count = len(block) median = 0.0 if count % 2: median = float(block[count/2]) elif count > 0: median = (block[count / 2 - 1] + block[count / 2]) / 2.0 # Calculate the mean and standard deviation if count > 0: mean = sum(block) / float(len(block)) squares_of_diffs = map(lambda x: pow(x - mean, 2), block) mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) else: mean = 0 mean_of_squares = 0 std_dev = math.sqrt(mean_of_squares) stats.append((median, mean, std_dev)) return stats ``` ea0e1e38 ``` def process_feed(yaml_items, metric, metric_times): ``` 89fce464 ``` weekend_time_blocks = [[], [], [], [], [], [], [], []] weekday_time_blocks = [[], [], [], [], [], [], [], []] for i in yaml_items: time_posted = i['orig_posted'] wday = time.localtime(time_posted).tm_wday value_times = i[metric_times] values = i[metric] value_times_indices = [(t - time_posted) / 1800 for t in value_times] for j in range(len(values)): if value_times_indices[j] > 7 or value_times_indices[j] < 0: continue if wday == 5 or wday == 6: bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j]) else: bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j]) ``` ea0e1e38 ``` remove_outliers(weekend_time_blocks) remove_outliers(weekday_time_blocks) ``` 89fce464 ``` ``` ea0e1e38 ``` weekend_stats = calculate_median_mean_stddev(weekend_time_blocks) weekday_stats = calculate_median_mean_stddev(weekday_time_blocks) ``` 89fce464 ``` return weekend_stats, weekday_stats if __name__=='__main__': start_time = time.time() progress_text = [] try: localdir = os.path.abspath(os.path.dirname(sys.argv)) # # Read in techcrunch.yaml # # [ { 'title' : 'Title Text', # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', # 'author' : u'MG Siegler', # 'orig_posted' : 1282197199 # 'tags' : [ u'Google', u'privacy' ] # 'qualified' : -1 # 'comment_times' : [ 1282197199, 1282197407 ] # 'comments' : [ 0, 15 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 5 ] # 'slash_comment_times' : [ 1282197199, 1282197407 ] # 'slash_comments' : [ 0, 3 ] # 'retweet_times' : [ 1282197199, 1282197407 ] # 'retweets' : [ 0, 43 ] # }, # { ... } # ] # yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml') if os.path.exists(yaml_fullpath): f = file(yaml_fullpath, 'rb') items = yaml.load(f) f.close() else: print "could not open", yaml_fullpath items = [] weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times') # We'll only look at the stats for the time 1:00 to 1:30 after posting. weekend_median, weekend_mean, weekend_sigma = weekend_stats weekend_threshold = weekend_median + (weekend_sigma) median, mean, sigma = weekday_stats threshold = median + (sigma) print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold) print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold) for item in items: if item['qualified'] == -1: print "Processing", item['title'].encode('ascii', 'replace') for i in range(len(item['retweet_times'])): r_time = item['retweet_times'][i] if r_time - item['orig_posted'] < 5400: print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), if item['retweets'][i] >= threshold: item['qualified'] = i print "NOW QUALIFIES", if r_time - item['orig_posted'] >= 3600: break print except Exception as e: exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) print exceptional_text, ' '.join(progress_text) traceback.print_exc(file=sys.stdout) ```