analysis.py (ea0e1e3) - techcrunch.git

analysis.py

#!/usr/bin/env python

import yaml
import sys
import os
import time
import traceback
import exceptions
import math
import bisect

debug = True

def get_standard_deviation(l):
    """ returns the standard deviation of the iterable l """
    mean = sum(l) / len(l)
    squares_of_diffs = map(lambda x: pow(x - mean, 2), l)
    mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
    return math.sqrt(mean_of_squares)

def unique(seq, idfun=None):
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result

def process_comments_for_feed(yaml_items):
    time_blocks = [[], [], [], [], [], [], [], []]
    for i in yaml_items:
        time_posted = i['orig_posted']
        comment_times = i['comment_times']
        comments = i['comments']
        comment_times_indices = [(t - time_posted) / 1800 for t in comment_times]
        for j in range(len(comments)):
            if comment_times_indices[j] > 7 or comment_times_indices[j] < 0:
                continue
            time_blocks[comment_times_indices[j]].append(comments[j])

stats = []
    for time_block in time_blocks:
        mean = sum(time_block) / len(time_block)
        squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block)
        mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
        std_dev = math.sqrt(mean_of_squares)
        stats.append((mean, std_dev))
    return stats

def remove_outliers(time_blocks):
    """remove 6% of the values as outliers (3% from each side)."""
    for block in time_blocks:
        pairs_to_remove = 0
        if len(block) > 66:
            pairs_to_remove = int(len(block) * 0.03)
        elif len(block) > 19:
            pairs_to_remove = 1

while pairs_to_remove > 0:
            block.pop()
            block.pop(0)
            pairs_to_remove -= 1

def calculate_median_mean_stddev(time_blocks):
    stats = []
    for block in time_blocks:
        # Calculate the median
        count = len(block)
        median = 0.0
        if count % 2:
            median = float(block[count/2])
        elif count > 0:
            median = (block[count / 2 - 1] + block[count / 2]) / 2.0

# Calculate the mean and standard deviation
        if count > 0:
            mean = sum(block) / float(len(block))
            squares_of_diffs = map(lambda x: pow(x - mean, 2), block)
            mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
        else:
            mean = 0
            mean_of_squares = 0
        std_dev = math.sqrt(mean_of_squares)
        stats.append((median, mean, std_dev))
    return stats

def process_feed(yaml_items, metric, metric_times):
    weekend_time_blocks = [[], [], [], [], [], [], [], []]
    weekday_time_blocks = [[], [], [], [], [], [], [], []]
    for i in yaml_items:
        time_posted = i['orig_posted']
        wday = time.localtime(time_posted).tm_wday
        value_times = i[metric_times]
        values = i[metric]
        value_times_indices = [(t - time_posted) / 1800 for t in value_times]
        for j in range(len(values)):
            if value_times_indices[j] > 7 or value_times_indices[j] < 0:
                continue
            if wday == 5 or wday == 6:
                bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j])
            else:
                bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j])

remove_outliers(weekend_time_blocks)
    remove_outliers(weekday_time_blocks)

weekend_stats = calculate_median_mean_stddev(weekend_time_blocks)
    weekday_stats = calculate_median_mean_stddev(weekday_time_blocks)

return weekend_stats, weekday_stats

if __name__=='__main__':
    start_time = time.time()
    progress_text = []

try:
        localdir = os.path.abspath(os.path.dirname(sys.argv[0]))
        #
        # Read in techcrunch.yaml
        #
        # [ { 'title'               : 'Title Text',
        #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
        #     'author'              : u'MG Siegler',
        #     'orig_posted'         : 1282197199
        #     'tags'                : [ u'Google', u'privacy' ]
        #     'qualified'           : -1
        #     'comment_times'       : [ 1282197199, 1282197407 ]
        #     'comments'            : [ 0, 15 ]
        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
        #     'slash_comments'      : [ 0, 5 ]
        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
        #     'slash_comments'      : [ 0, 3 ]
        #     'retweet_times'       : [ 1282197199, 1282197407 ]
        #     'retweets'            : [ 0, 43 ]
        #    },
        #    { ... }
        #  ]
        #
        yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml')
        if os.path.exists(yaml_fullpath):
            f = file(yaml_fullpath, 'rb')
            items = yaml.load(f)
            f.close()
        else:
            print "could not open", yaml_fullpath
            items = []

weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times')

# We'll only look at the stats for the time 1:00 to 1:30 after posting.
        weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
        weekend_threshold = weekend_median + (weekend_sigma)
        median, mean, sigma = weekday_stats[2]
        threshold = median + (sigma)
        print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold)
        print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold)
        for item in items:
            if item['qualified'] == -1:
                print "Processing", item['title'].encode('ascii', 'replace')
                for i in range(len(item['retweet_times'])):
                    r_time = item['retweet_times'][i]
                    if r_time - item['orig_posted'] < 5400:
                        print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]),
                        if item['retweets'][i] >= threshold:
                            item['qualified'] = i
                            print "NOW QUALIFIES",
                        if r_time - item['orig_posted'] >= 3600:
                            break
                print

except Exception as e:
        exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
        print exceptional_text, ' '.join(progress_text)
        traceback.print_exc(file=sys.stdout)

techcrunch.git

Better conformance to PEP-8. Long ways to go.