Adding file analysis.py (89fce46) - techcrunch.git

analysis.py

...	...	@@ -0,0 +1,186 @@
	1	+#!/usr/bin/env python
	2	+
	3	+import yaml
	4	+import sys
	5	+import os
	6	+import time
	7	+import traceback
	8	+import exceptions
	9	+import math
	10	+import bisect
	11	+
	12	+debug = True
	13	+
	14	+
	15	+def get_standard_deviation(l):
	16	+ """ returns the standard deviation of the iterable l """
	17	+ mean = sum(l) / len(l)
	18	+ squares_of_diffs = map(lambda x: pow(x - mean, 2), l)
	19	+ mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
	20	+ return math.sqrt(mean_of_squares)
	21	+
	22	+
	23	+def unique(seq, idfun=None):
	24	+ if idfun is None:
	25	+ def idfun(x): return x
	26	+ seen = {}
	27	+ result = []
	28	+ for item in seq:
	29	+ marker = idfun(item)
	30	+ if marker in seen: continue
	31	+ seen[marker] = 1
	32	+ result.append(item)
	33	+ return result
	34	+
	35	+
	36	+def Process_comments_for_feed(yaml_items):
	37	+ time_blocks = [[], [], [], [], [], [], [], []]
	38	+ for i in yaml_items:
	39	+ time_posted = i['orig_posted']
	40	+ comment_times = i['comment_times']
	41	+ comments = i['comments']
	42	+ comment_times_indices = [(t - time_posted) / 1800 for t in comment_times]
	43	+ for j in range(len(comments)):
	44	+ if comment_times_indices[j] > 7 or comment_times_indices[j] < 0:
	45	+ continue
	46	+ time_blocks[comment_times_indices[j]].append(comments[j])
	47	+
	48	+ stats = []
	49	+ for time_block in time_blocks:
	50	+ mean = sum(time_block) / len(time_block)
	51	+ squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block)
	52	+ mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
	53	+ std_dev = math.sqrt(mean_of_squares)
	54	+ stats.append((mean, std_dev))
	55	+ return stats
	56	+
	57	+
	58	+def Remove_outliers(time_blocks):
	59	+ """remove 6% of the values as outliers (3% from each side)."""
	60	+ for block in time_blocks:
	61	+ pairs_to_remove = 0
	62	+ if len(block) > 66:
	63	+ pairs_to_remove = int(len(block) * 0.03)
	64	+ elif len(block) > 19:
	65	+ pairs_to_remove = 1
	66	+
	67	+ while pairs_to_remove > 0:
	68	+ block.pop()
	69	+ block.pop(0)
	70	+ pairs_to_remove -= 1
	71	+
	72	+
	73	+def Calculate_median_mean_stddev(time_blocks):
	74	+ stats = []
	75	+ for block in time_blocks:
	76	+ # Calculate the median
	77	+ count = len(block)
	78	+ median = 0.0
	79	+ if count % 2:
	80	+ median = float(block[count/2])
	81	+ elif count > 0:
	82	+ median = (block[count / 2 - 1] + block[count / 2]) / 2.0
	83	+
	84	+ # Calculate the mean and standard deviation
	85	+ if count > 0:
	86	+ mean = sum(block) / float(len(block))
	87	+ squares_of_diffs = map(lambda x: pow(x - mean, 2), block)
	88	+ mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
	89	+ else:
	90	+ mean = 0
	91	+ mean_of_squares = 0
	92	+ std_dev = math.sqrt(mean_of_squares)
	93	+ stats.append((median, mean, std_dev))
	94	+ return stats
	95	+
	96	+
	97	+def Process_feed(yaml_items, metric, metric_times):
	98	+ weekend_time_blocks = [[], [], [], [], [], [], [], []]
	99	+ weekday_time_blocks = [[], [], [], [], [], [], [], []]
	100	+ for i in yaml_items:
	101	+ time_posted = i['orig_posted']
	102	+ wday = time.localtime(time_posted).tm_wday
	103	+ value_times = i[metric_times]
	104	+ values = i[metric]
	105	+ value_times_indices = [(t - time_posted) / 1800 for t in value_times]
	106	+ for j in range(len(values)):
	107	+ if value_times_indices[j] > 7 or value_times_indices[j] < 0:
	108	+ continue
	109	+ if wday == 5 or wday == 6:
	110	+ bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j])
	111	+ else:
	112	+ bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j])
	113	+
	114	+ Remove_outliers(weekend_time_blocks)
	115	+ Remove_outliers(weekday_time_blocks)
	116	+
	117	+ weekend_stats = Calculate_median_mean_stddev(weekend_time_blocks)
	118	+ weekday_stats = Calculate_median_mean_stddev(weekday_time_blocks)
	119	+
	120	+ return weekend_stats, weekday_stats
	121	+
	122	+
	123	+if __name__=='__main__':
	124	+ start_time = time.time()
	125	+ progress_text = []
	126	+
	127	+ try:
	128	+ localdir = os.path.abspath(os.path.dirname(sys.argv[0]))
	129	+ #
	130	+ # Read in techcrunch.yaml
	131	+ #
	132	+ # [ { 'title' : 'Title Text',
	133	+ # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
	134	+ # 'author' : u'MG Siegler',
	135	+ # 'orig_posted' : 1282197199
	136	+ # 'tags' : [ u'Google', u'privacy' ]
	137	+ # 'qualified' : -1
	138	+ # 'comment_times' : [ 1282197199, 1282197407 ]
	139	+ # 'comments' : [ 0, 15 ]
	140	+ # 'slash_comment_times' : [ 1282197199, 1282197407 ]
	141	+ # 'slash_comments' : [ 0, 5 ]
	142	+ # 'slash_comment_times' : [ 1282197199, 1282197407 ]
	143	+ # 'slash_comments' : [ 0, 3 ]
	144	+ # 'retweet_times' : [ 1282197199, 1282197407 ]
	145	+ # 'retweets' : [ 0, 43 ]
	146	+ # },
	147	+ # { ... }
	148	+ # ]
	149	+ #
	150	+ yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml')
	151	+ if os.path.exists(yaml_fullpath):
	152	+ f = file(yaml_fullpath, 'rb')
	153	+ items = yaml.load(f)
	154	+ f.close()
	155	+ else:
	156	+ print "could not open", yaml_fullpath
	157	+ items = []
	158	+
	159	+ weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times')
	160	+
	161	+ # We'll only look at the stats for the time 1:00 to 1:30 after posting.
	162	+ weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
	163	+ weekend_threshold = weekend_median + (weekend_sigma)
	164	+ median, mean, sigma = weekday_stats[2]
	165	+ threshold = median + (sigma)
	166	+ print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold)
	167	+ print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold)
	168	+ for item in items:
	169	+ if item['qualified'] == -1:
	170	+ print "Processing", item['title'].encode('ascii', 'replace')
	171	+ for i in range(len(item['retweet_times'])):
	172	+ r_time = item['retweet_times'][i]
	173	+ if r_time - item['orig_posted'] < 5400:
	174	+ print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]),
	175	+ if item['retweets'][i] >= threshold:
	176	+ item['qualified'] = i
	177	+ print "NOW QUALIFIES",
	178	+ if r_time - item['orig_posted'] >= 3600:
	179	+ break
	180	+ print
	181	+
	182	+ except Exception as e:
	183	+ exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
	184	+ print exceptional_text, ' '.join(progress_text)
	185	+ traceback.print_exc(file=sys.stdout)
	186	+
0	187