analysis.py (9179b4b) - techcrunch.git

9179b4b6a9f34c3d241400ca691a88c9b4ed30f9

Upgrade to Python3 dblume authored 5 months ago	`1) #!/home/dblume/opt/python-3.9.6/bin/python3`
Adding file analysis.py David Blume authored 6 years ago	`2) 3) import yaml 4) import sys 5) import os 6) import time 7) import traceback 8) import math 9) import bisect 10) 11) debug = True 12) 13) 14) def get_standard_deviation(l): 15) """ returns the standard deviation of the iterable l """ 16) mean = sum(l) / len(l)`
Upgrade to Python3 dblume authored 5 months ago	`17) squares_of_diffs = [pow(x - mean, 2) for x in l]`
Adding file analysis.py David Blume authored 6 years ago	`18) mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) 19) return math.sqrt(mean_of_squares) 20) 21) 22) def unique(seq, idfun=None): 23) if idfun is None: 24) def idfun(x): return x 25) seen = {} 26) result = [] 27) for item in seq: 28) marker = idfun(item) 29) if marker in seen: continue 30) seen[marker] = 1 31) result.append(item) 32) return result 33) 34)`
Better conformance to PEP-8... David Blume authored 6 years ago	`35) def process_comments_for_feed(yaml_items):`
Adding file analysis.py David Blume authored 6 years ago	36) time_blocks = [[], [], [], [], [], [], [], []] 37) for i in yaml_items: 38) time_posted = i['orig_posted'] 39) comment_times = i['comment_times'] 40) comments = i['comments'] 41) comment_times_indices = [(t - time_posted) / 1800 for t in comment_times] 42) for j in range(len(comments)): 43) if comment_times_indices[j] > 7 or comment_times_indices[j] < 0: 44) continue 45) time_blocks[comment_times_indices[j]].append(comments[j]) 46) 47) stats = [] 48) for time_block in time_blocks: 49) mean = sum(time_block) / len(time_block)
Upgrade to Python3 dblume authored 5 months ago	`50) squares_of_diffs = [pow(x - mean, 2) for x in time_block]`
Adding file analysis.py David Blume authored 6 years ago	`51) mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) 52) std_dev = math.sqrt(mean_of_squares) 53) stats.append((mean, std_dev)) 54) return stats 55) 56)`
Better conformance to PEP-8... David Blume authored 6 years ago	`57) def remove_outliers(time_blocks):`
Adding file analysis.py David Blume authored 6 years ago	`58) """remove 6% of the values as outliers (3% from each side).""" 59) for block in time_blocks: 60) pairs_to_remove = 0 61) if len(block) > 66: 62) pairs_to_remove = int(len(block) * 0.03) 63) elif len(block) > 19: 64) pairs_to_remove = 1 65) 66) while pairs_to_remove > 0: 67) block.pop() 68) block.pop(0) 69) pairs_to_remove -= 1 70) 71)`
Better conformance to PEP-8... David Blume authored 6 years ago	`72) def calculate_median_mean_stddev(time_blocks):`
Adding file analysis.py David Blume authored 6 years ago	`73) stats = [] 74) for block in time_blocks: 75) # Calculate the median 76) count = len(block) 77) median = 0.0 78) if count % 2: 79) median = float(block[count/2]) 80) elif count > 0: 81) median = (block[count / 2 - 1] + block[count / 2]) / 2.0 82) 83) # Calculate the mean and standard deviation 84) if count > 0: 85) mean = sum(block) / float(len(block))`
Upgrade to Python3 dblume authored 5 months ago	`86) squares_of_diffs = [pow(x - mean, 2) for x in block]`
Adding file analysis.py David Blume authored 6 years ago	`87) mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) 88) else: 89) mean = 0 90) mean_of_squares = 0 91) std_dev = math.sqrt(mean_of_squares) 92) stats.append((median, mean, std_dev)) 93) return stats 94) 95)`
Better conformance to PEP-8... David Blume authored 6 years ago	`96) def process_feed(yaml_items, metric, metric_times):`
Adding file analysis.py David Blume authored 6 years ago	97) weekend_time_blocks = [[], [], [], [], [], [], [], []] 98) weekday_time_blocks = [[], [], [], [], [], [], [], []] 99) for i in yaml_items: 100) time_posted = i['orig_posted'] 101) wday = time.localtime(time_posted).tm_wday 102) value_times = i[metric_times] 103) values = i[metric] 104) value_times_indices = [(t - time_posted) / 1800 for t in value_times] 105) for j in range(len(values)): 106) if value_times_indices[j] > 7 or value_times_indices[j] < 0: 107) continue 108) if wday == 5 or wday == 6: 109) bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j]) 110) else: 111) bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j]) 112)
Better conformance to PEP-8... David Blume authored 6 years ago	`113) remove_outliers(weekend_time_blocks) 114) remove_outliers(weekday_time_blocks)`
Adding file analysis.py David Blume authored 6 years ago	`115)`
Better conformance to PEP-8... David Blume authored 6 years ago	`116) weekend_stats = calculate_median_mean_stddev(weekend_time_blocks) 117) weekday_stats = calculate_median_mean_stddev(weekday_time_blocks)`
Adding file analysis.py David Blume authored 6 years ago	118) 119) return weekend_stats, weekday_stats 120) 121) 122) if __name__=='__main__': 123) start_time = time.time() 124) progress_text = [] 125) 126) try: 127) localdir = os.path.abspath(os.path.dirname(sys.argv[0])) 128) # 129) # Read in techcrunch.yaml 130) # 131) # [ { 'title' : 'Title Text', 132) # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', 133) # 'author' : u'MG Siegler', 134) # 'orig_posted' : 1282197199 135) # 'tags' : [ u'Google', u'privacy' ] 136) # 'qualified' : -1 137) # 'comment_times' : [ 1282197199, 1282197407 ] 138) # 'comments' : [ 0, 15 ] 139) # 'slash_comment_times' : [ 1282197199, 1282197407 ] 140) # 'slash_comments' : [ 0, 5 ] 141) # 'slash_comment_times' : [ 1282197199, 1282197407 ] 142) # 'slash_comments' : [ 0, 3 ] 143) # 'retweet_times' : [ 1282197199, 1282197407 ] 144) # 'retweets' : [ 0, 43 ] 145) # }, 146) # { ... } 147) # ] 148) # 149) yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml') 150) if os.path.exists(yaml_fullpath): 151) f = file(yaml_fullpath, 'rb') 152) items = yaml.load(f) 153) f.close() 154) else:
Upgrade to Python3 dblume authored 5 months ago	`155) print("could not open", yaml_fullpath)`
Adding file analysis.py David Blume authored 6 years ago	`156) items = [] 157) 158) weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times') 159) 160) # We'll only look at the stats for the time 1:00 to 1:30 after posting. 161) weekend_median, weekend_mean, weekend_sigma = weekend_stats[2] 162) weekend_threshold = weekend_median + (weekend_sigma) 163) median, mean, sigma = weekday_stats[2] 164) threshold = median + (sigma)`
Upgrade to Python3 dblume authored 5 months ago	`165) print("Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold)) 166) print("Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold))`
Adding file analysis.py David Blume authored 6 years ago	`167) for item in items: 168) if item['qualified'] == -1:`
Upgrade to Python3 dblume authored 5 months ago	`169) print("Processing", item['title'].encode('ascii', 'replace'))`
Adding file analysis.py David Blume authored 6 years ago	`170) for i in range(len(item['retweet_times'])): 171) r_time = item['retweet_times'][i] 172) if r_time - item['orig_posted'] < 5400:`
Upgrade to Python3 dblume authored 5 months ago	`173) print("Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), end=' ')`
Adding file analysis.py David Blume authored 6 years ago	`174) if item['retweets'][i] >= threshold: 175) item['qualified'] = i`
Upgrade to Python3 dblume authored 5 months ago	`176) print("NOW QUALIFIES", end=' ')`
Adding file analysis.py David Blume authored 6 years ago	`177) if r_time - item['orig_posted'] >= 3600: 178) break`
Upgrade to Python3 dblume authored 5 months ago	`179) print()`
Adding file analysis.py David Blume authored 6 years ago	`180) 181) except Exception as e: 182) exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)`
Upgrade to Python3 dblume authored 5 months ago	`183) print(exceptional_text, ' '.join(progress_text))`