master
David Blume Adding file analysis.py

David Blume authored 6 years ago

1) #!/usr/bin/env python
2) 
3) import yaml
4) import sys
5) import os
6) import time
7) import traceback
8) import exceptions
9) import math
10) import bisect
11) 
12) debug = True
13) 
14) 
15) def get_standard_deviation(l):
16)     """ returns the standard deviation of the iterable l """
17)     mean = sum(l) / len(l)
18)     squares_of_diffs = map(lambda x: pow(x - mean, 2), l)
19)     mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
20)     return math.sqrt(mean_of_squares)
21) 
22) 
23) def unique(seq, idfun=None):
24)     if idfun is None:
25)         def idfun(x): return x
26)     seen = {}
27)     result = []
28)     for item in seq:
29)         marker = idfun(item)
30)         if marker in seen: continue
31)         seen[marker] = 1
32)         result.append(item)
33)     return result
34) 
35) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

36) def process_comments_for_feed(yaml_items):
David Blume Adding file analysis.py

David Blume authored 6 years ago

37)     time_blocks = [[], [], [], [], [], [], [], []]
38)     for i in yaml_items:
39)         time_posted = i['orig_posted']
40)         comment_times = i['comment_times']
41)         comments = i['comments']
42)         comment_times_indices = [(t - time_posted) / 1800 for t in comment_times]
43)         for j in range(len(comments)):
44)             if comment_times_indices[j] > 7 or comment_times_indices[j] < 0:
45)                 continue
46)             time_blocks[comment_times_indices[j]].append(comments[j])
47) 
48)     stats = []
49)     for time_block in time_blocks:
50)         mean = sum(time_block) / len(time_block)
51)         squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block)
52)         mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
53)         std_dev = math.sqrt(mean_of_squares)
54)         stats.append((mean, std_dev))
55)     return stats
56) 
57) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

58) def remove_outliers(time_blocks):
David Blume Adding file analysis.py

David Blume authored 6 years ago

59)     """remove 6% of the values as outliers (3% from each side)."""
60)     for block in time_blocks:
61)         pairs_to_remove = 0
62)         if len(block) > 66:
63)             pairs_to_remove = int(len(block) * 0.03)
64)         elif len(block) > 19:
65)             pairs_to_remove = 1
66) 
67)         while pairs_to_remove > 0:
68)             block.pop()
69)             block.pop(0)
70)             pairs_to_remove -= 1
71) 
72) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

73) def calculate_median_mean_stddev(time_blocks):
David Blume Adding file analysis.py

David Blume authored 6 years ago

74)     stats = []
75)     for block in time_blocks:
76)         # Calculate the median
77)         count = len(block)
78)         median = 0.0
79)         if count % 2:
80)             median = float(block[count/2])
81)         elif count > 0:
82)             median = (block[count / 2 - 1] + block[count / 2]) / 2.0
83) 
84)         # Calculate the mean and standard deviation
85)         if count > 0:
86)             mean = sum(block) / float(len(block))
87)             squares_of_diffs = map(lambda x: pow(x - mean, 2), block)
88)             mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
89)         else:
90)             mean = 0
91)             mean_of_squares = 0
92)         std_dev = math.sqrt(mean_of_squares)
93)         stats.append((median, mean, std_dev))
94)     return stats
95) 
96) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

97) def process_feed(yaml_items, metric, metric_times):
David Blume Adding file analysis.py

David Blume authored 6 years ago

98)     weekend_time_blocks = [[], [], [], [], [], [], [], []]
99)     weekday_time_blocks = [[], [], [], [], [], [], [], []]
100)     for i in yaml_items:
101)         time_posted = i['orig_posted']
102)         wday = time.localtime(time_posted).tm_wday
103)         value_times = i[metric_times]
104)         values = i[metric]
105)         value_times_indices = [(t - time_posted) / 1800 for t in value_times]
106)         for j in range(len(values)):
107)             if value_times_indices[j] > 7 or value_times_indices[j] < 0:
108)                 continue
109)             if wday == 5 or wday == 6:
110)                 bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j])
111)             else:
112)                 bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j])
113) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

114)     remove_outliers(weekend_time_blocks)
115)     remove_outliers(weekday_time_blocks)
David Blume Adding file analysis.py

David Blume authored 6 years ago

116) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

117)     weekend_stats = calculate_median_mean_stddev(weekend_time_blocks)
118)     weekday_stats = calculate_median_mean_stddev(weekday_time_blocks)