9179b4b6a9f34c3d241400ca691a88c9b4ed30f9
dblume Upgrade to Python3

dblume authored 5 months ago

1) #!/home/dblume/opt/python-3.9.6/bin/python3
David Blume Adding file analysis.py

David Blume authored 6 years ago

2) 
3) import yaml
4) import sys
5) import os
6) import time
7) import traceback
8) import math
9) import bisect
10) 
11) debug = True
12) 
13) 
14) def get_standard_deviation(l):
15)     """ returns the standard deviation of the iterable l """
16)     mean = sum(l) / len(l)
dblume Upgrade to Python3

dblume authored 5 months ago

17)     squares_of_diffs = [pow(x - mean, 2) for x in l]
David Blume Adding file analysis.py

David Blume authored 6 years ago

18)     mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
19)     return math.sqrt(mean_of_squares)
20) 
21) 
22) def unique(seq, idfun=None):
23)     if idfun is None:
24)         def idfun(x): return x
25)     seen = {}
26)     result = []
27)     for item in seq:
28)         marker = idfun(item)
29)         if marker in seen: continue
30)         seen[marker] = 1
31)         result.append(item)
32)     return result
33) 
34) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

35) def process_comments_for_feed(yaml_items):
David Blume Adding file analysis.py

David Blume authored 6 years ago

36)     time_blocks = [[], [], [], [], [], [], [], []]
37)     for i in yaml_items:
38)         time_posted = i['orig_posted']
39)         comment_times = i['comment_times']
40)         comments = i['comments']
41)         comment_times_indices = [(t - time_posted) / 1800 for t in comment_times]
42)         for j in range(len(comments)):
43)             if comment_times_indices[j] > 7 or comment_times_indices[j] < 0:
44)                 continue
45)             time_blocks[comment_times_indices[j]].append(comments[j])
46) 
47)     stats = []
48)     for time_block in time_blocks:
49)         mean = sum(time_block) / len(time_block)
dblume Upgrade to Python3

dblume authored 5 months ago

50)         squares_of_diffs = [pow(x - mean, 2) for x in time_block]
David Blume Adding file analysis.py

David Blume authored 6 years ago

51)         mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
52)         std_dev = math.sqrt(mean_of_squares)
53)         stats.append((mean, std_dev))
54)     return stats
55) 
56) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

57) def remove_outliers(time_blocks):
David Blume Adding file analysis.py

David Blume authored 6 years ago

58)     """remove 6% of the values as outliers (3% from each side)."""
59)     for block in time_blocks:
60)         pairs_to_remove = 0
61)         if len(block) > 66:
62)             pairs_to_remove = int(len(block) * 0.03)
63)         elif len(block) > 19:
64)             pairs_to_remove = 1
65) 
66)         while pairs_to_remove > 0:
67)             block.pop()
68)             block.pop(0)
69)             pairs_to_remove -= 1
70) 
71) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

72) def calculate_median_mean_stddev(time_blocks):
David Blume Adding file analysis.py

David Blume authored 6 years ago

73)     stats = []
74)     for block in time_blocks:
75)         # Calculate the median
76)         count = len(block)
77)         median = 0.0
78)         if count % 2:
79)             median = float(block[count/2])
80)         elif count > 0:
81)             median = (block[count / 2 - 1] + block[count / 2]) / 2.0
82) 
83)         # Calculate the mean and standard deviation
84)         if count > 0:
85)             mean = sum(block) / float(len(block))
dblume Upgrade to Python3

dblume authored 5 months ago

86)             squares_of_diffs = [pow(x - mean, 2) for x in block]
David Blume Adding file analysis.py

David Blume authored 6 years ago

87)             mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
88)         else:
89)             mean = 0
90)             mean_of_squares = 0
91)         std_dev = math.sqrt(mean_of_squares)
92)         stats.append((median, mean, std_dev))
93)     return stats
94) 
95) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

96) def process_feed(yaml_items, metric, metric_times):
David Blume Adding file analysis.py

David Blume authored 6 years ago

97)     weekend_time_blocks = [[], [], [], [], [], [], [], []]
98)     weekday_time_blocks = [[], [], [], [], [], [], [], []]
99)     for i in yaml_items:
100)         time_posted = i['orig_posted']
101)         wday = time.localtime(time_posted).tm_wday
102)         value_times = i[metric_times]
103)         values = i[metric]
104)         value_times_indices = [(t - time_posted) / 1800 for t in value_times]
105)         for j in range(len(values)):
106)             if value_times_indices[j] > 7 or value_times_indices[j] < 0:
107)                 continue
108)             if wday == 5 or wday == 6:
109)                 bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j])
110)             else:
111)                 bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j])
112) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

113)     remove_outliers(weekend_time_blocks)
114)     remove_outliers(weekday_time_blocks)
David Blume Adding file analysis.py

David Blume authored 6 years ago

115) 
David Blume Better conformance to PEP-8...

David Blume authored 6 years ago

116)     weekend_stats = calculate_median_mean_stddev(weekend_time_blocks)
117)     weekday_stats = calculate_median_mean_stddev(weekday_time_blocks)
David Blume Adding file analysis.py

David Blume authored 6 years ago

118) 
119)     return weekend_stats, weekday_stats
120) 
121) 
122) if __name__=='__main__':
123)     start_time = time.time()
124)     progress_text = []
125) 
126)     try:
127)         localdir = os.path.abspath(os.path.dirname(sys.argv[0]))
128)         #
129)         # Read in techcrunch.yaml
130)         #
131)         # [ { 'title'               : 'Title Text',
132)         #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
133)         #     'author'              : u'MG Siegler',
134)         #     'orig_posted'         : 1282197199
135)         #     'tags'                : [ u'Google', u'privacy' ]
136)         #     'qualified'           : -1
137)         #     'comment_times'       : [ 1282197199, 1282197407 ]
138)         #     'comments'            : [ 0, 15 ]
139)         #     'slash_comment_times' : [ 1282197199, 1282197407 ]
140)         #     'slash_comments'      : [ 0, 5 ]
141)         #     'slash_comment_times' : [ 1282197199, 1282197407 ]
142)         #     'slash_comments'      : [ 0, 3 ]
143)         #     'retweet_times'       : [ 1282197199, 1282197407 ]
144)         #     'retweets'            : [ 0, 43 ]
145)         #    },
146)         #    { ... }
147)         #  ]
148)         #
149)         yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml')
150)         if os.path.exists(yaml_fullpath):
151)             f = file(yaml_fullpath, 'rb')
152)             items = yaml.load(f)
153)             f.close()
154)         else:
dblume Upgrade to Python3

dblume authored 5 months ago

155)             print("could not open", yaml_fullpath)
David Blume Adding file analysis.py

David Blume authored 6 years ago

156)             items = []
157) 
158)         weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times')
159) 
160)         # We'll only look at the stats for the time 1:00 to 1:30 after posting.
161)         weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
162)         weekend_threshold = weekend_median + (weekend_sigma)
163)         median, mean, sigma = weekday_stats[2]
164)         threshold = median + (sigma)
dblume Upgrade to Python3

dblume authored 5 months ago

165)         print("Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold))
166)         print("Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold))
David Blume Adding file analysis.py

David Blume authored 6 years ago

167)         for item in items:
168)             if item['qualified'] == -1:
dblume Upgrade to Python3

dblume authored 5 months ago

169)                 print("Processing", item['title'].encode('ascii', 'replace'))
David Blume Adding file analysis.py

David Blume authored 6 years ago

170)                 for i in range(len(item['retweet_times'])):
171)                     r_time = item['retweet_times'][i]
172)                     if r_time - item['orig_posted'] < 5400:
dblume Upgrade to Python3

dblume authored 5 months ago

173)                         print("Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), end=' ')
David Blume Adding file analysis.py

David Blume authored 6 years ago

174)                         if item['retweets'][i] >= threshold:
175)                             item['qualified'] = i
dblume Upgrade to Python3

dblume authored 5 months ago

176)                             print("NOW QUALIFIES", end=' ')
David Blume Adding file analysis.py

David Blume authored 6 years ago

177)                         if r_time - item['orig_posted'] >= 3600:
178)                             break
dblume Upgrade to Python3

dblume authored 5 months ago

179)                 print()
David Blume Adding file analysis.py

David Blume authored 6 years ago

180) 
181)     except Exception as e:
182)         exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
dblume Upgrade to Python3

dblume authored 5 months ago

183)         print(exceptional_text, ' '.join(progress_text))