Adding file analysis.py
David Blume

David Blume commited on 2018-01-20 20:49:06
Showing 1 changed files, with 186 additions and 0 deletions.

... ...
@@ -0,0 +1,186 @@
1
+#!/usr/bin/env python
2
+
3
+import yaml
4
+import sys
5
+import os
6
+import time
7
+import traceback
8
+import exceptions
9
+import math
10
+import bisect
11
+
12
+debug = True
13
+
14
+
15
+def get_standard_deviation(l):
16
+    """ returns the standard deviation of the iterable l """
17
+    mean = sum(l) / len(l)
18
+    squares_of_diffs = map(lambda x: pow(x - mean, 2), l)
19
+    mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
20
+    return math.sqrt(mean_of_squares)
21
+
22
+
23
+def unique(seq, idfun=None):
24
+    if idfun is None:
25
+        def idfun(x): return x
26
+    seen = {}
27
+    result = []
28
+    for item in seq:
29
+        marker = idfun(item)
30
+        if marker in seen: continue
31
+        seen[marker] = 1
32
+        result.append(item)
33
+    return result
34
+
35
+
36
+def Process_comments_for_feed(yaml_items):
37
+    time_blocks = [[], [], [], [], [], [], [], []]
38
+    for i in yaml_items:
39
+        time_posted = i['orig_posted']
40
+        comment_times = i['comment_times']
41
+        comments = i['comments']
42
+        comment_times_indices = [(t - time_posted) / 1800 for t in comment_times]
43
+        for j in range(len(comments)):
44
+            if comment_times_indices[j] > 7 or comment_times_indices[j] < 0:
45
+                continue
46
+            time_blocks[comment_times_indices[j]].append(comments[j])
47
+
48
+    stats = []
49
+    for time_block in time_blocks:
50
+        mean = sum(time_block) / len(time_block)
51
+        squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block)
52
+        mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
53
+        std_dev = math.sqrt(mean_of_squares)
54
+        stats.append((mean, std_dev))
55
+    return stats
56
+
57
+
58
+def Remove_outliers(time_blocks):
59
+    """remove 6% of the values as outliers (3% from each side)."""
60
+    for block in time_blocks:
61
+        pairs_to_remove = 0
62
+        if len(block) > 66:
63
+            pairs_to_remove = int(len(block) * 0.03)
64
+        elif len(block) > 19:
65
+            pairs_to_remove = 1
66
+
67
+        while pairs_to_remove > 0:
68
+            block.pop()
69
+            block.pop(0)
70
+            pairs_to_remove -= 1
71
+
72
+
73
+def Calculate_median_mean_stddev(time_blocks):
74
+    stats = []
75
+    for block in time_blocks:
76
+        # Calculate the median
77
+        count = len(block)
78
+        median = 0.0
79
+        if count % 2:
80
+            median = float(block[count/2])
81
+        elif count > 0:
82
+            median = (block[count / 2 - 1] + block[count / 2]) / 2.0
83
+
84
+        # Calculate the mean and standard deviation
85
+        if count > 0:
86
+            mean = sum(block) / float(len(block))
87
+            squares_of_diffs = map(lambda x: pow(x - mean, 2), block)
88
+            mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs)
89
+        else:
90
+            mean = 0
91
+            mean_of_squares = 0
92
+        std_dev = math.sqrt(mean_of_squares)
93
+        stats.append((median, mean, std_dev))
94
+    return stats
95
+
96
+
97
+def Process_feed(yaml_items, metric, metric_times):
98
+    weekend_time_blocks = [[], [], [], [], [], [], [], []]
99
+    weekday_time_blocks = [[], [], [], [], [], [], [], []]
100
+    for i in yaml_items:
101
+        time_posted = i['orig_posted']
102
+        wday = time.localtime(time_posted).tm_wday
103
+        value_times = i[metric_times]
104
+        values = i[metric]
105
+        value_times_indices = [(t - time_posted) / 1800 for t in value_times]
106
+        for j in range(len(values)):
107
+            if value_times_indices[j] > 7 or value_times_indices[j] < 0:
108
+                continue
109
+            if wday == 5 or wday == 6:
110
+                bisect.insort(weekend_time_blocks[value_times_indices[j]], values[j])
111
+            else:
112
+                bisect.insort(weekday_time_blocks[value_times_indices[j]], values[j])
113
+
114
+    Remove_outliers(weekend_time_blocks)
115
+    Remove_outliers(weekday_time_blocks)
116
+
117
+    weekend_stats = Calculate_median_mean_stddev(weekend_time_blocks)
118
+    weekday_stats = Calculate_median_mean_stddev(weekday_time_blocks)
119
+
120
+    return weekend_stats, weekday_stats
121
+
122
+
123
+if __name__=='__main__':
124
+    start_time = time.time()
125
+    progress_text = []
126
+
127
+    try:
128
+        localdir = os.path.abspath(os.path.dirname(sys.argv[0]))
129
+        #
130
+        # Read in techcrunch.yaml
131
+        #
132
+        # [ { 'title'               : 'Title Text',
133
+        #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
134
+        #     'author'              : u'MG Siegler',
135
+        #     'orig_posted'         : 1282197199
136
+        #     'tags'                : [ u'Google', u'privacy' ]
137
+        #     'qualified'           : -1
138
+        #     'comment_times'       : [ 1282197199, 1282197407 ]
139
+        #     'comments'            : [ 0, 15 ]
140
+        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
141
+        #     'slash_comments'      : [ 0, 5 ]
142
+        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
143
+        #     'slash_comments'      : [ 0, 3 ]
144
+        #     'retweet_times'       : [ 1282197199, 1282197407 ]
145
+        #     'retweets'            : [ 0, 43 ]
146
+        #    },
147
+        #    { ... }
148
+        #  ]
149
+        #
150
+        yaml_fullpath = os.path.join(localdir, 'techcrunch.yaml')
151
+        if os.path.exists(yaml_fullpath):
152
+            f = file(yaml_fullpath, 'rb')
153
+            items = yaml.load(f)
154
+            f.close()
155
+        else:
156
+            print "could not open", yaml_fullpath
157
+            items = []
158
+
159
+        weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times')
160
+
161
+        # We'll only look at the stats for the time 1:00 to 1:30 after posting.
162
+        weekend_median, weekend_mean, weekend_sigma = weekend_stats[2]
163
+        weekend_threshold = weekend_median + (weekend_sigma)
164
+        median, mean, sigma = weekday_stats[2]
165
+        threshold = median + (sigma)
166
+        print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold)
167
+        print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold)
168
+        for item in items:
169
+            if item['qualified'] == -1:
170
+                print "Processing", item['title'].encode('ascii', 'replace')
171
+                for i in range(len(item['retweet_times'])):
172
+                    r_time = item['retweet_times'][i]
173
+                    if r_time - item['orig_posted'] < 5400:
174
+                        print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]),
175
+                        if item['retweets'][i] >= threshold:
176
+                            item['qualified'] = i
177
+                            print "NOW QUALIFIES",
178
+                        if r_time - item['orig_posted'] >= 3600:
179
+                            break
180
+                print
181
+
182
+    except Exception as e:
183
+        exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e)
184
+        print exceptional_text, ' '.join(progress_text)
185
+        traceback.print_exc(file=sys.stdout)
186
+
0 187