forked from Charcoal-SE/SmokeDetector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbodyfetcher.py
416 lines (357 loc) · 20.7 KB
/
bodyfetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# coding=utf-8
from spamhandling import handle_spam, check_if_spam
from datahandling import (add_or_update_api_data, clear_api_data, store_bodyfetcher_queue, store_bodyfetcher_max_ids,
add_queue_timing_data)
from chatcommunicate import tell_rooms_with
from globalvars import GlobalVars
from operator import itemgetter
from datetime import datetime
import json
import time
import threading
import requests
import copy
from classes import Post, PostParseError
from helpers import log
from itertools import chain
# noinspection PyClassHasNoInit,PyBroadException
class BodyFetcher:
queue = {}
previous_max_ids = {}
# special_cases are the minimum number of posts, for each of the specified sites, which
# need to be in the queue prior to feching posts.
# The number of questions we fetch each day per site is the total of
# new questions + new answers + edits.
# Stack Overflow is handled specially. It's know that some SO edits/new posts don't
# appear on the WebSocket.
# Queue depths were last comprehensively adjusted on 2015-12-30.
special_cases = {
# 2020-11-02:
# Request numbers pre 2020-11-02 are very low due to a now fixed bug.
#
# pre sum of requests
# questions 2020-11-02 2020-02-19 2020-10-28 to
# per day setting requests 2020-11-02
"stackoverflow.com": 3, # _ 6,816 3 360 4,365
"math.stackexchange.com": 2, # _ 596 1 473 6,346
# "ru.stackoverflow.com": 2, # _ 230 10 13 145
# "askubuntu.com": , # _ 140 1 88 1,199
# "es.stackoverflow.com": 2, # _ 138 5 25 225
# "superuser.com": , # _ 122 1 87 1,038
# "physics.stackexchange.com": , # _ 90 1 76 1,161
# "stats.stackexchange.com": 2, # _ 82 5 16 151
# "pt.stackoverflow.com": 2, # _ 73 10 7 75
# "unix.stackexchange.com": , # _ 72 1 76 772
# "electronics.stackexchange.com": , # _ 69 1 46 723
# "serverfault.com": , # _ 62 1 43 582
# "tex.stackexchange.com": 2, # _ 60 5 8 98
# "blender.stackexchange.com": 2, # _ 59 5 8 85
# "salesforce.stackexchange.com": , # _ 49 1 47 472
# "gis.stackexchange.com": 2, # _ 46 3 15 166
# "mathoverflow.net" (time_sensitive) # _ 37 - 33 511
# "english.stackexchange.com": , # _ 36 1 34 382
# "magento.stackexchange.com": 2, # _ 34 3 5 93
# "ell.stackexchange.com": , # _ 33 1 24 365
# "wordpress.stackexchange.com": , # _ 29 1 30 283
# "apple.stackexchange.com": , # _ 29 1 46 294
# "diy.stackexchange.com": , # _ 26 1 24 306
# "mathematica.stackexchange.com": , # _ 25 1 21 384
# "dba.stackexchange.com": , # _ 23 1 31 343
# "datascience.stackexchange.com": , # _ 21 1 17 220
# "chemistry.stackexchange.com": , # _ 20 1 20 140
# "security.stackexchange.com": , # _ 18 1 15 238
# "codereview.stackexchange.com": , # _ 18 5 2 39
# The only reason this is the cut-off is that it was the last in the existing list
# as of 2020-11-01.
}
time_sensitive = ["security.stackexchange.com", "movies.stackexchange.com",
"mathoverflow.net", "gaming.stackexchange.com", "webmasters.stackexchange.com",
"arduino.stackexchange.com", "workplace.stackexchange.com"]
threshold = 1
last_activity_date = 0
api_data_lock = threading.Lock()
queue_modify_lock = threading.Lock()
max_ids_modify_lock = threading.Lock()
queue_timing_modify_lock = threading.Lock()
def add_to_queue(self, post, should_check_site=False):
try:
d = json.loads(json.loads(post)["data"])
except ValueError:
# post didn't contain a valid JSON object in its ["data"] member
# indicative of a server-side socket reset
return
site_base = d["siteBaseHostAddress"]
post_id = d["id"]
# For the Sandbox questions on MSE, we choose to ignore the entire question and all answers.
ignored_mse_questions = [
3122, # Formatting Sandbox
51812, # The API sandbox
296077, # Sandbox archive
]
if post_id in ignored_mse_questions and site_base == "meta.stackexchange.com":
return # don't check meta sandbox, it's full of weird posts
with self.queue_modify_lock:
if site_base not in self.queue:
self.queue[site_base] = {}
# Something about how the queue is being filled is storing Post IDs in a list.
# So, if we get here we need to make sure that the correct types are paseed.
#
# If the item in self.queue[site_base] is a dict, do nothing.
# If the item in self.queue[site_base] is not a dict but is a list or a tuple, then convert to dict and
# then replace the list or tuple with the dict.
# If the item in self.queue[site_base] is neither a dict or a list, then explode.
if type(self.queue[site_base]) is dict:
pass
elif type(self.queue[site_base]) is not dict and type(self.queue[site_base]) in [list, tuple]:
post_list_dict = {}
for post_list_id in self.queue[site_base]:
post_list_dict[post_list_id] = None
self.queue[site_base] = post_list_dict
else:
raise TypeError("A non-iterable is in the queue item for a given site, this will cause errors!")
# This line only works if we are using a dict in the self.queue[site_base] object, which we should be with
# the previous conversion code.
self.queue[site_base][str(post_id)] = datetime.utcnow()
if GlobalVars.flovis is not None:
GlobalVars.flovis.stage('bodyfetcher/enqueued', site_base, post_id,
{sk: list(sq.keys()) for sk, sq in self.queue.items()})
if should_check_site:
self.make_api_call_for_site(site_base)
else:
self.check_queue()
return
def check_queue(self):
for site, values in self.queue.items():
if site in self.special_cases:
if len(values) >= self.special_cases[site]:
self.make_api_call_for_site(site)
return
if site in self.time_sensitive:
if len(values) >= 1 and datetime.utcnow().hour in range(4, 12):
self.make_api_call_for_site(site)
return
# if we don't have any sites with their queue filled, take the first one without a special case
for site, values in self.queue.items():
if site not in self.special_cases and len(values) >= self.threshold:
self.make_api_call_for_site(site)
return
# We're not making an API request, so explicitly store the queue
with self.queue_modify_lock:
store_bodyfetcher_queue()
def print_queue(self):
return '\n'.join(["{0}: {1}".format(key, str(len(values))) for (key, values) in self.queue.items()])
def make_api_call_for_site(self, site):
if site not in self.queue:
return
with self.queue_modify_lock:
new_posts = self.queue.pop(site)
store_bodyfetcher_queue()
new_post_ids = [int(k) for k in new_posts.keys()]
if GlobalVars.flovis is not None:
for post_id in new_post_ids:
GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id,
{'site': site, 'posts': list(new_posts.keys())})
# Add queue timing data
with self.queue_timing_modify_lock:
post_add_times = [v for k, v in new_posts.items()]
pop_time = datetime.utcnow()
for add_time in post_add_times:
seconds_in_queue = (pop_time - add_time).total_seconds()
add_queue_timing_data(site, seconds_in_queue)
with self.max_ids_modify_lock:
if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]:
previous_max_id = self.previous_max_ids[site]
intermediate_posts = range(previous_max_id + 1, max(new_post_ids))
# We don't want to go over the 100-post API cutoff, so take the last
# (100-len(new_post_ids)) from intermediate_posts
intermediate_posts = intermediate_posts[-(100 - len(new_post_ids)):]
# new_post_ids could contain edited posts, so merge it back in
combined = chain(intermediate_posts, new_post_ids)
# Could be duplicates, so uniquify
posts = list(set(combined))
else:
posts = new_post_ids
try:
if max(new_post_ids) > self.previous_max_ids[site]:
self.previous_max_ids[site] = max(new_post_ids)
store_bodyfetcher_max_ids()
except KeyError:
self.previous_max_ids[site] = max(new_post_ids)
store_bodyfetcher_max_ids()
log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site))
if len(new_post_ids) > 30:
log('debug', "{} +{} more".format(sorted(new_post_ids)[:30], len(new_post_ids) - 30))
else:
log('debug', sorted(new_post_ids))
if len(new_post_ids) == len(posts):
log('debug', "[ *Identical* ]")
elif len(posts) > 30:
log('debug', "{} +{} more".format(sorted(posts)[:30], len(posts) - 30))
else:
log('debug', sorted(posts))
question_modifier = ""
pagesize_modifier = {}
if site == "stackoverflow.com":
# Not all SO questions are shown in the realtime feed. We now
# fetch all recently modified SO questions to work around that.
if self.last_activity_date != 0:
pagesize = "50"
else:
pagesize = "25"
pagesize_modifier = {
'pagesize': pagesize,
'min': str(self.last_activity_date)
}
else:
question_modifier = "/{0}".format(";".join([str(post) for post in posts]))
url = "https://api.stackexchange.com/2.2/questions{}".format(question_modifier)
params = {
'filter': '!1rs)sUKylwB)8isvCRk.xNu71LnaxjnPS12*pX*CEOKbPFwVFdHNxiMa7GIVgzDAwMa',
'key': 'IAkbitmze4B8KpacUfLqkw((',
'site': site
}
params.update(pagesize_modifier)
# wait to make sure API has/updates post data
time.sleep(3)
with GlobalVars.api_request_lock:
# Respect backoff, if we were given one
if GlobalVars.api_backoff_time > time.time():
time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
try:
time_request_made = datetime.utcnow().strftime('%H:%M:%S')
response = requests.get(url, params=params, timeout=20).json()
except (requests.exceptions.Timeout, requests.ConnectionError, Exception):
# Any failure in the request being made (timeout or otherwise) should be added back to
# the queue.
with self.queue_modify_lock:
if site in self.queue:
self.queue[site].update(new_posts)
else:
self.queue[site] = new_posts
return
with self.api_data_lock:
add_or_update_api_data(site)
message_hq = ""
with GlobalVars.apiquota_rw_lock:
if "quota_remaining" in response:
quota_remaining = response["quota_remaining"]
if quota_remaining - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0 \
and quota_remaining > 19980:
tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. "
"Current quota: {1}.".format(GlobalVars.apiquota,
quota_remaining))
sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1),
reverse=True)
api_quota_used_per_site = ""
for site_name, quota_used in sorted_calls_per_site:
sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '')
api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used))
api_quota_used_per_site = api_quota_used_per_site.strip()
tell_rooms_with("debug", api_quota_used_per_site)
clear_api_data()
if quota_remaining == 0:
tell_rooms_with("debug", "API reports no quota left! May be a glitch.")
tell_rooms_with("debug", str(response)) # No code format for now?
if GlobalVars.apiquota == -1:
tell_rooms_with("debug", "Restart: API quota is {quota}."
.format(quota=quota_remaining))
GlobalVars.apiquota = quota_remaining
else:
message_hq = "The quota_remaining property was not in the API response."
if "error_message" in response:
message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made)
if "error_id" in response and response["error_id"] == 502:
if GlobalVars.api_backoff_time < time.time() + 12: # Add a backoff of 10 + 2 seconds as a default
GlobalVars.api_backoff_time = time.time() + 12
message_hq += " Backing off on requests for the next 12 seconds."
message_hq += " Previous URL: `{}`".format(url)
if "backoff" in response:
if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
GlobalVars.api_backoff_time = time.time() + response["backoff"]
if len(message_hq) > 0 and "site is required" not in message_hq:
message_hq = message_hq.strip()
if len(message_hq) > 500:
message_hq = "\n" + message_hq
tell_rooms_with("debug", message_hq)
if "items" not in response:
return
if site == "stackoverflow.com":
items = response["items"]
if len(items) > 0 and "last_activity_date" in items[0]:
self.last_activity_date = items[0]["last_activity_date"]
num_scanned = 0
start_time = time.time()
for post in response["items"]:
pnb = copy.deepcopy(post)
if 'body' in pnb:
pnb['body'] = 'Present, but truncated'
if 'answers' in pnb:
del pnb['answers']
if "title" not in post or "body" not in post:
if GlobalVars.flovis is not None and 'question_id' in post:
GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], pnb)
continue
post['site'] = site
try:
post['edited'] = (post['creation_date'] != post['last_edit_date'])
except KeyError:
post['edited'] = False # last_edit_date not present = not edited
try:
post_ = Post(api_response=post)
except PostParseError as err:
log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_))
if GlobalVars.flovis is not None and 'question_id' in post:
GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], pnb)
continue
num_scanned += 1
is_spam, reason, why = check_if_spam(post_)
if is_spam:
try:
if GlobalVars.flovis is not None and 'question_id' in post:
GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'],
{'post': pnb, 'check_if_spam': [is_spam, reason, why]})
handle_spam(post=post_,
reasons=reason,
why=why)
except Exception as e:
log('error', "Exception in handle_spam:", e)
elif GlobalVars.flovis is not None and 'question_id' in post:
GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'],
{'post': pnb, 'check_if_spam': [is_spam, reason, why]})
try:
if "answers" not in post:
pass
else:
for answer in post["answers"]:
anb = copy.deepcopy(answer)
if 'body' in anb:
anb['body'] = 'Present, but truncated'
num_scanned += 1
answer["IsAnswer"] = True # Necesssary for Post object
answer["title"] = "" # Necessary for proper Post object creation
answer["site"] = site # Necessary for proper Post object creation
try:
answer['edited'] = (answer['creation_date'] != answer['last_edit_date'])
except KeyError:
answer['edited'] = False # last_edit_date not present = not edited
answer_ = Post(api_response=answer, parent=post_)
is_spam, reason, why = check_if_spam(answer_)
if is_spam:
try:
if GlobalVars.flovis is not None and 'answer_id' in answer:
GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'],
{'post': anb, 'check_if_spam': [is_spam, reason, why]})
handle_spam(answer_,
reasons=reason,
why=why)
except Exception as e:
log('error', "Exception in handle_spam:", e)
elif GlobalVars.flovis is not None and 'answer_id' in answer:
GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'],
{'post': anb, 'check_if_spam': [is_spam, reason, why]})
except Exception as e:
log('error', "Exception handling answers:", e)
end_time = time.time()
scan_time = end_time - start_time
GlobalVars.PostScanStat.add_stat(num_scanned, scan_time)
return