-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmisc_data_scripts.py
375 lines (312 loc) · 13.4 KB
/
misc_data_scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# Don't run this as a script - paste code as needed.
########################################################################################################
########################################################################################################
# Export event data to file
from streamcollect.models import Keyword, Event, GeoPoint, Tweet, User, Hashtag, Url, Mention
filename = 'Event_Summaries.txt'
event = Event.objects.all()[0]
with open(filename, 'a') as out_file:
out_file.write('####################\n')
out_file.write('{}\n\n'.format(event.name))
out_file.write('time_start\t\t{}\n'.format(event.time_start))
out_file.write('time_end\t\t{}\n'.format(event.time_end))
out_file.write('kw_stream_start\t\t{}\n'.format(event.kw_stream_start))
out_file.write('kw_stream_end\t\t{}\n'.format(event.kw_stream_end))
out_file.write('gps_stream_start\t{}\n'.format(event.gps_stream_start))
out_file.write('gps_stream_end\t\t{}\n'.format(event.gps_stream_end))
out_file.write('min created_at\t\t{}\n'.format(min(Tweet.objects.filter(data_source__gte=1).values_list('created_at'))[0]))
out_file.write('max created_at\t\t{}\n'.format(max(Tweet.objects.filter(data_source__gte=1).values_list('created_at'))[0]))
for i in [1,2]:
out_file.write('\nKeyword Priority = {}\n'.format(i))
for k in Keyword.objects.filter(priority=i):
out_file.write(k.keyword+'\n')
ps = GeoPoint.objects.all()
out_file.write('\nBounding Box')
out_file.write('\n({}, {}), ({}, {})'.format(ps[0].latitude, ps[0].longitude, ps[1].latitude, ps[1].longitude))
out_file.write('\n\nTweet counts:')
out_file.write('\nTotal:\t\t{}'.format(Tweet.objects.all().count()))
out_file.write('\nTotal >0\t{}'.format(Tweet.objects.filter(data_source__gt=0).count()))
for i in range(-1,5):
out_file.write('\n{}:\t\t{}'.format(i, Tweet.objects.filter(data_source=i).count()))
out_file.write('\n\nUser counts:')
out_file.write('\nTotal:\t\t{}'.format(User.objects.all().count()))
out_file.write('\nTotal >0:\t{}'.format(User.objects.filter(user_class__gt=0).count()))
for i in range(-1,5):
out_file.write('\n{}:\t\t{}'.format(i, User.objects.filter(user_class=i).count()))
out_file.write('\n\nHashtags:\t{}'.format(Hashtag.objects.all().count()))
out_file.write('\nUrls:\t\t{}'.format(Url.objects.all().count()))
out_file.write('\nMentions:\t{}'.format(Mention.objects.all().count()))
out_file.write('\n\n')
########################################################################################################
########################################################################################################
#These snippets are used to track data during capture:
import time
from streamcollect.models import Tweet
c = Tweet.objects.all().count()
fifteen_min = Tweet.objects.all().count()
f_counter = 0
while True:
if f_counter%900 == 0: # slightly more than 15min due to other processes in loop.
#print('Saved in last window: {}'.format(Tweet.objects.all().count()-fifteen_min))
print('{} {} {} {} | +{}'.format(Tweet.objects.filter(data_source=1).count(), Tweet.objects.filter(data_source=2).count(), Tweet.objects.filter(data_source=3).count(), Tweet.objects.filter(data_source=4).count(), Tweet.objects.all().count()-c))
c = Tweet.objects.all().count()
fifteen_min = Tweet.objects.all().count()
time.sleep(10)
f_counter += 10
import time
from celery.task.control import inspect
len(inspect(['celery@media_worker']).reserved().get('celery@media_worker'))
len(inspect(['celery@stream_worker']).active().get('celery@stream_worker'))
# These snippets are primarily used to remove Tweets which are unwanted.
# Original use was to remove tweets with 'pray' and their associated users/entities.
# Find tweets with a duplicate tweet_id and delete the newest
from django.db.models import Count
from streamcollect.models import Tweet
from streamcollect.models import User
dupes = Tweet.objects.values('tweet_id').annotate(Count('id')).order_by().filter(id__count__gt=1)
ids = dupes.values('tweet_id')
for id in ids:
tweets = Tweet.objects.filter(tweet_id=id.get('tweet_id'))
if tweets.count() == 2:
id1 = tweets[0].id
id2 = tweets[1].id
if id1 > id2:
delete_id = id1
else:
delete_id = id2
Tweet.objects.filter(id=delete_id).delete()
#-----------------#
# Remove 'prayer' tweets (then need to check for isolated users/tags etc as above)
t = Tweet.objects.filter(text__icontains='pray').delete()
#t = Tweet.objects.filter(data_source__gte=1).filter(text__icontains='pray').delete()
#-----------------#
# Delete hashtags that are now not linked to any Tweets.
# Adjust and repeat for Mention and Url.
# This isn't necessary after deleting duplicates, as the remaining one links.
from streamcollect.models import Hashtag
count=0
items = Hashtag.objects.all()
for i in items:
if i.tweets.all().count() == 0:
print('count = 0 for hashtag {}'.format(i.hashtag))
count += 1
i.delete()
# Uncomment to only count hashtags including 'pray'
#if 'pray' in h.hashtag:
# print(h.hashtag)
#else:
# count -= 1
print(count)
#-----------------#
# Remove/demote class 2 users who now don't belong to any tweets.
users = User.objects.filter(user_class=2).filter(tweet=None)
#Note that 'tweet=None' should be 'author=None' in old version, same for relo names
for u in users:
class2 = False
for r in u.relo_in.all(): # Incoming relo, previously 'target'
if r.source_user.user_class == 2:
print('user has incoming link: {}'.format(r.source_user.screen_name))
r.source_user.out_degree -= 1
r.source_user.save()
class2 = True
for r in u.relo_out.all(): # Outgoing relo, previously 'source'
if r.target_user.user_class == 2:
print('user has outgoing link: {}'.format(r.target_user.screen_name))
r.target_user.in_degree -= 1
r.target_user.save()
class2 = True
if class2 is False:
print('Deleting user.')
u.relo_out.all().delete() # Changed
u.relo_in.all().delete() # Changed
#Check all relos, see if isolated and delete if necessary
u.delete()
pass
else:
print('Demoting user.')
r_out = u.relo_out.all() # Changed
for r in r_out:
if r.target_user.user_class < 2:
r.delete()
r_in = u.relo_in.all() # Changed
for r in r_in:
if r.source_user.user_class < 2:
r.delete()
u.user_class = 0
u.save()
#-----------------#
# Remove alter User objects which are no longer linked to anyone.
lone_users = User.objects.filter(user_class=0).filter(relo_out=None, relo_in=None) #previously source and target
lone_users.count()
lone_users.delete()
#-----------------#
# Finding earliest and latest dates of Tweets for each class
for i in range(4):
early = None
late = None
tweets = Tweet.objects.filter(data_source=i)
for t in tweets:
if early is None or t.created_at < early:
early = t.created_at
if late is None or t.created_at > late:
late = t.created_at
print("Earliest for {}: {}".format(i, early))
print("Latest for {}: {}".format(i, late))
#-----------------#
# Find users added in the last 12 hours (ie. added by creating relationships)
from datetime import timedelta, datetime, timezone
window = datetime.now() - timedelta(hours=12)
window = window.replace(tzinfo=timezone.utc) # Make TZ aware
new_users = User.objects.filter(added_at>window)
#-----------------#
# Finding users from within a certain bounding_box (ie. to remove a sub-box from set)
#
# BEFORE USE: Update exclusion_box, and check the spam account values match the config.
# Note that the coordinate check logic inside the loop needs customisation if
# using a box that is not top left corner.
geo_users = User.objects.filter(data_source=3)
exclusion_box = [29.1197, -99.66, 30.39, -97.5021]
outside = 0
lower = 0
inside = 0
for u in geo_users:
ts = Tweet.objects.filter(author=u, data_source=3)
t_outside_box = False
for t in ts:
if t.coordinates_lat < exclusion_box[0] or t.coordinates_lon > exclusion_box[3]:
# Beneath box or east of box
t_outside_box = True
#print("User: {} Tweet outside exclusion_box".format(u.screen_name))
outside += 1
break
else:
t.data_source = 0
t.save()
if t_outside_box == False:
if Tweet.objects.filter(author=u, data_source=2).count() > 0:
# Should change data_source of user here.
#print("User: {} has lower data_source Tweet".format(u.screen_name))
lower += 1
u.data_source = 2
u.save()
continue
elif Tweet.objects.filter(author=u, data_source=1).count() > 0:
# Should change data_source of user here.
#print("User: {} has lower data_source Tweet".format(u.screen_name))
lower += 1
u.data_source = 1
u.save()
continue
else:
# Tweets only within exclusion box:
#print("User: {} from within exclusion box".format(u.screen_name))
# Class as source = -1, and handle source=-1 in following block.
u.data_source = -1
u.save()
inside += 1
continue
print("Outside box: {}".format(outside))
print("Demoted to lower code: {}".format(lower))
print("Inside box, due for delete/demote check: {}".format(inside))
# Now check whether demoted users are isolated and should be removed.
users = User.objects.filter(data_source=-1)
isolated_count = 0
for u in users:
linked = False
for r in u.relo_out.all():
if r.target_user.user_class == 2:
linked = True
break
for r in u.relo_in.all():
if r.source_user.user_class == 2:
linked = True
break
if linked == False:
isolated_count += 1
u.delete()
else:
if (u.followers_count is not None and u.followers_count > 5000) or \
(u.friends_count is not None and u.friends_count > 5000) or \
(u.statuses_count is not None and u.statuses_count > 10000):
# Spam account.
u.user_class = -1
u.data_source = 0
u.save()
else:
u.user_class = 1
u.data_source = 0
u.save()
# Remove alter User objects which are no longer linked to anyone.
lone_users = User.objects.filter(user_class__lte=1).filter(relo_out=None, relo_in=None)
lone_users.count()
lone_users.delete()
#-----------------#
#Dealing with old Databases
#-----------------#
# # Moving streamed column into data_source column
# #Add temp data_source column in SQL:
# \d+ streamcollect_tweet #View column names
# ALTER TABLE streamcollect_tweet ADD COLUMN data_source2 int;
# UPDATE streamcollect_tweet SET data_source2 = CASE WHEN streamed='f' THEN 0 ELSE 1 END;
# SELECT id, streamed, data_source2 FROM streamcollect_tweet;
#
# # After performing migrations:
# UPDATE streamcollect_tweet SET data_source2 = data_source;
# ALTER TABLE streamcollect_tweet DROP COLUMN data_source2;
# Checking if tweet is within GPS bounding box and updating data_source accordingly (for old databases without data_source column)
gps = [-99.9590682, 26.5486063, -93.9790001, 30.3893434] # Long, lat, long, lat
ts = Tweet.objects.filter(coordinates_lon__isnull=False)
for t in ts:
if t.coordinates_lat > gps[1] and t.coordinates_lat < gps[3]:
if t.coordinates_lon > gps[0] and t.coordinates_lon < gps[2]:
t.data_source = 3
t.save()
u = t.author
u.data_source = 3
u.save()
#-----------------#
#-----------------#
#-----------------#
# Exporting Tweet dates to file to view on graph
filename = 'irma_date_data_{}.csv' # Change as needed
for i in range(4):
tweets = Tweet.objects.filter(data_source=i)
datafile = open(filename.format(i), 'w')
for t in tweets:
datafile.write(str(t.created_at)+'\n')
datafile.close()
#-----------------#
# View histogram of dates (in new VEnv)
# # Set up environment
# python3 -m venv venv
# source venv/bin/activate
# pip3 install pandas
# pip3 install jupyter
# jupyter notebook
%matplotlib inline
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
filename = 'irma_date_data_{}.csv' # Change as needed
data_frames = {}
for i in range(4): # 4 classes of Tweet data_source
try:
df = pd.read_csv(filename.format(i), header=None)
except:
continue
df[0] = pd.to_datetime(df[0])
data_frames["data_file_{}".format(i)] = df
for i in data_frames:
df = data_frames.get(i)
data = df.groupby([df[0].dt.day, df[0].dt.hour]).count()
data.plot(title='Tweet data_source = {}'.format(i[len(i)-1:]), legend=False)
# To plot on same graph (Seems to break x scale)
ax = None
for i in data_frames:
df = data_frames.get(i)
data = df.groupby([df[0].dt.day, df[0].dt.hour]).count()
ax = data.plot(title='Tweet data_source = {}'.format(i[len(i)-1:]), legend=False, ax=ax)
#if ax is None:
# ax = data.plot(title='Tweet data_source = {}'.format(i[len(i)-1:]), legend=False)
#else:
# data.plot(title='Tweet data_source = {}'.format(i[len(i)-1:]), legend=False, ax=ax)