-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFeaturePrep0313.py
197 lines (169 loc) · 7.79 KB
/
FeaturePrep0313.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import json
from json.decoder import JSONDecodeError
import os
import glob
import pandas as pd
import numpy as np
from urllib.parse import urlparse
# Feature preprocessing
# Output - 0313 data
dir = os.getcwd()
filter_files = glob.glob(dir+'/FilteredData/'+'*.json')
out_dir = dir+'/PrepData0313/'
original_data = dir+'/Master Modeler Competition 2021 - ERASE - FB post collection (1_2017 to current).csv'
list_of_dct = []
list_of_filtered = []
unavailable_json = []
# Events
# January Human Trafficking Awareness Month
# SuperBowl: Feb. 5, 2017; Feb. 4, 2018; Feb. 3, 2019; Feb. 2, 2020; Feb. 7, 2021
# Marti Gras: February 28, 2017; February 13, 2018; March 5, 2019; February 25, 2020; February 16, 2021
# March Madness: second half of March + first week of April; 2020 cancelled
# July 30 World Day Against Trafficking in Person
original = pd.read_csv(original_data, usecols=['URL', 'date', 'total engagement', 'engagement rate', 'reactions', 'shares', 'comments'])
original = original.rename({'date': 'original_date',
"total engagement": "total_engagement",
"engagement rate": "engagement_rate"}, axis=1)
class MediaTypeError(Exception):
pass
class URLMatchError(Exception):
pass
def np_encoder(object):
if isinstance(object, np.generic):
return object.item()
for file in filter_files:
with open(file, 'r') as f:
filename = os.path.basename(file)
try:
dct = json.load(f)
new_dct = dct
# check url from json and original data
url1 = new_dct['url']
url2 = original.loc[pd.to_numeric(filename[:-5])]['URL']
# if urlparse(url1).netloc != urlparse(url2).netloc:
if url1 != url2:
raise URLMatchError
else:
# engagements
new_dct.update(original.loc[pd.to_numeric(filename[:-5]), original.columns != 'URL'])
new_dct['date'] = dct['created_time'][0:10] # keep date as string bcs datetime format is not supported by JSON
new_dct['date_dt'] = pd.to_datetime(new_dct['date'])
# day of week
new_dct['day_week'] = new_dct['date_dt'].dayofweek
new_dct['Mon'] = 0
new_dct['Tue'] = 0
new_dct['Wed'] = 0
new_dct['Thur'] = 0
new_dct['Fri'] = 0
new_dct['Sat'] = 0
if new_dct['day_week'] == 0:
new_dct['Mon'] = 1
elif new_dct['day_week'] == 1:
new_dct['Tue'] = 1
elif new_dct['day_week'] == 2:
new_dct['Wed'] = 1
elif new_dct['day_week'] == 3:
new_dct['Thur'] = 1
elif new_dct['day_week'] == 4:
new_dct['Fri'] = 1
elif new_dct['day_week'] == 5:
new_dct['Sat'] = 1
# event
new_dct['event'] = 0
if new_dct['date_dt'].month == 1:
new_dct['event'] = 1
elif new_dct['date_dt'].month == 7 and new_dct['date_dt'].day == 30:
new_dct['event'] = 1
# season
new_dct['season'] = 3
new_dct['winter'] = 0
new_dct['spring'] = 0
new_dct['summer'] = 0
if new_dct['date_dt'].month == 12 or new_dct['date_dt'].month <= 2:
new_dct['winter'] = 1
new_dct['season'] = 0
elif 3 <= new_dct['date_dt'].month <= 5:
new_dct['spring'] = 1
new_dct['season'] = 1
elif 6 <= new_dct['date_dt'].month <= 8:
new_dct['summer'] = 1
new_dct['season'] = 2
# time of day using original_date
new_dct['hour'] = dct['original_date'][13:-3]
new_dct['hour'] = pd.to_numeric(new_dct['hour'])
new_dct['time_day'] = 0
new_dct['morning'] = 0
new_dct['afternoon'] = 0
new_dct['evening'] = 0
if 6 <= new_dct['hour'] < 12:
new_dct['morning'] = 1
new_dct['time_day'] = 1
elif 12 <= new_dct['hour'] < 18:
new_dct['afternoon'] = 1
new_dct['time_day'] = 2
elif 18 <= new_dct['hour'] <= 23:
new_dct['evening'] = 1
new_dct['time_day'] = 3
# remove 'date_dt' bcs it is not supported by JSON
new_dct.pop('date_dt')
# emojis, mentions, names
new_dct['emoji_num'] = 0
new_dct['mention_num'] = 0
new_dct['name_num'] = 0
if 'emojis' in dct.keys() and dct['emojis'] is not None:
new_dct['emoji_num'] = len(dct['emojis'])
if 'mentions' in dct.keys() and dct['mentions'] is not None:
new_dct['mention_num'] = len(dct['mentions'])
if 'names' in dct.keys() and dct['names'] is not None:
new_dct['name_num'] = len(dct['names'])
# media type
new_dct['share'] = 0
new_dct['share_url'] = None
new_dct['photo'] = 0
new_dct['photo_url'] = None
new_dct['video'] = 0
new_dct['video_url'] = None
new_dct['media_desc'] = []
new_dct['thumbnail_url'] = []
if dct['attachments'] is not None:
for media in dct['attachments']:
if media['media_type'] in ['avatar', 'profile_media', 'cover_photo']:
raise MediaTypeError
if media['media_type'] == 'share':
new_dct['share'] = 1
new_dct['media_desc'] = media['media_description']
new_dct['thumbnail_url'] = media['thumbnail_url']
if media['media_url'] is not None:
new_dct['share_url'] = media['media_url']
else:
print('Media URL Error: ', filename, new_dct['url'], 'share_url is missing')
elif media['media_type'] in ['photo', 'album', 'new_album']:
new_dct['photo'] = 1
new_dct['media_desc'] = media['media_description']
new_dct['thumbnail_url'] = media['thumbnail_url']
if media['media_url'] is not None:
new_dct['photo_url'] = media['media_url']
else:
print('Media URL Error: ', filename, new_dct['url'], 'photo_url is missing')
elif media['media_type'] in ['video_inline', 'video_direct_response', 'native_templates', 'video', 'map']:
new_dct['video'] = 1
new_dct['media_desc'] = media['media_description']
new_dct['thumbnail_url'] = media['thumbnail_url']
if media['media_url'] is not None:
new_dct['video_url'] = media['media_url']
else:
print('Media URL Error: ', filename, new_dct['url'], 'video_url is missing')
new_dct['link'] = 0
if 'urls' in dct.keys() and dct['urls'] is not None:
new_dct['link'] = len(dct['urls'])
out_file = open(out_dir + filename, 'w')
json.dump(new_dct, out_file, default=np_encoder)
list_of_dct.append(new_dct)
except JSONDecodeError:
unavailable_json.append(filename)
except MediaTypeError:
print('Media Type Error: ', filename, new_dct['url'], 'that has type', media['media_type'], 'is removed')
except URLMatchError:
print('URL Matching Error: Could not match', filename[:-5], 'in original data')
df = pd.DataFrame(list_of_dct)
df.to_csv('dataset0313.csv', index=False) #sep='\t'