-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtime_split.py
73 lines (66 loc) · 3.8 KB
/
time_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from datetime import datetime
import csv
import pandas as pd
# split all data by time
with open('LIWC_TG_averages/bar_going_before.csv', 'w')as wthird_file:
with open('LIWC_TG_averages/bar_going_after.csv', 'w') as wfourth_file:
with open('LIWC-TreatedGroups/LIWC_bar_going.csv', 'r')as rfirst_file:
with open("TreatedGroup/Bar_Going.txt", 'r') as rsecond_file:
first_content = csv.reader(rfirst_file)
headers = next(first_content, None)
second_content = rsecond_file.read().split('\n')
# write headers into intermediate file
for index in range(len(headers)):
wthird_file.write(headers[index])
wthird_file.write(',')
wfourth_file.write(headers[index])
wfourth_file.write(',')
if index == len(headers) - 1:
wthird_file.write(headers[index])
wthird_file.write('\n')
wfourth_file.write(headers[index])
wfourth_file.write('\n')
# extract time from original Treated Group file
tg_accts = set()
for s_row in range(0, len(second_content) - 1):
info = second_content[s_row].strip().split('_')
acct_no = info[0]
date_int = datetime.strptime(info[1], '%Y-%m-%d')
tg_accts.add(acct_no)
# print(tg_accts)
# Find matching account number and time stamps
for f_row in first_content:
# print(tg_accts)
if f_row[1] in tg_accts:
date = datetime.strptime(f_row[2], '%Y-%m-%d')
# print(f_row[1], f_row[2], "MATCH")
date_range = date - date_int
# print(f_row[1], date_range, date, date_int)
# write all lines approximately 2 months prior to interruption in intermediate before file
if date_range.days <= 0 and date_range.days >= -60:
# print(f_row[1], date_range, date, date_int)
for item in range(len(f_row)):
wthird_file.write(f_row[item])
wthird_file.write(',')
if item == len(f_row) - 1 :
wthird_file.write(f_row[item])
wthird_file.write('\n')
#
# write all lines approximately 2 months after interruption in intermediate after file
elif date_range.days >= 0 and date_range.days <= 60:
# print("AFTER", f_row[1], date_range, date, date_int)
for item in range(len(f_row)):
wfourth_file.write(f_row[item])
wfourth_file.write(',')
if item == len(f_row) - 1 :
wfourth_file.write(f_row[item])
wfourth_file.write('\n')
# use PANDAS to group data by month 2 months before and 2 months after
before_data = pd.read_csv("LIWC_TG_averages/sport_leaving_before.csv")
after_data = pd.read_csv("LIWC_TG_averages/sport_leaving_after.csv")
group_before = before_data.groupby(['userID', 'year', 'month']).mean()
group_after = after_data.groupby(['userID', 'year', 'month']).mean()
# print(group_before)
# print(group_after)
group_before.to_csv("LIWC_TG_averages/sport_leaving_before_month.csv")
group_after.to_csv("LIWC_TG_averages/sport_leaving_after_month.csv")