-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
102 lines (80 loc) · 3.67 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from fetchPDFs import *
from handle_data import *
from fetchEncounters import *
from fetchMigrants import *
def get_all_data():
all_data = fetch_google_sheet_data()
# Replace NaN values with empty strings to prevent unintended NaNs
all_data.fillna("", inplace=True)
# Convert 'date_of_incident' to datetime, forcing errors='coerce' to handle bad data
all_data["date_of_incident"] = pd.to_datetime(all_data["date_of_incident"], errors="coerce")
# Get the latest date for each primary_category in all_data
latest_dates = all_data.groupby("primary_category")["date_of_incident"].max()
return all_data, latest_dates
def combine_data(encountersData, migrantsData):
final_df = pd.concat([encountersData, migrantsData])
def check_date_format(date_val):
split_by_dash = date_val.split("-")
if len(date_val) != 10:
return False
elif len(split_by_dash) != 3:
return False
elif len(split_by_dash[0]) != 4:
return False
elif len(split_by_dash[1]) != 2:
return False
elif len(split_by_dash[2]) != 2:
return False
return True
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
dates_list = []
year_list = []
month_list = []
for i,j in final_df.iterrows():
date_val = str(j["date_of_incident"])
size = len(date_val)
# if i == 20:
# break
if check_date_format(date_val):
dates_list.append(date_val)
year_list.append(date_val.split("-")[0])
month_index = int(date_val.split("-")[1]) - 1
month = months[month_index]
month_list.append(month)
else:
if date_val.count(".") == 2: # Then DD.MM.YYYY (28.02.2020) format
date_val = date_val.split(".")[2] + "-" + date_val.split(".")[1] + "-" + date_val.split(".")[0]
dates_list.append(date_val)
year_list.append(date_val.split("-")[0])
month_index = int(date_val.split("-")[1]) - 1
month = months[month_index]
month_list.append(month)
else:
print ("==== :", date_val, size)
#break
final_df["date_of_incident"] = dates_list
final_df["Year"] = year_list
final_df["Month"] = month_list
cols = ['tracking_number', 'date_of_incident', 'Year', 'Month' ,'primary_category', 'event_type',
'event_subtype', 'event_associated_number',
'event_associated_number_unit', 'latitude', 'longitude', 'description',
'actor1', 'assoc_actor_1', 'actor2', 'assoc_actor_2', 'actor3',
'assoc_actor_3', 'name_of_ship1', 'imo_number_of_ship1',
'type_of_ship1', 'name_of_ship2', 'imo_number_of_ship2',
'type_of_ship2', 'name_of_ship3', 'imo_number_of_ship3',
'type_of_ship3', 'flag_states_involved', 'location',
'location_precision', 'location_url', 'source_name',
'source_link_external', 'source_name2', 'source_link2_external',
'compiled_from', 'compilation_source_link_external']
final_df = final_df[cols]
final_df['Year'] = final_df['Year'].astype(int)
final_df.drop(final_df[final_df['Year'] < 2011].index, inplace = True)
return final_df
all_data, latest_dates = get_all_data()
pdfData = fetch_pdfs(latest_dates['Piracy and Robbery'])
encountersData = fetch_Encounters(latest_dates['Fishing'])
migrantsData = fetch_Migrants()
new_data = combine_data(encountersData, migrantsData)
new_data = pd.concat([new_data, pdfData])
push_to_spreadsheet(all_data, new_data, latest_dates)
print("Worked!")