-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetchMigrants.py
188 lines (146 loc) · 10.9 KB
/
fetchMigrants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import time
from utilities import *
import glob
import pandas as pd
import re
def clean_migrants():
csv_files = glob.glob("*Missing_Migrants*.csv")
# Load CSV
df = pd.read_csv(csv_files[0])
# Here we define the words to filter the dataset
location_death = [' boat', ' sea ', ' sea ', ' ship ', ' ship ', ' coast ', ' ‘port of’ ', ' island ', ' island(s) ', ' islands ', ' isla ', ' isla(s) ', ' islas ', ' isle ', ' isles ', ' isle(s) ', ' isleta ', ' isletas ', ' isleta(s) ', ' isola ', ' isole ', ' isoletta ', ' atlantic ', ' pacific ', ' arctic ', ' antarctic ', ' indian ', ' shore ', ' shores ', ' shore(s) ', ' waters ', ' container ', ' drown ', ' capsize ', ' vessel ', ' maritime ', ' bay ', ' shore ', ' off ', ' adrift ', ' aground ', ' stranded ', ' drift ', ' drifting ', ' marine ', ' beach ', ' boca ',' gulf ', ' boca ', ' nautical ', ' dock ', ' jetty ', ' channel ', ' cape ', ' cabo ', ' cap ', ' sail ', ' sailed ', ' sailing ', ' ferry ', ' strait ', ' fisherman ', ' fishermen ', ' washed ', ' ashore ', ' overboard ', ' sank ', ' punta ', ' harbor ', ' dinghy ', ' dinghies ', ' playa ', ' plage ', ' tanjung ', ' cayo ', ' key ', ' bight ', ' inlet ', ' floating ', ' sar zone ']
migration_route = ['caribbean to central america', 'central mediterranean', 'comoros to mayotte', 'dominican republic to puerto rico', 'eastern mediterranean', 'english channel', 'horn of africa to yemen crossing', 'venezuela to caribbean', 'western africa / atlantic route to canary islands', 'western mediterranean']
regions = ['Caribbean', 'Mediterranean']
# Here we handle NaN cells
df['Migration Route'] = df['Migration Route'].fillna('unknown')
df['Location of Incident'] = df['Location of Incident'].fillna('unknown')
df['Region of Incident'] = df['Region of Incident'].fillna('unknown')
df['Cause of Death'] = df['Cause of Death'].fillna('Mixed or unknown')
# This function returns a filtered dataset for those who died or went missing at sea
def get_lost_at_sea(dataframe):
# First we exclude all US-Mexico border crossing migration routes
dataframe = dataframe[~(dataframe['Migration Route'].str.contains('US-Mexico border crossing', case = False))]
drowned_df = dataframe.copy()
maritime_region_df = dataframe.copy()
location_df = dataframe.copy()
migration_df = dataframe.copy()
# We will add relevant rows to this dataframe
filtered_df = pd.DataFrame()
# Add rows for those drowned
drowned_df = drowned_df[(drowned_df['Cause of Death'].str.contains('drowning', case = False))] # Filters for those who drowned
filtered_df = pd.concat([filtered_df, drowned_df]) # Add relevant rows to main dataframe
# Add rows for location of death
for word in location_death:
location_df = location_df[(location_df['Location of Incident'].str.contains(re.escape(word), case = False))] # Define dataframe with specific words
filtered_df = pd.concat([filtered_df, location_df]) # Add relevant rows to main dataframe
filtered_df = filtered_df.drop_duplicates() # Drop duplicate rows that result from this
# Add rows for sea/migration routes
for word in migration_route:
migration_df = migration_df[(migration_df['Migration Route'].str.contains(word, case = False))] # Define dataframe with specific words
filtered_df = pd.concat([filtered_df, migration_df]) # Add relevant rows to main dataframe
filtered_df = filtered_df.drop_duplicates() # Drop duplicate rows that result from this
# Add rows for maritime regions
for region in regions:
region_df = maritime_region_df[(maritime_region_df['Region of Incident'].str.contains(region, case = False))] # Define dataframe with specific region
filtered_df = pd.concat([filtered_df, region_df]) # Add relevant rows to main dataframe
filtered_df = filtered_df.drop_duplicates() # Drop duplicate rows that result from this
return filtered_df
# Run this to filter the data for those who died/went missing at sea
maritime_dataset = get_lost_at_sea(df)
maritime_dataset = maritime_dataset.reset_index(drop = True)
# This function returns a description list given appropriate dataset
def get_desc(dataframe):
description_list = []
for index in dataframe.index: # Index corresponds to row in the dataframe
row = dataframe.loc[index]
# Extract numbers needed
number_of_dead = int(row['Number of Dead']) if pd.isnull(row['Number of Dead']) == False else 0
min_estimated_missing = int(row['Minimum Estimated Number of Missing']) if pd.isnull(row['Minimum Estimated Number of Missing']) == False else 0
number_of_survivors = int(row['Number of Survivors']) if pd.isnull(row['Number of Survivors']) == False else 0
# Extract strings needed
incident_type = str(row['Incident Type']).lower()
cause_of_death = str(row['Cause of Death']).lower() if row['Cause of Death'] != 'Mixed or unknown' else 'mixed or unknown reasons'
country_of_origin = row['Country of Origin'] if pd.isnull(row['Country of Origin']) == False else 'Unknown'
country_of_origin = country_of_origin.replace(',', ', ').replace('Unknown', 'an unknown country')
last_occurance_index = country_of_origin.rfind(',') # Replace last ',' with ', and' in country_of_origin
if last_occurance_index != -1:
country_of_origin = country_of_origin[:last_occurance_index] + ', and' + country_of_origin[last_occurance_index + len(','):]
migration_route = row['Migration Route']
# Add 'a'/'an' to ensure grammatically correct
article = ''
if incident_type[0] in ['a', 'e', 'i', 'o', 'u']:
article = 'an'
else:
article = 'a'
article_2 = ''
if migration_route == 'unknown':
article_2 = 'an'
else:
article_2 = 'the'
# Write description
description = f"The International Organization for Migration\'s Missing Migrant Dataset reported {article} {incident_type} that resulted in {number_of_dead} migrant(s) dead due to {cause_of_death} with at least {min_estimated_missing} migrant(s) missing, and {number_of_survivors} survivor(s) reported. The migrants were reportedly traveling from {country_of_origin} along {article_2} {migration_route} migration route."
# Add description to description list
description_list.append(description)
return description_list
#This function returns a list of latitudes from the Coordinates column of the dataset.
def get_lat(dataframe):
lat_list = [] # The list index corresponds to the row number of the dataset.
dataframe['Coordinates'] = dataframe['Coordinates'].astype('str')
lat_lon_list = dataframe['Coordinates'].str.replace('POINT ', '', regex = False).str.replace('(', '', regex = False).str.replace(')', '', regex = False).str.split(' ')
for coord in lat_lon_list:
lat_list.append(float(coord[-1].strip(',')))
return lat_list
#This function returns a list of longitudes from the Coordinates column of the dataset.
def get_lon(dataframe):
lon_list = [] # The list index corresponds to the row number of the dataset.
dataframe['Coordinates'] = dataframe['Coordinates'].astype('str')
lat_lon_list = dataframe['Coordinates'].str.replace('POINT ', '', regex = False).str.replace('(', '', regex = False).str.replace(')', '', regex = False).str.split(' ')
for coord in lat_lon_list:
lon_list.append(float(coord[0].strip(',')))
return lon_list
# This function returns a list of the coordinates with lowercase 'POINT'
def get_coords(dataframe):
coords_list = []
for index in dataframe.index: # Index corresponds to row in the dataframe
row = dataframe.loc[index]
coordinates = str(row['Coordinates']).lower() # To lowercase
coords_list.append(coordinates)
return coords_list
# Run this to build formatted dataset
final_dataset = pd.DataFrame(columns = ['tracking_number', 'date_of_incident', 'primary_category', 'event_type', 'event_subtype', 'event_associated_number', 'event_associated_number_unit', 'latitude', 'longitude', 'description', 'actor1', 'assoc_actor_1', 'actor2', 'assoc_actor_2', 'actor3', 'assoc_actor_3', 'name_of_ship1', 'imo_number_of_ship1', 'type_of_ship1', 'name_of_ship2', 'imo_number_of_ship2', 'type_of_ship2', 'name_of_ship3', 'imo_number_of_ship3', 'type_of_ship3', 'flag_states_involved', 'region', 'location', 'location_precision', 'location_url', 'source_name', 'source_link_external', 'source_name2', 'source_link2_external', 'compiled_from', 'compilation_source_link_external'])
final_dataset['description'] = get_desc(maritime_dataset)
final_dataset['actor1'] = 'Migrants'
final_dataset['location'] = maritime_dataset['Location of Incident']
final_dataset['location_precision'] = "Point"
final_dataset['location_url'] = 'https://missingmigrants.iom.int/data'
final_dataset['source_name'] = maritime_dataset['Information Source']
final_dataset['source_link_external'] = maritime_dataset['URL']
final_dataset['compiled_from'] = 'International Organization for Migration - Missing Migrants Dataset'
final_dataset['compilation_source_link_external'] = 'https://missingmigrants.iom.int/data'
final_dataset['date_of_incident'] = maritime_dataset['Incident Date']
final_dataset['primary_category'] = 'Migration'
final_dataset['event_type'] = 'Migrant(s) Dead or Missing'
final_dataset['event_subtype'] = maritime_dataset['Cause of Death']
final_dataset['event_associated_number'] = maritime_dataset['Total Number of Dead and Missing']
final_dataset['event_associated_number_unit'] = 'Total Number of Dead and Missing'
final_dataset[['latitude', 'longitude']] = maritime_dataset['Coordinates'].str.split(', ', expand=True).astype(float)
return final_dataset
def fetch_Migrants():
driver, wait = get_driver()
driver.get("https://missingmigrants.iom.int/downloads")
# Wait for the table and its rows to load
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.views-data-export-csv div a')))
link = driver.find_elements(By.CSS_SELECTOR, 'div.views-data-export-csv div a')[-1]
link.click()
timeout = 100
# Start timer
start_time = time.time()
while len(glob.glob("*Missing_Migrants*.csv")) == 0:
time.sleep(5)
# Escape condition: Timeout if CSV doesn't download within `timeout_seconds`
if time.time() - start_time > timeout:
print("Error: CSV download timed out. Skipping this.")
break
driver.quit()
migrant_data = clean_migrants()
return migrant_data