fetchMigrants.py

import time
from utilities import *
import glob
import pandas as pd
import re

def clean_migrants():
    csv_files = glob.glob("*Missing_Migrants*.csv")

    # Load CSV
    df = pd.read_csv(csv_files[0])

    # Here we define the words to filter the dataset
    location_death = [' boat', ' sea ', ' sea ', ' ship ', ' ship ', ' coast ', ' ‘port of’ ', ' island ', ' island(s) ', ' islands ', ' isla ', ' isla(s) ', ' islas ', ' isle ', ' isles ', ' isle(s) ', ' isleta ', ' isletas ', ' isleta(s) ', ' isola ', ' isole ', ' isoletta ', ' atlantic ', ' pacific ', ' arctic ', ' antarctic ', ' indian ', ' shore ', ' shores ', ' shore(s) ', ' waters ', ' container ', ' drown ', ' capsize ', ' vessel ', ' maritime ', ' bay ', ' shore ', ' off ', ' adrift ', ' aground ', ' stranded ', ' drift ', ' drifting ', ' marine ', ' beach ', ' boca ',' gulf ', ' boca ', ' nautical ', ' dock ', ' jetty ', ' channel ', ' cape ', ' cabo ', ' cap ', ' sail ', ' sailed ', ' sailing ', ' ferry ', ' strait ', ' fisherman ', ' fishermen ', ' washed ', ' ashore ', ' overboard ', ' sank ', ' punta ', ' harbor ', ' dinghy ', ' dinghies ', ' playa ', ' plage ', ' tanjung ', ' cayo ', ' key ', ' bight ', ' inlet ', ' floating ', ' sar zone ']
    migration_route = ['caribbean to central america', 'central mediterranean', 'comoros to mayotte', 'dominican republic to puerto rico', 'eastern mediterranean', 'english channel', 'horn of africa to yemen crossing', 'venezuela to caribbean', 'western africa / atlantic route to canary islands', 'western mediterranean']
    regions = ['Caribbean', 'Mediterranean']

    # Here we handle NaN cells
    df['Migration Route'] = df['Migration Route'].fillna('unknown')
    df['Location of Incident'] = df['Location of Incident'].fillna('unknown')
    df['Region of Incident'] = df['Region of Incident'].fillna('unknown')
    df['Cause of Death'] = df['Cause of Death'].fillna('Mixed or unknown')

    # This function returns a filtered dataset for those who died or went missing at sea
    def get_lost_at_sea(dataframe):

        # First we exclude all US-Mexico border crossing migration routes
        dataframe = dataframe[~(dataframe['Migration Route'].str.contains('US-Mexico border crossing', case = False))]

        drowned_df = dataframe.copy()
        maritime_region_df = dataframe.copy()
        location_df = dataframe.copy()
        migration_df = dataframe.copy()

        # We will add relevant rows to this dataframe
        filtered_df = pd.DataFrame()

        # Add rows for those drowned
        drowned_df = drowned_df[(drowned_df['Cause of Death'].str.contains('drowning', case = False))] # Filters for those who drowned
        filtered_df = pd.concat([filtered_df, drowned_df]) # Add relevant rows to main dataframe

        # Add rows for location of death
        for word in location_death:
            location_df = location_df[(location_df['Location of Incident'].str.contains(re.escape(word), case = False))] # Define dataframe with specific words
            filtered_df = pd.concat([filtered_df, location_df]) # Add relevant rows to main dataframe
        filtered_df = filtered_df.drop_duplicates() # Drop duplicate rows that result from this

        # Add rows for sea/migration routes
        for word in migration_route:
            migration_df = migration_df[(migration_df['Migration Route'].str.contains(word, case = False))] # Define dataframe with specific words
            filtered_df = pd.concat([filtered_df, migration_df]) # Add relevant rows to main dataframe
        filtered_df = filtered_df.drop_duplicates() # Drop duplicate rows that result from this

        # Add rows for maritime regions
        for region in regions:
            region_df = maritime_region_df[(maritime_region_df['Region of Incident'].str.contains(region, case = False))] # Define dataframe with specific region
            filtered_df = pd.concat([filtered_df, region_df]) # Add relevant rows to main dataframe
        filtered_df = filtered_df.drop_duplicates() # Drop duplicate rows that result from this

        return filtered_df

    # Run this to filter the data for those who died/went missing at sea
    maritime_dataset = get_lost_at_sea(df)
    maritime_dataset = maritime_dataset.reset_index(drop = True)

    # This function returns a description list given appropriate dataset
    def get_desc(dataframe):
        description_list = []

        for index in dataframe.index: # Index corresponds to row in the dataframe
            row = dataframe.loc[index]
            # Extract numbers needed
            number_of_dead = int(row['Number of Dead']) if pd.isnull(row['Number of Dead']) == False else 0
            min_estimated_missing = int(row['Minimum Estimated Number of Missing']) if pd.isnull(row['Minimum Estimated Number of Missing']) == False else 0
            number_of_survivors = int(row['Number of Survivors']) if pd.isnull(row['Number of Survivors']) == False else 0

            # Extract strings needed
            incident_type = str(row['Incident Type']).lower()

            cause_of_death = str(row['Cause of Death']).lower() if row['Cause of Death'] != 'Mixed or unknown' else 'mixed or unknown reasons'

            country_of_origin = row['Country of Origin'] if pd.isnull(row['Country of Origin']) == False else 'Unknown'
            country_of_origin = country_of_origin.replace(',', ', ').replace('Unknown', 'an unknown country')

            last_occurance_index = country_of_origin.rfind(',') # Replace last ',' with ', and' in country_of_origin
            if last_occurance_index != -1:
                country_of_origin = country_of_origin[:last_occurance_index] + ', and' + country_of_origin[last_occurance_index + len(','):]

            migration_route = row['Migration Route']

            # Add 'a'/'an' to ensure grammatically correct
            article = ''
            if incident_type[0] in ['a', 'e', 'i', 'o', 'u']:
                article = 'an'
            else:
                article = 'a'

            article_2 = ''
            if migration_route == 'unknown':
                article_2 = 'an'
            else:
                article_2 = 'the'

            # Write description
            description = f"The International Organization for Migration\'s Missing Migrant Dataset reported {article} {incident_type} that resulted in {number_of_dead} migrant(s) dead due to {cause_of_death} with at least {min_estimated_missing} migrant(s) missing, and {number_of_survivors} survivor(s) reported. The migrants were reportedly traveling from {country_of_origin} along {article_2} {migration_route} migration route."

            # Add description to description list
            description_list.append(description)

        return description_list
    
    #This function returns a list of latitudes from the Coordinates column of the dataset.
    def get_lat(dataframe):
        lat_list = [] # The list index corresponds to the row number of the dataset.
        dataframe['Coordinates'] = dataframe['Coordinates'].astype('str')
        lat_lon_list = dataframe['Coordinates'].str.replace('POINT ', '', regex = False).str.replace('(', '', regex = False).str.replace(')', '', regex = False).str.split(' ')
        for coord in lat_lon_list:
            lat_list.append(float(coord[-1].strip(',')))
        return lat_list
    
    #This function returns a list of longitudes from the Coordinates column of the dataset.
    def get_lon(dataframe):
        lon_list = [] # The list index corresponds to the row number of the dataset.
        dataframe['Coordinates'] = dataframe['Coordinates'].astype('str')
        lat_lon_list = dataframe['Coordinates'].str.replace('POINT ', '', regex = False).str.replace('(', '', regex = False).str.replace(')', '', regex = False).str.split(' ')
        for coord in lat_lon_list:
            lon_list.append(float(coord[0].strip(',')))
        return lon_list
    
    # This function returns a list of the coordinates with lowercase 'POINT'
    def get_coords(dataframe):
        coords_list = []
        for index in dataframe.index: # Index corresponds to row in the dataframe
            row = dataframe.loc[index]

            coordinates = str(row['Coordinates']).lower() # To lowercase
            coords_list.append(coordinates)

        return coords_list
    
    # Run this to build formatted dataset
    final_dataset = pd.DataFrame(columns = ['tracking_number', 'date_of_incident', 'primary_category', 'event_type', 'event_subtype', 'event_associated_number', 'event_associated_number_unit', 'latitude', 'longitude', 'description', 'actor1', 'assoc_actor_1', 'actor2', 'assoc_actor_2', 'actor3', 'assoc_actor_3', 'name_of_ship1', 'imo_number_of_ship1', 'type_of_ship1', 'name_of_ship2', 'imo_number_of_ship2', 'type_of_ship2', 'name_of_ship3', 'imo_number_of_ship3', 'type_of_ship3', 'flag_states_involved', 'region', 'location', 'location_precision', 'location_url', 'source_name', 'source_link_external', 'source_name2', 'source_link2_external', 'compiled_from', 'compilation_source_link_external'])

    final_dataset['description'] = get_desc(maritime_dataset)
    final_dataset['actor1'] = 'Migrants'
    final_dataset['location'] = maritime_dataset['Location of Incident']
    final_dataset['location_precision'] = "Point"
    final_dataset['location_url'] = 'https://missingmigrants.iom.int/data'
    final_dataset['source_name'] = maritime_dataset['Information Source']
    final_dataset['source_link_external'] = maritime_dataset['URL']
    final_dataset['compiled_from'] = 'International Organization for Migration - Missing Migrants Dataset'
    final_dataset['compilation_source_link_external'] = 'https://missingmigrants.iom.int/data'
    final_dataset['date_of_incident'] = maritime_dataset['Incident Date']
    final_dataset['primary_category'] = 'Migration'
    final_dataset['event_type'] = 'Migrant(s) Dead or Missing'
    final_dataset['event_subtype'] = maritime_dataset['Cause of Death']
    final_dataset['event_associated_number'] = maritime_dataset['Total Number of Dead and Missing']
    final_dataset['event_associated_number_unit'] = 'Total Number of Dead and Missing'
    final_dataset[['latitude', 'longitude']] = maritime_dataset['Coordinates'].str.split(', ', expand=True).astype(float)

    return final_dataset

def fetch_Migrants():
    driver, wait = get_driver()

    driver.get("https://missingmigrants.iom.int/downloads")

    # Wait for the table and its rows to load
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.views-data-export-csv div a')))

    link = driver.find_elements(By.CSS_SELECTOR, 'div.views-data-export-csv div a')[-1]
    link.click()
    
    timeout = 100
    # Start timer
    start_time = time.time()
    while len(glob.glob("*Missing_Migrants*.csv")) == 0:
        time.sleep(5)

        # Escape condition: Timeout if CSV doesn't download within `timeout_seconds`
        if time.time() - start_time > timeout:
            print("Error: CSV download timed out. Skipping this.")
            break

    driver.quit()
    
    migrant_data = clean_migrants()
    return migrant_data