From 37b47967b5bfd91b04cd35026bc25d0728d5b061 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Tue, 4 Aug 2020 15:05:59 -0400
Subject: [PATCH 01/16] wrangle country data

---
 ebmdatalab/generate-ebmdatalab-stats.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index cac4c5026..09969971c 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -5,6 +5,7 @@
 import os
 import pandas as pd
 import urllib.request
+import geopandas
 
 from manubot.cite.citekey import url_to_citekey
 from manubot.cite.doi import get_short_doi_url
@@ -116,6 +117,22 @@ def main(args):
     fig.savefig(args.output_figure + '.png', bbox_inches = "tight")
     fig.savefig(args.output_figure + '.svg', bbox_inches = "tight")
     
+    # Identify frequencies of each country in single-country and multi-country clinical trials
+    multi_countries = trials_df['countries'][trials_df['countries'].str.contains(',')]
+    multi_countries = pd.Series([country for country_list in multi_countries.str.split(',') for country in country_list])
+    multi_country_counts = multi_countries.value_counts()
+
+    single_countries = trials_df['countries'][~trials_df['countries'].str.contains(',')]
+    single_country_counts = single_countries.value_counts()
+    single_country_counts = single_country_counts.drop(labels='No Country Given')
+
+    # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data
+    world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
+    print(single_country_counts.index)
+    print(world.index)
+    #fig, ax = plt.subplots(1, 1)
+    
+    exit(0) 
     print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
     
     # The placeholder will be replaced by the actual SHA-1 hash in separate

From df5e59cbf4f694f615a750cfc3bb2821afa2f0e5 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Tue, 4 Aug 2020 15:35:19 -0400
Subject: [PATCH 02/16] test pycountry

---
 ebmdatalab/generate-ebmdatalab-stats.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 09969971c..07eecda59 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import urllib.request
 import geopandas
+import pycountry
 
 from manubot.cite.citekey import url_to_citekey
 from manubot.cite.doi import get_short_doi_url
@@ -126,10 +127,24 @@ def main(args):
     single_country_counts = single_countries.value_counts()
     single_country_counts = single_country_counts.drop(labels='No Country Given')
 
+    # Match country names in EBM data with ISO codes (more stable than names)    
+    for c in single_country_counts.index:
+        print(c)
+        try:
+             code = pycountry.countries.get(name=c).alpha_3
+        except LookupError():
+             hits = pycountry.countries.search_fuzzy(c)
+             if len(hits) == 1:
+                 code = 
+    print(single_country_codes)
     # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data
     world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
     print(single_country_counts.index)
-    print(world.index)
+    print(world.head())
+
+    for unmatched_country in single_country_counts.index[~single_country_counts.index.isin(world["name"])]:
+        print(pycountry.countries.get(name=unmatched_country))
+    
     #fig, ax = plt.subplots(1, 1)
     
     exit(0) 

From 76ff8b845d284e7be0d5bca2a31a7f2ac0509dc5 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Tue, 4 Aug 2020 18:56:54 -0400
Subject: [PATCH 03/16] data cleaning

---
 ebmdatalab/generate-ebmdatalab-stats.py | 86 +++++++++++++++++++------
 1 file changed, 65 insertions(+), 21 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 07eecda59..7fd3ad764 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -37,6 +37,45 @@ def extract_citekey(results_url):
         citekey = short_doi_url.replace('https://doi.org', 'doi:10')
     return citekey
 
+def assign_ISO(countries):
+    # Match country names with ISO codes
+    # Input: pd.Series of country names
+    # Returns: dictionary of matches
+
+    # Need to hard code a few countries that aren't registered using standard names, so
+    # initializing the single_country_codes database with these values
+    country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD",
+                     "Democratic Republic of the Congo": "COD"}
+
+    # Identify the most likely 3-letter ISO code for each country
+    failed_matches = list()
+    for country in countries:
+        if country not in country_codes.keys():
+            try:
+                hit = pycountry.countries.get(name=country)
+                if hit == None:
+                    # If the name isn't an exact match, try alternatives
+                    # .search_fuzzy matching returns a list, whereas .get retrieves data as class Country
+                    hit = pycountry.countries.search_fuzzy(country)
+                    if len(hit) > 1:
+                        hit = pycountry.countries.search_fuzzy(country + ",")
+                    elif type(hit) == None:
+                        hit = pycountry.countries.get(official_name=country)
+            except LookupError:
+                failed_matches.append(country)
+                continue
+
+            if type(hit) == list and len(hit) == 1:
+                country_codes[country] = hit[0].alpha_3
+            elif type(hit) == list or type(hit) == None:
+                failed_matches.append(country)
+            else:
+                country_codes[country] = hit.alpha_3
+
+    # Print warning about failures and return successes as dictionary
+    print("Failed to assign country codes to:", ", ".join(failed_matches))
+    return(country_codes)
+
 # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb
 def main(args):
     '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset'''
@@ -118,29 +157,34 @@ def main(args):
     fig.savefig(args.output_figure + '.png', bbox_inches = "tight")
     fig.savefig(args.output_figure + '.svg', bbox_inches = "tight")
     
-    # Identify frequencies of each country in single-country and multi-country clinical trials
-    multi_countries = trials_df['countries'][trials_df['countries'].str.contains(',')]
-    multi_countries = pd.Series([country for country_list in multi_countries.str.split(',') for country in country_list])
-    multi_country_counts = multi_countries.value_counts()
-
-    single_countries = trials_df['countries'][~trials_df['countries'].str.contains(',')]
-    single_country_counts = single_countries.value_counts()
-    single_country_counts = single_country_counts.drop(labels='No Country Given')
-
-    # Match country names in EBM data with ISO codes (more stable than names)    
-    for c in single_country_counts.index:
-        print(c)
-        try:
-             code = pycountry.countries.get(name=c).alpha_3
-        except LookupError():
-             hits = pycountry.countries.search_fuzzy(c)
-             if len(hits) == 1:
-                 code = 
-    print(single_country_codes)
+    # Identify the names of each country in single-country and multi-country clinical trials
+    # Multi refers to trials that have multiple country names, comma-separated
+    # One trial lists every country on Earth and formatted the data inconsistently, so drop it
+    valid_country = trials_df[trials_df['countries'] != "No Country Given"]
+    valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"]
+    single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')]
+    multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')]
+    multi_countries = pd.Series(
+        [country for country_list in multi_countries.str.split(',') for country in country_list]
+    )
+
+    # Identify the 3-letter ISO codes for each unique country
+    unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
+    country_codes = assign_ISO(unique_countries)
+
+    #multi_countries_codes = pd.DataFrame(multi_countries).join(
+    print(pd.DataFrame.from_dict(country_codes, orient="index") )
+    print(multi_countries.index)
+    #, columns=["countries", "code"]), on="countries")
+    #print(multi_countries_codes)
+    #multi_country_counts = multi_countries.value_counts()
+    #single_country_codes = assign_ISO(single_countries)
+    #single_country_counts = pd.DataFrame(single_countries .value_counts()
+    #print(single_country_counts)
+    exit(0)
+
     # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data
     world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
-    print(single_country_counts.index)
-    print(world.head())
 
     for unmatched_country in single_country_counts.index[~single_country_counts.index.isin(world["name"])]:
         print(pycountry.countries.get(name=unmatched_country))

From e7df1d8a8cc0fc851b316cbfbff74a34b5130298 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Tue, 4 Aug 2020 21:40:57 -0400
Subject: [PATCH 04/16] attempt to merge df, still buggy

---
 ebmdatalab/generate-ebmdatalab-stats.py | 33 +++++++++++--------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 7fd3ad764..06a77cb2a 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -170,28 +170,23 @@ def main(args):
 
     # Identify the 3-letter ISO codes for each unique country
     unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
-    country_codes = assign_ISO(unique_countries)
-
-    #multi_countries_codes = pd.DataFrame(multi_countries).join(
-    print(pd.DataFrame.from_dict(country_codes, orient="index") )
-    print(multi_countries.index)
-    #, columns=["countries", "code"]), on="countries")
-    #print(multi_countries_codes)
-    #multi_country_counts = multi_countries.value_counts()
-    #single_country_codes = assign_ISO(single_countries)
-    #single_country_counts = pd.DataFrame(single_countries .value_counts()
-    #print(single_country_counts)
-    exit(0)
+    country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), orient="index", columns=["iso_a3"])
 
-    # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data
-    world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
+    # Map the ISO codes onto the country data and count the frequency
+    single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"]
+    single_countries_codes = single_countries_codes.dropna()
+    single_countries_counts = single_countries_codes.value_counts()
+    multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
+    multi_countries_codes = multi_countries_codes.dropna()
+    multi_countries_counts = multi_countries_codes.value_counts()
 
-    for unmatched_country in single_country_counts.index[~single_country_counts.index.isin(world["name"])]:
-        print(pycountry.countries.get(name=unmatched_country))
-    
+    # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data
+    world_data = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')).set_index("iso_a3")
+    countries_mapping = world_data.join(pd.DataFrame(single_countries_counts)) #.join(multi_countries_counts)
+    print(countries_mapping)
     #fig, ax = plt.subplots(1, 1)
-    
-    exit(0) 
+    #print(pd.DataFrame(multi_countries_counts, index=multi_countries_counts["iso_a3"]).join(world, on="iso_a3"))
+
     print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
     
     # The placeholder will be replaced by the actual SHA-1 hash in separate

From 1019feb6099531aec1b9856de42f692a00e5643d Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 09:41:28 -0400
Subject: [PATCH 05/16] generate side-by-side choropleths

---
 ebmdatalab/generate-ebmdatalab-stats.py | 42 +++++++++++++++++++------
 ebmdatalab/generate-ebmdatalab-stats.sh |  3 +-
 2 files changed, 35 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 ebmdatalab/generate-ebmdatalab-stats.sh

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 06a77cb2a..942f75314 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -7,6 +7,8 @@
 import urllib.request
 import geopandas
 import pycountry
+import geoplot as gplt
+from datetime import date
 
 from manubot.cite.citekey import url_to_citekey
 from manubot.cite.doi import get_short_doi_url
@@ -156,6 +158,8 @@ def main(args):
     
     fig.savefig(args.output_figure + '.png', bbox_inches = "tight")
     fig.savefig(args.output_figure + '.svg', bbox_inches = "tight")
+
+    print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
     
     # Identify the names of each country in single-country and multi-country clinical trials
     # Multi refers to trials that have multiple country names, comma-separated
@@ -175,19 +179,34 @@ def main(args):
     # Map the ISO codes onto the country data and count the frequency
     single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"]
     single_countries_codes = single_countries_codes.dropna()
-    single_countries_counts = single_countries_codes.value_counts()
+    single_countries_counts = single_countries_codes.value_counts().rename("single_country_counts")
+
     multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
     multi_countries_codes = multi_countries_codes.dropna()
-    multi_countries_counts = multi_countries_codes.value_counts()
+    multi_countries_counts = multi_countries_codes.value_counts().rename("multi_country_counts")
 
-    # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data
+    # Map frequency data onto the geopandas geographical data
     world_data = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')).set_index("iso_a3")
-    countries_mapping = world_data.join(pd.DataFrame(single_countries_counts)) #.join(multi_countries_counts)
-    print(countries_mapping)
-    #fig, ax = plt.subplots(1, 1)
-    #print(pd.DataFrame(multi_countries_counts, index=multi_countries_counts["iso_a3"]).join(world, on="iso_a3"))
-
-    print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
+    countries_mapping = world_data.merge(
+        pd.DataFrame(single_countries_counts), how="inner", left_index=True, right_index=True).merge(
+        pd.DataFrame(multi_countries_counts), how="inner", left_index=True, right_index=True)
+
+    # Generate two-part choropleth visualizing world map with number of clinical trial data counted
+    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
+    ax1.set_title("Locations of Single-Country Clinical Trials")
+    ax1 = gplt.choropleth(countries_mapping, hue = countries_mapping['single_country_counts'],
+                        legend=True, ax=ax1) #countries_mapping.plot(column='single_country_counts', ax=ax1, legend=True)
+    ax2.set_title("Locations of Multi-Country Clinical Trials")
+    ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_country_counts'],
+                          legend=True, ax=ax2)
+    ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"),
+                 xy=(-168, -68))
+
+    plt.savefig(args.output_map + '.png', bbox_inches = "tight")
+    plt.savefig(args.output_map + '.svg', bbox_inches = "tight")
+
+    print(f'Wrote {args.output_map}.png and {args.output_map}.svg')
+    exit(0)
     
     # The placeholder will be replaced by the actual SHA-1 hash in separate
     # script after the updated image is committed
@@ -217,6 +236,11 @@ def main(args):
                         'statistics without file type extension. Will be saved ' \
                         'as .png and .svg.',
                         type=str)
+    parser.add_argument('output_map',
+                         help='Path of the output choropleth (world map figure) ' \
+                        'with geographic clinical trial frequencies, without file ' \
+                        'type extension. Will be saved as .png and .svg.',
+                        type=str)
 
     args = parser.parse_args()
     main(args)
diff --git a/ebmdatalab/generate-ebmdatalab-stats.sh b/ebmdatalab/generate-ebmdatalab-stats.sh
old mode 100644
new mode 100755
index 72345c7e6..e845f0b8e
--- a/ebmdatalab/generate-ebmdatalab-stats.sh
+++ b/ebmdatalab/generate-ebmdatalab-stats.sh
@@ -14,6 +14,7 @@ export EBM_COMMIT_DATE=$(echo $EBM_COMMIT_JSON | python -c "import sys, json; pr
 EBM_INPUT_JSON=ebmdatalab/trials_latest.json
 EBM_STATS_JSON=ebmdatalab/ebmdatalab-stats.json
 EBM_FIG=ebmdatalab/ebmdatalab-trials
+EBM_MAP=ebmdatalab/ebmdatalab-map
 
 echo "Downloading EBM Data Lab COVID-19 TrialsTracker data from commit $EBM_COMMIT_SHA authored $EBM_COMMIT_DATE"
 curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COMMIT_SHA/$EBM_REPO_PATH > $EBM_INPUT_JSON
@@ -22,6 +23,6 @@ curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COM
 # and run the version-figures.sh script to update the EBM_STATS_JSON with the
 # versioned figure URL
 echo "Generating EBM Data Lab COVID-19 TrialsTracker statistics and figure"
-python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG
+python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG $EBM_MAP
 
 rm $EBM_INPUT_JSON

From 72834c4aaa93c8ed8edcd383fbbcbb4a8a725392 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 12:37:53 -0400
Subject: [PATCH 06/16] tried to use geoplot, switching back

---
 ebmdatalab/generate-ebmdatalab-stats.py | 33 ++++++++++++++++---------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 942f75314..58f96b8f6 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -9,6 +9,8 @@
 import pycountry
 import geoplot as gplt
 from datetime import date
+from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar
+from bokeh import palettes
 
 from manubot.cite.citekey import url_to_citekey
 from manubot.cite.doi import get_short_doi_url
@@ -179,25 +181,34 @@ def main(args):
     # Map the ISO codes onto the country data and count the frequency
     single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"]
     single_countries_codes = single_countries_codes.dropna()
-    single_countries_counts = single_countries_codes.value_counts().rename("single_country_counts")
-
+    single_countries_counts = single_countries_codes.value_counts().rename("single_countries_counts")
     multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
     multi_countries_codes = multi_countries_codes.dropna()
-    multi_countries_counts = multi_countries_codes.value_counts().rename("multi_country_counts")
+    multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts")
 
-    # Map frequency data onto the geopandas geographical data
-    world_data = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')).set_index("iso_a3")
-    countries_mapping = world_data.merge(
-        pd.DataFrame(single_countries_counts), how="inner", left_index=True, right_index=True).merge(
-        pd.DataFrame(multi_countries_counts), how="inner", left_index=True, right_index=True)
+    # Map frequency data onto the geopandas geographical data for units with ISO code
+    # geopandas uses -99 as N/A for this field
+    countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
+    countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"]
+    for count_data in [single_countries_counts, multi_countries_counts]:
+        countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), how="left", left_on="iso_a3", right_index=True)
 
     # Generate two-part choropleth visualizing world map with number of clinical trial data counted
+    color_palette = LinearColorMapper(palette=palettes.Magma[256],
+                                     low=1,
+                                     high=max(
+                                         countries_mapping["single_countries_counts"].max(skipna=True),
+                                         countries_mapping["multi_countries_counts"].max(skipna=True)),
+                                     nan_color = '#d9d9d9')
     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
     ax1.set_title("Locations of Single-Country Clinical Trials")
-    ax1 = gplt.choropleth(countries_mapping, hue = countries_mapping['single_country_counts'],
-                        legend=True, ax=ax1) #countries_mapping.plot(column='single_country_counts', ax=ax1, legend=True)
+    ax1 = gplt.choropleth(countries_mapping,
+                          projection=geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')),
+                          hue = countries_mapping['single_countries_counts'].dropna(),
+                          legend=True,
+                          ax=ax1)
     ax2.set_title("Locations of Multi-Country Clinical Trials")
-    ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_country_counts'],
+    ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_countries_counts'],
                           legend=True, ax=ax2)
     ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"),
                  xy=(-168, -68))

From 1d1525973cd2ccb371678c658d683668d61be535 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 13:36:07 -0400
Subject: [PATCH 07/16] generate choropleth with geopandas

---
 ebmdatalab/generate-ebmdatalab-stats.py | 27 ++++++++++---------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 58f96b8f6..f1f0230f5 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -189,29 +189,24 @@ def main(args):
     # Map frequency data onto the geopandas geographical data for units with ISO code
     # geopandas uses -99 as N/A for this field
     countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
+    countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"]
     countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"]
     for count_data in [single_countries_counts, multi_countries_counts]:
         countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), how="left", left_on="iso_a3", right_index=True)
 
     # Generate two-part choropleth visualizing world map with number of clinical trial data counted
-    color_palette = LinearColorMapper(palette=palettes.Magma[256],
-                                     low=1,
-                                     high=max(
-                                         countries_mapping["single_countries_counts"].max(skipna=True),
-                                         countries_mapping["multi_countries_counts"].max(skipna=True)),
-                                     nan_color = '#d9d9d9')
     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
-    ax1.set_title("Locations of Single-Country Clinical Trials")
-    ax1 = gplt.choropleth(countries_mapping,
-                          projection=geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')),
-                          hue = countries_mapping['single_countries_counts'].dropna(),
-                          legend=True,
-                          ax=ax1)
-    ax2.set_title("Locations of Multi-Country Clinical Trials")
-    ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_countries_counts'],
-                          legend=True, ax=ax2)
+    fig.patch.set_visible(False)
+    ax1.axis('off')
+    ax2.axis('off')
+    countries_mapping.boundary.plot(ax=ax1, edgecolor="black")
+    countries_mapping.plot(column='single_countries_counts', ax=ax1, legend=True)
+    ax1.set_title("Number of Single-Country Clinical Trials Recruiting by Country")
+    countries_mapping.boundary.plot(ax=ax2, edgecolor="black")
+    countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples")
+    ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country")
     ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"),
-                 xy=(-168, -68))
+                 xy=(-10, -10))
 
     plt.savefig(args.output_map + '.png', bbox_inches = "tight")
     plt.savefig(args.output_map + '.svg', bbox_inches = "tight")

From dc38c1ad1e0b66074caa2641868267bd98005ff2 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 14:07:13 -0400
Subject: [PATCH 08/16] clean up code and fig

---
 ebmdatalab/generate-ebmdatalab-stats.py | 44 +++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index f1f0230f5..1e60e3b93 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -7,10 +7,7 @@
 import urllib.request
 import geopandas
 import pycountry
-import geoplot as gplt
 from datetime import date
-from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar
-from bokeh import palettes
 
 from manubot.cite.citekey import url_to_citekey
 from manubot.cite.doi import get_short_doi_url
@@ -42,9 +39,13 @@ def extract_citekey(results_url):
     return citekey
 
 def assign_ISO(countries):
-    # Match country names with ISO codes
-    # Input: pd.Series of country names
-    # Returns: dictionary of matches
+    """
+    Match country names with ISO codes
+    Input: series of country names
+    Returns: dictionary of matches
+    :type countries: pd.Series
+    """
+
 
     # Need to hard code a few countries that aren't registered using standard names, so
     # initializing the single_country_codes database with these values
@@ -57,13 +58,13 @@ def assign_ISO(countries):
         if country not in country_codes.keys():
             try:
                 hit = pycountry.countries.get(name=country)
-                if hit == None:
+                if hit is None:
                     # If the name isn't an exact match, try alternatives
                     # .search_fuzzy matching returns a list, whereas .get retrieves data as class Country
                     hit = pycountry.countries.search_fuzzy(country)
                     if len(hit) > 1:
                         hit = pycountry.countries.search_fuzzy(country + ",")
-                    elif type(hit) == None:
+                    elif type(hit) is None:
                         hit = pycountry.countries.get(official_name=country)
             except LookupError:
                 failed_matches.append(country)
@@ -71,14 +72,14 @@ def assign_ISO(countries):
 
             if type(hit) == list and len(hit) == 1:
                 country_codes[country] = hit[0].alpha_3
-            elif type(hit) == list or type(hit) == None:
+            elif type(hit) == list or type(hit) is None:
                 failed_matches.append(country)
             else:
                 country_codes[country] = hit.alpha_3
 
     # Print warning about failures and return successes as dictionary
-    print("Failed to assign country codes to:", ", ".join(failed_matches))
-    return(country_codes)
+    print("Could not assign country codes to:", ", ".join(failed_matches))
+    return country_codes
 
 # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb
 def main(args):
@@ -163,8 +164,9 @@ def main(args):
 
     print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
     
-    # Identify the names of each country in single-country and multi-country clinical trials
-    # Multi refers to trials that have multiple country names, comma-separated
+    # Clean and separate the names of each country in single-country and multi-country clinical trials
+    # Single-country trials have only a single name (string) in the `countries` field
+    # Multi refers to trials that have multiple names, comma-separated
     # One trial lists every country on Earth and formatted the data inconsistently, so drop it
     valid_country = trials_df[trials_df['countries'] != "No Country Given"]
     valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"]
@@ -175,24 +177,31 @@ def main(args):
     )
 
     # Identify the 3-letter ISO codes for each unique country
+    # Remove any leading/trailing whitespace that may result from splitting above
     unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
     country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), orient="index", columns=["iso_a3"])
 
     # Map the ISO codes onto the country data and count the frequency
-    single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"]
+    single_countries_codes = pd.DataFrame(single_countries,
+                                          index=single_countries).join(country_codes)["iso_a3"]
     single_countries_codes = single_countries_codes.dropna()
     single_countries_counts = single_countries_codes.value_counts().rename("single_countries_counts")
-    multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
+    multi_countries_codes = pd.DataFrame(multi_countries.str.strip(),
+                                         index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
     multi_countries_codes = multi_countries_codes.dropna()
     multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts")
 
     # Map frequency data onto the geopandas geographical data for units with ISO code
     # geopandas uses -99 as N/A for this field
+    # We don't need to evaluate Antarctica
     countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
     countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"]
     countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"]
     for count_data in [single_countries_counts, multi_countries_counts]:
-        countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), how="left", left_on="iso_a3", right_index=True)
+        countries_mapping = countries_mapping.merge(pd.DataFrame(count_data),
+                                                    how="left",
+                                                    left_on="iso_a3",
+                                                    right_index=True)
 
     # Generate two-part choropleth visualizing world map with number of clinical trial data counted
     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
@@ -206,13 +215,12 @@ def main(args):
     countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples")
     ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country")
     ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"),
-                 xy=(-10, -10))
+                 xy=(0,0), xycoords="axes points")
 
     plt.savefig(args.output_map + '.png', bbox_inches = "tight")
     plt.savefig(args.output_map + '.svg', bbox_inches = "tight")
 
     print(f'Wrote {args.output_map}.png and {args.output_map}.svg')
-    exit(0)
     
     # The placeholder will be replaced by the actual SHA-1 hash in separate
     # script after the updated image is committed

From fd738427e7671bfae42d2f580ad91bb7ecffe78c Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 14:26:32 -0400
Subject: [PATCH 09/16] update environment.yml

---
 environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/environment.yml b/environment.yml
index 61204fcd9..068f1b720 100644
--- a/environment.yml
+++ b/environment.yml
@@ -6,5 +6,7 @@ dependencies:
   - pandas=1.0.3
   - pip=20.0
   - python=3.7.6
+  - geopandas=0.8.1
+  - pycountry==20.7.3
   - pip:
     - git+https://github.com/manubot/manubot@a57ccf0be6972329ff3010eaaa0c5df7ccebb2d5

From f7db92c83fd5f017d070d7609dc49f50146219b3 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 14:52:19 -0400
Subject: [PATCH 10/16] linted

---
 ebmdatalab/generate-ebmdatalab-stats.py | 70 ++++++++++++++-----------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 1e60e3b93..31436927a 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -1,13 +1,13 @@
 import argparse
 import datetime
 import json
+from datetime import date
 import matplotlib.pyplot as plt
 import os
 import pandas as pd
 import urllib.request
 import geopandas
 import pycountry
-from datetime import date
 
 from manubot.cite.citekey import url_to_citekey
 from manubot.cite.doi import get_short_doi_url
@@ -21,7 +21,7 @@ def convert_date(git_date):
     # by the parser
     # https://en.wikipedia.org/wiki/ISO_8601#Coordinated_Universal_Time_(UTC)
     git_date = git_date.replace('Z', '+00:00')
-    
+
     # Remove the leading zero of the day
     # Assumes the year will not begin with 0
     return datetime.datetime.fromisoformat(git_date).strftime('%B %d, %Y').replace(' 0', ' ')
@@ -59,8 +59,9 @@ def assign_ISO(countries):
             try:
                 hit = pycountry.countries.get(name=country)
                 if hit is None:
-                    # If the name isn't an exact match, try alternatives
-                    # .search_fuzzy matching returns a list, whereas .get retrieves data as class Country
+                    # .get retrieves data as class Country
+                    # if it can't find a match, try alternative methods
+                    # .search_fuzzy matching returns a list
                     hit = pycountry.countries.search_fuzzy(country)
                     if len(hit) > 1:
                         hit = pycountry.countries.search_fuzzy(country + ",")
@@ -109,27 +110,27 @@ def main(args):
     assert (len(header) == len(trials_df.columns))
     trials_df.columns = header
     trials_df = trials_df.set_index('index')
-    
+
     ebm_stats['ebm_trials'] = f'{len(trials_df.index):,}'
-    
+
     # Get the most recent trial update
     most_recent_update = pd.to_datetime(trials_df['last_updated']).max()
     # Remove the leading zero of the day
     # Assumes the year will not begin with 0
     most_recent_update = most_recent_update.strftime('%B %d, %Y').replace(' 0', ' ')
     ebm_stats['ebm_date_pretty'] = most_recent_update
-    
+
     trial_results = trials_df[trials_df['results_url'] != 'No Results']['results_url']
     ebm_stats['ebm_trials_results'] = f'{len(trial_results):,}'
-    
+
     # Some results entries have multiple URLs
     trial_results_citekeys = [extract_citekey(results_url) for results in trial_results for results_url in results.split()]
     ebm_stats['ebm_trials_results_citekeys'] = sorted(set(trial_results_citekeys))
-    
+
     plt.rc('font', size=14)
     plt.rc('figure', titlesize=24)
     fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 12), constrained_layout=True)
-    
+
     # Plot trial recruitment status
     # Only include trials with a recruitment status
     recruitment_counts = trials_df['recruitment_status'].value_counts(ascending=True)
@@ -143,14 +144,14 @@ def main(args):
     phase_counts = phase_counts.drop(labels='Not Applicable')
     ax = phase_counts.plot(kind='barh', ax=axes[0, 1])
     ax.set_title('Clinical trials phase')
-    
+
     # Plot study type
     # Only include study types used in >= 5 trials
     study_type_counts = trials_df['study_type'].value_counts(ascending=True)
     study_type_counts = study_type_counts[study_type_counts >= 5]
     ax = study_type_counts.plot(kind='barh', ax=axes[1, 0])
     ax.set_title('Clinical trials study type')
-    
+
     # Plot common interventions
     # Only include trials with an intervention and interventions in >= 10 trials
     intervention_counts = trials_df['intervention'].value_counts(ascending=True)
@@ -158,16 +159,17 @@ def main(args):
     intervention_counts = intervention_counts[intervention_counts >= 10]
     ax = intervention_counts.plot(kind='barh', ax=axes[1, 1])
     ax.set_title('Clinical trials common interventions')
-    
+
     fig.savefig(args.output_figure + '.png', bbox_inches = "tight")
     fig.savefig(args.output_figure + '.svg', bbox_inches = "tight")
 
     print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
-    
-    # Clean and separate the names of each country in single-country and multi-country clinical trials
-    # Single-country trials have only a single name (string) in the `countries` field
-    # Multi refers to trials that have multiple names, comma-separated
-    # One trial lists every country on Earth and formatted the data inconsistently, so drop it
+
+    # Clean and separate the names of each country in single-country and
+    # multi-country clinical trials. Single-country trials have only a single
+    # name (string) in the `countries` field. Multi-country trials have
+    # multiple comma-separated names.
+    # Drop 1 trial that lists every country.
     valid_country = trials_df[trials_df['countries'] != "No Country Given"]
     valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"]
     single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')]
@@ -179,15 +181,19 @@ def main(args):
     # Identify the 3-letter ISO codes for each unique country
     # Remove any leading/trailing whitespace that may result from splitting above
     unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
-    country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), orient="index", columns=["iso_a3"])
+    country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries),
+                                           orient="index",
+                                           columns=["iso_a3"])
 
     # Map the ISO codes onto the country data and count the frequency
     single_countries_codes = pd.DataFrame(single_countries,
                                           index=single_countries).join(country_codes)["iso_a3"]
     single_countries_codes = single_countries_codes.dropna()
-    single_countries_counts = single_countries_codes.value_counts().rename("single_countries_counts")
-    multi_countries_codes = pd.DataFrame(multi_countries.str.strip(),
-                                         index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
+    single_countries_counts = \
+        single_countries_codes.value_counts().rename("single_countries_counts")
+    multi_countries_codes = \
+        pd.DataFrame(multi_countries.str.strip(),
+                     index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
     multi_countries_codes = multi_countries_codes.dropna()
     multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts")
 
@@ -203,7 +209,7 @@ def main(args):
                                                     left_on="iso_a3",
                                                     right_index=True)
 
-    # Generate two-part choropleth visualizing world map with number of clinical trial data counted
+    # Generate two-part choropleth of world map with # of clinical trials counted
     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
     fig.patch.set_visible(False)
     ax1.axis('off')
@@ -214,22 +220,24 @@ def main(args):
     countries_mapping.boundary.plot(ax=ax2, edgecolor="black")
     countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples")
     ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country")
-    ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"),
-                 xy=(0,0), xycoords="axes points")
+    ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' %
+                 date.today().strftime("%b-%d-%Y"),
+                 xy=(0, 0), xycoords="axes points")
 
-    plt.savefig(args.output_map + '.png', bbox_inches = "tight")
-    plt.savefig(args.output_map + '.svg', bbox_inches = "tight")
+    plt.savefig(args.output_map + '.png', bbox_inches="tight")
+    plt.savefig(args.output_map + '.svg', bbox_inches="tight")
 
     print(f'Wrote {args.output_map}.png and {args.output_map}.svg')
-    
+
     # The placeholder will be replaced by the actual SHA-1 hash in separate
     # script after the updated image is committed
     ebm_stats['ebm_trials_figure'] = \
         f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg'
-    
+
     # Tabulate number of trials for pharmaceuticals of interest
-    ebm_stats['ebm_tocilizumab_ct'] = str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum())
-    
+    ebm_stats['ebm_tocilizumab_ct'] = \
+        str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum())
+
     with open(args.output_json, 'w') as out_file:
         json.dump(ebm_stats, out_file, indent=2, sort_keys=True)
     print(f'Wrote {args.output_json}')

From 8e034e5e48cbec002d4589921dcd0be52eb0d837 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 15:03:32 -0400
Subject: [PATCH 11/16] remove extra newline

---
 ebmdatalab/generate-ebmdatalab-stats.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 31436927a..476dd2ff8 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -46,7 +46,6 @@ def assign_ISO(countries):
     :type countries: pd.Series
     """
 
-
     # Need to hard code a few countries that aren't registered using standard names, so
     # initializing the single_country_codes database with these values
     country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD",

From a626b18078bc0f5c248e4cad5c5fb1237eba03c0 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 18:08:06 -0400
Subject: [PATCH 12/16] fix issue with geopandas world dataset

---
 ebmdatalab/generate-ebmdatalab-stats.py | 41 +++++++++++++++++++------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 31436927a..3ba205c6a 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -48,7 +48,7 @@ def assign_ISO(countries):
 
 
     # Need to hard code a few countries that aren't registered using standard names, so
-    # initializing the single_country_codes database with these values
+    # initializing the country_codes database with these irregular values
     country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD",
                      "Democratic Republic of the Congo": "COD"}
 
@@ -82,6 +82,21 @@ def assign_ISO(countries):
     print("Could not assign country codes to:", ", ".join(failed_matches))
     return country_codes
 
+def lowres_fix(world):
+    """
+    There is an issue with the map data source from geopandas where ISO codes are missing
+    for several countries. This fix was proposed by @tommycarstensen at
+    https://github.com/geopandas/geopandas/issues/1041
+
+    :param world: dataframe (read in with geopandas)
+    :return: dataframe (geopandas formatted)
+    """
+    world.loc[world['name'] == 'France', 'iso_a3'] = 'FRA'
+    world.loc[world['name'] == 'Norway', 'iso_a3'] = 'NOR'
+    world.loc[world['name'] == 'Somaliland', 'iso_a3'] = 'SOM'
+    world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
+    return world
+
 # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb
 def main(args):
     '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset'''
@@ -189,25 +204,31 @@ def main(args):
     single_countries_codes = pd.DataFrame(single_countries,
                                           index=single_countries).join(country_codes)["iso_a3"]
     single_countries_codes = single_countries_codes.dropna()
-    single_countries_counts = \
-        single_countries_codes.value_counts().rename("single_countries_counts")
+    single_countries_counts = single_countries_codes.value_counts()
+    #single_countries_counts = single_countries_counts.rename("single_countries_counts")
     multi_countries_codes = \
         pd.DataFrame(multi_countries.str.strip(),
                      index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
     multi_countries_codes = multi_countries_codes.dropna()
-    multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts")
+    multi_countries_counts = multi_countries_codes.value_counts()
+    #multi_countries_counts = multi_countries_counts.rename("multi_countries_counts")
+    all_counts = single_countries_counts.\
+        to_frame(name = 'single_countries_counts').\
+        merge(multi_countries_counts.to_frame(name='multi_countries_counts'),
+              how="outer",
+              left_index=True,
+              right_index=True)
 
     # Map frequency data onto the geopandas geographical data for units with ISO code
     # geopandas uses -99 as N/A for this field
     # We don't need to evaluate Antarctica
     countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
     countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"]
-    countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"]
-    for count_data in [single_countries_counts, multi_countries_counts]:
-        countries_mapping = countries_mapping.merge(pd.DataFrame(count_data),
-                                                    how="left",
-                                                    left_on="iso_a3",
-                                                    right_index=True)
+    countries_mapping = lowres_fix(countries_mapping)
+    countries_mapping = countries_mapping.merge(all_counts,
+                                                how="left",
+                                                left_on="iso_a3",
+                                                right_index=True)
 
     # Generate two-part choropleth of world map with # of clinical trials counted
     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))

From 559e2c3775ee00579ef0fbee309502e62e75a66e Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Wed, 5 Aug 2020 18:31:42 -0400
Subject: [PATCH 13/16] linter

---
 ebmdatalab/generate-ebmdatalab-stats.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index a76282c8d..ce3b6c391 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -5,7 +5,6 @@
 import matplotlib.pyplot as plt
 import os
 import pandas as pd
-import urllib.request
 import geopandas
 import pycountry
 
@@ -57,22 +56,22 @@ def assign_ISO(countries):
         if country not in country_codes.keys():
             try:
                 hit = pycountry.countries.get(name=country)
-                if hit is None:
+                if isinstance(hit, type(None)):
                     # .get retrieves data as class Country
                     # if it can't find a match, try alternative methods
                     # .search_fuzzy matching returns a list
                     hit = pycountry.countries.search_fuzzy(country)
                     if len(hit) > 1:
                         hit = pycountry.countries.search_fuzzy(country + ",")
-                    elif type(hit) is None:
+                    elif isinstance(hit, type(None)):
                         hit = pycountry.countries.get(official_name=country)
             except LookupError:
                 failed_matches.append(country)
                 continue
 
-            if type(hit) == list and len(hit) == 1:
+            if isinstance(hit, list) and len(hit) == 1:
                 country_codes[country] = hit[0].alpha_3
-            elif type(hit) == list or type(hit) is None:
+            elif isinstance(hit, (list, type(None))):
                 failed_matches.append(country)
             else:
                 country_codes[country] = hit.alpha_3
@@ -204,15 +203,12 @@ def main(args):
                                           index=single_countries).join(country_codes)["iso_a3"]
     single_countries_codes = single_countries_codes.dropna()
     single_countries_counts = single_countries_codes.value_counts()
-    #single_countries_counts = single_countries_counts.rename("single_countries_counts")
     multi_countries_codes = \
         pd.DataFrame(multi_countries.str.strip(),
                      index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
     multi_countries_codes = multi_countries_codes.dropna()
     multi_countries_counts = multi_countries_codes.value_counts()
-    #multi_countries_counts = multi_countries_counts.rename("multi_countries_counts")
-    all_counts = single_countries_counts.\
-        to_frame(name = 'single_countries_counts').\
+    all_counts = single_countries_counts.to_frame(name='single_countries_counts').\
         merge(multi_countries_counts.to_frame(name='multi_countries_counts'),
               how="outer",
               left_index=True,
@@ -222,12 +218,13 @@ def main(args):
     # geopandas uses -99 as N/A for this field
     # We don't need to evaluate Antarctica
     countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
-    countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"]
     countries_mapping = lowres_fix(countries_mapping)
+    countries_mapping = countries_mapping[(countries_mapping.name != "Antarctica") &
+                                          (countries_mapping.iso_a3 != "-99")]
     countries_mapping = countries_mapping.merge(all_counts,
                                                 how="left",
-                                                left_on="iso_a3",
-                                                right_index=True)
+                                                right_index=True,
+                                                left_on="iso_a3")
 
     # Generate two-part choropleth of world map with # of clinical trials counted
     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
@@ -279,7 +276,7 @@ def main(args):
                         'as .png and .svg.',
                         type=str)
     parser.add_argument('output_map',
-                         help='Path of the output choropleth (world map figure) ' \
+                        help='Path of the output choropleth (world map figure) ' \
                         'with geographic clinical trial frequencies, without file ' \
                         'type extension. Will be saved as .png and .svg.',
                         type=str)

From 22440a88dac2401b40d9d357f3beda7bf9c28c76 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Thu, 13 Aug 2020 19:11:05 -0400
Subject: [PATCH 14/16] handle exceptions

---
 ebmdatalab/generate-ebmdatalab-stats.py | 89 ++++++++++++++++---------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index ce3b6c391..c168f672f 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -37,53 +37,81 @@ def extract_citekey(results_url):
         citekey = short_doi_url.replace('https://doi.org', 'doi:10')
     return citekey
 
+
+def check_none(value):
+    """Raises ValueError if value is type None, else returns value"""
+    if isinstance(value, type(None)):
+        raise ValueError
+    return value
+
+
+def find_country(country):
+    """ .get retrieves data as class Country
+    .search_fuzzy matching returns a list
+    try a few ways to identify a match and
+    return as soon as find something valid
+    Input is country name (string)
+    If no match found, return None"""
+    try:
+        hit = pycountry.countries.get(name=country)
+        hit = check_none(hit)
+        return hit
+    except (LookupError, ValueError):
+        try:
+            hit = pycountry.countries.get(official_name=country)
+            hit = check_none(hit)
+            return hit
+        except (LookupError, ValueError):
+            try:
+                hit = pycountry.countries.search_fuzzy(country)
+                hit = check_none(hit)
+                if type(hit) == list and len(hit) == 1:
+                    return hit[0]
+                raise ValueError
+            except (LookupError, ValueError):
+                try:
+                    hit = pycountry.countries.search_fuzzy(country + ",")
+                    hit = check_none(hit)
+                    if isinstance(hit, list) and len(hit) == 1:
+                        return hit[0]
+                    else:
+                        raise ValueError
+                except (LookupError, ValueError):
+                    return None
+
+
 def assign_ISO(countries):
-    """
-    Match country names with ISO codes
+    """ Match country names with ISO codes
     Input: series of country names
     Returns: dictionary of matches
-    :type countries: pd.Series
-    """
-
+    :type countries: pd.Series """
     # Need to hard code a few countries that aren't registered using standard names, so
     # initializing the country_codes database with these irregular values
     country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD",
-                     "Democratic Republic of the Congo": "COD"}
+                     "Democratic Republic of the Congo": "COD", "UAE": "ARE"}
 
     # Identify the most likely 3-letter ISO code for each country
     failed_matches = list()
     for country in countries:
         if country not in country_codes.keys():
-            try:
-                hit = pycountry.countries.get(name=country)
-                if isinstance(hit, type(None)):
-                    # .get retrieves data as class Country
-                    # if it can't find a match, try alternative methods
-                    # .search_fuzzy matching returns a list
-                    hit = pycountry.countries.search_fuzzy(country)
-                    if len(hit) > 1:
-                        hit = pycountry.countries.search_fuzzy(country + ",")
-                    elif isinstance(hit, type(None)):
-                        hit = pycountry.countries.get(official_name=country)
-            except LookupError:
-                failed_matches.append(country)
-                continue
-
-            if isinstance(hit, list) and len(hit) == 1:
-                country_codes[country] = hit[0].alpha_3
-            elif isinstance(hit, (list, type(None))):
-                failed_matches.append(country)
-            else:
+            # Need to query the pycountry package but it can fail for a
+            # few reasons. Use function to avoid LookupError issues and
+            # try all the different ways that might help to match a
+            # country name to its ISO code
+            hit = find_country(country)
+            if not isinstance(hit, type(None)):
                 country_codes[country] = hit.alpha_3
-
+            else:
+                failed_matches.append(country)
     # Print warning about failures and return successes as dictionary
     print("Could not assign country codes to:", ", ".join(failed_matches))
     return country_codes
 
+
 def lowres_fix(world):
-    """
-    There is an issue with the map data source from geopandas where ISO codes are missing
-    for several countries. This fix was proposed by @tommycarstensen at
+    """There is an issue with the map data source from geopandas where
+    ISO codes are missing for several countries. This fix was proposed
+    by @tommycarstensen at
     https://github.com/geopandas/geopandas/issues/1041
 
     :param world: dataframe (read in with geopandas)
@@ -95,6 +123,7 @@ def lowres_fix(world):
     world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
     return world
 
+
 # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb
 def main(args):
     '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset'''

From a47021c740a6b7b855f04b43cf0feebb583f6933 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Fri, 14 Aug 2020 10:13:23 -0400
Subject: [PATCH 15/16] Apply @agitter's suggestions

Co-authored-by: Anthony Gitter <agitter@users.noreply.github.com>
---
 ebmdatalab/generate-ebmdatalab-stats.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index c168f672f..486fc2583 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -104,7 +104,7 @@ def assign_ISO(countries):
             else:
                 failed_matches.append(country)
     # Print warning about failures and return successes as dictionary
-    print("Could not assign country codes to:", ", ".join(failed_matches))
+    print("Could not assign country codes to: ", ", ".join(failed_matches))
     return country_codes
 
 
@@ -279,7 +279,8 @@ def main(args):
     # script after the updated image is committed
     ebm_stats['ebm_trials_figure'] = \
         f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg'
-
+ebm_stats['ebm_map_figure'] = \
+        f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_map}.svg'
     # Tabulate number of trials for pharmaceuticals of interest
     ebm_stats['ebm_tocilizumab_ct'] = \
         str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum())

From 9bd74236495fd67674c561deb92774128304c814 Mon Sep 17 00:00:00 2001
From: HM Rando <halie.rando@pennmedicine.upenn.edu>
Date: Fri, 14 Aug 2020 11:00:11 -0400
Subject: [PATCH 16/16] rearrange blocks to keep plots together

---
 ebmdatalab/generate-ebmdatalab-stats.py | 77 +++++++++++++------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py
index 486fc2583..86e9dbf28 100644
--- a/ebmdatalab/generate-ebmdatalab-stats.py
+++ b/ebmdatalab/generate-ebmdatalab-stats.py
@@ -123,6 +123,44 @@ def lowres_fix(world):
     world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
     return world
 
+def tabulate_countries(trials_df):
+    # Clean and separate the names of each country in single-country and
+    # multi-country clinical trials. Single-country trials have only a single
+    # name (string) in the `countries` field. Multi-country trials have
+    # multiple comma-separated names.
+    # Drop 1 trial that lists every country.
+    valid_country = trials_df[trials_df['countries'] != "No Country Given"]
+    valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"]
+    single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')]
+    multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')]
+    multi_countries = pd.Series(
+        [country for country_list in multi_countries.str.split(',') for country in country_list]
+    )
+
+    # Identify the 3-letter ISO codes for each unique country
+    # Remove any leading/trailing whitespace that may result from splitting above
+    unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
+    country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries),
+                                           orient="index",
+                                           columns=["iso_a3"])
+
+    # Map the ISO codes onto the country data and count the frequency
+    single_countries_codes = pd.DataFrame(single_countries,
+                                          index=single_countries).join(country_codes)["iso_a3"]
+    single_countries_codes = single_countries_codes.dropna()
+    single_countries_counts = single_countries_codes.value_counts()
+    multi_countries_codes = \
+        pd.DataFrame(multi_countries.str.strip(),
+                     index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
+    multi_countries_codes = multi_countries_codes.dropna()
+    multi_countries_counts = multi_countries_codes.value_counts()
+    all_counts = single_countries_counts.to_frame(name='single_countries_counts').\
+        merge(multi_countries_counts.to_frame(name='multi_countries_counts'),
+              how="outer",
+              left_index=True,
+              right_index=True)
+    return all_counts
+
 
 # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb
 def main(args):
@@ -207,41 +245,8 @@ def main(args):
 
     print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')
 
-    # Clean and separate the names of each country in single-country and
-    # multi-country clinical trials. Single-country trials have only a single
-    # name (string) in the `countries` field. Multi-country trials have
-    # multiple comma-separated names.
-    # Drop 1 trial that lists every country.
-    valid_country = trials_df[trials_df['countries'] != "No Country Given"]
-    valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"]
-    single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')]
-    multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')]
-    multi_countries = pd.Series(
-        [country for country_list in multi_countries.str.split(',') for country in country_list]
-    )
-
-    # Identify the 3-letter ISO codes for each unique country
-    # Remove any leading/trailing whitespace that may result from splitting above
-    unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
-    country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries),
-                                           orient="index",
-                                           columns=["iso_a3"])
-
-    # Map the ISO codes onto the country data and count the frequency
-    single_countries_codes = pd.DataFrame(single_countries,
-                                          index=single_countries).join(country_codes)["iso_a3"]
-    single_countries_codes = single_countries_codes.dropna()
-    single_countries_counts = single_countries_codes.value_counts()
-    multi_countries_codes = \
-        pd.DataFrame(multi_countries.str.strip(),
-                     index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
-    multi_countries_codes = multi_countries_codes.dropna()
-    multi_countries_counts = multi_countries_codes.value_counts()
-    all_counts = single_countries_counts.to_frame(name='single_countries_counts').\
-        merge(multi_countries_counts.to_frame(name='multi_countries_counts'),
-              how="outer",
-              left_index=True,
-              right_index=True)
+    # Count geographic representation by ISO3 code
+    all_counts = tabulate_countries(trials_df)
 
     # Map frequency data onto the geopandas geographical data for units with ISO code
     # geopandas uses -99 as N/A for this field
@@ -279,7 +284,7 @@ def main(args):
     # script after the updated image is committed
     ebm_stats['ebm_trials_figure'] = \
         f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg'
-ebm_stats['ebm_map_figure'] = \
+    ebm_stats['ebm_map_figure'] = \
         f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_map}.svg'
     # Tabulate number of trials for pharmaceuticals of interest
     ebm_stats['ebm_tocilizumab_ct'] = \