From 37b47967b5bfd91b04cd35026bc25d0728d5b061 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Tue, 4 Aug 2020 15:05:59 -0400 Subject: [PATCH 01/16] wrangle country data --- ebmdatalab/generate-ebmdatalab-stats.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index cac4c5026..09969971c 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -5,6 +5,7 @@ import os import pandas as pd import urllib.request +import geopandas from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -116,6 +117,22 @@ def main(args): fig.savefig(args.output_figure + '.png', bbox_inches = "tight") fig.savefig(args.output_figure + '.svg', bbox_inches = "tight") + # Identify frequencies of each country in single-country and multi-country clinical trials + multi_countries = trials_df['countries'][trials_df['countries'].str.contains(',')] + multi_countries = pd.Series([country for country_list in multi_countries.str.split(',') for country in country_list]) + multi_country_counts = multi_countries.value_counts() + + single_countries = trials_df['countries'][~trials_df['countries'].str.contains(',')] + single_country_counts = single_countries.value_counts() + single_country_counts = single_country_counts.drop(labels='No Country Given') + + # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data + world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + print(single_country_counts.index) + print(world.index) + #fig, ax = plt.subplots(1, 1) + + exit(0) print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') # The placeholder will be replaced by the actual SHA-1 hash in separate From df5e59cbf4f694f615a750cfc3bb2821afa2f0e5 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Tue, 4 Aug 2020 15:35:19 -0400 Subject: [PATCH 02/16] test pycountry --- ebmdatalab/generate-ebmdatalab-stats.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 09969971c..07eecda59 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -6,6 +6,7 @@ import pandas as pd import urllib.request import geopandas +import pycountry from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -126,10 +127,24 @@ def main(args): single_country_counts = single_countries.value_counts() single_country_counts = single_country_counts.drop(labels='No Country Given') + # Match country names in EBM data with ISO codes (more stable than names) + for c in single_country_counts.index: + print(c) + try: + code = pycountry.countries.get(name=c).alpha_3 + except LookupError(): + hits = pycountry.countries.search_fuzzy(c) + if len(hits) == 1: + code = + print(single_country_codes) # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) print(single_country_counts.index) - print(world.index) + print(world.head()) + + for unmatched_country in single_country_counts.index[~single_country_counts.index.isin(world["name"])]: + print(pycountry.countries.get(name=unmatched_country)) + #fig, ax = plt.subplots(1, 1) exit(0) From 76ff8b845d284e7be0d5bca2a31a7f2ac0509dc5 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Tue, 4 Aug 2020 18:56:54 -0400 Subject: [PATCH 03/16] data cleaning --- ebmdatalab/generate-ebmdatalab-stats.py | 86 +++++++++++++++++++------ 1 file changed, 65 insertions(+), 21 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 07eecda59..7fd3ad764 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -37,6 +37,45 @@ def extract_citekey(results_url): citekey = short_doi_url.replace('https://doi.org', 'doi:10') return citekey +def assign_ISO(countries): + # Match country names with ISO codes + # Input: pd.Series of country names + # Returns: dictionary of matches + + # Need to hard code a few countries that aren't registered using standard names, so + # initializing the single_country_codes database with these values + country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD", + "Democratic Republic of the Congo": "COD"} + + # Identify the most likely 3-letter ISO code for each country + failed_matches = list() + for country in countries: + if country not in country_codes.keys(): + try: + hit = pycountry.countries.get(name=country) + if hit == None: + # If the name isn't an exact match, try alternatives + # .search_fuzzy matching returns a list, whereas .get retrieves data as class Country + hit = pycountry.countries.search_fuzzy(country) + if len(hit) > 1: + hit = pycountry.countries.search_fuzzy(country + ",") + elif type(hit) == None: + hit = pycountry.countries.get(official_name=country) + except LookupError: + failed_matches.append(country) + continue + + if type(hit) == list and len(hit) == 1: + country_codes[country] = hit[0].alpha_3 + elif type(hit) == list or type(hit) == None: + failed_matches.append(country) + else: + country_codes[country] = hit.alpha_3 + + # Print warning about failures and return successes as dictionary + print("Failed to assign country codes to:", ", ".join(failed_matches)) + return(country_codes) + # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb def main(args): '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset''' @@ -118,29 +157,34 @@ def main(args): fig.savefig(args.output_figure + '.png', bbox_inches = "tight") fig.savefig(args.output_figure + '.svg', bbox_inches = "tight") - # Identify frequencies of each country in single-country and multi-country clinical trials - multi_countries = trials_df['countries'][trials_df['countries'].str.contains(',')] - multi_countries = pd.Series([country for country_list in multi_countries.str.split(',') for country in country_list]) - multi_country_counts = multi_countries.value_counts() - - single_countries = trials_df['countries'][~trials_df['countries'].str.contains(',')] - single_country_counts = single_countries.value_counts() - single_country_counts = single_country_counts.drop(labels='No Country Given') - - # Match country names in EBM data with ISO codes (more stable than names) - for c in single_country_counts.index: - print(c) - try: - code = pycountry.countries.get(name=c).alpha_3 - except LookupError(): - hits = pycountry.countries.search_fuzzy(c) - if len(hits) == 1: - code = - print(single_country_codes) + # Identify the names of each country in single-country and multi-country clinical trials + # Multi refers to trials that have multiple country names, comma-separated + # One trial lists every country on Earth and formatted the data inconsistently, so drop it + valid_country = trials_df[trials_df['countries'] != "No Country Given"] + valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"] + single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')] + multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')] + multi_countries = pd.Series( + [country for country_list in multi_countries.str.split(',') for country in country_list] + ) + + # Identify the 3-letter ISO codes for each unique country + unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() + country_codes = assign_ISO(unique_countries) + + #multi_countries_codes = pd.DataFrame(multi_countries).join( + print(pd.DataFrame.from_dict(country_codes, orient="index") ) + print(multi_countries.index) + #, columns=["countries", "code"]), on="countries") + #print(multi_countries_codes) + #multi_country_counts = multi_countries.value_counts() + #single_country_codes = assign_ISO(single_countries) + #single_country_counts = pd.DataFrame(single_countries .value_counts() + #print(single_country_counts) + exit(0) + # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) - print(single_country_counts.index) - print(world.head()) for unmatched_country in single_country_counts.index[~single_country_counts.index.isin(world["name"])]: print(pycountry.countries.get(name=unmatched_country)) From e7df1d8a8cc0fc851b316cbfbff74a34b5130298 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Tue, 4 Aug 2020 21:40:57 -0400 Subject: [PATCH 04/16] attempt to merge df, still buggy --- ebmdatalab/generate-ebmdatalab-stats.py | 33 +++++++++++-------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 7fd3ad764..06a77cb2a 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -170,28 +170,23 @@ def main(args): # Identify the 3-letter ISO codes for each unique country unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() - country_codes = assign_ISO(unique_countries) - - #multi_countries_codes = pd.DataFrame(multi_countries).join( - print(pd.DataFrame.from_dict(country_codes, orient="index") ) - print(multi_countries.index) - #, columns=["countries", "code"]), on="countries") - #print(multi_countries_codes) - #multi_country_counts = multi_countries.value_counts() - #single_country_codes = assign_ISO(single_countries) - #single_country_counts = pd.DataFrame(single_countries .value_counts() - #print(single_country_counts) - exit(0) + country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), orient="index", columns=["iso_a3"]) - # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data - world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + # Map the ISO codes onto the country data and count the frequency + single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"] + single_countries_codes = single_countries_codes.dropna() + single_countries_counts = single_countries_codes.value_counts() + multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"] + multi_countries_codes = multi_countries_codes.dropna() + multi_countries_counts = multi_countries_codes.value_counts() - for unmatched_country in single_country_counts.index[~single_country_counts.index.isin(world["name"])]: - print(pycountry.countries.get(name=unmatched_country)) - + # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data + world_data = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')).set_index("iso_a3") + countries_mapping = world_data.join(pd.DataFrame(single_countries_counts)) #.join(multi_countries_counts) + print(countries_mapping) #fig, ax = plt.subplots(1, 1) - - exit(0) + #print(pd.DataFrame(multi_countries_counts, index=multi_countries_counts["iso_a3"]).join(world, on="iso_a3")) + print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') # The placeholder will be replaced by the actual SHA-1 hash in separate From 1019feb6099531aec1b9856de42f692a00e5643d Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 09:41:28 -0400 Subject: [PATCH 05/16] generate side-by-side choropleths --- ebmdatalab/generate-ebmdatalab-stats.py | 42 +++++++++++++++++++------ ebmdatalab/generate-ebmdatalab-stats.sh | 3 +- 2 files changed, 35 insertions(+), 10 deletions(-) mode change 100644 => 100755 ebmdatalab/generate-ebmdatalab-stats.sh diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 06a77cb2a..942f75314 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -7,6 +7,8 @@ import urllib.request import geopandas import pycountry +import geoplot as gplt +from datetime import date from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -156,6 +158,8 @@ def main(args): fig.savefig(args.output_figure + '.png', bbox_inches = "tight") fig.savefig(args.output_figure + '.svg', bbox_inches = "tight") + + print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') # Identify the names of each country in single-country and multi-country clinical trials # Multi refers to trials that have multiple country names, comma-separated @@ -175,19 +179,34 @@ def main(args): # Map the ISO codes onto the country data and count the frequency single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"] single_countries_codes = single_countries_codes.dropna() - single_countries_counts = single_countries_codes.value_counts() + single_countries_counts = single_countries_codes.value_counts().rename("single_country_counts") + multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"] multi_countries_codes = multi_countries_codes.dropna() - multi_countries_counts = multi_countries_codes.value_counts() + multi_countries_counts = multi_countries_codes.value_counts().rename("multi_country_counts") - # Generate two-pane choropleth visualizing world map with number of representations in clinical trial data + # Map frequency data onto the geopandas geographical data world_data = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')).set_index("iso_a3") - countries_mapping = world_data.join(pd.DataFrame(single_countries_counts)) #.join(multi_countries_counts) - print(countries_mapping) - #fig, ax = plt.subplots(1, 1) - #print(pd.DataFrame(multi_countries_counts, index=multi_countries_counts["iso_a3"]).join(world, on="iso_a3")) - - print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') + countries_mapping = world_data.merge( + pd.DataFrame(single_countries_counts), how="inner", left_index=True, right_index=True).merge( + pd.DataFrame(multi_countries_counts), how="inner", left_index=True, right_index=True) + + # Generate two-part choropleth visualizing world map with number of clinical trial data counted + fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) + ax1.set_title("Locations of Single-Country Clinical Trials") + ax1 = gplt.choropleth(countries_mapping, hue = countries_mapping['single_country_counts'], + legend=True, ax=ax1) #countries_mapping.plot(column='single_country_counts', ax=ax1, legend=True) + ax2.set_title("Locations of Multi-Country Clinical Trials") + ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_country_counts'], + legend=True, ax=ax2) + ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"), + xy=(-168, -68)) + + plt.savefig(args.output_map + '.png', bbox_inches = "tight") + plt.savefig(args.output_map + '.svg', bbox_inches = "tight") + + print(f'Wrote {args.output_map}.png and {args.output_map}.svg') + exit(0) # The placeholder will be replaced by the actual SHA-1 hash in separate # script after the updated image is committed @@ -217,6 +236,11 @@ def main(args): 'statistics without file type extension. Will be saved ' \ 'as .png and .svg.', type=str) + parser.add_argument('output_map', + help='Path of the output choropleth (world map figure) ' \ + 'with geographic clinical trial frequencies, without file ' \ + 'type extension. Will be saved as .png and .svg.', + type=str) args = parser.parse_args() main(args) diff --git a/ebmdatalab/generate-ebmdatalab-stats.sh b/ebmdatalab/generate-ebmdatalab-stats.sh old mode 100644 new mode 100755 index 72345c7e6..e845f0b8e --- a/ebmdatalab/generate-ebmdatalab-stats.sh +++ b/ebmdatalab/generate-ebmdatalab-stats.sh @@ -14,6 +14,7 @@ export EBM_COMMIT_DATE=$(echo $EBM_COMMIT_JSON | python -c "import sys, json; pr EBM_INPUT_JSON=ebmdatalab/trials_latest.json EBM_STATS_JSON=ebmdatalab/ebmdatalab-stats.json EBM_FIG=ebmdatalab/ebmdatalab-trials +EBM_MAP=ebmdatalab/ebmdatalab-map echo "Downloading EBM Data Lab COVID-19 TrialsTracker data from commit $EBM_COMMIT_SHA authored $EBM_COMMIT_DATE" curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COMMIT_SHA/$EBM_REPO_PATH > $EBM_INPUT_JSON @@ -22,6 +23,6 @@ curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COM # and run the version-figures.sh script to update the EBM_STATS_JSON with the # versioned figure URL echo "Generating EBM Data Lab COVID-19 TrialsTracker statistics and figure" -python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG +python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG $EBM_MAP rm $EBM_INPUT_JSON From 72834c4aaa93c8ed8edcd383fbbcbb4a8a725392 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 12:37:53 -0400 Subject: [PATCH 06/16] tried to use geoplot, switching back --- ebmdatalab/generate-ebmdatalab-stats.py | 33 ++++++++++++++++--------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 942f75314..58f96b8f6 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -9,6 +9,8 @@ import pycountry import geoplot as gplt from datetime import date +from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar +from bokeh import palettes from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -179,25 +181,34 @@ def main(args): # Map the ISO codes onto the country data and count the frequency single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"] single_countries_codes = single_countries_codes.dropna() - single_countries_counts = single_countries_codes.value_counts().rename("single_country_counts") - + single_countries_counts = single_countries_codes.value_counts().rename("single_countries_counts") multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"] multi_countries_codes = multi_countries_codes.dropna() - multi_countries_counts = multi_countries_codes.value_counts().rename("multi_country_counts") + multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts") - # Map frequency data onto the geopandas geographical data - world_data = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')).set_index("iso_a3") - countries_mapping = world_data.merge( - pd.DataFrame(single_countries_counts), how="inner", left_index=True, right_index=True).merge( - pd.DataFrame(multi_countries_counts), how="inner", left_index=True, right_index=True) + # Map frequency data onto the geopandas geographical data for units with ISO code + # geopandas uses -99 as N/A for this field + countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"] + for count_data in [single_countries_counts, multi_countries_counts]: + countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), how="left", left_on="iso_a3", right_index=True) # Generate two-part choropleth visualizing world map with number of clinical trial data counted + color_palette = LinearColorMapper(palette=palettes.Magma[256], + low=1, + high=max( + countries_mapping["single_countries_counts"].max(skipna=True), + countries_mapping["multi_countries_counts"].max(skipna=True)), + nan_color = '#d9d9d9') fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) ax1.set_title("Locations of Single-Country Clinical Trials") - ax1 = gplt.choropleth(countries_mapping, hue = countries_mapping['single_country_counts'], - legend=True, ax=ax1) #countries_mapping.plot(column='single_country_counts', ax=ax1, legend=True) + ax1 = gplt.choropleth(countries_mapping, + projection=geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')), + hue = countries_mapping['single_countries_counts'].dropna(), + legend=True, + ax=ax1) ax2.set_title("Locations of Multi-Country Clinical Trials") - ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_country_counts'], + ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_countries_counts'], legend=True, ax=ax2) ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"), xy=(-168, -68)) From 1d1525973cd2ccb371678c658d683668d61be535 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 13:36:07 -0400 Subject: [PATCH 07/16] generate choropleth with geopandas --- ebmdatalab/generate-ebmdatalab-stats.py | 27 ++++++++++--------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 58f96b8f6..f1f0230f5 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -189,29 +189,24 @@ def main(args): # Map frequency data onto the geopandas geographical data for units with ISO code # geopandas uses -99 as N/A for this field countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"] countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"] for count_data in [single_countries_counts, multi_countries_counts]: countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), how="left", left_on="iso_a3", right_index=True) # Generate two-part choropleth visualizing world map with number of clinical trial data counted - color_palette = LinearColorMapper(palette=palettes.Magma[256], - low=1, - high=max( - countries_mapping["single_countries_counts"].max(skipna=True), - countries_mapping["multi_countries_counts"].max(skipna=True)), - nan_color = '#d9d9d9') fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) - ax1.set_title("Locations of Single-Country Clinical Trials") - ax1 = gplt.choropleth(countries_mapping, - projection=geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')), - hue = countries_mapping['single_countries_counts'].dropna(), - legend=True, - ax=ax1) - ax2.set_title("Locations of Multi-Country Clinical Trials") - ax2 = gplt.choropleth(countries_mapping, hue = countries_mapping['multi_countries_counts'], - legend=True, ax=ax2) + fig.patch.set_visible(False) + ax1.axis('off') + ax2.axis('off') + countries_mapping.boundary.plot(ax=ax1, edgecolor="black") + countries_mapping.plot(column='single_countries_counts', ax=ax1, legend=True) + ax1.set_title("Number of Single-Country Clinical Trials Recruiting by Country") + countries_mapping.boundary.plot(ax=ax2, edgecolor="black") + countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples") + ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country") ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"), - xy=(-168, -68)) + xy=(-10, -10)) plt.savefig(args.output_map + '.png', bbox_inches = "tight") plt.savefig(args.output_map + '.svg', bbox_inches = "tight") From dc38c1ad1e0b66074caa2641868267bd98005ff2 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 14:07:13 -0400 Subject: [PATCH 08/16] clean up code and fig --- ebmdatalab/generate-ebmdatalab-stats.py | 44 +++++++++++++++---------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index f1f0230f5..1e60e3b93 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -7,10 +7,7 @@ import urllib.request import geopandas import pycountry -import geoplot as gplt from datetime import date -from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar -from bokeh import palettes from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -42,9 +39,13 @@ def extract_citekey(results_url): return citekey def assign_ISO(countries): - # Match country names with ISO codes - # Input: pd.Series of country names - # Returns: dictionary of matches + """ + Match country names with ISO codes + Input: series of country names + Returns: dictionary of matches + :type countries: pd.Series + """ + # Need to hard code a few countries that aren't registered using standard names, so # initializing the single_country_codes database with these values @@ -57,13 +58,13 @@ def assign_ISO(countries): if country not in country_codes.keys(): try: hit = pycountry.countries.get(name=country) - if hit == None: + if hit is None: # If the name isn't an exact match, try alternatives # .search_fuzzy matching returns a list, whereas .get retrieves data as class Country hit = pycountry.countries.search_fuzzy(country) if len(hit) > 1: hit = pycountry.countries.search_fuzzy(country + ",") - elif type(hit) == None: + elif type(hit) is None: hit = pycountry.countries.get(official_name=country) except LookupError: failed_matches.append(country) @@ -71,14 +72,14 @@ def assign_ISO(countries): if type(hit) == list and len(hit) == 1: country_codes[country] = hit[0].alpha_3 - elif type(hit) == list or type(hit) == None: + elif type(hit) == list or type(hit) is None: failed_matches.append(country) else: country_codes[country] = hit.alpha_3 # Print warning about failures and return successes as dictionary - print("Failed to assign country codes to:", ", ".join(failed_matches)) - return(country_codes) + print("Could not assign country codes to:", ", ".join(failed_matches)) + return country_codes # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb def main(args): @@ -163,8 +164,9 @@ def main(args): print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') - # Identify the names of each country in single-country and multi-country clinical trials - # Multi refers to trials that have multiple country names, comma-separated + # Clean and separate the names of each country in single-country and multi-country clinical trials + # Single-country trials have only a single name (string) in the `countries` field + # Multi refers to trials that have multiple names, comma-separated # One trial lists every country on Earth and formatted the data inconsistently, so drop it valid_country = trials_df[trials_df['countries'] != "No Country Given"] valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"] @@ -175,24 +177,31 @@ def main(args): ) # Identify the 3-letter ISO codes for each unique country + # Remove any leading/trailing whitespace that may result from splitting above unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), orient="index", columns=["iso_a3"]) # Map the ISO codes onto the country data and count the frequency - single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"] + single_countries_codes = pd.DataFrame(single_countries, + index=single_countries).join(country_codes)["iso_a3"] single_countries_codes = single_countries_codes.dropna() single_countries_counts = single_countries_codes.value_counts().rename("single_countries_counts") - multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"] + multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), + index=multi_countries.str.strip()).join(country_codes)["iso_a3"] multi_countries_codes = multi_countries_codes.dropna() multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts") # Map frequency data onto the geopandas geographical data for units with ISO code # geopandas uses -99 as N/A for this field + # We don't need to evaluate Antarctica countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"] countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"] for count_data in [single_countries_counts, multi_countries_counts]: - countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), how="left", left_on="iso_a3", right_index=True) + countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), + how="left", + left_on="iso_a3", + right_index=True) # Generate two-part choropleth visualizing world map with number of clinical trial data counted fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) @@ -206,13 +215,12 @@ def main(args): countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples") ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country") ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"), - xy=(-10, -10)) + xy=(0,0), xycoords="axes points") plt.savefig(args.output_map + '.png', bbox_inches = "tight") plt.savefig(args.output_map + '.svg', bbox_inches = "tight") print(f'Wrote {args.output_map}.png and {args.output_map}.svg') - exit(0) # The placeholder will be replaced by the actual SHA-1 hash in separate # script after the updated image is committed From fd738427e7671bfae42d2f580ad91bb7ecffe78c Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 14:26:32 -0400 Subject: [PATCH 09/16] update environment.yml --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 61204fcd9..068f1b720 100644 --- a/environment.yml +++ b/environment.yml @@ -6,5 +6,7 @@ dependencies: - pandas=1.0.3 - pip=20.0 - python=3.7.6 + - geopandas=0.8.1 + - pycountry==20.7.3 - pip: - git+https://github.com/manubot/manubot@a57ccf0be6972329ff3010eaaa0c5df7ccebb2d5 From f7db92c83fd5f017d070d7609dc49f50146219b3 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 14:52:19 -0400 Subject: [PATCH 10/16] linted --- ebmdatalab/generate-ebmdatalab-stats.py | 70 ++++++++++++++----------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 1e60e3b93..31436927a 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -1,13 +1,13 @@ import argparse import datetime import json +from datetime import date import matplotlib.pyplot as plt import os import pandas as pd import urllib.request import geopandas import pycountry -from datetime import date from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -21,7 +21,7 @@ def convert_date(git_date): # by the parser # https://en.wikipedia.org/wiki/ISO_8601#Coordinated_Universal_Time_(UTC) git_date = git_date.replace('Z', '+00:00') - + # Remove the leading zero of the day # Assumes the year will not begin with 0 return datetime.datetime.fromisoformat(git_date).strftime('%B %d, %Y').replace(' 0', ' ') @@ -59,8 +59,9 @@ def assign_ISO(countries): try: hit = pycountry.countries.get(name=country) if hit is None: - # If the name isn't an exact match, try alternatives - # .search_fuzzy matching returns a list, whereas .get retrieves data as class Country + # .get retrieves data as class Country + # if it can't find a match, try alternative methods + # .search_fuzzy matching returns a list hit = pycountry.countries.search_fuzzy(country) if len(hit) > 1: hit = pycountry.countries.search_fuzzy(country + ",") @@ -109,27 +110,27 @@ def main(args): assert (len(header) == len(trials_df.columns)) trials_df.columns = header trials_df = trials_df.set_index('index') - + ebm_stats['ebm_trials'] = f'{len(trials_df.index):,}' - + # Get the most recent trial update most_recent_update = pd.to_datetime(trials_df['last_updated']).max() # Remove the leading zero of the day # Assumes the year will not begin with 0 most_recent_update = most_recent_update.strftime('%B %d, %Y').replace(' 0', ' ') ebm_stats['ebm_date_pretty'] = most_recent_update - + trial_results = trials_df[trials_df['results_url'] != 'No Results']['results_url'] ebm_stats['ebm_trials_results'] = f'{len(trial_results):,}' - + # Some results entries have multiple URLs trial_results_citekeys = [extract_citekey(results_url) for results in trial_results for results_url in results.split()] ebm_stats['ebm_trials_results_citekeys'] = sorted(set(trial_results_citekeys)) - + plt.rc('font', size=14) plt.rc('figure', titlesize=24) fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 12), constrained_layout=True) - + # Plot trial recruitment status # Only include trials with a recruitment status recruitment_counts = trials_df['recruitment_status'].value_counts(ascending=True) @@ -143,14 +144,14 @@ def main(args): phase_counts = phase_counts.drop(labels='Not Applicable') ax = phase_counts.plot(kind='barh', ax=axes[0, 1]) ax.set_title('Clinical trials phase') - + # Plot study type # Only include study types used in >= 5 trials study_type_counts = trials_df['study_type'].value_counts(ascending=True) study_type_counts = study_type_counts[study_type_counts >= 5] ax = study_type_counts.plot(kind='barh', ax=axes[1, 0]) ax.set_title('Clinical trials study type') - + # Plot common interventions # Only include trials with an intervention and interventions in >= 10 trials intervention_counts = trials_df['intervention'].value_counts(ascending=True) @@ -158,16 +159,17 @@ def main(args): intervention_counts = intervention_counts[intervention_counts >= 10] ax = intervention_counts.plot(kind='barh', ax=axes[1, 1]) ax.set_title('Clinical trials common interventions') - + fig.savefig(args.output_figure + '.png', bbox_inches = "tight") fig.savefig(args.output_figure + '.svg', bbox_inches = "tight") print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') - - # Clean and separate the names of each country in single-country and multi-country clinical trials - # Single-country trials have only a single name (string) in the `countries` field - # Multi refers to trials that have multiple names, comma-separated - # One trial lists every country on Earth and formatted the data inconsistently, so drop it + + # Clean and separate the names of each country in single-country and + # multi-country clinical trials. Single-country trials have only a single + # name (string) in the `countries` field. Multi-country trials have + # multiple comma-separated names. + # Drop 1 trial that lists every country. valid_country = trials_df[trials_df['countries'] != "No Country Given"] valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"] single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')] @@ -179,15 +181,19 @@ def main(args): # Identify the 3-letter ISO codes for each unique country # Remove any leading/trailing whitespace that may result from splitting above unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() - country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), orient="index", columns=["iso_a3"]) + country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), + orient="index", + columns=["iso_a3"]) # Map the ISO codes onto the country data and count the frequency single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"] single_countries_codes = single_countries_codes.dropna() - single_countries_counts = single_countries_codes.value_counts().rename("single_countries_counts") - multi_countries_codes = pd.DataFrame(multi_countries.str.strip(), - index=multi_countries.str.strip()).join(country_codes)["iso_a3"] + single_countries_counts = \ + single_countries_codes.value_counts().rename("single_countries_counts") + multi_countries_codes = \ + pd.DataFrame(multi_countries.str.strip(), + index=multi_countries.str.strip()).join(country_codes)["iso_a3"] multi_countries_codes = multi_countries_codes.dropna() multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts") @@ -203,7 +209,7 @@ def main(args): left_on="iso_a3", right_index=True) - # Generate two-part choropleth visualizing world map with number of clinical trial data counted + # Generate two-part choropleth of world map with # of clinical trials counted fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) fig.patch.set_visible(False) ax1.axis('off') @@ -214,22 +220,24 @@ def main(args): countries_mapping.boundary.plot(ax=ax2, edgecolor="black") countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples") ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country") - ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % date.today().strftime("%b-%d-%Y"), - xy=(0,0), xycoords="axes points") + ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % + date.today().strftime("%b-%d-%Y"), + xy=(0, 0), xycoords="axes points") - plt.savefig(args.output_map + '.png', bbox_inches = "tight") - plt.savefig(args.output_map + '.svg', bbox_inches = "tight") + plt.savefig(args.output_map + '.png', bbox_inches="tight") + plt.savefig(args.output_map + '.svg', bbox_inches="tight") print(f'Wrote {args.output_map}.png and {args.output_map}.svg') - + # The placeholder will be replaced by the actual SHA-1 hash in separate # script after the updated image is committed ebm_stats['ebm_trials_figure'] = \ f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg' - + # Tabulate number of trials for pharmaceuticals of interest - ebm_stats['ebm_tocilizumab_ct'] = str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum()) - + ebm_stats['ebm_tocilizumab_ct'] = \ + str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum()) + with open(args.output_json, 'w') as out_file: json.dump(ebm_stats, out_file, indent=2, sort_keys=True) print(f'Wrote {args.output_json}') From 8e034e5e48cbec002d4589921dcd0be52eb0d837 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 15:03:32 -0400 Subject: [PATCH 11/16] remove extra newline --- ebmdatalab/generate-ebmdatalab-stats.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 31436927a..476dd2ff8 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -46,7 +46,6 @@ def assign_ISO(countries): :type countries: pd.Series """ - # Need to hard code a few countries that aren't registered using standard names, so # initializing the single_country_codes database with these values country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD", From a626b18078bc0f5c248e4cad5c5fb1237eba03c0 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 18:08:06 -0400 Subject: [PATCH 12/16] fix issue with geopandas world dataset --- ebmdatalab/generate-ebmdatalab-stats.py | 41 +++++++++++++++++++------ 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 31436927a..3ba205c6a 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -48,7 +48,7 @@ def assign_ISO(countries): # Need to hard code a few countries that aren't registered using standard names, so - # initializing the single_country_codes database with these values + # initializing the country_codes database with these irregular values country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD", "Democratic Republic of the Congo": "COD"} @@ -82,6 +82,21 @@ def assign_ISO(countries): print("Could not assign country codes to:", ", ".join(failed_matches)) return country_codes +def lowres_fix(world): + """ + There is an issue with the map data source from geopandas where ISO codes are missing + for several countries. This fix was proposed by @tommycarstensen at + https://github.com/geopandas/geopandas/issues/1041 + + :param world: dataframe (read in with geopandas) + :return: dataframe (geopandas formatted) + """ + world.loc[world['name'] == 'France', 'iso_a3'] = 'FRA' + world.loc[world['name'] == 'Norway', 'iso_a3'] = 'NOR' + world.loc[world['name'] == 'Somaliland', 'iso_a3'] = 'SOM' + world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' + return world + # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb def main(args): '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset''' @@ -189,25 +204,31 @@ def main(args): single_countries_codes = pd.DataFrame(single_countries, index=single_countries).join(country_codes)["iso_a3"] single_countries_codes = single_countries_codes.dropna() - single_countries_counts = \ - single_countries_codes.value_counts().rename("single_countries_counts") + single_countries_counts = single_countries_codes.value_counts() + #single_countries_counts = single_countries_counts.rename("single_countries_counts") multi_countries_codes = \ pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"] multi_countries_codes = multi_countries_codes.dropna() - multi_countries_counts = multi_countries_codes.value_counts().rename("multi_countries_counts") + multi_countries_counts = multi_countries_codes.value_counts() + #multi_countries_counts = multi_countries_counts.rename("multi_countries_counts") + all_counts = single_countries_counts.\ + to_frame(name = 'single_countries_counts').\ + merge(multi_countries_counts.to_frame(name='multi_countries_counts'), + how="outer", + left_index=True, + right_index=True) # Map frequency data onto the geopandas geographical data for units with ISO code # geopandas uses -99 as N/A for this field # We don't need to evaluate Antarctica countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"] - countries_mapping = countries_mapping[countries_mapping['iso_a3'] != "-99"] - for count_data in [single_countries_counts, multi_countries_counts]: - countries_mapping = countries_mapping.merge(pd.DataFrame(count_data), - how="left", - left_on="iso_a3", - right_index=True) + countries_mapping = lowres_fix(countries_mapping) + countries_mapping = countries_mapping.merge(all_counts, + how="left", + left_on="iso_a3", + right_index=True) # Generate two-part choropleth of world map with # of clinical trials counted fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) From 559e2c3775ee00579ef0fbee309502e62e75a66e Mon Sep 17 00:00:00 2001 From: HM Rando Date: Wed, 5 Aug 2020 18:31:42 -0400 Subject: [PATCH 13/16] linter --- ebmdatalab/generate-ebmdatalab-stats.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index a76282c8d..ce3b6c391 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -5,7 +5,6 @@ import matplotlib.pyplot as plt import os import pandas as pd -import urllib.request import geopandas import pycountry @@ -57,22 +56,22 @@ def assign_ISO(countries): if country not in country_codes.keys(): try: hit = pycountry.countries.get(name=country) - if hit is None: + if isinstance(hit, type(None)): # .get retrieves data as class Country # if it can't find a match, try alternative methods # .search_fuzzy matching returns a list hit = pycountry.countries.search_fuzzy(country) if len(hit) > 1: hit = pycountry.countries.search_fuzzy(country + ",") - elif type(hit) is None: + elif isinstance(hit, type(None)): hit = pycountry.countries.get(official_name=country) except LookupError: failed_matches.append(country) continue - if type(hit) == list and len(hit) == 1: + if isinstance(hit, list) and len(hit) == 1: country_codes[country] = hit[0].alpha_3 - elif type(hit) == list or type(hit) is None: + elif isinstance(hit, (list, type(None))): failed_matches.append(country) else: country_codes[country] = hit.alpha_3 @@ -204,15 +203,12 @@ def main(args): index=single_countries).join(country_codes)["iso_a3"] single_countries_codes = single_countries_codes.dropna() single_countries_counts = single_countries_codes.value_counts() - #single_countries_counts = single_countries_counts.rename("single_countries_counts") multi_countries_codes = \ pd.DataFrame(multi_countries.str.strip(), index=multi_countries.str.strip()).join(country_codes)["iso_a3"] multi_countries_codes = multi_countries_codes.dropna() multi_countries_counts = multi_countries_codes.value_counts() - #multi_countries_counts = multi_countries_counts.rename("multi_countries_counts") - all_counts = single_countries_counts.\ - to_frame(name = 'single_countries_counts').\ + all_counts = single_countries_counts.to_frame(name='single_countries_counts').\ merge(multi_countries_counts.to_frame(name='multi_countries_counts'), how="outer", left_index=True, @@ -222,12 +218,13 @@ def main(args): # geopandas uses -99 as N/A for this field # We don't need to evaluate Antarctica countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) - countries_mapping = countries_mapping[countries_mapping.name != "Antarctica"] countries_mapping = lowres_fix(countries_mapping) + countries_mapping = countries_mapping[(countries_mapping.name != "Antarctica") & + (countries_mapping.iso_a3 != "-99")] countries_mapping = countries_mapping.merge(all_counts, how="left", - left_on="iso_a3", - right_index=True) + right_index=True, + left_on="iso_a3") # Generate two-part choropleth of world map with # of clinical trials counted fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) @@ -279,7 +276,7 @@ def main(args): 'as .png and .svg.', type=str) parser.add_argument('output_map', - help='Path of the output choropleth (world map figure) ' \ + help='Path of the output choropleth (world map figure) ' \ 'with geographic clinical trial frequencies, without file ' \ 'type extension. Will be saved as .png and .svg.', type=str) From 22440a88dac2401b40d9d357f3beda7bf9c28c76 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Thu, 13 Aug 2020 19:11:05 -0400 Subject: [PATCH 14/16] handle exceptions --- ebmdatalab/generate-ebmdatalab-stats.py | 89 ++++++++++++++++--------- 1 file changed, 59 insertions(+), 30 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index ce3b6c391..c168f672f 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -37,53 +37,81 @@ def extract_citekey(results_url): citekey = short_doi_url.replace('https://doi.org', 'doi:10') return citekey + +def check_none(value): + """Raises ValueError if value is type None, else returns value""" + if isinstance(value, type(None)): + raise ValueError + return value + + +def find_country(country): + """ .get retrieves data as class Country + .search_fuzzy matching returns a list + try a few ways to identify a match and + return as soon as find something valid + Input is country name (string) + If no match found, return None""" + try: + hit = pycountry.countries.get(name=country) + hit = check_none(hit) + return hit + except (LookupError, ValueError): + try: + hit = pycountry.countries.get(official_name=country) + hit = check_none(hit) + return hit + except (LookupError, ValueError): + try: + hit = pycountry.countries.search_fuzzy(country) + hit = check_none(hit) + if type(hit) == list and len(hit) == 1: + return hit[0] + raise ValueError + except (LookupError, ValueError): + try: + hit = pycountry.countries.search_fuzzy(country + ",") + hit = check_none(hit) + if isinstance(hit, list) and len(hit) == 1: + return hit[0] + else: + raise ValueError + except (LookupError, ValueError): + return None + + def assign_ISO(countries): - """ - Match country names with ISO codes + """ Match country names with ISO codes Input: series of country names Returns: dictionary of matches - :type countries: pd.Series - """ - + :type countries: pd.Series """ # Need to hard code a few countries that aren't registered using standard names, so # initializing the country_codes database with these irregular values country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD", - "Democratic Republic of the Congo": "COD"} + "Democratic Republic of the Congo": "COD", "UAE": "ARE"} # Identify the most likely 3-letter ISO code for each country failed_matches = list() for country in countries: if country not in country_codes.keys(): - try: - hit = pycountry.countries.get(name=country) - if isinstance(hit, type(None)): - # .get retrieves data as class Country - # if it can't find a match, try alternative methods - # .search_fuzzy matching returns a list - hit = pycountry.countries.search_fuzzy(country) - if len(hit) > 1: - hit = pycountry.countries.search_fuzzy(country + ",") - elif isinstance(hit, type(None)): - hit = pycountry.countries.get(official_name=country) - except LookupError: - failed_matches.append(country) - continue - - if isinstance(hit, list) and len(hit) == 1: - country_codes[country] = hit[0].alpha_3 - elif isinstance(hit, (list, type(None))): - failed_matches.append(country) - else: + # Need to query the pycountry package but it can fail for a + # few reasons. Use function to avoid LookupError issues and + # try all the different ways that might help to match a + # country name to its ISO code + hit = find_country(country) + if not isinstance(hit, type(None)): country_codes[country] = hit.alpha_3 - + else: + failed_matches.append(country) # Print warning about failures and return successes as dictionary print("Could not assign country codes to:", ", ".join(failed_matches)) return country_codes + def lowres_fix(world): - """ - There is an issue with the map data source from geopandas where ISO codes are missing - for several countries. This fix was proposed by @tommycarstensen at + """There is an issue with the map data source from geopandas where + ISO codes are missing for several countries. This fix was proposed + by @tommycarstensen at https://github.com/geopandas/geopandas/issues/1041 :param world: dataframe (read in with geopandas) @@ -95,6 +123,7 @@ def lowres_fix(world): world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' return world + # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb def main(args): '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset''' From a47021c740a6b7b855f04b43cf0feebb583f6933 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Fri, 14 Aug 2020 10:13:23 -0400 Subject: [PATCH 15/16] Apply @agitter's suggestions Co-authored-by: Anthony Gitter --- ebmdatalab/generate-ebmdatalab-stats.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index c168f672f..486fc2583 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -104,7 +104,7 @@ def assign_ISO(countries): else: failed_matches.append(country) # Print warning about failures and return successes as dictionary - print("Could not assign country codes to:", ", ".join(failed_matches)) + print("Could not assign country codes to: ", ", ".join(failed_matches)) return country_codes @@ -279,7 +279,8 @@ def main(args): # script after the updated image is committed ebm_stats['ebm_trials_figure'] = \ f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg' - +ebm_stats['ebm_map_figure'] = \ + f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_map}.svg' # Tabulate number of trials for pharmaceuticals of interest ebm_stats['ebm_tocilizumab_ct'] = \ str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum()) From 9bd74236495fd67674c561deb92774128304c814 Mon Sep 17 00:00:00 2001 From: HM Rando Date: Fri, 14 Aug 2020 11:00:11 -0400 Subject: [PATCH 16/16] rearrange blocks to keep plots together --- ebmdatalab/generate-ebmdatalab-stats.py | 77 +++++++++++++------------ 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index 486fc2583..86e9dbf28 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -123,6 +123,44 @@ def lowres_fix(world): world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' return world +def tabulate_countries(trials_df): + # Clean and separate the names of each country in single-country and + # multi-country clinical trials. Single-country trials have only a single + # name (string) in the `countries` field. Multi-country trials have + # multiple comma-separated names. + # Drop 1 trial that lists every country. + valid_country = trials_df[trials_df['countries'] != "No Country Given"] + valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"] + single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')] + multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')] + multi_countries = pd.Series( + [country for country_list in multi_countries.str.split(',') for country in country_list] + ) + + # Identify the 3-letter ISO codes for each unique country + # Remove any leading/trailing whitespace that may result from splitting above + unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() + country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), + orient="index", + columns=["iso_a3"]) + + # Map the ISO codes onto the country data and count the frequency + single_countries_codes = pd.DataFrame(single_countries, + index=single_countries).join(country_codes)["iso_a3"] + single_countries_codes = single_countries_codes.dropna() + single_countries_counts = single_countries_codes.value_counts() + multi_countries_codes = \ + pd.DataFrame(multi_countries.str.strip(), + index=multi_countries.str.strip()).join(country_codes)["iso_a3"] + multi_countries_codes = multi_countries_codes.dropna() + multi_countries_counts = multi_countries_codes.value_counts() + all_counts = single_countries_counts.to_frame(name='single_countries_counts').\ + merge(multi_countries_counts.to_frame(name='multi_countries_counts'), + how="outer", + left_index=True, + right_index=True) + return all_counts + # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb def main(args): @@ -207,41 +245,8 @@ def main(args): print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') - # Clean and separate the names of each country in single-country and - # multi-country clinical trials. Single-country trials have only a single - # name (string) in the `countries` field. Multi-country trials have - # multiple comma-separated names. - # Drop 1 trial that lists every country. - valid_country = trials_df[trials_df['countries'] != "No Country Given"] - valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"] - single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')] - multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')] - multi_countries = pd.Series( - [country for country_list in multi_countries.str.split(',') for country in country_list] - ) - - # Identify the 3-letter ISO codes for each unique country - # Remove any leading/trailing whitespace that may result from splitting above - unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() - country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), - orient="index", - columns=["iso_a3"]) - - # Map the ISO codes onto the country data and count the frequency - single_countries_codes = pd.DataFrame(single_countries, - index=single_countries).join(country_codes)["iso_a3"] - single_countries_codes = single_countries_codes.dropna() - single_countries_counts = single_countries_codes.value_counts() - multi_countries_codes = \ - pd.DataFrame(multi_countries.str.strip(), - index=multi_countries.str.strip()).join(country_codes)["iso_a3"] - multi_countries_codes = multi_countries_codes.dropna() - multi_countries_counts = multi_countries_codes.value_counts() - all_counts = single_countries_counts.to_frame(name='single_countries_counts').\ - merge(multi_countries_counts.to_frame(name='multi_countries_counts'), - how="outer", - left_index=True, - right_index=True) + # Count geographic representation by ISO3 code + all_counts = tabulate_countries(trials_df) # Map frequency data onto the geopandas geographical data for units with ISO code # geopandas uses -99 as N/A for this field @@ -279,7 +284,7 @@ def main(args): # script after the updated image is committed ebm_stats['ebm_trials_figure'] = \ f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg' -ebm_stats['ebm_map_figure'] = \ + ebm_stats['ebm_map_figure'] = \ f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_map}.svg' # Tabulate number of trials for pharmaceuticals of interest ebm_stats['ebm_tocilizumab_ct'] = \