diff --git a/ebmdatalab/generate-ebmdatalab-stats.py b/ebmdatalab/generate-ebmdatalab-stats.py index cac4c5026..86e9dbf28 100644 --- a/ebmdatalab/generate-ebmdatalab-stats.py +++ b/ebmdatalab/generate-ebmdatalab-stats.py @@ -1,10 +1,12 @@ import argparse import datetime import json +from datetime import date import matplotlib.pyplot as plt import os import pandas as pd -import urllib.request +import geopandas +import pycountry from manubot.cite.citekey import url_to_citekey from manubot.cite.doi import get_short_doi_url @@ -18,7 +20,7 @@ def convert_date(git_date): # by the parser # https://en.wikipedia.org/wiki/ISO_8601#Coordinated_Universal_Time_(UTC) git_date = git_date.replace('Z', '+00:00') - + # Remove the leading zero of the day # Assumes the year will not begin with 0 return datetime.datetime.fromisoformat(git_date).strftime('%B %d, %Y').replace(' 0', ' ') @@ -35,6 +37,131 @@ def extract_citekey(results_url): citekey = short_doi_url.replace('https://doi.org', 'doi:10') return citekey + +def check_none(value): + """Raises ValueError if value is type None, else returns value""" + if isinstance(value, type(None)): + raise ValueError + return value + + +def find_country(country): + """ .get retrieves data as class Country + .search_fuzzy matching returns a list + try a few ways to identify a match and + return as soon as find something valid + Input is country name (string) + If no match found, return None""" + try: + hit = pycountry.countries.get(name=country) + hit = check_none(hit) + return hit + except (LookupError, ValueError): + try: + hit = pycountry.countries.get(official_name=country) + hit = check_none(hit) + return hit + except (LookupError, ValueError): + try: + hit = pycountry.countries.search_fuzzy(country) + hit = check_none(hit) + if type(hit) == list and len(hit) == 1: + return hit[0] + raise ValueError + except (LookupError, ValueError): + try: + hit = pycountry.countries.search_fuzzy(country + ",") + hit = check_none(hit) + if isinstance(hit, list) and len(hit) == 1: + return hit[0] + else: + raise ValueError + except (LookupError, ValueError): + return None + + +def assign_ISO(countries): + """ Match country names with ISO codes + Input: series of country names + Returns: dictionary of matches + :type countries: pd.Series """ + # Need to hard code a few countries that aren't registered using standard names, so + # initializing the country_codes database with these irregular values + country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD", + "Democratic Republic of the Congo": "COD", "UAE": "ARE"} + + # Identify the most likely 3-letter ISO code for each country + failed_matches = list() + for country in countries: + if country not in country_codes.keys(): + # Need to query the pycountry package but it can fail for a + # few reasons. Use function to avoid LookupError issues and + # try all the different ways that might help to match a + # country name to its ISO code + hit = find_country(country) + if not isinstance(hit, type(None)): + country_codes[country] = hit.alpha_3 + else: + failed_matches.append(country) + # Print warning about failures and return successes as dictionary + print("Could not assign country codes to: ", ", ".join(failed_matches)) + return country_codes + + +def lowres_fix(world): + """There is an issue with the map data source from geopandas where + ISO codes are missing for several countries. This fix was proposed + by @tommycarstensen at + https://github.com/geopandas/geopandas/issues/1041 + + :param world: dataframe (read in with geopandas) + :return: dataframe (geopandas formatted) + """ + world.loc[world['name'] == 'France', 'iso_a3'] = 'FRA' + world.loc[world['name'] == 'Norway', 'iso_a3'] = 'NOR' + world.loc[world['name'] == 'Somaliland', 'iso_a3'] = 'SOM' + world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS' + return world + +def tabulate_countries(trials_df): + # Clean and separate the names of each country in single-country and + # multi-country clinical trials. Single-country trials have only a single + # name (string) in the `countries` field. Multi-country trials have + # multiple comma-separated names. + # Drop 1 trial that lists every country. + valid_country = trials_df[trials_df['countries'] != "No Country Given"] + valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"] + single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')] + multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')] + multi_countries = pd.Series( + [country for country_list in multi_countries.str.split(',') for country in country_list] + ) + + # Identify the 3-letter ISO codes for each unique country + # Remove any leading/trailing whitespace that may result from splitting above + unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates() + country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries), + orient="index", + columns=["iso_a3"]) + + # Map the ISO codes onto the country data and count the frequency + single_countries_codes = pd.DataFrame(single_countries, + index=single_countries).join(country_codes)["iso_a3"] + single_countries_codes = single_countries_codes.dropna() + single_countries_counts = single_countries_codes.value_counts() + multi_countries_codes = \ + pd.DataFrame(multi_countries.str.strip(), + index=multi_countries.str.strip()).join(country_codes)["iso_a3"] + multi_countries_codes = multi_countries_codes.dropna() + multi_countries_counts = multi_countries_codes.value_counts() + all_counts = single_countries_counts.to_frame(name='single_countries_counts').\ + merge(multi_countries_counts.to_frame(name='multi_countries_counts'), + how="outer", + left_index=True, + right_index=True) + return all_counts + + # Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb def main(args): '''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset''' @@ -63,27 +190,27 @@ def main(args): assert (len(header) == len(trials_df.columns)) trials_df.columns = header trials_df = trials_df.set_index('index') - + ebm_stats['ebm_trials'] = f'{len(trials_df.index):,}' - + # Get the most recent trial update most_recent_update = pd.to_datetime(trials_df['last_updated']).max() # Remove the leading zero of the day # Assumes the year will not begin with 0 most_recent_update = most_recent_update.strftime('%B %d, %Y').replace(' 0', ' ') ebm_stats['ebm_date_pretty'] = most_recent_update - + trial_results = trials_df[trials_df['results_url'] != 'No Results']['results_url'] ebm_stats['ebm_trials_results'] = f'{len(trial_results):,}' - + # Some results entries have multiple URLs trial_results_citekeys = [extract_citekey(results_url) for results in trial_results for results_url in results.split()] ebm_stats['ebm_trials_results_citekeys'] = sorted(set(trial_results_citekeys)) - + plt.rc('font', size=14) plt.rc('figure', titlesize=24) fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 12), constrained_layout=True) - + # Plot trial recruitment status # Only include trials with a recruitment status recruitment_counts = trials_df['recruitment_status'].value_counts(ascending=True) @@ -97,14 +224,14 @@ def main(args): phase_counts = phase_counts.drop(labels='Not Applicable') ax = phase_counts.plot(kind='barh', ax=axes[0, 1]) ax.set_title('Clinical trials phase') - + # Plot study type # Only include study types used in >= 5 trials study_type_counts = trials_df['study_type'].value_counts(ascending=True) study_type_counts = study_type_counts[study_type_counts >= 5] ax = study_type_counts.plot(kind='barh', ax=axes[1, 0]) ax.set_title('Clinical trials study type') - + # Plot common interventions # Only include trials with an intervention and interventions in >= 10 trials intervention_counts = trials_df['intervention'].value_counts(ascending=True) @@ -112,20 +239,57 @@ def main(args): intervention_counts = intervention_counts[intervention_counts >= 10] ax = intervention_counts.plot(kind='barh', ax=axes[1, 1]) ax.set_title('Clinical trials common interventions') - + fig.savefig(args.output_figure + '.png', bbox_inches = "tight") fig.savefig(args.output_figure + '.svg', bbox_inches = "tight") - + print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg') - + + # Count geographic representation by ISO3 code + all_counts = tabulate_countries(trials_df) + + # Map frequency data onto the geopandas geographical data for units with ISO code + # geopandas uses -99 as N/A for this field + # We don't need to evaluate Antarctica + countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + countries_mapping = lowres_fix(countries_mapping) + countries_mapping = countries_mapping[(countries_mapping.name != "Antarctica") & + (countries_mapping.iso_a3 != "-99")] + countries_mapping = countries_mapping.merge(all_counts, + how="left", + right_index=True, + left_on="iso_a3") + + # Generate two-part choropleth of world map with # of clinical trials counted + fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16)) + fig.patch.set_visible(False) + ax1.axis('off') + ax2.axis('off') + countries_mapping.boundary.plot(ax=ax1, edgecolor="black") + countries_mapping.plot(column='single_countries_counts', ax=ax1, legend=True) + ax1.set_title("Number of Single-Country Clinical Trials Recruiting by Country") + countries_mapping.boundary.plot(ax=ax2, edgecolor="black") + countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples") + ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country") + ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' % + date.today().strftime("%b-%d-%Y"), + xy=(0, 0), xycoords="axes points") + + plt.savefig(args.output_map + '.png', bbox_inches="tight") + plt.savefig(args.output_map + '.svg', bbox_inches="tight") + + print(f'Wrote {args.output_map}.png and {args.output_map}.svg') + # The placeholder will be replaced by the actual SHA-1 hash in separate # script after the updated image is committed ebm_stats['ebm_trials_figure'] = \ f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg' - + ebm_stats['ebm_map_figure'] = \ + f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_map}.svg' # Tabulate number of trials for pharmaceuticals of interest - ebm_stats['ebm_tocilizumab_ct'] = str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum()) - + ebm_stats['ebm_tocilizumab_ct'] = \ + str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum()) + with open(args.output_json, 'w') as out_file: json.dump(ebm_stats, out_file, indent=2, sort_keys=True) print(f'Wrote {args.output_json}') @@ -146,6 +310,11 @@ def main(args): 'statistics without file type extension. Will be saved ' \ 'as .png and .svg.', type=str) + parser.add_argument('output_map', + help='Path of the output choropleth (world map figure) ' \ + 'with geographic clinical trial frequencies, without file ' \ + 'type extension. Will be saved as .png and .svg.', + type=str) args = parser.parse_args() main(args) diff --git a/ebmdatalab/generate-ebmdatalab-stats.sh b/ebmdatalab/generate-ebmdatalab-stats.sh old mode 100644 new mode 100755 index 72345c7e6..e845f0b8e --- a/ebmdatalab/generate-ebmdatalab-stats.sh +++ b/ebmdatalab/generate-ebmdatalab-stats.sh @@ -14,6 +14,7 @@ export EBM_COMMIT_DATE=$(echo $EBM_COMMIT_JSON | python -c "import sys, json; pr EBM_INPUT_JSON=ebmdatalab/trials_latest.json EBM_STATS_JSON=ebmdatalab/ebmdatalab-stats.json EBM_FIG=ebmdatalab/ebmdatalab-trials +EBM_MAP=ebmdatalab/ebmdatalab-map echo "Downloading EBM Data Lab COVID-19 TrialsTracker data from commit $EBM_COMMIT_SHA authored $EBM_COMMIT_DATE" curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COMMIT_SHA/$EBM_REPO_PATH > $EBM_INPUT_JSON @@ -22,6 +23,6 @@ curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COM # and run the version-figures.sh script to update the EBM_STATS_JSON with the # versioned figure URL echo "Generating EBM Data Lab COVID-19 TrialsTracker statistics and figure" -python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG +python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG $EBM_MAP rm $EBM_INPUT_JSON diff --git a/environment.yml b/environment.yml index 61204fcd9..068f1b720 100644 --- a/environment.yml +++ b/environment.yml @@ -6,5 +6,7 @@ dependencies: - pandas=1.0.3 - pip=20.0 - python=3.7.6 + - geopandas=0.8.1 + - pycountry==20.7.3 - pip: - git+https://github.com/manubot/manubot@a57ccf0be6972329ff3010eaaa0c5df7ccebb2d5