Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Map where clinical trials are recruiting #569

Merged
merged 17 commits into from
Aug 14, 2020
201 changes: 185 additions & 16 deletions ebmdatalab/generate-ebmdatalab-stats.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import argparse
import datetime
import json
from datetime import date
import matplotlib.pyplot as plt
import os
import pandas as pd
import urllib.request
import geopandas
import pycountry

from manubot.cite.citekey import url_to_citekey
from manubot.cite.doi import get_short_doi_url
Expand All @@ -18,7 +20,7 @@ def convert_date(git_date):
# by the parser
# https://en.wikipedia.org/wiki/ISO_8601#Coordinated_Universal_Time_(UTC)
git_date = git_date.replace('Z', '+00:00')

# Remove the leading zero of the day
# Assumes the year will not begin with 0
return datetime.datetime.fromisoformat(git_date).strftime('%B %d, %Y').replace(' 0', ' ')
Expand All @@ -35,6 +37,131 @@ def extract_citekey(results_url):
citekey = short_doi_url.replace('https://doi.org', 'doi:10')
return citekey


def check_none(value):
"""Raises ValueError if value is type None, else returns value"""
if isinstance(value, type(None)):
raise ValueError
return value


def find_country(country):
""" .get retrieves data as class Country
.search_fuzzy matching returns a list
try a few ways to identify a match and
return as soon as find something valid
Input is country name (string)
If no match found, return None"""
try:
hit = pycountry.countries.get(name=country)
hit = check_none(hit)
return hit
except (LookupError, ValueError):
try:
hit = pycountry.countries.get(official_name=country)
hit = check_none(hit)
return hit
except (LookupError, ValueError):
try:
hit = pycountry.countries.search_fuzzy(country)
hit = check_none(hit)
if type(hit) == list and len(hit) == 1:
return hit[0]
raise ValueError
except (LookupError, ValueError):
try:
hit = pycountry.countries.search_fuzzy(country + ",")
hit = check_none(hit)
if isinstance(hit, list) and len(hit) == 1:
return hit[0]
else:
raise ValueError
except (LookupError, ValueError):
return None


def assign_ISO(countries):
""" Match country names with ISO codes
Input: series of country names
Returns: dictionary of matches
:type countries: pd.Series """
# Need to hard code a few countries that aren't registered using standard names, so
# initializing the country_codes database with these irregular values
country_codes = {"South Korea": "KOR", "Democratic Republic of Congo": "COD",
"Democratic Republic of the Congo": "COD", "UAE": "ARE"}

# Identify the most likely 3-letter ISO code for each country
failed_matches = list()
for country in countries:
if country not in country_codes.keys():
# Need to query the pycountry package but it can fail for a
# few reasons. Use function to avoid LookupError issues and
# try all the different ways that might help to match a
# country name to its ISO code
hit = find_country(country)
if not isinstance(hit, type(None)):
country_codes[country] = hit.alpha_3
else:
failed_matches.append(country)
# Print warning about failures and return successes as dictionary
print("Could not assign country codes to: ", ", ".join(failed_matches))
return country_codes


def lowres_fix(world):
"""There is an issue with the map data source from geopandas where
ISO codes are missing for several countries. This fix was proposed
by @tommycarstensen at
https://github.com/geopandas/geopandas/issues/1041

:param world: dataframe (read in with geopandas)
:return: dataframe (geopandas formatted)
"""
world.loc[world['name'] == 'France', 'iso_a3'] = 'FRA'
world.loc[world['name'] == 'Norway', 'iso_a3'] = 'NOR'
world.loc[world['name'] == 'Somaliland', 'iso_a3'] = 'SOM'
world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'RKS'
return world

def tabulate_countries(trials_df):
# Clean and separate the names of each country in single-country and
# multi-country clinical trials. Single-country trials have only a single
# name (string) in the `countries` field. Multi-country trials have
# multiple comma-separated names.
# Drop 1 trial that lists every country.
valid_country = trials_df[trials_df['countries'] != "No Country Given"]
valid_country = valid_country[valid_country['trial_id'] != "ISRCTN80453162"]
single_countries = valid_country['countries'][~valid_country['countries'].str.contains(',')]
multi_countries = valid_country["countries"][valid_country["countries"].str.contains(',')]
multi_countries = pd.Series(
[country for country_list in multi_countries.str.split(',') for country in country_list]
)

# Identify the 3-letter ISO codes for each unique country
# Remove any leading/trailing whitespace that may result from splitting above
unique_countries = single_countries.append(multi_countries).str.strip().drop_duplicates()
country_codes = pd.DataFrame.from_dict(assign_ISO(unique_countries),
orient="index",
columns=["iso_a3"])

# Map the ISO codes onto the country data and count the frequency
single_countries_codes = pd.DataFrame(single_countries,
index=single_countries).join(country_codes)["iso_a3"]
single_countries_codes = single_countries_codes.dropna()
single_countries_counts = single_countries_codes.value_counts()
multi_countries_codes = \
pd.DataFrame(multi_countries.str.strip(),
index=multi_countries.str.strip()).join(country_codes)["iso_a3"]
multi_countries_codes = multi_countries_codes.dropna()
multi_countries_counts = multi_countries_codes.value_counts()
all_counts = single_countries_counts.to_frame(name='single_countries_counts').\
merge(multi_countries_counts.to_frame(name='multi_countries_counts'),
how="outer",
left_index=True,
right_index=True)
return all_counts


# Inspired by https://github.com/greenelab/meta-review/blob/master/analyses/deep-review-contrib/03.contrib-stats.ipynb
def main(args):
'''Extract statistics from the EBM Data Lab COVID-19 TrialsTracker dataset'''
Expand Down Expand Up @@ -63,27 +190,27 @@ def main(args):
assert (len(header) == len(trials_df.columns))
trials_df.columns = header
trials_df = trials_df.set_index('index')

ebm_stats['ebm_trials'] = f'{len(trials_df.index):,}'

# Get the most recent trial update
most_recent_update = pd.to_datetime(trials_df['last_updated']).max()
# Remove the leading zero of the day
# Assumes the year will not begin with 0
most_recent_update = most_recent_update.strftime('%B %d, %Y').replace(' 0', ' ')
ebm_stats['ebm_date_pretty'] = most_recent_update

trial_results = trials_df[trials_df['results_url'] != 'No Results']['results_url']
ebm_stats['ebm_trials_results'] = f'{len(trial_results):,}'

# Some results entries have multiple URLs
trial_results_citekeys = [extract_citekey(results_url) for results in trial_results for results_url in results.split()]
ebm_stats['ebm_trials_results_citekeys'] = sorted(set(trial_results_citekeys))

plt.rc('font', size=14)
plt.rc('figure', titlesize=24)
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 12), constrained_layout=True)

# Plot trial recruitment status
# Only include trials with a recruitment status
recruitment_counts = trials_df['recruitment_status'].value_counts(ascending=True)
Expand All @@ -97,35 +224,72 @@ def main(args):
phase_counts = phase_counts.drop(labels='Not Applicable')
ax = phase_counts.plot(kind='barh', ax=axes[0, 1])
ax.set_title('Clinical trials phase')

# Plot study type
# Only include study types used in >= 5 trials
study_type_counts = trials_df['study_type'].value_counts(ascending=True)
study_type_counts = study_type_counts[study_type_counts >= 5]
ax = study_type_counts.plot(kind='barh', ax=axes[1, 0])
ax.set_title('Clinical trials study type')

# Plot common interventions
# Only include trials with an intervention and interventions in >= 10 trials
intervention_counts = trials_df['intervention'].value_counts(ascending=True)
intervention_counts = intervention_counts.drop(labels='No Intervention')
intervention_counts = intervention_counts[intervention_counts >= 10]
ax = intervention_counts.plot(kind='barh', ax=axes[1, 1])
ax.set_title('Clinical trials common interventions')

fig.savefig(args.output_figure + '.png', bbox_inches = "tight")
fig.savefig(args.output_figure + '.svg', bbox_inches = "tight")

print(f'Wrote {args.output_figure}.png and {args.output_figure}.svg')


# Count geographic representation by ISO3 code
all_counts = tabulate_countries(trials_df)

# Map frequency data onto the geopandas geographical data for units with ISO code
# geopandas uses -99 as N/A for this field
# We don't need to evaluate Antarctica
countries_mapping = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
countries_mapping = lowres_fix(countries_mapping)
countries_mapping = countries_mapping[(countries_mapping.name != "Antarctica") &
(countries_mapping.iso_a3 != "-99")]
countries_mapping = countries_mapping.merge(all_counts,
how="left",
right_index=True,
left_on="iso_a3")

# Generate two-part choropleth of world map with # of clinical trials counted
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 16))
fig.patch.set_visible(False)
ax1.axis('off')
ax2.axis('off')
countries_mapping.boundary.plot(ax=ax1, edgecolor="black")
countries_mapping.plot(column='single_countries_counts', ax=ax1, legend=True)
ax1.set_title("Number of Single-Country Clinical Trials Recruiting by Country")
countries_mapping.boundary.plot(ax=ax2, edgecolor="black")
countries_mapping.plot(column='multi_countries_counts', ax=ax2, legend=True, cmap="Purples")
ax2.set_title("Number of Multi-Country Clinical Trials Recruiting by Country")
ax2.annotate(f'Source: EBM Data Lab COVID-19 TrialsTracker, %s' %
date.today().strftime("%b-%d-%Y"),
xy=(0, 0), xycoords="axes points")

plt.savefig(args.output_map + '.png', bbox_inches="tight")
plt.savefig(args.output_map + '.svg', bbox_inches="tight")

print(f'Wrote {args.output_map}.png and {args.output_map}.svg')

# The placeholder will be replaced by the actual SHA-1 hash in separate
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest moving this block to follow line 208 (the Wrote {args.output_figure}.png and {args.output_figure}.svg message) to keep the trials figure code grouped together.

# script after the updated image is committed
ebm_stats['ebm_trials_figure'] = \
f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_figure}.svg'

ebm_stats['ebm_map_figure'] = \
f'https://github.com/greenelab/covid19-review/raw/$FIGURE_COMMIT_SHA/{args.output_map}.svg'
# Tabulate number of trials for pharmaceuticals of interest
ebm_stats['ebm_tocilizumab_ct'] = str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum())

ebm_stats['ebm_tocilizumab_ct'] = \
str(trials_df['intervention'].str.contains('tocilizumab', case=False).sum())

with open(args.output_json, 'w') as out_file:
json.dump(ebm_stats, out_file, indent=2, sort_keys=True)
print(f'Wrote {args.output_json}')
Expand All @@ -146,6 +310,11 @@ def main(args):
'statistics without file type extension. Will be saved ' \
'as .png and .svg.',
type=str)
parser.add_argument('output_map',
help='Path of the output choropleth (world map figure) ' \
'with geographic clinical trial frequencies, without file ' \
'type extension. Will be saved as .png and .svg.',
type=str)

args = parser.parse_args()
main(args)
3 changes: 2 additions & 1 deletion ebmdatalab/generate-ebmdatalab-stats.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export EBM_COMMIT_DATE=$(echo $EBM_COMMIT_JSON | python -c "import sys, json; pr
EBM_INPUT_JSON=ebmdatalab/trials_latest.json
EBM_STATS_JSON=ebmdatalab/ebmdatalab-stats.json
EBM_FIG=ebmdatalab/ebmdatalab-trials
EBM_MAP=ebmdatalab/ebmdatalab-map

echo "Downloading EBM Data Lab COVID-19 TrialsTracker data from commit $EBM_COMMIT_SHA authored $EBM_COMMIT_DATE"
curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COMMIT_SHA/$EBM_REPO_PATH > $EBM_INPUT_JSON
Expand All @@ -22,6 +23,6 @@ curl -fsSL https://github.com/ebmdatalab/covid_trials_tracker-covid/raw/$EBM_COM
# and run the version-figures.sh script to update the EBM_STATS_JSON with the
# versioned figure URL
echo "Generating EBM Data Lab COVID-19 TrialsTracker statistics and figure"
python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG
python ebmdatalab/generate-ebmdatalab-stats.py $EBM_INPUT_JSON $EBM_STATS_JSON $EBM_FIG $EBM_MAP

rm $EBM_INPUT_JSON
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@ dependencies:
- pandas=1.0.3
- pip=20.0
- python=3.7.6
- geopandas=0.8.1
- pycountry==20.7.3
- pip:
- git+https://github.com/manubot/manubot@a57ccf0be6972329ff3010eaaa0c5df7ccebb2d5