Merge pull request #47 from mrc-ide/ebola-cfr-r1

Additional analysis for Ebola R1
mrc-ide · Jul 24, 2024 · e5d0ab6 · e5d0ab6
2 parents 47a8b6c + dce8b55
commit e5d0ab6
Show file tree

Hide file tree

Showing 3 changed files with 167 additions and 5 deletions.
diff --git a/ebola_workflow.R b/ebola_workflow.R
@@ -77,7 +77,8 @@ orderly_location_pull_packet()
 orderly_run("ebola_reproduction_number", parameters = list(pathogen = "EBOLA"))
 
 # Severity (CFR/IFR)
-orderly_run("ebola_severity", parameters = list(pathogen = "EBOLA"))
+orderly_run("ebola_severity")
+orderly_run("ebola_severity_additional")
 
 # Human Delays
 orderly_run("ebola_delays", parameters = list(pathogen = "EBOLA"))
@@ -117,5 +118,7 @@ orderly_run("ebola_summary", parameters = list(pathogen = "EBOLA"))
 orderly_run("ebola_figures", parameters = list(pathogen = "EBOLA"))
 
 # Figures and tables for supplementary material
-orderly_run("ebola_supplementary_figures", parameters = list(pathogen = "EBOLA"))
+# This task will not run because it pulls in pdfs from the previous tasks and
+# we have commented out docx2pdf conversion.
+# orderly_run("ebola_supplementary_figures", parameters = list(pathogen = "EBOLA"))
 
diff --git a/src/ebola_severity/orderly.R b/src/ebola_severity/orderly.R
@@ -15,8 +15,6 @@ library(doconv)
 
 orderly_strict_mode()
 
-orderly_parameters(pathogen = "EBOLA")
-
 orderly_artefact(
   "Plots and tables for severity parameters",
   c(
@@ -40,7 +38,7 @@ orderly_artefact(
 # Get data from db_compilation
 orderly_dependency(
   "db_compilation",
-  "latest(parameter:pathogen == this:pathogen)",
+  "latest(parameter:pathogen == 'EBOLA')",
   c(
     "articles.csv",
     "parameters.csv"
@@ -190,6 +188,8 @@ ordered_dat <- sev_dat %>%
     )
   )
 
+saveRDS(ordered_dat, "ebola_severity.rds")
+orderly_artefact("Ebola severity estimates", "ebola_severity.rds")
 # Pull numbers
 ordered_dat %>% nrow() # 166
 length(unique(ordered_dat$covidence_id)) # 130

diff --git a/src/ebola_severity_additional/ebola_severity_additional.R b/src/ebola_severity_additional/ebola_severity_additional.R
@@ -0,0 +1,159 @@
+library(dplyr)
+library(orderly2)
+library(epireview)
+orderly_dependency("ebola_severity", "latest", "ebola_severity.rds")
+orderly_shared_resource("ebola_functions.R" = "ebola_functions.R")
+orderly_shared_resource("ebola_visualisation.R" = "ebola_visualisation.R")
+
+source("ebola_functions.R")
+source("ebola_visualisation.R")
+
+cfr <- readRDS("ebola_severity.rds")
+## Removing this as it is a CFR across multiple outbreaks from 1976 to 2014
+idx <- cfr$covidence_id %in% 2594 & cfr$cfr_ifr_numerator %in% 526
+cfr <- cfr[!idx, ]
+## Text says 96; table says 88. I can't see where the 96 comes from
+## Going with 88. No CI provided for this estimate
+idx <- cfr$covidence_id %in% 2594 & cfr$cfr_ifr_numerator %in% 252
+cfr$parameter_value[idx] <- 88
+cfr$parameter_uncertainty_lower_value[idx] <- NA
+cfr$parameter_uncertainty_upper_value[idx] <- NA
+
+## Study in pregnant women; naive estimate
+cfr$cfr_ifr_method[cfr$covidence_id %in% 1889] <- "Naive"
+
+## https://doi.org/10.1093/cid/ciz678 estimates CFR with and without
+## imputation of missing data. Only number of cases without impuation
+## has been extracted, even where the CFR itself is based on imputed
+## data. Fixing this.
+cfr$population_sample_size[cfr$covidence_id %in% 16500 & 
+  cfr$cfr_ifr_method %in% "Adjusted"] <- 33338
+cfr$population_sample_size[cfr$covidence_id %in% 16500 & 
+  cfr$cfr_ifr_method %in% "Naive" & cfr$parameter_value == 71.9] <- 33338  
+
+## Adjusted CFRs, high-quality studies
+adjusted_high <- cfr[cfr$cfr_ifr_method %in% "Adjusted" & 
+  cfr$article_qa_score > 50, ]
+adjusted_high <- param_pm_uncertainty(adjusted_high) |> 
+  mark_multiple_estimates(label_type = "letters") |>
+  reorder_studies()
+p1 <- forest_plot(
+  adjusted_high, col_by = "population_country", facet_by = "ebola_species") +
+labs(x = "Case fatality rate (%)", 
+  title = "Adjusted case fatality rates in high-quality studies") +
+  theme(axis.text.y = element_text(size = 8))
+
+ggsave("adjusted_high.png", p1, width = 10, height = 6, units = "in")
+
+adjusted_all <- cfr[cfr$cfr_ifr_method %in% "Adjusted", ]
+adjusted_all <- param_pm_uncertainty(adjusted_all) |> 
+  mark_multiple_estimates(label_type = "letters") |>
+  reorder_studies()
+p2 <- forest_plot(
+  adjusted_all, col_by = "population_country", facet_by = "ebola_species") +
+labs(x = "Case fatality rate (%)", 
+  title = "Adjusted CFR estimates") +
+  theme(axis.text.y = element_text(size = 8))
+
+ggsave("adjusted_all.png", p2, width = 10, height = 6, units = "in")
+
+naive_all <- cfr[cfr$cfr_ifr_method %in% "Naive", ]
+naive_all <- param_pm_uncertainty(naive_all) |> 
+  mark_multiple_estimates(label_type = "letters") |>
+  reorder_studies()
+
+p3 <- forest_plot(
+  naive_all, col_by = "population_country", facet_by = "ebola_species") +
+labs(x = "Case fatality rate (%)",
+  title = "Naive CFR estimates") +
+  theme(axis.text.y = element_text(size = 6))
+
+ggsave("naive_all.png", p3, width = 10, height = 6, units = "in")
+
+## Retain only general population estimates
+pb <- cfr[cfr$population_sample_type %in% "Population based" & 
+  cfr$article_qa_score > 50, ]
+pb <- param_pm_uncertainty(pb)  
+pb <- mark_multiple_estimates(pb, label_type = "letters")
+pb <- reorder_studies(pb)
+p4 <- forest_plot(
+  pb, col_by = "population_country", facet_by = "ebola_species"
+) +
+  labs(x = "Case fatality rate (%)", 
+    title = "Population-based CFR estimates in high-quality studies") +
+  theme(axis.text.y = element_text(size = 8))
+
+ggsave("population_based.png", p4, width = 10, height = 6, units = "in")
+
+pb <- cfr[cfr$population_sample_type %in% "Population based", ]
+pb <- param_pm_uncertainty(pb)  
+pb <- mark_multiple_estimates(pb, label_type = "letters")
+pb <- reorder_studies(pb)
+p5 <- forest_plot(
+  pb, col_by = "population_country", facet_by = "ebola_species"
+) +
+  labs(x = "Case fatality rate (%)", 
+    title = "Population-based CFR estimates") +
+  theme(axis.text.y = element_text(size = 8))
+
+ggsave("population_based_all.png", p5, width = 10, height = 6, units = "in")
+
+## Estimates where the population_sample_type is not population-based; could be 
+## missing or hospital-based etc
+gb <- cfr[!cfr$population_sample_type %in% "Population based", ]
+gb <- param_pm_uncertainty(gb)  
+gb <- mark_multiple_estimates(gb, label_type = "letters")
+gb <- reorder_studies(gb)
+p5 <- forest_plot(
+  gb, col_by = "population_country", facet_by = "ebola_species"
+) +
+  labs(x = "Case fatality rate (%)", 
+    title = "Population-based CFR estimates") +
+  theme(axis.text.y = element_text(size = 8))
+
+ggsave("not_population_based_all.png", p5, width = 10, height = 6, units = "in")
+
+## Distribution of central values by denominator
+wa <- cfr[cfr$outbreak %in% "West Africa 2013-2016", ]
+wa$sample_size <- case_when(
+  wa$cfr_ifr_denominator <= 100 ~ "0-100",
+  wa$cfr_ifr_denominator > 100 & wa$cfr_ifr_denominator <= 1000 ~ 
+    "101-1000",
+  wa$cfr_ifr_denominator > 1000 ~ " > 1000",
+  TRUE ~ NA_character_
+)
+
+p <- ggplot(wa, aes(y = parameter_value, x = cfr_ifr_denominator)) +
+  geom_point(alpha = 0.5) +
+  facet_wrap(~sample_size, scales = "free_x") + 
+  labs(x = "Case fatality rate (%)", 
+    title = "Distribution of central values by denominator") +
+  theme(legend.position = "bottom") +
+  ylim(0, 100)
+
+ggsave("cfr_denominator.png", width = 10, height = 6, units = "in")
+
+weighted_cfr <- function(x) {
+  sum(
+    (x$parameter_value * x$cfr_ifr_denominator) / 
+    sum(x$cfr_ifr_denominator, na.rm = TRUE),
+    na.rm = TRUE
+  )
+}
+w_overall <- weighted_cfr(cfr)
+uw_overall <- mean(cfr$parameter_value, na.rm = TRUE)
+w_by_method <- split(cfr, cfr$cfr_ifr_method) |>
+  map_dfr(weighted_cfr, .id = "cfr_ifr_method") 
+uw_by_method <- cfr |>
+  group_by(cfr_ifr_method) |>
+  summarise(uw = mean(parameter_value, na.rm = TRUE))
+
+p <- ggplot(cfr, aes(parameter_value)) + 
+  geom_histogram(binwidth = 1, position = position_dodge()) +
+  facet_wrap(~cfr_ifr_method, ncol = 1) +
+  ##geom_vline(xintercept = weighted_cfr, color = "red") +
+  labs(x = "Case fatality rate (%)", 
+    title = "Distribution of central values") +
+    theme_minimal()
+
+ggsave("cfr_hist.png", p, width = 10, height = 6, units = "in")