Merge pull request #4 from toumban1/integrate_2023_changes

Integrate 2023 changes
MSKCC-Epi-Bio · Aug 26, 2024 · 1036d2b · 1036d2b
2 parents 6f9c013 + ec26164
commit 1036d2b
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 6 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,7 +30,7 @@ LazyData: true
 URL: https://github.com/MSKCC-Epi-Bio/gnomeR,
     https://mskcc-epi-bio.github.io/gnomeR/
 BugReports: https://github.com/MSKCC-Epi-Bio/gnomeR/issues
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Depends: R (>= 3.6)
 biocViews: 
     ComplexHeatmap,

diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R
@@ -6,6 +6,7 @@
 #' @param t Threshold value between 0 and 1 to subset by. Default is 10% (.1).
 #' @param other_vars One or more column names (quoted or unquoted) in data to be retained
 #' in resulting data frame. Default is NULL.
+#' @param by Variable used to subset the data. Default is NULL.
 #' @return a data frame with a `sample_id` column and columns for
 #' alterations over the given prevalence threshold of `t`.
 #'
@@ -22,7 +23,7 @@
 #'gene_binary %>%
 #'  subset_by_frequency()
 #'
-subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
+subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NULL) {
 
 
   # Checks ------------------------------------------------------------------
@@ -46,11 +47,19 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
                         arg_name = "other_vars"
     )
 
+  # Define by variable
+
+  by <-
+    .select_to_varnames({{ by }},
+                        data = gene_binary,
+                        arg_name = "by"
+    )
+
   # data frame of only alterations
   alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars))
 
   # Remove all NA columns ----------------------------------------------
-  all_na_alt <- apply(alt_only,  2, function(x) {
+  all_na_alt <- apply(alt_only, 2, function(x) {
      sum(is.na(x)) == nrow(alt_only)
   })
 
@@ -59,10 +68,19 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
 
 
   # Check Numeric Class -----------------------------------------------------
-  .abort_if_not_numeric(alt_only)
+
+  if (is.null(by)) {
+    .abort_if_not_numeric(alt_only)
+  }
+  else {
+    .abort_if_not_numeric(select(alt_only, -any_of(by)))
+  }
 
 
   # Calc Frequency ----------------------------------------------------------
+
+  if(is.null(by)){
+
   counts <- apply(alt_only, 2,  function(x) {sum(x, na.rm = TRUE)})
   num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))
 
@@ -74,5 +92,39 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
                           all_of(alts_over_thresh))
 
   return(subset_binary)
+  }
+  else{
+
+    alt_data <-
+      alt_only |>
+      group_by(across(any_of(by))) |>
+      summarise_all(list(sum = ~ sum(.), total = ~ sum(!is.na(.))), na.rm = T)
+
+    alt_group_data <-
+      alt_data |>
+      pivot_longer(-any_of(by),
+                   names_to = c("gene")) |>
+      separate(gene, into = c("gene", "measure"), sep = "_") |>
+      pivot_wider(names_from = measure,
+                  values_from = value) |>
+      mutate(propo = sum/total) |>
+      arrange(desc(propo))
+
+    alts_over_thresh_grp <-
+      alt_group_data |>
+      filter(propo > t) |>
+      group_by(gene) |>
+      select(gene) |>
+      unique() |>
+      unlist() |>
+      as.vector()
+
+    subset_binary <- select(gene_binary, "sample_id",
+                            any_of(by),
+                            any_of(other_vars),
+                            all_of(alts_over_thresh_grp))
+
+    return(subset_binary)
 
+  }
 }
diff --git a/README.Rmd b/README.Rmd
@@ -18,6 +18,7 @@ knitr::opts_chunk$set(
 library(dplyr)
 library(gtsummary)
 library(gnomeR)
+
 ```
 
 # gnomeR

diff --git a/man/subset_by_frequency.Rd b/man/subset_by_frequency.Rd
diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf
diff --git a/tests/testthat/test-plots.R b/tests/testthat/test-plots.R
@@ -287,5 +287,4 @@ test_that("mutation_viz works", {
 #
 # })
 
-# testthat::test_file("C:\\Users\\toumban\\OneDrive - Memorial Sloan Kettering Cancer Center\\Desktop\\gnomeR\\tests\\testthat\\test-plots.R")
 
diff --git a/tests/testthat/test-subset_by_freq.R b/tests/testthat/test-subset_by_freq.R
@@ -174,3 +174,77 @@ test_that("Pass `other_vars` as strings works", {
   expect_equal(names(sub2), names(sub))
 })
 
+
+test_that("Check for unknown `by` variable", {
+
+  bm <- bind_rows(
+    "gen50" = c(rep(0, 5), rep(1, 5)),
+    "gene20" = c(rep(0, 8), rep(1, 2)),
+    "gene0" = c(rep(0, 10), rep(1, 0)),
+    "sex" = rep(c("F", "M"), 5),
+    "stage" = rep(c("I", "II"), 5)) %>%
+    mutate(sample_id = as.character(1:nrow(.)))
+
+  expect_error(bm |>
+                 subset_by_frequency(t = .1, other_vars = c(sex, stage), by = grade),
+               "Error in `by=` argument input. Select from")
+
+})
+
+
+test_that("Check `by` variable works", {
+
+  bm <- bind_rows(
+    "gen50" = c(rep(0, 5), rep(1, 5)),
+    "gene20" = c(rep(0, 8), rep(1, 2)),
+    "gene0" = c(rep(0, 10), rep(1, 0)),
+    "sex" = rep(c("F", "M"), 5),
+    "stage" = rep(c("I", "II"), 5)) %>%
+    mutate(sample_id = as.character(1:nrow(.)))
+
+  sub <- bm %>%
+    subset_by_frequency(t = .1, other_vars = stage, by = sex)
+
+  expect_equal(setdiff(names(bm), names(sub)), c("gene0"))
+
+  bm1 <- bind_rows(
+    "gen50" = c(rep(0, 5), rep(1, 5)),
+    "gene20" = c(rep(0, 8), rep(1, 2)),
+    "gene0" = c(rep(0, 10), rep(1, 0)),
+    "sex" = c(rep("F", 4), rep("M", 4), "F", "M"),
+    "stage" = rep(c("I", "II"), 5)) %>%
+    mutate(sample_id = as.character(1:nrow(.)))
+
+  sub1 <- bm1 %>%
+    subset_by_frequency(t = .25, other_vars = stage, by = sex)
+
+  sub2 <- bm1 %>%
+    subset_by_frequency(t = .85, other_vars = stage, by = sex)
+
+  expect_equal(setdiff(names(bm1), names(sub1)), c("gene20", "gene0"))
+
+  expect_equal(setdiff(names(bm1), names(sub2)), c("gen50", "gene20", "gene0"))
+})
+
+
+test_that("Check categorical `by` variable works", {
+
+  bm <- bind_rows(
+    "gen50" = c(rep(0, 5), rep(1, 5)),
+    "gene20" = c(rep(0, 8), rep(1, 2)),
+    "gene10" = c(rep(0, 9), rep(1, 1)),
+    "gene0" = c(rep(0, 10), rep(1, 0)),
+    "sex" = c(rep("F", 4), rep("M", 4), "F", "M"),
+    "stage" = rep(c("I", "II"), 5),
+    "grade" = c(rep(1:4, 2), 1, 2)) %>%
+    mutate(sample_id = as.character(1:nrow(.)))
+
+  sub <- subset_by_frequency(bm, other_vars = c("sex", "stage"), by = grade)
+
+  sub1 <- subset_by_frequency(bm, t = 0.35, other_vars = c("sex", "stage"), by = grade)
+
+  expect_equal(setdiff(names(bm), names(sub)), c("gene0"))
+
+  expect_equal(setdiff(names(bm), names(sub1)), c("gene20", "gene10", "gene0"))
+})
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -287,5 +287,4 @@ test_that("mutation_viz works", {
		#
		# })

		# testthat::test_file("C:\\Users\\toumban\\OneDrive - Memorial Sloan Kettering Cancer Center\\Desktop\\gnomeR\\tests\\testthat\\test-plots.R")