Skip to content

Commit

Permalink
Merge pull request #4 from toumban1/integrate_2023_changes
Browse files Browse the repository at this point in the history
Integrate 2023 changes
  • Loading branch information
toumban1 authored Aug 26, 2024
2 parents 6f9c013 + ec26164 commit 1036d2b
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 6 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ LazyData: true
URL: https://github.com/MSKCC-Epi-Bio/gnomeR,
https://mskcc-epi-bio.github.io/gnomeR/
BugReports: https://github.com/MSKCC-Epi-Bio/gnomeR/issues
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Depends: R (>= 3.6)
biocViews:
ComplexHeatmap,
Expand Down
58 changes: 55 additions & 3 deletions R/subset-by-frequency.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#' @param t Threshold value between 0 and 1 to subset by. Default is 10% (.1).
#' @param other_vars One or more column names (quoted or unquoted) in data to be retained
#' in resulting data frame. Default is NULL.
#' @param by Variable used to subset the data. Default is NULL.
#' @return a data frame with a `sample_id` column and columns for
#' alterations over the given prevalence threshold of `t`.
#'
Expand All @@ -22,7 +23,7 @@
#'gene_binary %>%
#' subset_by_frequency()
#'
subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NULL) {


# Checks ------------------------------------------------------------------
Expand All @@ -46,11 +47,19 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
arg_name = "other_vars"
)

# Define by variable

by <-
.select_to_varnames({{ by }},
data = gene_binary,
arg_name = "by"
)

# data frame of only alterations
alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars))

# Remove all NA columns ----------------------------------------------
all_na_alt <- apply(alt_only, 2, function(x) {
all_na_alt <- apply(alt_only, 2, function(x) {
sum(is.na(x)) == nrow(alt_only)
})

Expand All @@ -59,10 +68,19 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {


# Check Numeric Class -----------------------------------------------------
.abort_if_not_numeric(alt_only)

if (is.null(by)) {
.abort_if_not_numeric(alt_only)
}
else {
.abort_if_not_numeric(select(alt_only, -any_of(by)))
}


# Calc Frequency ----------------------------------------------------------

if(is.null(by)){

counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)})
num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))

Expand All @@ -74,5 +92,39 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
all_of(alts_over_thresh))

return(subset_binary)
}
else{

alt_data <-
alt_only |>
group_by(across(any_of(by))) |>
summarise_all(list(sum = ~ sum(.), total = ~ sum(!is.na(.))), na.rm = T)

alt_group_data <-
alt_data |>
pivot_longer(-any_of(by),
names_to = c("gene")) |>
separate(gene, into = c("gene", "measure"), sep = "_") |>
pivot_wider(names_from = measure,
values_from = value) |>
mutate(propo = sum/total) |>
arrange(desc(propo))

alts_over_thresh_grp <-
alt_group_data |>
filter(propo > t) |>
group_by(gene) |>
select(gene) |>
unique() |>
unlist() |>
as.vector()

subset_binary <- select(gene_binary, "sample_id",
any_of(by),
any_of(other_vars),
all_of(alts_over_thresh_grp))

return(subset_binary)

}
}
1 change: 1 addition & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ knitr::opts_chunk$set(
library(dplyr)
library(gtsummary)
library(gnomeR)
```

# gnomeR
Expand Down
4 changes: 3 additions & 1 deletion man/subset_by_frequency.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified tests/testthat/Rplots.pdf
Binary file not shown.
1 change: 0 additions & 1 deletion tests/testthat/test-plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -287,5 +287,4 @@ test_that("mutation_viz works", {
#
# })

# testthat::test_file("C:\\Users\\toumban\\OneDrive - Memorial Sloan Kettering Cancer Center\\Desktop\\gnomeR\\tests\\testthat\\test-plots.R")

74 changes: 74 additions & 0 deletions tests/testthat/test-subset_by_freq.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,77 @@ test_that("Pass `other_vars` as strings works", {
expect_equal(names(sub2), names(sub))
})


test_that("Check for unknown `by` variable", {

bm <- bind_rows(
"gen50" = c(rep(0, 5), rep(1, 5)),
"gene20" = c(rep(0, 8), rep(1, 2)),
"gene0" = c(rep(0, 10), rep(1, 0)),
"sex" = rep(c("F", "M"), 5),
"stage" = rep(c("I", "II"), 5)) %>%
mutate(sample_id = as.character(1:nrow(.)))

expect_error(bm |>
subset_by_frequency(t = .1, other_vars = c(sex, stage), by = grade),
"Error in `by=` argument input. Select from")

})


test_that("Check `by` variable works", {

bm <- bind_rows(
"gen50" = c(rep(0, 5), rep(1, 5)),
"gene20" = c(rep(0, 8), rep(1, 2)),
"gene0" = c(rep(0, 10), rep(1, 0)),
"sex" = rep(c("F", "M"), 5),
"stage" = rep(c("I", "II"), 5)) %>%
mutate(sample_id = as.character(1:nrow(.)))

sub <- bm %>%
subset_by_frequency(t = .1, other_vars = stage, by = sex)

expect_equal(setdiff(names(bm), names(sub)), c("gene0"))

bm1 <- bind_rows(
"gen50" = c(rep(0, 5), rep(1, 5)),
"gene20" = c(rep(0, 8), rep(1, 2)),
"gene0" = c(rep(0, 10), rep(1, 0)),
"sex" = c(rep("F", 4), rep("M", 4), "F", "M"),
"stage" = rep(c("I", "II"), 5)) %>%
mutate(sample_id = as.character(1:nrow(.)))

sub1 <- bm1 %>%
subset_by_frequency(t = .25, other_vars = stage, by = sex)

sub2 <- bm1 %>%
subset_by_frequency(t = .85, other_vars = stage, by = sex)

expect_equal(setdiff(names(bm1), names(sub1)), c("gene20", "gene0"))

expect_equal(setdiff(names(bm1), names(sub2)), c("gen50", "gene20", "gene0"))
})


test_that("Check categorical `by` variable works", {

bm <- bind_rows(
"gen50" = c(rep(0, 5), rep(1, 5)),
"gene20" = c(rep(0, 8), rep(1, 2)),
"gene10" = c(rep(0, 9), rep(1, 1)),
"gene0" = c(rep(0, 10), rep(1, 0)),
"sex" = c(rep("F", 4), rep("M", 4), "F", "M"),
"stage" = rep(c("I", "II"), 5),
"grade" = c(rep(1:4, 2), 1, 2)) %>%
mutate(sample_id = as.character(1:nrow(.)))

sub <- subset_by_frequency(bm, other_vars = c("sex", "stage"), by = grade)

sub1 <- subset_by_frequency(bm, t = 0.35, other_vars = c("sex", "stage"), by = grade)

expect_equal(setdiff(names(bm), names(sub)), c("gene0"))

expect_equal(setdiff(names(bm), names(sub1)), c("gene20", "gene10", "gene0"))
})

0 comments on commit 1036d2b

Please sign in to comment.