Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed Apr 30, 2024
1 parent 55336dc commit 6ea11e2
Show file tree
Hide file tree
Showing 126 changed files with 3,407 additions and 1,717 deletions.
4 changes: 4 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
^NOTICE$
^analyses$
^dev$
^cran-comments\.md$
^CRAN-SUBMISSION$
^DESCRIPTION-github$
^DESCRIPTION-cran$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
.Rhistory
.RData
.Ruserdata
DESCRIPTION-github
DESCRIPTION-cran
CRAN-SUBMISSION
cran-comments.md
/dev/**
43 changes: 21 additions & 22 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
Package: goat
Type: Package
Title: Geneset Ordinal Association Test (GOAT)
Version: 0.9.6
Authors@R:
Title: Gene Set Analysis Using the Gene Set Ordinal Association Test
Version: 1.0
Authors@R:
person(given = "Frank",
family = "Koopmans",
role = c("aut", "cre"),
email = "frank.koopmans@vu.nl",
email = "ftwkoopmans@gmail.com",
comment = c(ORCID = "0000-0002-4973-5732"))
Description:
Perform geneset enrichment analyses using the GOAT algorithm.
URL: https://github.com/ftwkoopmans/goat
Description:
Perform gene set enrichment analyses using the Gene set Ordinal
Association Test (GOAT) algorithm and visualize your
results. Koopmans, F. (2024) <doi:10.1101/2023.12.10.570979>.
URL: https://github.com/ftwkoopmans/goat/
License: Apache License (>= 2)
Depends:
Depends:
R (>= 4.1.0),
dplyr (>= 1.0.3)
Imports:
Imports:
tibble (>= 3.0.0),
tidyselect (>= 1.2.0),
tidyr (>= 1.1.2),
Expand All @@ -27,27 +29,24 @@ Imports:
vctrs (>= 0.3.8),
MonoPoly (>= 0.3-10),
ggplot2 (>= 3.3.0),
pheatmap,
treemap,
pheatmap (>= 1.0.8),
treemap (>= 2.4),
igraph (>= 1.2.5),
ggraph (>= 2.0.0),
ggraph (>= 2.0.0)
Suggests:
AnnotationDbi,
GO.db,
org.Hs.eg.db
Suggests:
org.Hs.eg.db,
fgsea,
iDEA,
future (>= 1.21.0),
future.apply (>= 1.9.0),
testthat (>= 3.0.0)
Remotes:
github::ctlab/fgsea,
github::xzhoulab/iDEA
BiocViews:
LinkingTo: Rcpp
Encoding: UTF-8
LazyData: true
LazyDataCompression: xz
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.0
RoxygenNote: 7.3.1
Config/testthat/edition: 3
Language: en-US
Remotes:
github::ctlab/fgsea
BiocViews:
2 changes: 1 addition & 1 deletion LICENSE.note
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ All code and documentation in the goat R package as a whole is distributed under

The goat R package includes the 'RunningStat' implementation of "Welford running variance" provided on John Cook's blog at https://www.johndcook.com/blog/standard_deviation/

The goat R package bundles a number of publicly available datasets that were adopted from published scientific manuscripts. Respective PubMed identifiers are included in the dataset names.
The goat R package bundles publicly available data that were adopted from published scientific manuscripts. Respective PubMed identifiers are included in the dataset names.
15 changes: 13 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
# Generated by roxygen2: do not edit by hand

export(cluster_genesets)
export(darken_color)
export(download_genesets_goatrepo)
export(download_goat_manuscript_data)
export(filter_genesets)
export(gg_color_hue)
export(go_gene2go)
export(go_obo)
export(goat_logo)
export(goat_print_version)
export(goat_version)
export(hgnc_lookuptable)
export(hgnc_idmap_table)
export(lighten_color)
export(load_genesets_gmtfile)
export(load_genesets_go_bioconductor)
export(load_genesets_go_fromfile)
Expand All @@ -16,9 +23,14 @@ export(partition_genes)
export(plot_heatmap)
export(plot_lollipop)
export(plot_network)
export(plot_volcano)
export(rankscore)
export(rankscore_fixed_order)
export(reduce_genesets)
export(save_genesets)
export(score_geneset_directionality)
export(score_geneset_oddsratio)
export(string_trunc_right)
export(symbol_to_entrez)
export(test_genesets)
export(test_genesets_fisherexact)
Expand All @@ -27,7 +39,6 @@ export(test_genesets_goat_fitfunction)
export(test_genesets_goat_precomputed)
export(test_genesets_gsea)
export(test_genesets_hypergeometric)
export(test_genesets_idea)
export(treemap_data)
export(treemap_plot)
import(dplyr)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# *News*

# goat 1.0 (2024-04-28)

First public release.
11 changes: 2 additions & 9 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
#' Previously published datasets that were used as examples in the GOAT manuscript
#'
#' @format ## `goat_example_datasets`
#' a list of data.frames that represent differential expression analysis from high-throughput datasets
#' @source The name of each dataset in `goat_example_datasets` contains the respective PubMed identifier
"goat_example_datasets"



#' Precomputed parameters used by the GOAT algorithm
#'
#' @description there parameters are used by goat to efficiently perform geneset testing without bootstrapping
#' @format ## `goat_nulldistributions`
#' precomputed null distribution parameters, used by goat to efficiently perform geneset testing without bootstrapping
#' a data.frame with precomputed GOAT null distribution parameters
"goat_nulldistributions"
159 changes: 159 additions & 0 deletions R/download_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@

#' Download the datasets that were used in the GOAT manuscript
#'
#' @description
#' Downloads OMICs-based datasets that were used in the GOAT manuscript from the GOAT GitHub page.
#' This file is cached in the output directory and only needs to be downloaded once. Multiple datasets
#' are included and their names include the respective PubMed identifiers (PMID).
#'
#' If you encounter technical difficulties, try to;
#'
#' 1) download the file by copy/pasting this URL into your browser: https://github.com/ftwkoopmans/goat/raw/main/analyses/goat_manuscript_datasets.rda
#' 2) load the data in R using the following 2 lines of code, here assuming you stored the downloaded file at C:/data/goat_manuscript_datasets.rda
#'
#' `load("C:/data/goat_manuscript_datasets.rda")`
#'
#' `genelist = goat_manuscript_datasets.rda[["Wingo 2020:mass-spec:PMID32424284"]]`
#'
#' @param output_dir full path to the directory where the downloaded files should be stored. Directory is created if it does not exist.
#' e.g. `output_dir="~/data"` on unix systems, `output_dir="C:/data"` on Windows, or set to `output_dir=getwd()` to write output to the current working directory
#' @param ignore_cache boolean, set to TRUE to force re-download and ignore cached data, if any. Default: FALSE
#' @return a list of genelist data tables. The names of the list represent the datasets,
#' values in the list are data tables that can be used as a "genelist" in the GOAT R package
#' @export
download_goat_manuscript_data = function(output_dir, ignore_cache = FALSE) {
stopifnot("parameter output_dir must be a single string and represent a directory on your computer" = length(output_dir) == 1 && is.character(output_dir) && !is.na(output_dir))
stopifnot("parameter ignore_cache must be a single boolean value" = length(ignore_cache) == 1 && ignore_cache %in% c(TRUE, FALSE))
sprintf_template_downloadfail = "failed to download %s and store it at %s\nTry an alternative output_dir parameter or follow the download_goat_manuscript_data() function documentation to manually download the file and load it in R"

# create dir if it does not exist
if(!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
if(!dir.exists(output_dir)) {
stop(paste0("Could not create the requested output directory: ", output_dir, "\nTry to provide an existing directory as parameter for 'output_dir'"))
}
}

# load from cache, or download if not available
filename = paste0(output_dir, "/goat_manuscript_datasets.rda")
url = "https://github.com/ftwkoopmans/goat/raw/main/analyses/goat_manuscript_datasets.rda"
if(ignore_cache || !file.exists(filename)) {
message(paste("downloading", url, "..."))
utils::download.file(url, filename, mode = "wb")
if(!file.exists(filename)) {
stop(sprintf(sprintf_template_downloadfail, url, filename))
}
message(paste0("downloaded data was stored at: ", filename))
} else {
message(paste0("cached data was retrieved from: ", filename))
}

# load RData file into environment
e = new.env()
load(filename, envir = e)

# validate that the expected variable is present
if(!is.list(e$goat_manuscript_datasets)) {
stop("failed to load RData file; it did not contain expected variable 'goat_manuscript_datasets'")
}

return(e$goat_manuscript_datasets)
}



#' Download and parse geneset collections from the GOAT GitHub repository
#'
#' @description while the Bioconductor respository is extensive, contains data for many species and is a part of
#' a larger infrastructure, it might contain outdated GO data when the user is not using the latest R version.
#' If users are on an R version that is a few years old, so will the GO data from Bioconductor be.
#'
#' As an alternative, we store gene2go data from NCBI (for Human genes only!) at the GOAT GitHub repository.
#' This function allows for a convenient way to download this data and then parse the genesets.
#'
#' Alternatively you can browse the file in the data branch of the GOAT GitHub repository and download these files manually,
#' then load them via the GOAT R function `load_genesets_go_fromfile()`.
#'
#' To view all available data you can open this URL in a browser; https://github.com/ftwkoopmans/goat/tree/data
#'
#' New data is automatically added biannually. The first available version is 2024-01-01, the next 2024-06-01, then 2025-01-01, and so on.
#'
#' @examples \donttest{
#' # note: this example will download 2 files of approx 10MB in total
#'
#' # store the downloaded files in the following directory. Here, the temporary file
#' # directory is used. Alternatively, consider storing this data in a more permanent location.
#' # e.g. output_dir="~/data/go" on unix systems or output_dir="C:/data/go" on Windows
#' output_dir = tempdir()
#'
#' # download data files with GO annotations, version 2024-01-01 (default parameter)
#' # these are then parsed with the load_genesets_go_fromfile() function
#' # if the files are already available at output_dir, the download step is skipped
#' genesets_asis = download_genesets_goatrepo(output_dir)
#'
#' ### for a basic example on how to use the data obtain here,
#' ### refer to the example included at function documentation of: test_genesets()
#' }
#' @param output_dir full path to the directory where the downloaded files should be stored. Directory is created if it does not exist.
#' e.g. `output_dir="~/data"` on unix systems, `output_dir="C:/data"` on Windows, or set to `output_dir=getwd()` to write output to the current working directory
#' @param type the type of genesets to download. Currently, only "GO" is supported (default)
#' @param version the dataset version. This must be a date in format YYYY-MM-DD. Example: "2024-01-01" (default). View all available versions at https://github.com/ftwkoopmans/goat/tree/data
#' @param ignore_cache boolean, set to TRUE to force re-download and ignore cached data, if any. Default: FALSE
#' @return result from respective geneset parser function. e.g. if parameter `type` was set to"GO" (default), this function returns the result of `load_genesets_go_fromfile()`. These data returned by this function is typically used as input for `filter_genesets()`, c.f. full example at documentation for test_genesets()
#' @export
download_genesets_goatrepo = function(output_dir, type = "GO", version = "2024-01-01", ignore_cache = FALSE) {
stopifnot("parameter type must be a single string. The only supported option for now is 'GO' (default)" = length(type) == 1 && is.character(type) && !is.na(type) && type %in% c("GO"))
stopifnot("parameter version must be a single string that represents a date, see function documentation" = length(version) == 1 && is.character(version) && !is.na(version) && grepl("^\\d\\d\\d\\d\\-\\d\\d-\\d\\d$", version))
stopifnot("parameter output_dir must be a single string and represent a directory on your computer" = length(output_dir) == 1 && is.character(output_dir) && !is.na(output_dir))
stopifnot("parameter ignore_cache must be a single boolean value" = length(ignore_cache) == 1 && ignore_cache %in% c(TRUE, FALSE))

# create dir if it does not exist
if(!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
if(!dir.exists(output_dir)) {
stop(paste0("Could not create the requested output directory: ", output_dir, "\nTry to provide an existing directory as parameter for 'output_dir'"))
}
}

if(type == "GO") {
file_g2g = sprintf("%s/gene2go_%s.gz", output_dir, version)
file_obo = sprintf("%s/go_%s.obo.gz", output_dir, version)
url_g2g = sprintf("https://github.com/ftwkoopmans/goat/raw/data/go/%s/gene2go_human_%s.gz", version, version)
url_obo = sprintf("https://github.com/ftwkoopmans/goat/raw/data/go/%s/go_%s.obo.gz", version, version)
any_download = FALSE
sprintf_template_downloadfail = "failed to download %s and store it at %s\nMost likely causes are 1) the requested file/version does not exist (try the default parameter!) and 2) Internet connection issues (try to download the here mentioned URL by copy/pasting in your browser).\nPlease refer to the download_genesets_goatrepo() function documentation to learn how you can find available versions (besides the default parameter)"

# attempt to download if not available on disk
if(ignore_cache || !file.exists(file_g2g)) {
message(paste("downloading", url_g2g, "..."))
utils::download.file(url_g2g, file_g2g, mode = "wb")
if(!file.exists(file_g2g)) {
stop(sprintf(sprintf_template_downloadfail, url_g2g, file_g2g))
}
any_download = TRUE
}

# attempt to download if not available on disk
if(ignore_cache || !file.exists(file_obo)) {
message(paste("downloading", url_obo, "..."))
utils::download.file(url_obo, file_obo, mode = "wb")
if(!file.exists(file_obo)) {
stop(sprintf(sprintf_template_downloadfail, url_obo, file_obo))
}
any_download = TRUE
}

if(any_download) {
message(paste("downloaded geneset files were stored at:", output_dir))
} else {
message(paste("cached geneset files were retrieved from:", output_dir))
}

return(load_genesets_go_fromfile(file_gene2go = file_g2g, file_goobo = file_obo))
}

# ... other types may be added in the future

}


6 changes: 3 additions & 3 deletions R/filter_genesets.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ filter_genesets = function(genesets, genelist, min_overlap = 10L, max_overlap =
if(!is.finite(max_size)) max_size = Inf # users can provide NA to disable filtering

if(min_signif > 0) {
cat("Warning: the 'min_signif' parameter is enabled. Be careful, this is \"prefiltering\" and will affect the correcteness / calibration of estimated geneset p-values. For GOAT and GSEA, this is NOT RECOMMENDED\n")
warning("the 'min_signif' parameter is enabled. Be careful, this is \"prefiltering\" and will affect the correcteness / calibration of estimated geneset p-values. For GOAT and GSEA, this is NOT RECOMMENDED")
}

# settings as string
Expand Down Expand Up @@ -78,7 +78,7 @@ filter_genesets = function(genesets, genelist, min_overlap = 10L, max_overlap =
group_by(ngenes) |> # can only be a dupe if vector length is equal, so efficiently check within same-length
mutate(isdupe = finddupes(genes)) |>
ungroup() |>
filter(isdupe == F) |>
filter(isdupe == FALSE) |>
select(-isdupe)
}

Expand All @@ -96,7 +96,7 @@ filter_genesets = function(genesets, genelist, min_overlap = 10L, max_overlap =
}

if(nrow(x) == 0) {
cat("filter_genesets() yields an empty result !\nAre the gene identifiers in your 'genesets' and 'genelist tables of the same type? e.g. both tables should contain NCBI Entrez gene IDs, or both use HGNC identifiers, or Ensembl gene IDs. Another common mistake is using different species, so double-check that both tables contain e.g. human gene identifiers\n")
warning("filter_genesets() yields an empty result !\nAre the gene identifiers in your 'genesets' and 'genelist tables of the same type? e.g. both tables should contain NCBI Entrez gene IDs, or both use HGNC identifiers, or Ensembl gene IDs. Another common mistake is using different species, so double-check that both tables contain e.g. human gene identifiers")
}

attr(x, "settings") <- c(attr(genesets, "settings"), settings)
Expand Down
Loading

0 comments on commit 6ea11e2

Please sign in to comment.