Sync

microbiome · Apr 15, 2024 · bbf1760 · bbf1760
2 parents 917cf7f + cb796e4
commit bbf1760
Show file tree

Hide file tree

Showing 15 changed files with 1,302 additions and 141 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,7 +33,8 @@ Suggests:
     glue, 
     sessioninfo, 
     miaTime, 
-    microbiomeDataSets, 
+    microbiomeDataSets,
+    microbiome,
     curatedMetagenomicData, 
     ggplot2, 
     BiocParallel, 

diff --git a/inst/assets/bibliography.bib b/inst/assets/bibliography.bib
@@ -1,3 +1,79 @@
+
+
+
+
+
+
+@Book{Aitchison1986,
+  author =	 {Aitchison, J},
+  title =	 {The Statistical Analysis of Compositional Data},
+  address =	 {London, UK},
+  publisher =	 {Chapman & Hall},
+  year =	 1986
+}
+
+
+  
+
+
+
+@Article{Keshavan2010,
+  author =	 {Keshavan, RH and Montanari, A and Oh, S},
+  title =	 {Matrix Completion From a Few Entries},
+  journal =	 {IEEE Transactions on Information Theory},
+  volume =	 56,
+  pages =	 {2980--2998},
+  year =	 2010
+}
+
+
+
+
+
+
+@Article{Martino2019,
+  author =	 {Martino, C and Morton, J.T. and Marotz, C.A. and Thompson, L.R. and Tripathi, A and Knight, R and Zengler, K},
+  title =	 {A novel sparse compositional technique reveals microbial perturbations},
+  journal =	 {mSystems},
+  volume =	 4,
+  issue =	 1,
+  doi =		 {},
+  year =	 2019 
+}
+
+@Article{Gloor2017,
+  author =	 {Gloor, GB and Macklaim, JM and Pawlowsky-Glahn, V
+                  and Egozcue, JJ},
+  title =	 {Microbiome Datasets Are Compositional: And This Is
+                  Not Optional},
+  journal =	 {Frontiers in Microbiology},
+  volume =	 8,
+  issue =	 2224,
+  doi =		 {10.3389/fmicb.2017.02224},
+  year =	 2017
+}
+
+
+
+@Article{Karwowska2024,
+  author =	 {Zuzanna Karwowska and Oliver Aasmets and Estonian Biobank Research Team and Tomasz Kosciolek and Elin Org},
+  title =	 {Effects of Data Transformation and Model Selection on Feature Importance in Microbiome Classification Data},
+  journal =	 {bioRxiv},
+  doi =		 {10.1101/2023.09.19.558406v2},
+  year =	 {2024}
+}
+
+@Article{Giliberti2022,
+  author =	 {Giliberti, R and Cavaliere, S and Mauriello, IE and Ercolini, D and Pasolli, E},
+  title =	 {Host phenotype classification from human microbiome data is mainly driven by the presence of microbial taxa.},
+  journal =	 {PLoS Comput Biol.},
+  volume =	 18,
+  number =	 4,
+  pages =	 {e1010066},
+  doi =		 {10.1371/journal.pcbi.1010066},
+  year =	 {2022}
+}
+
 @Manual{serizay2023,
     title = {BiocBook: Write, publish and maintain versioned Quarto books with Bioconductor},
     author = {Jacques Serizay},

diff --git a/inst/pages/04_containers.qmd b/inst/pages/04_containers.qmd
@@ -374,7 +374,6 @@ Load required packages.
 
 ```{r dada2_2}
 library(mia)
-library(ggplot2)
 library(BiocManager)
 library(Biostrings)
 ```

diff --git a/inst/pages/11_taxonomic_information.qmd b/inst/pages/11_taxonomic_information.qmd
@@ -118,17 +118,17 @@ specified taxonomic rank.
 head(getUniqueFeatures(tse, rank = "Phylum"))
 ```
 
-### Generate a taxonomic tree on the fly {#sec-fly-tree}
+### Generate a hierarchy tree on the fly {#sec-fly-tree}
 
-To create a taxonomic tree, `taxonomyTree` used the information and returns a
+To create a hierarchy tree, `getHierarchyTree` used the information and returns a
 `phylo` object. Duplicate information from the `rowData` is removed.
 
 ```{r}
-taxonomyTree(tse)
+getHierarchyTree(tse)
 ```
 
 ```{r}
-tse <- addTaxonomyTree(tse)
+tse <- addHierarchyTree(tse)
 tse
 ```
 
@@ -219,7 +219,7 @@ tse <- transformAssay(tse, assay.type = "clr", method = "z",
                       MARGIN = "features")
 
 # Cluster (with euclidean distance) on the features of the z assay
-tse <- cluster(tse,
+tse <- addCluster(tse,
                assay.type = "z",
                clust.col = "hclustEuclidean",
 	       MARGIN = "features",
@@ -231,7 +231,7 @@ kendall_dissimilarity <- function(x) {
 }
 
 # Cluster (with Kendall dissimilarity) on the features of the z assay
-tse <- cluster(tse,
+tse <- addCluster(tse,
                assay.type = "z",
                clust.col = "hclustKendall",
        	       MARGIN = "features", 	       
@@ -256,11 +256,11 @@ plot the histogram of the clusters.
 ```{r taxa_clustering_histogram}
 library(ggplot2)
 library(patchwork) # TO arrange several plots as a grid
-plot1 <- ggplot(as.data.frame(rowData(tse)), aes(x = clusters_euclidean)) +
+plot1 <- ggplot(rowData(tse), aes(x = clusters_euclidean)) +
     geom_bar() +
     labs(title = "CAG size distribution (Euclidean distance)",
          x = "Clusters", y = "Feature count (n)")
-plot2 <- ggplot(as.data.frame(rowData(tse)), aes(x = clusters_kendall)) +
+plot2 <- ggplot(rowData(tse), aes(x = clusters_kendall)) +
     geom_bar() +
     labs(title = "CAG size distribution (1 - tau)",
          x = "Clusters", y = "Feature count (n)")
@@ -277,40 +277,133 @@ tse_merged
 We can note that it worked as planned since there were 5 clusters and there are
 now 5 rows.
 
-## Data transformation {#sec-assay-transform}
-
-Data transformations are common in microbiome analysis. Examples
-include the logarithmic transformation, calculation of relative
-abundances (percentages), and compositionality-aware transformations
-such as the centered log-ratio transformation (clr).
 
-In mia package, transformations are applied to abundance data. The transformed 
-abundance table is stored back to 'assays'. mia includes transformation 
-function ('transformAssay()') which applies sample-wise or column-wise 
-transformation when MARGIN = 'samples', feature-wise or row-wise transformation 
-when MARGIN = 'features'.
+## Data transformation {#sec-assay-transform}
 
-For a complete list of available transformations and parameters, see function 
+Data transformations are common in (microbial) ecology [@Legendre2001]
+and used to improve compatibility with assumptions related to specific
+statistical methods, mitigate biases, enhance the comparability of
+samples or features, or to obtain more interpretable values.
+
+Examples include the logarithmic transformation, calculation of
+relative abundances (percentages), and compositionality-aware
+transformations such as the centered log-ratio transformation (clr).
+
+Let us summarize some commonly used transformations in microbiome data
+science; further details and benchmarkings available in the
+references.
+
+ * 'relabundance' relative transformation; also known as total sum
+   scaling (TSS) and compositional transformation. This converts
+   counts into percentages (at the scale [0, 1]) that sum up to
+   1. Much of the currently available taxonomic abundance data from
+   high-throughput assays (16S, metagenomic sequencing) is
+   compositional by nature, even if the data is provided as counts
+   [@Gloor2017].
+
+ * 'clr' Centered log ratio transformation [@Aitchison1986] is used to
+   reduce data skewness and compositionality bias in relative
+   abundances, while bringing the data to the logarithmic scale. This
+   transformation is frequently applied in microbial ecology
+   [@Gloor2017]. However, this transformation only applies to positive
+   values. Usual solution is to add pseudocount, which adds another
+   type of bias in the data. The robust clr transformation ('rclr')
+   aims to circumvent the need to add a pseudocount. While the
+   resulting values from these transformations are difficult interpret
+   directly, this transformation may enhance comparability of relative
+   differences between samples. It is part of a broader Aitchison
+   family of transformations; the additive log ratio transformation
+   (`alr') is also available. The robust clr ("rclr") is similar to
+   regular clr (see above) but allows data with zeroes and avoids the
+   need to add pseudocount [@Keshavan2010, @Martino2019].
+
+  * 'pa' presence/absence transformation ignores abundances and only
+   indicates whether the given feature is detected above the given
+   threshold (default: 0). This simple transformation is relatively
+   widely used in ecological research. It has shown good performance
+   in microbiome-based classification performance [@Giliberti2022,
+   Karwowska2024].
+
+ * 'z' Z transformation scales data to zero mean and unit variance;
+   this us used to bring features (or samples) to more comparable
+   levels in terms of mean and scale of the values. This can enhance
+   visualization and interpretation of the data
+
+* 'log', 'log2', 'log10' Logarithmic transformations; used e.g. to
+   reduce data skewness; with compositional data the `clr` (or `rclr`)
+   transformation is often preferred.
+
+ * 'hellinger' Hellinger transformation equals to the square root of
+   relative abundances. This ecological transformation can be useful
+   if we are interested in changes in relative abundances.
+
+ * 'rank' Rank transformation replaces each value by its rank. Also
+   see 'rrank' (relative rank transformation). This has use for
+   instance in non-parametric statistics.
+
+ * Other available transformations include Chi square ('chi.square'),
+   Frequency transformation ('frequency'), and Make margin sum of
+   squares equal to one ('normalize')
+
+
+### Transforming the data in practice
+
+Transformations on abundance assays can be performed with
+`mia::transformAssay()`, keeping both the original and the transformed
+assay(s). The transformed abundance assay is then stored back to the
+'assays' slot in the data object. The function applies sample-wise or
+column-wise transformation when MARGIN = 'samples', feature-wise or
+row-wise transformation when MARGIN = 'features'.
+
+A complete list of available transformations and parameters, is
+available in the function
 [help](https://microbiome.github.io/mia/reference/transformAssay.html).
 
+
 ```{r}
-tse <- tseGlobalPatterns
+# Load example data
+library(mia)
+data("GlobalPatterns", package = "mia")
+tse <- GlobalPatterns
+
+# Transform "counts" assay to relative abundances ("relabundance"), with pseudocount 1 
 tse <- transformAssay(tse, assay.type = "counts", method = "relabundance", pseudocount = 1)
+
+# Transform relative abundance assay ("relabundance") to "clr", using pseudocount if necessary;
+# name the resulting assay to "clr" 
 tse <- transformAssay(x = tse, assay.type = "relabundance", method = "clr", 
-                      pseudocount = 1, name = "clr")
+                      pseudocount = TRUE, name = "clr")
+
+```
+
+
+Get the values in the resulting assay, and view some of the first
+entries of it with the `head` command.
 
+```{r}
 head(assay(tse, "clr"))
 ```
 
--   In 'pa' transformation, abundance table is converted to present/absent table.
+
+In 'pa' transformation, abundance table is converted to
+presence/absence table that ignores abundances and only indicates
+whether the given feature is detected. This simple transformation is
+relatively widely used in ecological research. It has shown good
+performance in microbiome-based classification performance
+[@Giliberti2022, Karwowska2024].
+
 
 ```{r}
+# Here, `assay.type` is not explicitly specified.
+# Then The function uses the "counts" assay for the transformation.
 tse <- transformAssay(tse, method = "pa")
-
 head(assay(tse, "pa"))
 ```
 
+You can now view the entire list of abundance assays in your data object with:
+
 ```{r}
-# list of abundance tables that assays slot contains
 assays(tse)
 ```
+
+
diff --git a/inst/pages/12_quality_control.qmd b/inst/pages/12_quality_control.qmd
@@ -186,7 +186,7 @@ top_phyla_mean <- getTopFeatures(altExp(tse,"Phylum"),
                              top=5L,
                              assay.type="counts")
 x <- unsplitByRanks(tse, ranks = taxonomyRanks(tse)[1:6])
-x <- addTaxonomyTree(x)
+x <- addHierarchyTree(x)
 ```
 
 After some preparation, the data is assembled and can be plotted with
@@ -250,7 +250,7 @@ histogram (left), or by sorting the samples by library size (right).
 ```{r plot-viz-lib-size-1, fig.width=8, fig.height=4, fig.cap="Library size distribution."}
 library(ggplot2)
 
-p1 <- ggplot(as.data.frame(colData(tse))) +
+p1 <- ggplot(colData(tse)) +
         geom_histogram(aes(x = sum), color = "black", fill = "gray", bins = 30) +
         labs(x = "Library size", y = "Frequency (n)") + 
         # scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), 
@@ -284,7 +284,6 @@ Library sizes other variables from `colData` can be
 visualized by using specified function called `plotColData`.
 
 ```{r plot-viz-lib-size-2, fig.width=8, fig.height=4, fig.cap="Library sizes per sample."}
-library(ggplot2)
 # Sort samples by read count, order the factor levels, and store back to tse as DataFrame
 # TODO: plotColData could include an option for sorting samples based on colData variables
 colData(tse) <- as.data.frame(colData(tse)) %>%