Merge pull request #10 from enriquea/dev

Allow to handle missingness and add some imputation methods
enriquea · Mar 28, 2021 · ec45940 · ec45940
2 parents bf44a28 + ecc06ce
commit ec45940
Show file tree

Hide file tree

Showing 21 changed files with 843 additions and 75 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,9 +26,10 @@ Suggests:
     testthat,
     knitr,
     rmarkdown
+Depends: R (>= 2.10)
 LazyData: TRUE
 URL: http://github.com/enriquea/feseR
 BugReports: http://github.com/enriquea/feseR/issues
 VignetteBuilder: knitr
-RoxygenNote: 6.0.1
+RoxygenNote: 7.0.2
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,8 @@ export(filter.corr)
 export(filter.gain.inf)
 export(filter.matrix.corr)
 export(filter.pca)
+export(filterMissingnessRate)
+export(imputeMatrix)
 export(plot_corr)
 export(plot_pca)
 export(rfeRF)

diff --git a/R/fs_functions.R b/R/fs_functions.R
@@ -193,7 +193,6 @@ filter.pca <- function(features, center = TRUE, scale = TRUE, cum.var.cutoff = 1
 }
 
 
-## wrapper rfeRF
 
 #' rfeRF
 #'
@@ -203,37 +202,18 @@ filter.pca <- function(features, center = TRUE, scale = TRUE, cum.var.cutoff = 1
 #' @param class Response variable as numeric vector. It will be coerced to factor.
 #' @param number.cv Number of cross-validation folds (10 default). Used during training phase.
 #' @param group.sizes  A numeric vector of integers corresponding to the number of features that should be retained.
+#' @param metric Metric to evaluate performance ('Accuracy' (default), 'Kappa' or 'ROC').
+#' @param tolerance Allow tolerance for evaluation metric (Default zero).
+#' @param verbose Make the output verbose. 
 #'
 #' @return A list the elements. See \code{\link[caret]{rfe}} for more details.
 #'
 #'
 #' @export
-
-
-rfeRF.old <- function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,100,5))) {
-
-  # Matrix input validation
-  valid.matrix(mx = features)
-
-  if(!is.vector(class) & (nrow(features) != length(class))){
-    stop('Error processing input data...')
-  }
-
-  #### recursive feature elimination-random forest
-  rfProfile <- caret::rfe(x = features,
-                          y = as.factor(class),
-                          maximize = TRUE,
-                          metric = 'Accuracy',
-                          sizes = group.sizes,
-                          rfeControl = caret::rfeControl(functions = caret::rfFuncs,
-                                                         method = "cv",
-                                                         number = number.cv,
-                                                         verbose = FALSE))
-
-  return(rfProfile) # return RF profile
-}
-
-rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15, 100, 5)), metric = "Accuracy", verbose = TRUE, tolerance = 0) {
+#' 
+rfeRF = function(features, class, number.cv = 10, 
+                 group.sizes = c(1:10, seq(15, 100, 5)), 
+                 metric = "Accuracy", verbose = TRUE, tolerance = 0) {
 
     # Matrix input validation
     valid.matrix(mx = features)
@@ -303,6 +283,7 @@ rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,
 #' @param number.cv See \code{\link{rfeRF}} for description.
 #' @param group.sizes See \code{\link{rfeRF}} for description.
 #' @param metric Metric to evaluate performance ('Accuracy' (default), 'Kappa' or 'ROC').
+#' @param tolerance Allow tolerance for evaluation metric (Default zero).
 #' @param verbose Make the output verbose. 
 #'
 #' @param extfolds Number of times (default 10) to repeat the entire FS process randomizing the dataset (test/training).
@@ -318,8 +299,7 @@ rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,
 #'
 #'
 #' @export
-
-
+#' 
 combineFS = function(features, class, univariate = "corr", mincorr = 0.3, 
                      n.percent = 0.75, zero.gain.out = TRUE, multivariate = "mcorr", 
                      maxcorr = 0.75, cum.var.cutoff = 1, wrapper = "rfe.rf", 
@@ -496,5 +476,3 @@ rfFuncs2$rank <- function(object, x, y) {
 }
 
 
-
-
diff --git a/R/utils.R b/R/utils.R
@@ -1,5 +1,5 @@
 
-## Useful functions for checking/comparing matrices
+## Utils functions
 
 #' valid.matrix
 #'
@@ -8,8 +8,7 @@
 #' @param mx A numeric matrix as input.
 #'
 #' @return A matrix.
-#'
-
+#' 
 valid.matrix <- function(mx) {
   if(!is.matrix(mx)) {
     stop('Expected a matrix as input...')
@@ -30,12 +29,11 @@ valid.matrix <- function(mx) {
 #'
 #' @param input.matrix A numeric matrix as input.
 #' @param output.matrix A numeric matrix as input.
-#' @param description A brief description to print out.
+#' @param description Short description summarizing this action.
 #'
 #' @return A message object.
 #'
-
-compare.matrix <- function(input.matrix, output.matrix, description = 'Number of removed features:') {
+compare.matrix <- function(input.matrix, output.matrix, description = 'Percent of removed features:') {
 
   if(!is.matrix(input.matrix) & !is.matrix(output.matrix)) {
     stop('Expected a matrix as input...')
@@ -47,6 +45,128 @@ compare.matrix <- function(input.matrix, output.matrix, description = 'Number of
   if (ncol2 < 2){
     stop('Removed all features from input matrix...')
   } else {
-    message(paste(description, round((100 - ncol2/ncol1*100), 2), '%', sep = ' '))
+    pct_removed = round(((ncol1 - ncol2)/ncol1*100), 2)
+    message(paste0(
+      'Kept ', ncol2, ' features out of ', ncol1, '. ',
+       description, ' ', pct_removed, '%'))
   }
 }
+
+
+#' computeMissingRate
+#'
+#' @param v Input vector
+#'
+#' @return Missing rate value
+#'
+computeMissingRate <- function(v){
+   v <- as.numeric(v)
+   mr <- sum(is.na(v)) / length(v)
+   return(round(mr, 2))
+}
+
+
+
+#' getMissingnessRateByFeatures
+#'
+#' Given an input matrix, compute the missingness rate for every feature (expected in cols)
+#'
+#' @param mx A numeric matrix as input.
+#'
+#' @return A dataframe with features and missingness rate.
+#'
+getMissingnessRateByFeatures <- function(mx){
+  if(length(colnames(mx)) == 0) stop('Column (feature) names are missing...')
+  missingness_rate <- apply(mx, 2, computeMissingRate)
+  df <- data.frame(features=names(missingness_rate),
+                   missingness=missingness_rate)
+  return(df)
+}
+
+
+#' impute
+#'
+#' @param v Input numeric vector
+#' @param method One of 'mean' of 'median'
+#'
+#' @return A vector with missing values imputed.
+#'
+impute <- function(v, method='mean'){
+   if(!method %in% c('mean', 'median')){
+     stop('Imputation method must be one of mean or median...')
+     }
+   v <- as.numeric(v)
+   if(method=='mean'){
+     v[is.na(v)] <- mean(v, na.rm = T)
+   } 
+   if(method=='median'){
+     v[is.na(v)] <- median(v, na.rm = T)
+   }
+   return(v)
+}
+
+
+#' imputeMatrix
+#' 
+#' Impute missing values with the mean or median across samples (rows)
+#' for every feature (cols).
+#'
+#' @param mx  A numeric matrix as input.
+#' @param method Imputation method for missing values (mean or median).
+#'
+#' @return Imputed matrix
+#' @export
+#'
+imputeMatrix <- function(mx, method='mean'){
+    if(!is.matrix(mx)) {
+      stop('Expected a matrix as input...')
+    }
+    if (length(rownames(mx)) == 0) {
+      stop('Row names are missing...')
+    }
+    if (length(colnames(mx)) == 0) {
+    stop('Column names are missing...')
+    }
+    rnames <- rownames(mx)
+    message(paste0("Imputing matrix with method: ", method))
+    mx.imputed <- apply(mx, 2, impute, method = method)
+    row.names(mx.imputed) <- rnames
+   return(mx.imputed)
+}
+
+
+#' filterMissingnessRate
+#'
+#' @param mx A numeric matrix as input.
+#' @param max_missing_rate Maximal missing rate allow for a feature (default: 0.25).
+#'
+#' @return Filtered matrix with features (col) passing the max missing rate threshold removed.
+#' @export
+#'
+filterMissingnessRate <- function(mx, max_missing_rate=0.25){
+  if(!is.matrix(mx)) {
+    stop('Expected a matrix as input...')
+  }
+  if (length(rownames(mx)) == 0) {
+    stop('Row names are missing...')
+  } 
+  if (length(colnames(mx)) == 0) {
+    stop('Column names are missing...')
+  } 
+
+  df = getMissingnessRateByFeatures(mx)
+
+  features_to_keep = subset(df, df[,'missingness'] < 
+                              max_missing_rate)[,'features']
+  message(
+    paste0("Found ", length(features_to_keep),
+           ' features with max missing rate ', max_missing_rate))
+
+  # keep features passing the missing rate cutoff
+  mx.filtered = mx[,colnames(mx) %in% features_to_keep]
+
+  compare.matrix(input.matrix = mx,
+                 output.matrix = mx.filtered)
+
+  return(mx.filtered)
+}
diff --git a/R/visualization.R b/R/visualization.R
@@ -122,14 +122,15 @@ plot_corr <- function(features, corr.method = 'pearson') {
 #'
 #' This function allows to plot different ggplot charts in the same grid.
 #'
-#' @param plotlists this param allow iterate thorugth a set of plots to
+#' @param plotlist list of ggplot objects
 #' @param file if the user wants to plot to a file, it can use the file paramterer
 #' @param cols Number of columns to be use.
 #' @param layout the layout to be use
+#' 
+multiplot <- function(plotlist = NULL, file, cols = 1, layout = NULL) {
 
-multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
-
-  plots <- c(list(...), plotlist)
+  # plots <- c(list(...), plotlist)
+  plots <- plotlist
   numPlots = length(plots)
 
   if (is.null(layout)) {

diff --git a/data/GSE48760.rda b/data/GSE48760.rda
diff --git a/data/GSE5325.rda b/data/GSE5325.rda
diff --git a/data/TNBC.rda b/data/TNBC.rda
diff --git a/man/combineFS.Rd b/man/combineFS.Rd
diff --git a/man/compare.matrix.Rd b/man/compare.matrix.Rd
diff --git a/man/computeMissingRate.Rd b/man/computeMissingRate.Rd