Skip to content

Commit

Permalink
Merge pull request #10 from enriquea/dev
Browse files Browse the repository at this point in the history
Allow to handle missingness and add some imputation methods
  • Loading branch information
enriquea authored Mar 28, 2021
2 parents bf44a28 + ecc06ce commit ec45940
Show file tree
Hide file tree
Showing 21 changed files with 843 additions and 75 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ Suggests:
testthat,
knitr,
rmarkdown
Depends: R (>= 2.10)
LazyData: TRUE
URL: http://github.com/enriquea/feseR
BugReports: http://github.com/enriquea/feseR/issues
VignetteBuilder: knitr
RoxygenNote: 6.0.1
RoxygenNote: 7.0.2
Roxygen: list(markdown = TRUE)
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ export(filter.corr)
export(filter.gain.inf)
export(filter.matrix.corr)
export(filter.pca)
export(filterMissingnessRate)
export(imputeMatrix)
export(plot_corr)
export(plot_pca)
export(rfeRF)
Expand Down
40 changes: 9 additions & 31 deletions R/fs_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ filter.pca <- function(features, center = TRUE, scale = TRUE, cum.var.cutoff = 1
}


## wrapper rfeRF

#' rfeRF
#'
Expand All @@ -203,37 +202,18 @@ filter.pca <- function(features, center = TRUE, scale = TRUE, cum.var.cutoff = 1
#' @param class Response variable as numeric vector. It will be coerced to factor.
#' @param number.cv Number of cross-validation folds (10 default). Used during training phase.
#' @param group.sizes A numeric vector of integers corresponding to the number of features that should be retained.
#' @param metric Metric to evaluate performance ('Accuracy' (default), 'Kappa' or 'ROC').
#' @param tolerance Allow tolerance for evaluation metric (Default zero).
#' @param verbose Make the output verbose.
#'
#' @return A list the elements. See \code{\link[caret]{rfe}} for more details.
#'
#'
#' @export


rfeRF.old <- function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,100,5))) {

# Matrix input validation
valid.matrix(mx = features)

if(!is.vector(class) & (nrow(features) != length(class))){
stop('Error processing input data...')
}

#### recursive feature elimination-random forest
rfProfile <- caret::rfe(x = features,
y = as.factor(class),
maximize = TRUE,
metric = 'Accuracy',
sizes = group.sizes,
rfeControl = caret::rfeControl(functions = caret::rfFuncs,
method = "cv",
number = number.cv,
verbose = FALSE))

return(rfProfile) # return RF profile
}

rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15, 100, 5)), metric = "Accuracy", verbose = TRUE, tolerance = 0) {
#'
rfeRF = function(features, class, number.cv = 10,
group.sizes = c(1:10, seq(15, 100, 5)),
metric = "Accuracy", verbose = TRUE, tolerance = 0) {

# Matrix input validation
valid.matrix(mx = features)
Expand Down Expand Up @@ -303,6 +283,7 @@ rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,
#' @param number.cv See \code{\link{rfeRF}} for description.
#' @param group.sizes See \code{\link{rfeRF}} for description.
#' @param metric Metric to evaluate performance ('Accuracy' (default), 'Kappa' or 'ROC').
#' @param tolerance Allow tolerance for evaluation metric (Default zero).
#' @param verbose Make the output verbose.
#'
#' @param extfolds Number of times (default 10) to repeat the entire FS process randomizing the dataset (test/training).
Expand All @@ -318,8 +299,7 @@ rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,
#'
#'
#' @export


#'
combineFS = function(features, class, univariate = "corr", mincorr = 0.3,
n.percent = 0.75, zero.gain.out = TRUE, multivariate = "mcorr",
maxcorr = 0.75, cum.var.cutoff = 1, wrapper = "rfe.rf",
Expand Down Expand Up @@ -496,5 +476,3 @@ rfFuncs2$rank <- function(object, x, y) {
}




134 changes: 127 additions & 7 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

## Useful functions for checking/comparing matrices
## Utils functions

#' valid.matrix
#'
Expand All @@ -8,8 +8,7 @@
#' @param mx A numeric matrix as input.
#'
#' @return A matrix.
#'

#'
valid.matrix <- function(mx) {
if(!is.matrix(mx)) {
stop('Expected a matrix as input...')
Expand All @@ -30,12 +29,11 @@ valid.matrix <- function(mx) {
#'
#' @param input.matrix A numeric matrix as input.
#' @param output.matrix A numeric matrix as input.
#' @param description A brief description to print out.
#' @param description Short description summarizing this action.
#'
#' @return A message object.
#'

compare.matrix <- function(input.matrix, output.matrix, description = 'Number of removed features:') {
compare.matrix <- function(input.matrix, output.matrix, description = 'Percent of removed features:') {

if(!is.matrix(input.matrix) & !is.matrix(output.matrix)) {
stop('Expected a matrix as input...')
Expand All @@ -47,6 +45,128 @@ compare.matrix <- function(input.matrix, output.matrix, description = 'Number of
if (ncol2 < 2){
stop('Removed all features from input matrix...')
} else {
message(paste(description, round((100 - ncol2/ncol1*100), 2), '%', sep = ' '))
pct_removed = round(((ncol1 - ncol2)/ncol1*100), 2)
message(paste0(
'Kept ', ncol2, ' features out of ', ncol1, '. ',
description, ' ', pct_removed, '%'))
}
}


#' computeMissingRate
#'
#' @param v Input vector
#'
#' @return Missing rate value
#'
computeMissingRate <- function(v){
v <- as.numeric(v)
mr <- sum(is.na(v)) / length(v)
return(round(mr, 2))
}



#' getMissingnessRateByFeatures
#'
#' Given an input matrix, compute the missingness rate for every feature (expected in cols)
#'
#' @param mx A numeric matrix as input.
#'
#' @return A dataframe with features and missingness rate.
#'
getMissingnessRateByFeatures <- function(mx){
if(length(colnames(mx)) == 0) stop('Column (feature) names are missing...')
missingness_rate <- apply(mx, 2, computeMissingRate)
df <- data.frame(features=names(missingness_rate),
missingness=missingness_rate)
return(df)
}


#' impute
#'
#' @param v Input numeric vector
#' @param method One of 'mean' of 'median'
#'
#' @return A vector with missing values imputed.
#'
impute <- function(v, method='mean'){
if(!method %in% c('mean', 'median')){
stop('Imputation method must be one of mean or median...')
}
v <- as.numeric(v)
if(method=='mean'){
v[is.na(v)] <- mean(v, na.rm = T)
}
if(method=='median'){
v[is.na(v)] <- median(v, na.rm = T)
}
return(v)
}


#' imputeMatrix
#'
#' Impute missing values with the mean or median across samples (rows)
#' for every feature (cols).
#'
#' @param mx A numeric matrix as input.
#' @param method Imputation method for missing values (mean or median).
#'
#' @return Imputed matrix
#' @export
#'
imputeMatrix <- function(mx, method='mean'){
if(!is.matrix(mx)) {
stop('Expected a matrix as input...')
}
if (length(rownames(mx)) == 0) {
stop('Row names are missing...')
}
if (length(colnames(mx)) == 0) {
stop('Column names are missing...')
}
rnames <- rownames(mx)
message(paste0("Imputing matrix with method: ", method))
mx.imputed <- apply(mx, 2, impute, method = method)
row.names(mx.imputed) <- rnames
return(mx.imputed)
}


#' filterMissingnessRate
#'
#' @param mx A numeric matrix as input.
#' @param max_missing_rate Maximal missing rate allow for a feature (default: 0.25).
#'
#' @return Filtered matrix with features (col) passing the max missing rate threshold removed.
#' @export
#'
filterMissingnessRate <- function(mx, max_missing_rate=0.25){
if(!is.matrix(mx)) {
stop('Expected a matrix as input...')
}
if (length(rownames(mx)) == 0) {
stop('Row names are missing...')
}
if (length(colnames(mx)) == 0) {
stop('Column names are missing...')
}

df = getMissingnessRateByFeatures(mx)

features_to_keep = subset(df, df[,'missingness'] <
max_missing_rate)[,'features']
message(
paste0("Found ", length(features_to_keep),
' features with max missing rate ', max_missing_rate))

# keep features passing the missing rate cutoff
mx.filtered = mx[,colnames(mx) %in% features_to_keep]

compare.matrix(input.matrix = mx,
output.matrix = mx.filtered)

return(mx.filtered)
}
9 changes: 5 additions & 4 deletions R/visualization.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,15 @@ plot_corr <- function(features, corr.method = 'pearson') {
#'
#' This function allows to plot different ggplot charts in the same grid.
#'
#' @param plotlists this param allow iterate thorugth a set of plots to
#' @param plotlist list of ggplot objects
#' @param file if the user wants to plot to a file, it can use the file paramterer
#' @param cols Number of columns to be use.
#' @param layout the layout to be use
#'
multiplot <- function(plotlist = NULL, file, cols = 1, layout = NULL) {

multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {

plots <- c(list(...), plotlist)
# plots <- c(list(...), plotlist)
plots <- plotlist
numPlots = length(plots)

if (is.null(layout)) {
Expand Down
Binary file modified data/GSE48760.rda
Binary file not shown.
Binary file modified data/GSE5325.rda
Binary file not shown.
Binary file modified data/TNBC.rda
Binary file not shown.
34 changes: 27 additions & 7 deletions man/combineFS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions man/compare.matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/computeMissingRate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit ec45940

Please sign in to comment.