Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow to handle missingness and add some imputation methods #10

Merged
merged 8 commits into from
Mar 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ Suggests:
testthat,
knitr,
rmarkdown
Depends: R (>= 2.10)
LazyData: TRUE
URL: http://github.com/enriquea/feseR
BugReports: http://github.com/enriquea/feseR/issues
VignetteBuilder: knitr
RoxygenNote: 6.0.1
RoxygenNote: 7.0.2
Roxygen: list(markdown = TRUE)
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ export(filter.corr)
export(filter.gain.inf)
export(filter.matrix.corr)
export(filter.pca)
export(filterMissingnessRate)
export(imputeMatrix)
export(plot_corr)
export(plot_pca)
export(rfeRF)
Expand Down
40 changes: 9 additions & 31 deletions R/fs_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ filter.pca <- function(features, center = TRUE, scale = TRUE, cum.var.cutoff = 1
}


## wrapper rfeRF

#' rfeRF
#'
Expand All @@ -203,37 +202,18 @@ filter.pca <- function(features, center = TRUE, scale = TRUE, cum.var.cutoff = 1
#' @param class Response variable as numeric vector. It will be coerced to factor.
#' @param number.cv Number of cross-validation folds (10 default). Used during training phase.
#' @param group.sizes A numeric vector of integers corresponding to the number of features that should be retained.
#' @param metric Metric to evaluate performance ('Accuracy' (default), 'Kappa' or 'ROC').
#' @param tolerance Allow tolerance for evaluation metric (Default zero).
#' @param verbose Make the output verbose.
#'
#' @return A list the elements. See \code{\link[caret]{rfe}} for more details.
#'
#'
#' @export


rfeRF.old <- function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,100,5))) {

# Matrix input validation
valid.matrix(mx = features)

if(!is.vector(class) & (nrow(features) != length(class))){
stop('Error processing input data...')
}

#### recursive feature elimination-random forest
rfProfile <- caret::rfe(x = features,
y = as.factor(class),
maximize = TRUE,
metric = 'Accuracy',
sizes = group.sizes,
rfeControl = caret::rfeControl(functions = caret::rfFuncs,
method = "cv",
number = number.cv,
verbose = FALSE))

return(rfProfile) # return RF profile
}

rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15, 100, 5)), metric = "Accuracy", verbose = TRUE, tolerance = 0) {
#'
rfeRF = function(features, class, number.cv = 10,
group.sizes = c(1:10, seq(15, 100, 5)),
metric = "Accuracy", verbose = TRUE, tolerance = 0) {

# Matrix input validation
valid.matrix(mx = features)
Expand Down Expand Up @@ -303,6 +283,7 @@ rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,
#' @param number.cv See \code{\link{rfeRF}} for description.
#' @param group.sizes See \code{\link{rfeRF}} for description.
#' @param metric Metric to evaluate performance ('Accuracy' (default), 'Kappa' or 'ROC').
#' @param tolerance Allow tolerance for evaluation metric (Default zero).
#' @param verbose Make the output verbose.
#'
#' @param extfolds Number of times (default 10) to repeat the entire FS process randomizing the dataset (test/training).
Expand All @@ -318,8 +299,7 @@ rfeRF = function(features, class, number.cv = 10, group.sizes = c(1:10, seq(15,
#'
#'
#' @export


#'
combineFS = function(features, class, univariate = "corr", mincorr = 0.3,
n.percent = 0.75, zero.gain.out = TRUE, multivariate = "mcorr",
maxcorr = 0.75, cum.var.cutoff = 1, wrapper = "rfe.rf",
Expand Down Expand Up @@ -496,5 +476,3 @@ rfFuncs2$rank <- function(object, x, y) {
}




134 changes: 127 additions & 7 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

## Useful functions for checking/comparing matrices
## Utils functions

#' valid.matrix
#'
Expand All @@ -8,8 +8,7 @@
#' @param mx A numeric matrix as input.
#'
#' @return A matrix.
#'

#'
valid.matrix <- function(mx) {
if(!is.matrix(mx)) {
stop('Expected a matrix as input...')
Expand All @@ -30,12 +29,11 @@ valid.matrix <- function(mx) {
#'
#' @param input.matrix A numeric matrix as input.
#' @param output.matrix A numeric matrix as input.
#' @param description A brief description to print out.
#' @param description Short description summarizing this action.
#'
#' @return A message object.
#'

compare.matrix <- function(input.matrix, output.matrix, description = 'Number of removed features:') {
compare.matrix <- function(input.matrix, output.matrix, description = 'Percent of removed features:') {

if(!is.matrix(input.matrix) & !is.matrix(output.matrix)) {
stop('Expected a matrix as input...')
Expand All @@ -47,6 +45,128 @@ compare.matrix <- function(input.matrix, output.matrix, description = 'Number of
if (ncol2 < 2){
stop('Removed all features from input matrix...')
} else {
message(paste(description, round((100 - ncol2/ncol1*100), 2), '%', sep = ' '))
pct_removed = round(((ncol1 - ncol2)/ncol1*100), 2)
message(paste0(
'Kept ', ncol2, ' features out of ', ncol1, '. ',
description, ' ', pct_removed, '%'))
}
}


#' computeMissingRate
#'
#' @param v Input vector
#'
#' @return Missing rate value
#'
computeMissingRate <- function(v){
v <- as.numeric(v)
mr <- sum(is.na(v)) / length(v)
return(round(mr, 2))
}



#' getMissingnessRateByFeatures
#'
#' Given an input matrix, compute the missingness rate for every feature (expected in cols)
#'
#' @param mx A numeric matrix as input.
#'
#' @return A dataframe with features and missingness rate.
#'
getMissingnessRateByFeatures <- function(mx){
if(length(colnames(mx)) == 0) stop('Column (feature) names are missing...')
missingness_rate <- apply(mx, 2, computeMissingRate)
df <- data.frame(features=names(missingness_rate),
missingness=missingness_rate)
return(df)
}


#' impute
#'
#' @param v Input numeric vector
#' @param method One of 'mean' of 'median'
#'
#' @return A vector with missing values imputed.
#'
impute <- function(v, method='mean'){
if(!method %in% c('mean', 'median')){
stop('Imputation method must be one of mean or median...')
}
v <- as.numeric(v)
if(method=='mean'){
v[is.na(v)] <- mean(v, na.rm = T)
}
if(method=='median'){
v[is.na(v)] <- median(v, na.rm = T)
}
return(v)
}


#' imputeMatrix
#'
#' Impute missing values with the mean or median across samples (rows)
#' for every feature (cols).
#'
#' @param mx A numeric matrix as input.
#' @param method Imputation method for missing values (mean or median).
#'
#' @return Imputed matrix
#' @export
#'
imputeMatrix <- function(mx, method='mean'){
if(!is.matrix(mx)) {
stop('Expected a matrix as input...')
}
if (length(rownames(mx)) == 0) {
stop('Row names are missing...')
}
if (length(colnames(mx)) == 0) {
stop('Column names are missing...')
}
rnames <- rownames(mx)
message(paste0("Imputing matrix with method: ", method))
mx.imputed <- apply(mx, 2, impute, method = method)
row.names(mx.imputed) <- rnames
return(mx.imputed)
}


#' filterMissingnessRate
#'
#' @param mx A numeric matrix as input.
#' @param max_missing_rate Maximal missing rate allow for a feature (default: 0.25).
#'
#' @return Filtered matrix with features (col) passing the max missing rate threshold removed.
#' @export
#'
filterMissingnessRate <- function(mx, max_missing_rate=0.25){
if(!is.matrix(mx)) {
stop('Expected a matrix as input...')
}
if (length(rownames(mx)) == 0) {
stop('Row names are missing...')
}
if (length(colnames(mx)) == 0) {
stop('Column names are missing...')
}

df = getMissingnessRateByFeatures(mx)

features_to_keep = subset(df, df[,'missingness'] <
max_missing_rate)[,'features']
message(
paste0("Found ", length(features_to_keep),
' features with max missing rate ', max_missing_rate))

# keep features passing the missing rate cutoff
mx.filtered = mx[,colnames(mx) %in% features_to_keep]

compare.matrix(input.matrix = mx,
output.matrix = mx.filtered)

return(mx.filtered)
}
9 changes: 5 additions & 4 deletions R/visualization.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,15 @@ plot_corr <- function(features, corr.method = 'pearson') {
#'
#' This function allows to plot different ggplot charts in the same grid.
#'
#' @param plotlists this param allow iterate thorugth a set of plots to
#' @param plotlist list of ggplot objects
#' @param file if the user wants to plot to a file, it can use the file paramterer
#' @param cols Number of columns to be use.
#' @param layout the layout to be use
#'
multiplot <- function(plotlist = NULL, file, cols = 1, layout = NULL) {

multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {

plots <- c(list(...), plotlist)
# plots <- c(list(...), plotlist)
plots <- plotlist
numPlots = length(plots)

if (is.null(layout)) {
Expand Down
Binary file modified data/GSE48760.rda
Binary file not shown.
Binary file modified data/GSE5325.rda
Binary file not shown.
Binary file modified data/TNBC.rda
Binary file not shown.
34 changes: 27 additions & 7 deletions man/combineFS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions man/compare.matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/computeMissingRate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading