diff --git a/DESCRIPTION b/DESCRIPTION index 060556b3..63651dd0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,20 +2,19 @@ Package: ggRandomForests Type: Package Title: Visually Exploring Random Forests Version: 2.0.1 -Date: 2016-08-10 +Date: 2016-09-07 Author: John Ehrlinger Maintainer: John Ehrlinger License: GPL (>=3) -VignetteBuilder: knitr URL: https://github.com/ehrlinger/ggRandomForests BugReports: https://github.com/ehrlinger/ggRandomForests/issues Description: Graphic elements for exploring Random Forests using the 'randomForest' or 'randomForestSRC' package for survival, regression and classification forests and 'ggplot2' package plotting. Depends: - R (>= 3.1.0) + R (>= 3.1.0), + randomForestSRC (>= 1.5.5) Imports: - randomForestSRC (>= 1.5.5), randomForest, ggplot2, survival, diff --git a/R/cache_rfsrc_datasets.R b/R/cache_rfsrc_datasets.R index 66136959..f849f569 100644 --- a/R/cache_rfsrc_datasets.R +++ b/R/cache_rfsrc_datasets.R @@ -17,21 +17,21 @@ #' deal with thoses changes. We make the function available to end users to #' create objects for further experimentation. #' -#' There are five cached data set types: -#' '\itemize{ -#' \item \code{\link{rfsrc_data}} - \code{\link[randomForestSRC]{rfsrc}} objects. -#' \item \code{\link{varsel_data}} - \code{\link[randomForestSRC]{var.select}} -#' minimal depth variable selection objects. -#' \item \code{\link{interaction_data}} - -#' \code{\link[randomForestSRC]{find.interaction}} minimal depth, -#' pairwise variable interaction matrices. -#' \item \code{\link{partial_data}} - \code{\link[randomForestSRC]{plot.variable}} -#' objects -#' (\code{partial=TRUE}) for partial variable dependence. -#' \item \code{\link{partial_coplot_data}} - -#' \code{\link[randomForestSRC]{plot.variable}} objects -#' (\code{partial=TRUE}) for partial variable dependence. -#' } +# There are five cached data set types: +# '\itemize{ +# \item \code{\link{rfsrc_data}} - \code{\link[randomForestSRC]{rfsrc}} objects. +# \item \code{\link{varsel_data}} - \code{\link[randomForestSRC]{var.select}} +# minimal depth variable selection objects. +# \item \code{\link{interaction_data}} - +# \code{\link[randomForestSRC]{find.interaction}} minimal depth, +# pairwise variable interaction matrices. +# \item \code{\link{partial_data}} - \code{\link[randomForestSRC]{plot.variable}} +# objects +# (\code{partial=TRUE}) for partial variable dependence. +# \item \code{\link{partial_coplot_data}} - +# \code{\link[randomForestSRC]{plot.variable}} objects +# (\code{partial=TRUE}) for partial variable dependence. +# } #' #' For the following data sets: #' #'\itemize{ @@ -46,11 +46,11 @@ #' @seealso \code{iris} \code{airq} \code{mtcars} \code{\link[MASS]{Boston}} #' \code{\link[randomForestSRC]{pbc}} #' \code{\link[randomForestSRC]{veteran}} -#' \code{\link{rfsrc_data}} -#' \code{\link{varsel_data}} -#' \code{\link{interaction_data}} -#' \code{\link{partial_data}} -#' \code{\link{partial_coplot_data}} +# \code{\link{rfsrc_data}} +# \code{\link{varsel_data}} +# \code{\link{interaction_data}} +# \code{\link{partial_data}} +# \code{\link{partial_coplot_data}} #' #' @importFrom randomForestSRC rfsrc var.select plot.variable find.interaction #' @importFrom utils data diff --git a/R/calc_roc.R b/R/calc_roc.R index 8b86e5f5..54659c1a 100644 --- a/R/calc_roc.R +++ b/R/calc_roc.R @@ -43,8 +43,8 @@ #' #' @examples #' ## Taken from the gg_roc example -#' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) -#' data(rfsrc_iris) +#' rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#' #data(rfsrc_iris) #' gg_dta <- calc_roc.rfsrc(rfsrc_iris, rfsrc_iris$yvar, which.outcome=1, oob=TRUE) #' gg_dta <- calc_roc.rfsrc(rfsrc_iris, rfsrc_iris$yvar, which.outcome=1, oob=FALSE) #' @export @@ -130,8 +130,8 @@ calc_roc <- calc_roc.rfsrc #' @examples #' ## #' ## Taken from the gg_roc example -#' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) -#' data(rfsrc_iris) +#' rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#' #data(rfsrc_iris) #' #' \dontrun{ #' gg_dta <- gg_roc(rfsrc_iris, which.outcome=1) diff --git a/R/combine.gg_partial.R b/R/combine.gg_partial.R index 3d993f8d..fbc2d94c 100644 --- a/R/combine.gg_partial.R +++ b/R/combine.gg_partial.R @@ -28,6 +28,7 @@ #' @importFrom parallel mclapply #' #' @examples +#' \dontrun{ #' # Load a set of plot.variable partial plot data #' data(partial_pbc) #' @@ -61,7 +62,7 @@ #' ggpart[[ind]] <- NULL #' } #' plot(ggpart, panel=TRUE) -#' +#' } #' #' @export combine.gg_partial <- function(x, y, lbls, ...){ diff --git a/R/gg_error.R b/R/gg_error.R index 3a449bbc..f5f68dc9 100644 --- a/R/gg_error.R +++ b/R/gg_error.R @@ -50,9 +50,9 @@ #' ## ------------------------------------------------------------ #' ## ------------- iris data #' ## You can build a randomForest -#' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#' rfsrc_iris <- rfsrc(Species ~ ., data = iris) #' # ... or load a cached randomForestSRC object -#' data(rfsrc_iris, package="ggRandomForests") +#' # data(rfsrc_iris, package="ggRandomForests") #' #' # Get a data.frame containing error rates #' gg_dta<- gg_error(rfsrc_iris) @@ -73,7 +73,7 @@ #' # Plot the gg_error object #' plot(gg_dta) #' } -#' +#'\dontrun{ #' ## ------------- Boston data #' data(rfsrc_Boston, package="ggRandomForests") #' @@ -82,7 +82,7 @@ #' #' # Plot the gg_error object #' plot(gg_dta) -#' +#'} #' \dontrun{ #' ## ------------- mtcars data #' @@ -105,14 +105,14 @@ #' gg_dta <- gg_error(rfsrc_veteran) #' plot(gg_dta) #' } -#' +#'\dontrun{ #' ## ------------- pbc data #' # Load a cached randomForestSRC object #' data(rfsrc_pbc, package="ggRandomForests") #' #' gg_dta <- gg_error(rfsrc_pbc) #' plot(gg_dta) -#' +#'} #' @export gg_error gg_error.rfsrc gg_error.randomForest gg_error.randomForest.formula gg_error <- function (object, ...) { UseMethod("gg_error", object) diff --git a/R/gg_interaction.R b/R/gg_interaction.R index e4f13227..02c1a7ff 100644 --- a/R/gg_interaction.R +++ b/R/gg_interaction.R @@ -56,6 +56,7 @@ #' ## ------------------------------------------------------------ #' ## find interactions, classification setting #' ## ------------------------------------------------------------ +#' \dontrun{ #' ## -------- iris data #' ## iris.obj <- rfsrc(Species ~., data = iris) #' ## TODO: VIMP interactions not handled yet.... @@ -66,7 +67,7 @@ #' #' plot(gg_dta, xvar="Petal.Width") #' plot(gg_dta, panel=TRUE) -#' +#' } #' ## ------------------------------------------------------------ #' ## find interactions, regression setting #' ## ------------------------------------------------------------ @@ -85,13 +86,13 @@ #' #' plot(gg_dta, panel=TRUE) #' } -#' +#' \dontrun{ #' ## -------- Boston data #' data(interaction_Boston, package="ggRandomForests") #' gg_dta <- gg_interaction(interaction_Boston) #' #' plot(gg_dta, panel=TRUE) -#' +#' } #' \dontrun{ #' ## -------- mtcars data #' data(interaction_mtcars, package="ggRandomForests") @@ -107,12 +108,13 @@ #' ## data(pbc, package = "randomForestSRC") #' ## pbc.obj <- rfsrc(Surv(days,status) ~ ., pbc, nsplit = 10) #' ## interaction_pbc <- randomForestSRC::find.interaction(pbc.obj, nvar = 8) +#' \dontrun{ #' data(interaction_pbc, package="ggRandomForests") #' gg_dta <- gg_interaction(interaction_pbc) #' #' plot(gg_dta, xvar="bili") #' plot(gg_dta, panel=TRUE) -#' +#' } #' \dontrun{ #' ## -------- veteran data #' data(interaction_veteran, package="ggRandomForests") diff --git a/R/gg_minimal_depth.R b/R/gg_minimal_depth.R index 24a8c0d8..0a430a80 100644 --- a/R/gg_minimal_depth.R +++ b/R/gg_minimal_depth.R @@ -42,7 +42,8 @@ #' ## ------------------------------------------------------------ #' ## classification example #' ## ------------------------------------------------------------ -#' ## -------- iris data +#'\dontrun{ +#' ## -------- iris data #' ## You can build a randomForest #' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) #' # varsel_iris <- randomForestSRC::var.select(rfsrc_iris) @@ -54,7 +55,7 @@ #' #' # Plot the gg_minimal_depth object #' plot(gg_dta) -#' +#' } #' ## ------------------------------------------------------------ #' ## Regression example #' ## ------------------------------------------------------------ @@ -71,13 +72,13 @@ #' # Plot the gg_minimal_depth object #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- Boston data #' data(varsel_Boston, package="ggRandomForests") #' #' # Get a data.frame containing error rates #' plot(gg_minimal_depth(varsel_Boston)) -#' +#' } #' \dontrun{ #' ## -------- mtcars data #' data(varsel_mtcars, package="ggRandomForests") @@ -102,13 +103,13 @@ #' gg_dta <- gg_minimal_depth(varsel_veteran) #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- pbc data #' data(varsel_pbc, package="ggRandomForests") #' #' gg_dta <- gg_minimal_depth(varsel_pbc) #' plot(gg_dta) -#' +#' } #' @aliases gg_minimal_depth gg_minimal_depth.randomForest gg_minimal_depth.rfsrc #' #' @export diff --git a/R/gg_minimal_vimp.R b/R/gg_minimal_vimp.R index bec322dd..cacf0bf0 100644 --- a/R/gg_minimal_vimp.R +++ b/R/gg_minimal_vimp.R @@ -35,6 +35,7 @@ #' ## ------------------------------------------------------------ #' ## classification example #' ## ------------------------------------------------------------ +#' \dontrun{ #' ## -------- iris data #' ## You can build a randomForest #' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) @@ -47,7 +48,7 @@ #' #' # Plot the gg_minimal_depth object #' plot(gg_dta) -#' +#' } #' ## ------------------------------------------------------------ #' ## Regression example #' ## ------------------------------------------------------------ @@ -64,7 +65,7 @@ #' # Plot the gg_minimal_vimp object #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- Boston data #' data(varsel_Boston, package="ggRandomForests") #' @@ -73,7 +74,7 @@ #' #' # Plot the gg_minimal_vimp object #' plot(gg_dta) -#' +#' } #' \dontrun{ #' ## -------- mtcars data #' data(varsel_mtcars, package="ggRandomForests") @@ -99,12 +100,13 @@ #' gg_dta <- gg_minimal_vimp(varsel_veteran) #' plot(gg_dta) #' } +#' \dontrun{ #' ## -------- pbc data #' data(varsel_pbc, package="ggRandomForests") #' #' gg_dta <- gg_minimal_vimp(varsel_pbc) #' plot(gg_dta) -#' +#' } #' @aliases gg_minimal_vimp gg_minimal_vimp.randomForest gg_minimal_vimp.rfsrc #' @export gg_minimal_vimp <- function (object, ...) { diff --git a/R/gg_partial.R b/R/gg_partial.R index fe02aad5..1d1de3f4 100644 --- a/R/gg_partial.R +++ b/R/gg_partial.R @@ -50,7 +50,7 @@ #' ## classification #' ## ------------------------------------------------------------ #' ## -------- iris data -#' +#' \dontrun{ #' ## iris "Petal.Width" partial dependence plot #' ## #' # rfsrc_iris <- rfsrc(Species ~., data = iris) @@ -60,7 +60,7 @@ #' #' gg_dta <- gg_partial(partial_iris) #' plot(gg_dta) -#' +#' } #' ## ------------------------------------------------------------ #' ## regression #' ## ------------------------------------------------------------ @@ -82,13 +82,13 @@ #' gg_dta[["Month"]] <- NULL #' plot(gg_dta, panel=TRUE) #' } -#' +#' \dontrun{ #' ## -------- Boston data #' data(partial_Boston, package="ggRandomForests") #' #' gg_dta <- gg_partial(partial_Boston) #' plot(gg_dta, panel=TRUE) -#' +#' } #' \dontrun{ #' ## -------- mtcars data #' data(partial_mtcars, package="ggRandomForests") @@ -145,6 +145,7 @@ #' gg_dta.cat[["karno"]] <- gg_dta.cat[["diagtime"]] <- gg_dta.cat[["age"]] <- NULL #' plot(gg_dta.cat, panel=TRUE, notch=TRUE) #' } +#' \dontrun{ #' ## -------- pbc data #' data("partial_pbc", package = "ggRandomForests") #' data("varsel_pbc", package = "ggRandomForests") @@ -173,7 +174,7 @@ #' #' #plot(pbc_ggpart[["edema"]], panel=TRUE) #, #' # notch = TRUE, alpha = .3, outlier.shape = NA) -#' +#' } #' @aliases gg_partial gg_partial_list gg_partial.rfsrc gg_partial.randomForest #' @name gg_partial #' @name gg_partial_list diff --git a/R/gg_partial_coplot.R b/R/gg_partial_coplot.R index 646efe15..fe1712d1 100644 --- a/R/gg_partial_coplot.R +++ b/R/gg_partial_coplot.R @@ -17,6 +17,7 @@ #' @importFrom parallel mclapply #' #' @examples +#' \dontrun{ #' # Load the forest #' data(rfsrc_pbc, package="ggRandomForests") #' @@ -28,7 +29,7 @@ #' #' # Create the conditional groups and add to the gg_variable object #' copper_grp <- cut(ggvar$copper, breaks = copper_cts) -#' +#' } #' \dontrun{ #' ## We would run this, but it's expensive #' partial_coplot_pbc <- gg_partial_coplot(rfsrc_pbc, xvar = "bili", @@ -37,12 +38,13 @@ #' time = 1, #' show.plots = FALSE) #' } +#' \dontrun{ #' ## so load the cached set #' data(partial_coplot_pbc, package="ggRandomForests") #' #' # Partial coplot #' plot(partial_coplot_pbc) #, se = FALSE) -#' +#' } #' #' @export gg_partial_coplot.rfsrc <- function(object, diff --git a/R/gg_rfsrc.R b/R/gg_rfsrc.R index 07c4e3dc..449dcf22 100644 --- a/R/gg_rfsrc.R +++ b/R/gg_rfsrc.R @@ -44,8 +44,8 @@ #' ## classification example #' ## ------------------------------------------------------------ #' ## -------- iris data -#' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) -#' data(rfsrc_iris, package="ggRandomForests") +#' rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#' #data(rfsrc_iris, package="ggRandomForests") #' gg_dta<- gg_rfsrc(rfsrc_iris) #' #' plot(gg_dta) @@ -61,11 +61,11 @@ #' #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- Boston data #' data(rfsrc_Boston, package="ggRandomForests") #' plot(rfsrc_Boston) -#' +#' } #' \dontrun{ #' ## -------- mtcars data #' data(rfsrc_mtcars, package="ggRandomForests") @@ -91,11 +91,11 @@ #' gg_dta <- gg_rfsrc(rfsrc_veteran, by="trt") #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- pbc data #' ## We don't run this because of bootstrap confidence limits #' data(rfsrc_pbc, package = "ggRandomForests") -#' +#' } #' \dontrun{ #' gg_dta <- gg_rfsrc(rfsrc_pbc) #' plot(gg_dta) @@ -103,10 +103,10 @@ #' gg_dta <- gg_rfsrc(rfsrc_pbc, conf.int=.95) #' plot(gg_dta) #' } -#' +#' \dontrun{ #' gg_dta <- gg_rfsrc(rfsrc_pbc, by="treatment") #' plot(gg_dta) -#' +#'} #' #' @aliases gg_rfsrc gg_rfsrc.rfsrc diff --git a/R/gg_roc.R b/R/gg_roc.R index 9201881d..002c5ee5 100644 --- a/R/gg_roc.R +++ b/R/gg_roc.R @@ -34,8 +34,8 @@ #' ## classification example #' ## ------------------------------------------------------------ #' ## -------- iris data -#' #rfsrc_iris <- rfsrc(Species ~ ., data = iris) -#' data(rfsrc_iris, package="ggRandomForests") +#' rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#' #data(rfsrc_iris, package="ggRandomForests") #' #' # ROC for setosa #' gg_dta <- gg_roc(rfsrc_iris, which.outcome=1) diff --git a/R/gg_survival.R b/R/gg_survival.R index c1fd0f53..760dadb0 100644 --- a/R/gg_survival.R +++ b/R/gg_survival.R @@ -35,6 +35,7 @@ #' @seealso \code{\link{kaplan}} \code{\link{nelson}} \code{\link{plot.gg_survival}} #' #' @examples +#' \dontrun{ #' ## -------- pbc data #' data(pbc, package="randomForestSRC") #' pbc$time <- pbc$days/364.25 @@ -58,7 +59,7 @@ #' data=pbc, by="treatment", conf.int=.68) #' #' plot(gg_dta, error="lines") -#' +#' } #' @export gg_survival <- function(interval=NULL, censor=NULL, by=NULL, data, diff --git a/R/gg_variable.R b/R/gg_variable.R index fc5435fd..d95cefaf 100644 --- a/R/gg_variable.R +++ b/R/gg_variable.R @@ -49,8 +49,8 @@ #' ## ------------------------------------------------------------ #' ## -------- iris data #' ## iris -#' #rfsrc_iris <- rfsrc(Species ~., data = iris) -#' data(rfsrc_iris, package="ggRandomForests") +#' rfsrc_iris <- rfsrc(Species ~., data = iris) +#' #data(rfsrc_iris, package="ggRandomForests") #' #' gg_dta <- gg_variable(rfsrc_iris) #' plot(gg_dta, xvar="Sepal.Width") diff --git a/R/gg_vimp.R b/R/gg_vimp.R index f25e69af..a40f2110 100644 --- a/R/gg_vimp.R +++ b/R/gg_vimp.R @@ -42,8 +42,8 @@ #' ## classification example #' ## ------------------------------------------------------------ #' ## -------- iris data -#' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) -#' data(rfsrc_iris, package="ggRandomForests") +#' rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#' #data(rfsrc_iris, package="ggRandomForests") #' gg_dta <- gg_vimp(rfsrc_iris) #' plot(gg_dta) #' @@ -57,12 +57,12 @@ #' gg_dta <- gg_vimp(rfsrc_airq) #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- Boston data #' data(rfsrc_Boston, package="ggRandomForests") #' gg_dta <- gg_vimp(rfsrc_Boston) #' plot(gg_dta) -#' +#' } #' \dontrun{ #' ## -------- mtcars data #' data(rfsrc_mtcars, package="ggRandomForests") @@ -78,7 +78,7 @@ #' gg_dta <- gg_vimp(rfsrc_veteran) #' plot(gg_dta) #' } -#' +#' \dontrun{ #' ## -------- pbc data #' data(rfsrc_pbc, package="ggRandomForests") #' gg_dta <- gg_vimp(rfsrc_pbc) @@ -87,7 +87,7 @@ #' # Restrict to only the top 10. #' gg_dta <- gg_vimp(rfsrc_pbc, nvar=10) #' plot(gg_dta) - +#' } #' @aliases gg_vimp gg_vimp.rfsrc gg_vimp.randomForest gg_vimp.randomForest.formula #' @export gg_vimp <- function (object, nvar, ...) { diff --git a/R/interaction_data.R b/R/interaction_data.R deleted file mode 100644 index 3f28b7a2..00000000 --- a/R/interaction_data.R +++ /dev/null @@ -1,169 +0,0 @@ -#' Cached \code{\link[randomForestSRC]{find.interaction}} matrix objects for examples, -#' diagnostics and vignettes. -#' -#' Data sets storing \code{\link[randomForestSRC]{find.interaction}} matrix objects corresponding to -#' training data according to the following naming convention: -#'\itemize{ -#' \item \code{interaction_iris} - from a randomForestSR[C] for the \code{iris} data set. -#' \item \code{interaction_Boston} - from a randomForestS[R]C for the \code{Boston} housing -#' data set (\code{MASS} package). -#' \item \code{interaction_pbc} - from a randomForest[S]RC for the \code{pbc} data set -#' (\code{randomForestSRC} package) -#' } -#' -#' @details -#' Constructing the minimal depth interaction matrices on randomForestsSRC objects are -#' computationally expensive. We cache \code{\link[randomForestSRC]{find.interaction}} matrix objects -#' to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -#' (see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) -#' -#' For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -#' (see \code{\link{rfsrc_data}}), then calculate the minimal depth variable interaction -#' table with \code{\link[randomForestSRC]{find.interaction}}. Each data set is built with the -#' \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version listed -#' in the \code{ggRandomForests} DESCRIPTION file. -#' -#' \itemize{ -#' \item \code{interaction_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -#' the measurements in centimeters of the variables sepal length and width and -#' petal length and width, respectively, for 50 flowers from each of 3 species -#' of iris. Build a classification random forest for predicting the species (setosa, -#' versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). -#' -#' \item \code{interaction_airq} - The \code{airquality} data set is from the New York State -#' Department of Conservation (ozone data) and the National Weather Service -#' (meteorological data) collected in New York, from May to September 1973. Build regression -#' random forest for predicting \code{Ozone} on 5 covariates and 153 observations. -#' -#' \item \code{interaction_mtcars} - The \code{mtcars} data was extracted from the 1974 Motor -#' Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and -#' performance for 32 automobiles (1973-74 models). Build a regression random forest for -#' predicting mpg on 10 covariates and 32 observations. -#' -#' \item \code{interaction_Boston} - The \code{Boston} housing values in suburbs of Boston from the -#' \code{MASS} package. Build a regression random forest for predicting medv (median home -#' values) on 13 covariates and 506 observations. -#' -#' \item \code{interaction_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -#' cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -#' referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -#' randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -#' the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -#' package. Build a survival random forest for time-to-event death data with 17 covariates and -#' 312 observations (remaining 106 observations are held out). -#' -#' \item \code{interaction_veteran} - Veteran's Administration randomized trial of two treatment -#' regimens for lung cancer. Build a survival random forest for time-to-event death data -#' with 6 covariates and 137 observations. -#' } -#' -#' @seealso \code{iris} \code{\link[MASS]{Boston}} -#' \code{\link[randomForestSRC]{pbc}} -#' \code{\link[randomForestSRC]{find.interaction}} -#' \code{\link{rfsrc_data}} -#' \code{\link{cache_rfsrc_datasets}} -#' \code{\link{gg_interaction}} -#' \code{\link{plot.gg_interaction}} -#' -#' @examples -#' \dontrun{ -#' #--------------------------------------------------------------------- -#' # iris data - classification random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_iris, package="ggRandomForests") -#' -#' # The interaction table -#' interaction_iris <- find.interaction(rfsrc_iris) -#' -#' # plot the forest interaction table -#' gg_dta <- gg_interaction(interaction_iris) -#' plot(gg_dta, panel=TRUE) -#' -#' #--------------------------------------------------------------------- -#' # MASS::Boston data - regression random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_Boston, package="ggRandomForests") -#' -#' # The interaction table -#' interaction_Boston <- find.interaction(rfsrc_Boston) -#' -#' # plot the forest interaction table -#' gg_dta <- gg_interaction(interaction_Boston) -#' plot(gg_dta, panel=TRUE) -#' -#' #--------------------------------------------------------------------- -#' # randomForestSRC::pbc data - survival random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_pbc, package="ggRandomForests") -#' -#' # The interaction table -#' interaction_pbc <- find.interaction(rfsrc_pbc) -#' -#' # plot the forest interaction table -#' gg_dta <- gg_interaction(interaction_pbc) -#' plot(gg_dta, panel=TRUE) -#' -#' } -#' -#' @references -#' #--------------------- -#' randomForestSRC -#' --------------------- -#' -#' Ishwaran H. and Kogalur U.B. (2014). Random Forests for -#' Survival, Regression and Classification (RF-SRC), R package -#' version 1.5.5. -#' -#' Ishwaran H. and Kogalur U.B. (2007). Random survival forests -#' for R. R News 7(2), 25-31. -#' -#' Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -#' (2008). Random survival forests. Ann. Appl. Statist. 2(3), -#' 841-860. -#' -#' #--------------------- -#' Boston data set -#' --------------------- -#' -#' Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying -#' Influential Data and Sources of Collinearity. New York: Wiley. -#' -#' Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." -#' J. Environ. Economics and Management 5: 81-102. -#' -#' #--------------------- -#' Iris data set -#' --------------------- -#' -#' Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -#' Wadsworth \& Brooks/Cole. (has iris3 as iris.) -#' -#' Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -#' Annals of Eugenics, 7, Part II, 179-188. -#' -#' Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -#' of the American Iris Society, 59, 2-5. -#' -#' #--------------------- -#' pbc data set -#' --------------------- -#' -#' Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -#' New York: Wiley. -#' -#' T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -#' Springer-Verlag, New York. ISBN: 0-387-98784-3. -#' -#' @aliases interaction_data interaction_iris interaction_Boston interaction_pbc -#' @docType data -#' @keywords datasets -#' @format \code{\link[randomForestSRC]{find.interaction}} matrix -#' @name interaction_data -#' @name interaction_iris -#' @name interaction_Boston -#' @name interaction_pbc -#' -NULL diff --git a/R/partial.rfsrc.R b/R/partial.rfsrc.R index ad83cad5..6ac41f30 100644 --- a/R/partial.rfsrc.R +++ b/R/partial.rfsrc.R @@ -113,7 +113,7 @@ #' #' ## iris #' #rfsrc_iris <- rfsrc(Species ~., data = iris) -#' data(rfsrc_iris, package="ggRandomForests") +#' #data(rfsrc_iris, package="ggRandomForests") #' #gg_dta <- partial.rfsrc(rfsrc_iris, ) #' #' \dontrun{ diff --git a/R/partial_coplot_data.R b/R/partial_coplot_data.R deleted file mode 100644 index 736e4b28..00000000 --- a/R/partial_coplot_data.R +++ /dev/null @@ -1,96 +0,0 @@ -#' Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, diagnostics and vignettes. -#' -#' Data sets storing \code{\link[randomForestSRC]{rfsrc}} objects corresponding to -#' training data according to the following naming convention: -#'\itemize{ -#' \item \code{partial_coplot_Boston} - randomForestS[R]C for the \code{Boston} housing -#' data set (\code{MASS} package). -#' } -#' -#' @details -#' Constructing random forests are computationally expensive. -#' We cache \code{\link[randomForestSRC]{rfsrc}} objects to improve the \code{ggRandomForests} -#' examples, diagnostics and vignettes run times. -#' (see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) -#' -#' For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}}. Tuning parameters used -#' in each case are documented in the examples. Each data set is built with the -#' \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version listed -#' in the \code{ggRandomForests} DESCRIPTION file. -#' -#' \itemize{ -#' \item \code{partial_coplot_Boston} - The \code{Boston} housing values in suburbs of Boston from the -#' \code{MASS} package. Build a regression random forest for predicting medv (median home -#' values) on 13 covariates and 506 observations. -#' -#' } -#' -#' @seealso \code{\link[MASS]{Boston}} \code{\link[randomForestSRC]{plot.variable}} -#' \code{\link{cache_rfsrc_datasets}} -#' -#' @examples -#' \dontrun{ -#' #--------------------------------------------------------------------- -#' # MASS::Boston data - regression random forest -#' #--------------------------------------------------------------------- -#' data(Boston_rfsrc, package="ggRandomForests") -#' -#' # Cut the codependent variable -#' rm_pts <- cut_distribution(rfsrc_Boston$xvar$rm, groups=6) -#' rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks=rm_pts) -#' -#' # plot.variable for lstat on subsets of rm (this will take some time.) -#' partial_coplot_Boston <- gg_partial_coplot(rfsrc_Boston, xvar="lstat", -#' groups=rm_grp, -#' show.plots=FALSE) -#' -#' } -#' -#' @references -#' #--------------------- -#' randomForestSRC -#' --------------------- -#' -#' Ishwaran H. and Kogalur U.B. (2014). Random Forests for -#' Survival, Regression and Classification (RF-SRC), R package -#' version 1.5.5. -#' -#' Ishwaran H. and Kogalur U.B. (2007). Random survival forests -#' for R. R News 7(2), 25-31. -#' -#' Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -#' (2008). Random survival forests. Ann. Appl. Statist. 2(3), -#' 841-860. -#' -#' #--------------------- -#' Boston data set -#' --------------------- -#' -#' Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying -#' Influential Data and Sources of Collinearity. New York: Wiley. -#' -#' Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." -#' J. Environ. Economics and Management 5: 81-102. -#' -#' #--------------------- -#' pbc data set -#' --------------------- -#' -#' Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -#' New York: Wiley. -#' -#' T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -#' Springer-Verlag, New York. ISBN: 0-387-98784-3. -#' -#' -#' @aliases partial_coplot_data partial_coplot_Boston partial_coplot_Boston2 partial_coplot_pbc partial_coplot_pbc2 -#' @docType data -#' @keywords datasets -#' @format List of \code{\link[randomForestSRC]{plot.variable}} objects -#' @name partial_coplot_data -#' @name partial_coplot_Boston -#' @name partial_coplot_Boston2 -#' @name partial_coplot_pbc -#' @name partial_coplot_pbc2 -#' -NULL \ No newline at end of file diff --git a/R/partial_data.R b/R/partial_data.R deleted file mode 100644 index 5fddefa1..00000000 --- a/R/partial_data.R +++ /dev/null @@ -1,164 +0,0 @@ -#' Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, -#' diagnostics and vignettes. -#' -#' Data sets storing \code{\link[randomForestSRC]{plot.variable}} objects corresponding to -#' training data according to the following naming convention: -#'\itemize{ -#' \item \code{partial_iris} - from a randomForestSR[C] for the \code{iris} data set. -#' \item \code{partial_Boston} - from a randomForestS[R]C for the \code{Boston} housing -#' data set (\code{MASS} package). -#' \item \code{partial_pbc} - from a randomForest[S]RC for the \code{pbc} data set -#' (\code{randomForestSRC} package) -#' } -#' -#' @details -#' Constructing partial plot data with the randomForestsSRC::plot.variable function are -#' computationally expensive. We cache \code{\link[randomForestSRC]{plot.variable}} objects -#' to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -#' (see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) -#' -#' For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -#' (see \code{\link{rfsrc_data}}), then calculate the partial plot data with -#' \code{\link[randomForestSRC]{plot.variable}} function, setting \code{partial=TRUE}. Each data set is -#' built with the \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version -#' listed in the \code{ggRandomForests} DESCRIPTION file. -#' -#' \itemize{ -#' \item \code{partial_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -#' the measurements in centimeters of the variables sepal length and width and -#' petal length and width, respectively, for 50 flowers from each of 3 species -#' of iris. Build a classification random forest for predicting the species (setosa, -#' versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). -#' -#' \item \code{partial_Boston} - The \code{Boston} housing values in suburbs of Boston from the -#' \code{MASS} package. Build a regression random forest for predicting medv (median home -#' values) on 13 covariates and 506 observations. -#' -#' \item \code{partial_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -#' cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -#' referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -#' randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -#' the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -#' package. Build a survival random forest for time-to-event death data with 17 covariates and -#' 312 observations (remaining 106 observations are held out). -#' } -#' -#' @seealso \code{iris} \code{MASS::Boston} -#' \code{\link[randomForestSRC]{pbc}} -#' \code{\link[randomForestSRC]{plot.variable}} -#' \code{\link{rfsrc_data}} -#' \code{\link{cache_rfsrc_datasets}} -#' \code{\link{gg_partial}} -#' \code{\link{plot.gg_partial}} -#' -#' @examples -#' \dontrun{ -#' #--------------------------------------------------------------------- -#' # iris data - classification random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_iris, package="ggRandomForests") -#' -#' # The plot.variable call -#' partial_iris <- plot.variable(rfsrc_iris, -#' partial=TRUE, show.plots=FALSE) -#' -#' # plot the forest partial plots -#' gg_dta <- gg_partial(partial_iris) -#' plot(gg_dta, panel=TRUE) -#' -#' #--------------------------------------------------------------------- -#' # MASS::Boston data - regression random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_Boston, package="ggRandomForests") -#' -#' # The plot.variable call -#' partial_Boston <- plot.variable(rfsrc_Boston, -#' partial=TRUE, show.plots = FALSE ) -#' -#' # plot the forest partial plots -#' gg_dta <- gg_partial(partial_Boston) -#' plot(gg_dta, panel=TRUE) -#' -#' #--------------------------------------------------------------------- -#' # randomForestSRC::pbc data - survival random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_pbc, package="ggRandomForests") -#' -#' # The plot.variable call - -#' # survival requires a time point specification. -#' # for the pbc data, we want 1, 3 and 5 year survival. -#' partial_pbc <- lapply(c(1,3,5), function(tm){ -#' plot.variable(rfsrc_pbc, surv.type = "surv", -#' time = tm, -#' xvar.names = xvar, -#' partial = TRUE, -#' show.plots = FALSE) -#' }) -#' -#' # plot the forest partial plots -#' gg_dta <- gg_partial(partial_pbc) -#' plot(gg_dta) -#' } -#' -#' @references -#' #--------------------- -#' randomForestSRC -#' --------------------- -#' -#' Ishwaran H. and Kogalur U.B. (2014). Random Forests for -#' Survival, Regression and Classification (RF-SRC), R package -#' version 1.5.5. -#' -#' Ishwaran H. and Kogalur U.B. (2007). Random survival forests -#' for R. R News 7(2), 25-31. -#' -#' Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -#' (2008). Random survival forests. Ann. Appl. Statist. 2(3), -#' 841-860. -#' -#' #--------------------- -#' Boston data set -#' --------------------- -#' -#' Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying -#' Influential Data and Sources of Collinearity. New York: Wiley. -#' -#' Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." -#' J. Environ. Economics and Management 5: 81-102. -#' -#' #--------------------- -#' Iris data set -#' --------------------- -#' -#' Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -#' Wadsworth \& Brooks/Cole. (has iris3 as iris.) -#' -#' Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -#' Annals of Eugenics, 7, Part II, 179-188. -#' -#' Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -#' of the American Iris Society, 59, 2-5. -#' -#' #--------------------- -#' pbc data set -#' --------------------- -#' -#' Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -#' New York: Wiley. -#' -#' T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -#' Springer-Verlag, New York. ISBN: 0-387-98784-3. -#' -#' @aliases partial_data partial_iris partial_Boston partial_pbc -#' @docType data -#' @keywords datasets -#' @format \code{\link[randomForestSRC]{plot.variable}} -#' @name partial_data -#' @name partial_iris -#' @name partial_Boston -#' @name partial_pbc -#' -NULL diff --git a/R/partial_surface_data.R b/R/partial_surface_data.R deleted file mode 100644 index 7d7a2dee..00000000 --- a/R/partial_surface_data.R +++ /dev/null @@ -1,145 +0,0 @@ -#' Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, -#' diagnostics and vignettes. -#' -#' Data sets storing \code{\link[randomForestSRC]{plot.variable}} objects corresponding to -#' training data according to the following naming convention: -#'\itemize{ -#' \item \code{partial_Boston_surf} - from a randomForestS[R]C for the \code{Boston} housing -#' data set (\code{MASS} package). -#' \item \code{partial_pbc_surf} - from a randomForest[S]RC for the \code{pbc} data set -#' (\code{randomForestSRC} package) -#' \item \code{partial_pbc_time} - from a randomForest[S]RC for the \code{pbc} data set -#' (\code{randomForestSRC} package) -#' } -#' -#' @details -#' Constructing partial plot data with the randomForestsSRC::plot.variable function are -#' computationally expensive. We cache \code{\link[randomForestSRC]{plot.variable}} objects -#' to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -#' (see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) -#' -#' For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -#' (see \code{\link{rfsrc_data}}), then calculate the partial plot data with -#' \code{\link[randomForestSRC]{plot.variable}} function, setting \code{partial=TRUE}. Each data set is -#' built with the \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version -#' listed in the \code{ggRandomForests} DESCRIPTION file. -#' -#' \itemize{ -#' \item \code{partial_Boston} - The \code{Boston} housing values in suburbs of Boston from the -#' \code{MASS} package. Build a regression random forest for predicting medv (median home -#' values) on 13 covariates and 506 observations. -#' -#' \item \code{partial_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -#' cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -#' referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -#' randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -#' the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -#' package. Build a survival random forest for time-to-event death data with 17 covariates and -#' 312 observations (remaining 106 observations are held out). -#' } -#' -#' @seealso \code{\link[MASS]{Boston}} -#' \code{\link[randomForestSRC]{pbc}} -#' \code{\link[randomForestSRC]{plot.variable}} -#' \code{\link{rfsrc_data}} -#' \code{\link{cache_rfsrc_datasets}} -#' \code{\link{gg_partial}} -#' \code{\link{plot.gg_partial}} -#' -#' @examples -#' \dontrun{ -#' #--------------------------------------------------------------------- -#' # MASS::Boston data - regression random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_Boston, package="ggRandomForests") -#' -#' # The plot.variable call -#' partial_Boston <- plot.variable(rfsrc_Boston, -#' partial=TRUE, show.plots = FALSE ) -#' -#' # plot the forest partial plots -#' gg_dta <- gg_partial(partial_Boston) -#' plot(gg_dta, panel=TRUE) -#' -#' #--------------------------------------------------------------------- -#' # randomForestSRC::pbc data - survival random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_pbc, package="ggRandomForests") -#' -#' # Restrict the time of interest to less than 5 years. -#' time_pts <- rfsrc_pbc$time.interest[which(rfsrc_pbc$time.interest<=5)] -#' -#' # Find the 50 points in time, evenly space along the distribution of -#' # event times for a series of partial dependence curves -#' time_cts <-quantile_pts(time_pts, groups = 50) -#' -#' # Generate the gg_partial_coplot data object -#' system.time(partial_pbc_time <- lapply(time_cts, function(ct){ -#' plot.variable(rfsrc_pbc, xvar = "bili", time = ct, -#' npts = 50, show.plots = FALSE, -#' partial = TRUE, surv.type="surv") -#' })) -#' # user system elapsed -#' # 2561.313 81.446 2641.707 -#' -#' # Find the quantile points to create 50 cut points -#' alb_partial_pts <-quantile_pts(rfsrc_pbc$xvar$albumin, groups = 50) -#' -#' system.time(partial_pbc_surf <- lapply(alb_partial_pts, function(ct){ -#' rfsrc_pbc$xvar$albumin <- ct -#' plot.variable(rfsrc_pbc, xvar = "bili", time = 1, -#' npts = 50, show.plots = FALSE, -#' partial = TRUE, surv.type="surv") -#' })) -#' # user system elapsed -#' # 2547.482 91.978 2671.870 -#' -#' } -#' -#' @references -#' #--------------------- -#' randomForestSRC -#' --------------------- -#' -#' Ishwaran H. and Kogalur U.B. (2014). Random Forests for -#' Survival, Regression and Classification (RF-SRC), R package -#' version 1.5.5. -#' -#' Ishwaran H. and Kogalur U.B. (2007). Random survival forests -#' for R. R News 7(2), 25-31. -#' -#' Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -#' (2008). Random survival forests. Ann. Appl. Statist. 2(3), -#' 841-860. -#' -#' #--------------------- -#' Boston data set -#' --------------------- -#' -#' Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying -#' Influential Data and Sources of Collinearity. New York: Wiley. -#' -#' Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." -#' J. Environ. Economics and Management 5: 81-102. -#' -#' #--------------------- -#' pbc data set -#' --------------------- -#' -#' Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -#' New York: Wiley. -#' -#' T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -#' Springer-Verlag, New York. ISBN: 0-387-98784-3. -#' -#' @aliases partial_surface_data partial_Boston_surf partial_pbc_surf partial_pbc_time -#' @docType data -#' @keywords datasets -#' @format list of \code{\link[randomForestSRC]{plot.variable}} objects -#' @name partial_surface_data -#' @name partial_Boston_surf -#' @name partial_pbc_surf -#' @name partial_pbc_time -NULL diff --git a/R/print.gg_minimal_depth.R b/R/print.gg_minimal_depth.R index 7a9b22a1..8c19a1c2 100644 --- a/R/print.gg_minimal_depth.R +++ b/R/print.gg_minimal_depth.R @@ -25,6 +25,7 @@ #' ## ------------------------------------------------------------ #' ## classification example #' ## ------------------------------------------------------------ +#' \dontrun{ #' ## You can build a randomForest #' # rfsrc_iris <- rfsrc(Species ~ ., data = iris) #' # varsel_iris <- var.select(rfsrc_iris) @@ -34,7 +35,7 @@ #' # Get a data.frame containing minimaldepth measures #' gg_dta <- gg_minimal_depth(varsel_iris) #' print(gg_dta) -#' +#' } #' ## ------------------------------------------------------------ #' ## regression example #' ## ------------------------------------------------------------ @@ -49,7 +50,7 @@ #' # To nicely print a rfsrc::var.select output... #' print(varsel_airq) #' } -#' +#' \dontrun{ #' # ... or load a cached randomForestSRC object #' data(varsel_Boston, package="ggRandomForests") #' @@ -59,7 +60,7 @@ #' #' # To nicely print a rfsrc::var.select output... #' print(varsel_Boston) -#' +#' } #' @export print.gg_minimal_depth <- function(x, ...){ gg_dta <- x diff --git a/R/quantile_pts.R b/R/quantile_pts.R index d4c4b8ab..5abab07d 100644 --- a/R/quantile_pts.R +++ b/R/quantile_pts.R @@ -20,6 +20,7 @@ #' @importFrom stats quantile #' #' @examples +#' \dontrun{ #' data(rfsrc_Boston) #' #' # To create 6 intervals, we want 7 points. @@ -30,7 +31,7 @@ #' rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks=rm_pts) #' #' summary(rm_grp) -#' +#' } #' @export quantile_pts <- function(object, groups, intervals=FALSE){ # We need one more break than group, diff --git a/R/rfsrc_data.R b/R/rfsrc_data.R deleted file mode 100644 index 79ce515e..00000000 --- a/R/rfsrc_data.R +++ /dev/null @@ -1,170 +0,0 @@ -#' Cached \code{\link[randomForestSRC]{rfsrc}} objects for examples, diagnostics and vignettes. -#' -#' Data sets storing \code{\link[randomForestSRC]{rfsrc}} objects corresponding to -#' training data according to the following naming convention: -#'\itemize{ -#' \item \code{rfsrc_iris} - randomForestSR[C] for the \code{iris} data set. -#' \item \code{rfsrc_Boston} - randomForestS[R]C for the \code{Boston} housing -#' data set (\code{MASS} package). -#' \item \code{rfsrc_pbc} - randomForest[S]RC for the \code{pbc} data set -#' (\code{randomForestSRC} package) -#' } -#' -#' @details -#' Constructing random forests are computationally expensive. -#' We cache \code{\link[randomForestSRC]{rfsrc}} objects to improve the \code{ggRandomForests} -#' examples, diagnostics and vignettes run times. -#' (see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) -#' -#' For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}}. Tuning parameters used -#' in each case are documented in the examples. Each data set is built with the -#' \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version listed -#' in the \code{ggRandomForests} DESCRIPTION file. -#' -#' \itemize{ -#' \item \code{rfsrc_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -#' the measurements in centimeters of the variables sepal length and width and -#' petal length and width, respectively, for 50 flowers from each of 3 species -#' of iris. Build a classification random forest for predicting the species (setosa, -#' versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). -#' -#' \item \code{rfsrc_Boston} - The \code{Boston} housing values in suburbs of Boston from the -#' \code{MASS} package. Build a regression random forest for predicting medv (median home -#' values) on 13 covariates and 506 observations. -#' -#' \item \code{rfsrc_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -#' cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -#' referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -#' randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -#' the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -#' package. Build a survival random forest for time-to-event death data with 17 covariates and -#' 312 observations (remaining 106 observations are held out). -#' } -#' -#' @seealso \code{iris} \code{\link[MASS]{Boston}} -#' \code{\link[randomForestSRC]{pbc}} -#' \code{\link[randomForestSRC]{rfsrc}} -#' \code{\link{cache_rfsrc_datasets}} -#' \code{\link{gg_rfsrc}} -#' \code{\link{plot.gg_rfsrc}} -#' \code{\link{gg_error}} -#' \code{\link{plot.gg_error}} -#' -#' @examples -#' \dontrun{ -#' #--------------------------------------------------------------------- -#' # iris data - classification random forest -#' #--------------------------------------------------------------------- -#' # rfsrc grow call -#' rfsrc_iris <- rfsrc(Species ~., data = iris) -#' -#' # plot the forest generalization error convergence -#' gg_dta <- gg_error(rfsrc_iris) -#' plot(gg_dta) -#' -#' # Plot the forest predictions -#' gg_dta <- gg_rfsrc(rfsrc_iris) -#' plot(gg_dta) -#' -#' #--------------------------------------------------------------------- -#' # MASS::Boston data - regression random forest -#' #--------------------------------------------------------------------- -#' # Load the data... -#' data(Boston, package="MASS") -#' Boston$chas <- as.logical(Boston$chas) -#' -#' # rfsrc grow call -#' rfsrc_Boston <- rfsrc(medv~., data=Boston) -#' -#' # plot the forest generalization error convergence -#' gg_dta <- gg_error(rfsrc_Boston) -#' plot(gg_dta) -#' -#' # Plot the forest predictions -#' gg_dta <- gg_rfsrc(rfsrc_Boston) -#' plot(gg_dta) -#' -#' #--------------------------------------------------------------------- -#' # randomForestSRC::pbc data - survival random forest -#' #--------------------------------------------------------------------- -#' # Load the data... -#' # For simplicity here. We do a bit of data tidying -#' # before running the stored random forest. -#' data(pbc, package="randomForestSRC") -#' -#' # Remove non-randomized cases -#' dta.train <- pbc[-which(is.na(pbc$treatment)),] -#' -#' # rfsrc grow call -#' rfsrc_pbc <- rfsrc(Surv(years, status) ~ ., dta.train, nsplit = 10, -#' na.action="na.impute") -#' -#' # plot the forest generalization error convergence -#' gg_dta <- gg_error(rfsrc_pbc) -#' plot(gg_dta) -#' -#' # Plot the forest predictions -#' gg_dta <- gg_rfsrc(rfsrc_pbc) -#' plot(gg_dta) -#' -#' } -#' -#' @references -#' #--------------------- -#' randomForestSRC -#' --------------------- -#' -#' Ishwaran H. and Kogalur U.B. (2014). Random Forests for -#' Survival, Regression and Classification (RF-SRC), R package -#' version 1.5.5. -#' -#' Ishwaran H. and Kogalur U.B. (2007). Random survival forests -#' for R. R News 7(2), 25-31. -#' -#' Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -#' (2008). Random survival forests. Ann. Appl. Statist. 2(3), -#' 841-860. -#' -#' #--------------------- -#' Boston data set -#' --------------------- -#' -#' Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying -#' Influential Data and Sources of Collinearity. New York: Wiley. -#' -#' Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." -#' J. Environ. Economics and Management 5: 81-102. -#' -#' #--------------------- -#' Iris data set -#' --------------------- -#' -#' Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -#' Wadsworth \& Brooks/Cole. (has iris3 as iris.) -#' -#' Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -#' Annals of Eugenics, 7, Part II, 179-188. -#' -#' Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -#' of the American Iris Society, 59, 2-5. -#' -#' #--------------------- -#' pbc data set -#' --------------------- -#' -#' Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -#' New York: Wiley. -#' -#' T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -#' Springer-Verlag, New York. ISBN: 0-387-98784-3. -#' -#' @aliases rfsrc_data rfsrc_iris rfsrc_Boston rfsrc_pbc rfsrc_pbc_test -#' @docType data -#' @keywords datasets -#' @format \code{\link[randomForestSRC]{rfsrc}} object -#' @name rfsrc_data -#' @name rfsrc_iris -#' @name rfsrc_Boston -#' @name rfsrc_pbc -#' @name rfsrc_pbc_test -NULL diff --git a/R/surface_matrix.gg_partial_coplot.R b/R/surface_matrix.gg_partial_coplot.R index 74e154d0..5e62ae03 100644 --- a/R/surface_matrix.gg_partial_coplot.R +++ b/R/surface_matrix.gg_partial_coplot.R @@ -9,6 +9,7 @@ #' column names. #' #' @examples +#' \dontrun{ #' ## From vignette(randomForestRegression, package="ggRandomForests") #' ## #' data(rfsrc_Boston) @@ -32,7 +33,7 @@ #' # Transform the gg_partial_coplot object into a list of three named matrices #' # for surface plotting with plot3D::surf3D #' srf <- surface_matrix(partial_surf, c("lstat", "rm", "yhat")) -#' +#' } #' #' \dontrun{ #' # surf3D is in the plot3D package. diff --git a/R/varsel_data.R b/R/varsel_data.R deleted file mode 100644 index a536553e..00000000 --- a/R/varsel_data.R +++ /dev/null @@ -1,159 +0,0 @@ -#' Cached \code{\link[randomForestSRC]{var.select}} objects for examples, -#' diagnostics and vignettes. -#' -#' Data sets storing \code{\link[randomForestSRC]{var.select}} objects corresponding to -#' training data according to the following naming convention: -#'\itemize{ -#' \item \code{varsel_iris} - from a randomForestSR[C] for the \code{iris} data set. -#' \item \code{varsel_Boston} - from a randomForestS[R]C for the \code{Boston} housing -#' data set (\code{MASS} package). -#' \item \code{varsel_pbc} - from a randomForest[S]RC for the \code{pbc} data set -#' (\code{randomForestSRC} package) -#' } -#' -#' @details -#' Constructing minimal depth variable selection with the randomForestsSRC::var.select function -#' is computationally expensive. We cache \code{\link[randomForestSRC]{var.select}} objects -#' to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -#' (see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) -#' -#' For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -#' (see \code{\link{rfsrc_data}}), then calculate the minimal depth variable selection with -#' \code{\link[randomForestSRC]{var.select}} function, setting \code{method="md"}. Each data set is -#' built with the \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version -#' listed in the \code{ggRandomForests} DESCRIPTION file. -#' -#' \itemize{ -#' \item \code{varsel_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -#' the measurements in centimeters of the variables sepal length and width and -#' petal length and width, respectively, for 50 flowers from each of 3 species -#' of iris. Build a classification random forest for predicting the species (setosa, -#' versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). -#' -#' \item \code{varsel_Boston} - The \code{Boston} housing values in suburbs of Boston from the -#' \code{MASS} package. Build a regression random forest for predicting medv (median home -#' values) on 13 covariates and 506 observations. -#' -#' \item \code{varsel_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -#' cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -#' referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -#' randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -#' the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -#' package. Build a survival random forest for time-to-event death data with 17 covariates and -#' 312 observations (remaining 106 observations are held out). -#' -#' } -#' -#' @seealso \code{iris} \code{\link[MASS]{Boston}} -#' \code{\link[randomForestSRC]{pbc}} -#' \code{\link[randomForestSRC]{var.select}} -#' \code{\link{rfsrc_data}} -#' \code{\link{cache_rfsrc_datasets}} -#' \code{\link{gg_minimal_depth}} -#' \code{\link{plot.gg_minimal_depth}} -#' \code{\link{gg_minimal_vimp}} -#' \code{\link{plot.gg_minimal_vimp}} -#' -#' @examples -#' \dontrun{ -#' #--------------------------------------------------------------------- -#' # iris data - classification random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_iris, package="ggRandomForests") -#' -#' # The var.select call -#' varsel_iris <- var.select(rfsrc_iris) -#' -#' # plot the forestminimal depth ranking -#' gg_dta <- gg_minimal_depth(varsel_iris) -#' plot(gg_dta) -#' -#' -#' #--------------------------------------------------------------------- -#' # MASS::Boston data - regression random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_Boston, package="ggRandomForests") -#' -#' # The var.select call -#' varsel_Boston <- var.select(rfsrc_Boston) -#' -#' # plot the forestminimal depth ranking -#' gg_dta <- gg_minimal_depth(varsel_Boston) -#' plot(gg_dta) -#' -#' #--------------------------------------------------------------------- -#' # randomForestSRC::pbc data - survival random forest -#' #--------------------------------------------------------------------- -#' # load the rfsrc object from the cached data -#' data(rfsrc_pbc, package="ggRandomForests") -#' -#' # The var.select call -#' varsel_pbc <- var.select(rfsrc_pbc) -#' -#' # plot the forestminimal depth ranking -#' gg_dta <- gg_minimal_depth(varsel_pbc) -#' plot(gg_dta) -#' -#' } -#' -#' @references -#' #--------------------- -#' randomForestSRC -#' --------------------- -#' -#' Ishwaran H. and Kogalur U.B. (2014). Random Forests for -#' Survival, Regression and Classification (RF-SRC), R package -#' version 1.5.5. -#' -#' Ishwaran H. and Kogalur U.B. (2007). Random survival forests -#' for R. R News 7(2), 25-31. -#' -#' Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -#' (2008). Random survival forests. Ann. Appl. Statist. 2(3), -#' 841-860. -#' -#' #--------------------- -#' Boston data set -#' --------------------- -#' -#' Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying -#' Influential Data and Sources of Collinearity. New York: Wiley. -#' -#' Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." -#' J. Environ. Economics and Management 5: 81-102. -#' -#' #--------------------- -#' Iris data set -#' --------------------- -#' -#' Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -#' Wadsworth \& Brooks/Cole. (has iris3 as iris.) -#' -#' Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -#' Annals of Eugenics, 7, Part II, 179-188. -#' -#' Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -#' of the American Iris Society, 59, 2-5. -#' -#' #--------------------- -#' pbc data set -#' --------------------- -#' -#' Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -#' New York: Wiley. -#' -#' T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -#' Springer-Verlag, New York. ISBN: 0-387-98784-3. -#' -#' @aliases varsel_data varsel_iris varsel_Boston varsel_pbc -#' @docType data -#' @keywords datasets -#' @format \code{\link[randomForestSRC]{var.select}} object -#' @name varsel_data -#' @name varsel_iris -#' @name varsel_Boston -#' @name varsel_pbc -#' -NULL diff --git a/README.md b/README.md index 3d18ede4..a0381d31 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ ggRandomForests: Visually Exploring Random Forests ======================================================== -[![DOI](https://zenodo.org/badge/5745/ehrlinger/ggRandomForests.png)](http://dx.doi.org/10.5281/zenodo.11526) -[![CRAN version](http://www.r-pkg.org/badges/version/ggRandomForests)](http://cran.r-project.org/package=ggRandomForests) -![cranlogs](http://cranlogs.r-pkg.org./badges/ggRandomForests) +[![DOI](https://zenodo.org/badge/5745/ehrlinger/ggRandomForests.png)](https://dx.doi.org/10.5281/zenodo.11526) +[![CRAN version](https://www.r-pkg.org/badges/version/ggRandomForests)](https://cran.r-project.org/package=ggRandomForests) +![cranlogs](https://cranlogs.r-pkg.org./badges/ggRandomForests) -![active](http://www.repostatus.org/badges/latest/active.svg) +![active](https://www.repostatus.org/badges/latest/active.svg) [![Build Status](https://travis-ci.org/ehrlinger/ggRandomForests.svg?branch=master)](https://travis-ci.org/ehrlinger/ggRandomForests) [![Coverage Status](https://coveralls.io/repos/ehrlinger/ggRandomForests/badge.svg?branch=master&service=github)](https://coveralls.io/github/ehrlinger/ggRandomForests?branch=master) -[ggRandomForests](http://CRAN.R-project.org/package=ggRandomForests) will help uncover variable associations in the random forests models. The package is designed for use with the [randomForest](http://CRAN.R-project.org/package=randomForest) package (A. Liaw and M. Wiener 2002) or the [randomForestSRC](http://CRAN.R-project.org/package=randomForestSRC) package (Iswaran et.al. 2014, 2008, 2007) for survival, regression and classification random forests and uses the [ggplot2](http://CRAN.R-project.org/package=ggplot2) package (Wickham 2009) for plotting diagnostic and variable association results. [ggRandomForests](http://CRAN.R-project.org/package=ggRandomForests) is structured to extract data objects from [randomForestSRC](http://CRAN.R-project.org/package=randomForestSRC) or [randomForest](http://CRAN.R-project.org/package=randomForest) objects and provides S3 functions for printing and plotting these objects. +[ggRandomForests](https://cran.r-project.org/package=ggRandomForests) will help uncover variable associations in the random forests models. The package is designed for use with the [randomForest](https://cran.r-project.org/package=randomForest) package (A. Liaw and M. Wiener 2002) or the [randomForestSRC](https://cran.r-project.org/package=randomForestSRC) package (Iswaran et.al. 2014, 2008, 2007) for survival, regression and classification random forests and uses the [ggplot2](https://cran.r-project.org/package=ggplot2) package (Wickham 2009) for plotting diagnostic and variable association results. [ggRandomForests](https://cran.r-project.org/package=ggRandomForests) is structured to extract data objects from [randomForestSRC](https://cran.r-project.org/package=randomForestSRC) or [randomForest](https://cran.r-project.org/package=randomForest) objects and provides S3 functions for printing and plotting these objects. -The [randomForestSRC](http://CRAN.R-project.org/package=randomForestSRC) package provides a unified treatment of Breiman's (2001) random forests for a variety of data settings. Regression and classification forests are grown when the response is numeric or categorical (factor) while survival and competing risk forests (Ishwaran et al. 2008, 2012) are grown for right-censored survival data. Recently, suppport for the [randomForest](http://CRAN.R-project.org/package=randomForest) package (A. Liaw and M. Wiener 2002) for regression and classification forests has also been added. +The [randomForestSRC](https://cran.r-project.org/package=randomForestSRC) package provides a unified treatment of Breiman's (2001) random forests for a variety of data settings. Regression and classification forests are grown when the response is numeric or categorical (factor) while survival and competing risk forests (Ishwaran et al. 2008, 2012) are grown for right-censored survival data. Recently, suppport for the [randomForest](https://cran.r-project.org/package=randomForest) package (A. Liaw and M. Wiener 2002) for regression and classification forests has also been added. Many of the figures created by the `ggRandomForests` package are also available directly from within the `randomForestSRC` or `randomForest` package. However, `ggRandomForests` offers the following advantages: @@ -21,7 +21,7 @@ Many of the figures created by the `ggRandomForests` package are also available * The use of `ggplot2` for plotting. We chose to use the `ggplot2` package for our figures to allow users flexibility in modifying the figures to their liking. Each S3 plot function returns either a single `ggplot2` object, or a `list` of `ggplot2` objects, allowing users to use additional `ggplot2` functions or themes to modify and customize the figures to their liking. The package has recently been extended for Breiman and Cutler's Random Forests for Classification and -Regression package [randomForest](http://CRAN.R-project.org/package=randomForest) where possible. Though methods have been provided for all `gg_*` functions, the unsupported functions will return an error message indicating where support is still lacking. +Regression package [randomForest](https://cran.r-project.org/package=randomForest) where possible. Though methods have been provided for all `gg_*` functions, the unsupported functions will return an error message indicating where support is still lacking. ## References diff --git a/cran-comments.md b/cran-comments.md index 33bf3cd7..887d9983 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,5 +1,8 @@ This is ggRandomForests package submission v2.0.1 -------------------------------------------------------------------------------- * Correct a bug in survival plots when predicting on future data without a known outcome. -* Additional Vignettes are now at https://github.com/ehrlinger/ggRFVignette +* ALL Vignettes are now at https://github.com/ehrlinger/ggRFVignette +* All tests are being moved to https://github.com/ehrlinger/ggRFVignette +* Begin work on rewriting all checks to not use cached data. + This will require more runtime, and hence we will run fewer of them on CRAN release. * Minor bug and documentation fixes. \ No newline at end of file diff --git a/data/interaction_Boston.rda b/data/interaction_Boston.rda deleted file mode 100644 index 45cf58cb..00000000 Binary files a/data/interaction_Boston.rda and /dev/null differ diff --git a/data/interaction_iris.rda b/data/interaction_iris.rda deleted file mode 100644 index d7ec996e..00000000 Binary files a/data/interaction_iris.rda and /dev/null differ diff --git a/data/interaction_pbc.rda b/data/interaction_pbc.rda deleted file mode 100644 index 45581c6f..00000000 Binary files a/data/interaction_pbc.rda and /dev/null differ diff --git a/data/partial_Boston.rda b/data/partial_Boston.rda deleted file mode 100644 index 5fecd165..00000000 Binary files a/data/partial_Boston.rda and /dev/null differ diff --git a/data/partial_Boston_surf.rda b/data/partial_Boston_surf.rda deleted file mode 100644 index ec12c2fa..00000000 Binary files a/data/partial_Boston_surf.rda and /dev/null differ diff --git a/data/partial_coplot_Boston.rda b/data/partial_coplot_Boston.rda deleted file mode 100644 index 8aa2f423..00000000 Binary files a/data/partial_coplot_Boston.rda and /dev/null differ diff --git a/data/partial_coplot_Boston2.rda b/data/partial_coplot_Boston2.rda deleted file mode 100644 index 7a95af75..00000000 Binary files a/data/partial_coplot_Boston2.rda and /dev/null differ diff --git a/data/partial_coplot_pbc.rda b/data/partial_coplot_pbc.rda deleted file mode 100644 index 3e207744..00000000 Binary files a/data/partial_coplot_pbc.rda and /dev/null differ diff --git a/data/partial_coplot_pbc2.rda b/data/partial_coplot_pbc2.rda deleted file mode 100644 index fff17182..00000000 Binary files a/data/partial_coplot_pbc2.rda and /dev/null differ diff --git a/data/partial_iris.rda b/data/partial_iris.rda deleted file mode 100644 index 8c405fed..00000000 Binary files a/data/partial_iris.rda and /dev/null differ diff --git a/data/partial_pbc.rda b/data/partial_pbc.rda deleted file mode 100644 index 525e69ed..00000000 Binary files a/data/partial_pbc.rda and /dev/null differ diff --git a/data/partial_pbc_surf.rda b/data/partial_pbc_surf.rda deleted file mode 100644 index dcc5bb4c..00000000 Binary files a/data/partial_pbc_surf.rda and /dev/null differ diff --git a/data/partial_pbc_time.rda b/data/partial_pbc_time.rda deleted file mode 100644 index 1ea366aa..00000000 Binary files a/data/partial_pbc_time.rda and /dev/null differ diff --git a/data/rfsrc_Boston.rda b/data/rfsrc_Boston.rda deleted file mode 100644 index 6b88d612..00000000 Binary files a/data/rfsrc_Boston.rda and /dev/null differ diff --git a/data/rfsrc_iris.rda b/data/rfsrc_iris.rda deleted file mode 100644 index c623346f..00000000 Binary files a/data/rfsrc_iris.rda and /dev/null differ diff --git a/data/rfsrc_pbc.rda b/data/rfsrc_pbc.rda deleted file mode 100644 index 624cedaa..00000000 Binary files a/data/rfsrc_pbc.rda and /dev/null differ diff --git a/data/rfsrc_pbc_test.rda b/data/rfsrc_pbc_test.rda deleted file mode 100644 index 5eef3446..00000000 Binary files a/data/rfsrc_pbc_test.rda and /dev/null differ diff --git a/data/varsel_Boston.rda b/data/varsel_Boston.rda deleted file mode 100644 index 9f8ee018..00000000 Binary files a/data/varsel_Boston.rda and /dev/null differ diff --git a/data/varsel_iris.rda b/data/varsel_iris.rda deleted file mode 100644 index 258841fb..00000000 Binary files a/data/varsel_iris.rda and /dev/null differ diff --git a/data/varsel_pbc.rda b/data/varsel_pbc.rda deleted file mode 100644 index 7f067def..00000000 Binary files a/data/varsel_pbc.rda and /dev/null differ diff --git a/inst/CITATION b/inst/CITATION deleted file mode 100644 index d70f6e95..00000000 --- a/inst/CITATION +++ /dev/null @@ -1,10 +0,0 @@ -year <- sub("-.*", "", meta$Date) -note <- sprintf("R package version %s", meta$Version) - -bibentry("Manual", -title="ggRandomForests: Visually Exploring Random Forests", -author=as.person("John Ehrlinger"), -url="http://cran.r-project.org/package=ggRandomForests", -note=note, -year=year -) diff --git a/man/cache_rfsrc_datasets.Rd b/man/cache_rfsrc_datasets.Rd index ec2dba9a..63536e31 100644 --- a/man/cache_rfsrc_datasets.Rd +++ b/man/cache_rfsrc_datasets.Rd @@ -30,21 +30,6 @@ some functionality. This function was created to help the package developer deal with thoses changes. We make the function available to end users to create objects for further experimentation. -There are five cached data set types: -'\itemize{ -\item \code{\link{rfsrc_data}} - \code{\link[randomForestSRC]{rfsrc}} objects. -\item \code{\link{varsel_data}} - \code{\link[randomForestSRC]{var.select}} -minimal depth variable selection objects. -\item \code{\link{interaction_data}} - -\code{\link[randomForestSRC]{find.interaction}} minimal depth, -pairwise variable interaction matrices. -\item \code{\link{partial_data}} - \code{\link[randomForestSRC]{plot.variable}} -objects -(\code{partial=TRUE}) for partial variable dependence. -\item \code{\link{partial_coplot_data}} - -\code{\link[randomForestSRC]{plot.variable}} objects -(\code{partial=TRUE}) for partial variable dependence. -} For the following data sets: #'\itemize{ @@ -59,11 +44,6 @@ For the following data sets: \seealso{ \code{iris} \code{airq} \code{mtcars} \code{\link[MASS]{Boston}} \code{\link[randomForestSRC]{pbc}} -\code{\link[randomForestSRC]{veteran}} -\code{\link{rfsrc_data}} -\code{\link{varsel_data}} -\code{\link{interaction_data}} -\code{\link{partial_data}} -\code{\link{partial_coplot_data}} +\code{\link[randomForestSRC]{veteran}} } diff --git a/man/calc_auc.Rd b/man/calc_auc.Rd index 0a41201c..689c1405 100644 --- a/man/calc_auc.Rd +++ b/man/calc_auc.Rd @@ -25,8 +25,8 @@ the ROC curve. \examples{ ## ## Taken from the gg_roc example -# rfsrc_iris <- rfsrc(Species ~ ., data = iris) -data(rfsrc_iris) +rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#data(rfsrc_iris) \dontrun{ gg_dta <- gg_roc(rfsrc_iris, which.outcome=1) diff --git a/man/calc_roc.rfsrc.Rd b/man/calc_roc.rfsrc.Rd index b4ca1f34..d695f608 100644 --- a/man/calc_roc.rfsrc.Rd +++ b/man/calc_roc.rfsrc.Rd @@ -35,8 +35,8 @@ for use by the end user. } \examples{ ## Taken from the gg_roc example -# rfsrc_iris <- rfsrc(Species ~ ., data = iris) -data(rfsrc_iris) + rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#data(rfsrc_iris) gg_dta <- calc_roc.rfsrc(rfsrc_iris, rfsrc_iris$yvar, which.outcome=1, oob=TRUE) gg_dta <- calc_roc.rfsrc(rfsrc_iris, rfsrc_iris$yvar, which.outcome=1, oob=FALSE) } diff --git a/man/combine.gg_partial.Rd b/man/combine.gg_partial.Rd index f9c9a003..85bc70d4 100644 --- a/man/combine.gg_partial.Rd +++ b/man/combine.gg_partial.Rd @@ -36,6 +36,7 @@ The second call will append a single \code{lbls} label to the \code{\link{gg_partial}} object. } \examples{ +\dontrun{ # Load a set of plot.variable partial plot data data(partial_pbc) @@ -69,7 +70,7 @@ for(ind in nms){ ggpart[[ind]] <- NULL } plot(ggpart, panel=TRUE) - +} } diff --git a/man/gg_error.Rd b/man/gg_error.Rd index ff78a225..1baecedf 100644 --- a/man/gg_error.Rd +++ b/man/gg_error.Rd @@ -34,9 +34,9 @@ for connecting to the S3 \code{\link{plot.gg_error}} function. ## ------------------------------------------------------------ ## ------------- iris data ## You can build a randomForest -# rfsrc_iris <- rfsrc(Species ~ ., data = iris) +rfsrc_iris <- rfsrc(Species ~ ., data = iris) # ... or load a cached randomForestSRC object -data(rfsrc_iris, package="ggRandomForests") +# data(rfsrc_iris, package="ggRandomForests") # Get a data.frame containing error rates gg_dta<- gg_error(rfsrc_iris) @@ -57,7 +57,7 @@ gg_dta<- gg_error(rfsrc_airq) # Plot the gg_error object plot(gg_dta) } - +\dontrun{ ## ------------- Boston data data(rfsrc_Boston, package="ggRandomForests") @@ -66,7 +66,7 @@ gg_dta<- gg_error(rfsrc_Boston) # Plot the gg_error object plot(gg_dta) - +} \dontrun{ ## ------------- mtcars data @@ -89,14 +89,14 @@ rfsrc_veteran <- rfsrc(Surv(time, status) ~ ., data = dta$veteran, ...) gg_dta <- gg_error(rfsrc_veteran) plot(gg_dta) } - +\dontrun{ ## ------------- pbc data # Load a cached randomForestSRC object data(rfsrc_pbc, package="ggRandomForests") gg_dta <- gg_error(rfsrc_pbc) plot(gg_dta) - +} } \references{ Breiman L. (2001). Random forests, Machine Learning, 45:5-32. diff --git a/man/gg_interaction.Rd b/man/gg_interaction.Rd index 93921d70..cd930521 100644 --- a/man/gg_interaction.Rd +++ b/man/gg_interaction.Rd @@ -28,6 +28,7 @@ function with all optional arguments. ## ------------------------------------------------------------ ## find interactions, classification setting ## ------------------------------------------------------------ +\dontrun{ ## -------- iris data ## iris.obj <- rfsrc(Species ~., data = iris) ## TODO: VIMP interactions not handled yet.... @@ -38,7 +39,7 @@ gg_dta <- gg_interaction(interaction_iris) plot(gg_dta, xvar="Petal.Width") plot(gg_dta, panel=TRUE) - +} ## ------------------------------------------------------------ ## find interactions, regression setting ## ------------------------------------------------------------ @@ -57,13 +58,13 @@ plot(gg_dta, xvar="Solar.R") plot(gg_dta, panel=TRUE) } - +\dontrun{ ## -------- Boston data data(interaction_Boston, package="ggRandomForests") gg_dta <- gg_interaction(interaction_Boston) plot(gg_dta, panel=TRUE) - +} \dontrun{ ## -------- mtcars data data(interaction_mtcars, package="ggRandomForests") @@ -79,12 +80,13 @@ plot(gg_dta, panel=TRUE) ## data(pbc, package = "randomForestSRC") ## pbc.obj <- rfsrc(Surv(days,status) ~ ., pbc, nsplit = 10) ## interaction_pbc <- randomForestSRC::find.interaction(pbc.obj, nvar = 8) +\dontrun{ data(interaction_pbc, package="ggRandomForests") gg_dta <- gg_interaction(interaction_pbc) plot(gg_dta, xvar="bili") plot(gg_dta, panel=TRUE) - +} \dontrun{ ## -------- veteran data data(interaction_veteran, package="ggRandomForests") diff --git a/man/gg_minimal_depth.Rd b/man/gg_minimal_depth.Rd index d189cf2d..be397665 100644 --- a/man/gg_minimal_depth.Rd +++ b/man/gg_minimal_depth.Rd @@ -30,7 +30,8 @@ function takes the output from \code{[randomForestSRC]{var.select}} and creates ## ------------------------------------------------------------ ## classification example ## ------------------------------------------------------------ -## -------- iris data +\dontrun{ + ## -------- iris data ## You can build a randomForest # rfsrc_iris <- rfsrc(Species ~ ., data = iris) # varsel_iris <- randomForestSRC::var.select(rfsrc_iris) @@ -42,7 +43,7 @@ gg_dta<- gg_minimal_depth(varsel_iris) # Plot the gg_minimal_depth object plot(gg_dta) - +} ## ------------------------------------------------------------ ## Regression example ## ------------------------------------------------------------ @@ -59,13 +60,13 @@ gg_dta<- gg_minimal_depth(varsel_airq) # Plot the gg_minimal_depth object plot(gg_dta) } - +\dontrun{ ## -------- Boston data data(varsel_Boston, package="ggRandomForests") # Get a data.frame containing error rates plot(gg_minimal_depth(varsel_Boston)) - +} \dontrun{ ## -------- mtcars data data(varsel_mtcars, package="ggRandomForests") @@ -90,13 +91,13 @@ data(varsel_veteran, package="ggRandomForests") gg_dta <- gg_minimal_depth(varsel_veteran) plot(gg_dta) } - +\dontrun{ ## -------- pbc data data(varsel_pbc, package="ggRandomForests") gg_dta <- gg_minimal_depth(varsel_pbc) plot(gg_dta) - +} } \seealso{ \code{[randomForestSRC]{var.select}} \code{\link{plot.gg_minimal_depth}} diff --git a/man/gg_minimal_vimp.Rd b/man/gg_minimal_vimp.Rd index 4227b3d5..3850f67f 100644 --- a/man/gg_minimal_vimp.Rd +++ b/man/gg_minimal_vimp.Rd @@ -30,6 +30,7 @@ Minimal depth vs VIMP camparison by variable rankings. ## ------------------------------------------------------------ ## classification example ## ------------------------------------------------------------ +\dontrun{ ## -------- iris data ## You can build a randomForest # rfsrc_iris <- rfsrc(Species ~ ., data = iris) @@ -42,7 +43,7 @@ gg_dta<- gg_minimal_vimp(varsel_iris) # Plot the gg_minimal_depth object plot(gg_dta) - +} ## ------------------------------------------------------------ ## Regression example ## ------------------------------------------------------------ @@ -59,7 +60,7 @@ gg_dta<- gg_minimal_vimp(varsel_airq) # Plot the gg_minimal_vimp object plot(gg_dta) } - +\dontrun{ ## -------- Boston data data(varsel_Boston, package="ggRandomForests") @@ -68,7 +69,7 @@ gg_dta<- gg_minimal_vimp(varsel_Boston) # Plot the gg_minimal_vimp object plot(gg_dta) - +} \dontrun{ ## -------- mtcars data data(varsel_mtcars, package="ggRandomForests") @@ -94,11 +95,12 @@ data(varsel_veteran, package="ggRandomForests") gg_dta <- gg_minimal_vimp(varsel_veteran) plot(gg_dta) } +\dontrun{ ## -------- pbc data data(varsel_pbc, package="ggRandomForests") gg_dta <- gg_minimal_vimp(varsel_pbc) plot(gg_dta) - +} } diff --git a/man/gg_partial.Rd b/man/gg_partial.Rd index d94ff4ff..f306b0ef 100644 --- a/man/gg_partial.Rd +++ b/man/gg_partial.Rd @@ -38,7 +38,7 @@ An option \code{named} argument can name a column for merging multiple plots tog ## classification ## ------------------------------------------------------------ ## -------- iris data - +\dontrun{ ## iris "Petal.Width" partial dependence plot ## # rfsrc_iris <- rfsrc(Species ~., data = iris) @@ -48,7 +48,7 @@ data(partial_iris, package="ggRandomForests") gg_dta <- gg_partial(partial_iris) plot(gg_dta) - +} ## ------------------------------------------------------------ ## regression ## ------------------------------------------------------------ @@ -70,13 +70,13 @@ plot(gg_dta.m, notch=TRUE) gg_dta[["Month"]] <- NULL plot(gg_dta, panel=TRUE) } - +\dontrun{ ## -------- Boston data data(partial_Boston, package="ggRandomForests") gg_dta <- gg_partial(partial_Boston) plot(gg_dta, panel=TRUE) - +} \dontrun{ ## -------- mtcars data data(partial_mtcars, package="ggRandomForests") @@ -133,6 +133,7 @@ plot(gg_dta, panel=TRUE) gg_dta.cat[["karno"]] <- gg_dta.cat[["diagtime"]] <- gg_dta.cat[["age"]] <- NULL plot(gg_dta.cat, panel=TRUE, notch=TRUE) } +\dontrun{ ## -------- pbc data data("partial_pbc", package = "ggRandomForests") data("varsel_pbc", package = "ggRandomForests") @@ -161,7 +162,7 @@ plot(ggpart, panel = TRUE) #plot(pbc_ggpart[["edema"]], panel=TRUE) #, # notch = TRUE, alpha = .3, outlier.shape = NA) - + } } \references{ Friedman, Jerome H. 2000. "Greedy Function Approximation: A Gradient Boosting diff --git a/man/gg_partial_coplot.rfsrc.Rd b/man/gg_partial_coplot.rfsrc.Rd index 890575bd..ae2c87ad 100644 --- a/man/gg_partial_coplot.rfsrc.Rd +++ b/man/gg_partial_coplot.rfsrc.Rd @@ -30,6 +30,7 @@ Data structures for stratified partial coplots } \examples{ +\dontrun{ # Load the forest data(rfsrc_pbc, package="ggRandomForests") @@ -41,7 +42,7 @@ copper_cts <-quantile_pts(ggvar$copper, groups = 6, intervals = TRUE) # Create the conditional groups and add to the gg_variable object copper_grp <- cut(ggvar$copper, breaks = copper_cts) - +} \dontrun{ ## We would run this, but it's expensive partial_coplot_pbc <- gg_partial_coplot(rfsrc_pbc, xvar = "bili", @@ -50,12 +51,13 @@ partial_coplot_pbc <- gg_partial_coplot(rfsrc_pbc, xvar = "bili", time = 1, show.plots = FALSE) } +\dontrun{ ## so load the cached set data(partial_coplot_pbc, package="ggRandomForests") # Partial coplot plot(partial_coplot_pbc) #, se = FALSE) - + } } diff --git a/man/gg_rfsrc.rfsrc.Rd b/man/gg_rfsrc.rfsrc.Rd index 33f373a1..453a957f 100644 --- a/man/gg_rfsrc.rfsrc.Rd +++ b/man/gg_rfsrc.rfsrc.Rd @@ -35,8 +35,8 @@ forest prediction. ## classification example ## ------------------------------------------------------------ ## -------- iris data -# rfsrc_iris <- rfsrc(Species ~ ., data = iris) -data(rfsrc_iris, package="ggRandomForests") +rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#data(rfsrc_iris, package="ggRandomForests") gg_dta<- gg_rfsrc(rfsrc_iris) plot(gg_dta) @@ -52,11 +52,11 @@ gg_dta<- gg_rfsrc(rfsrc_airq) plot(gg_dta) } - +\dontrun{ ## -------- Boston data data(rfsrc_Boston, package="ggRandomForests") plot(rfsrc_Boston) - +} \dontrun{ ## -------- mtcars data data(rfsrc_mtcars, package="ggRandomForests") @@ -82,11 +82,11 @@ plot(gg_dta) gg_dta <- gg_rfsrc(rfsrc_veteran, by="trt") plot(gg_dta) } - +\dontrun{ ## -------- pbc data ## We don't run this because of bootstrap confidence limits data(rfsrc_pbc, package = "ggRandomForests") - +} \dontrun{ gg_dta <- gg_rfsrc(rfsrc_pbc) plot(gg_dta) @@ -94,10 +94,10 @@ plot(gg_dta) gg_dta <- gg_rfsrc(rfsrc_pbc, conf.int=.95) plot(gg_dta) } - +\dontrun{ gg_dta <- gg_rfsrc(rfsrc_pbc, by="treatment") plot(gg_dta) - +} } \seealso{ diff --git a/man/gg_roc.rfsrc.Rd b/man/gg_roc.rfsrc.Rd index 7c723426..0040ae60 100644 --- a/man/gg_roc.rfsrc.Rd +++ b/man/gg_roc.rfsrc.Rd @@ -28,8 +28,8 @@ The sensitivity and specificity of a randomForests classification object. ## classification example ## ------------------------------------------------------------ ## -------- iris data -#rfsrc_iris <- rfsrc(Species ~ ., data = iris) -data(rfsrc_iris, package="ggRandomForests") +rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#data(rfsrc_iris, package="ggRandomForests") # ROC for setosa gg_dta <- gg_roc(rfsrc_iris, which.outcome=1) diff --git a/man/gg_survival.Rd b/man/gg_survival.Rd index 97b02bec..fcf0b392 100644 --- a/man/gg_survival.Rd +++ b/man/gg_survival.Rd @@ -33,6 +33,7 @@ survival estimates using either \code{\link{nelson}}-aalen or \code{\link{kaplan}}-meier estimates. } \examples{ +\dontrun{ ## -------- pbc data data(pbc, package="randomForestSRC") pbc$time <- pbc$days/364.25 @@ -56,7 +57,7 @@ gg_dta <- gg_survival(interval="time", censor="status", data=pbc, by="treatment", conf.int=.68) plot(gg_dta, error="lines") - +} } \seealso{ \code{\link{kaplan}} \code{\link{nelson}} \code{\link{plot.gg_survival}} diff --git a/man/gg_variable.Rd b/man/gg_variable.Rd index 004b0d44..069a3618 100644 --- a/man/gg_variable.Rd +++ b/man/gg_variable.Rd @@ -41,8 +41,8 @@ or the output from the \code{\link[randomForestSRC]{plot.variable}} function. ## ------------------------------------------------------------ ## -------- iris data ## iris -#rfsrc_iris <- rfsrc(Species ~., data = iris) -data(rfsrc_iris, package="ggRandomForests") + rfsrc_iris <- rfsrc(Species ~., data = iris) +#data(rfsrc_iris, package="ggRandomForests") gg_dta <- gg_variable(rfsrc_iris) plot(gg_dta, xvar="Sepal.Width") diff --git a/man/gg_vimp.Rd b/man/gg_vimp.Rd index 0110b3a6..1137562d 100644 --- a/man/gg_vimp.Rd +++ b/man/gg_vimp.Rd @@ -30,8 +30,8 @@ a \code{\link[randomForestSRC]{rfsrc}} object. ## classification example ## ------------------------------------------------------------ ## -------- iris data -# rfsrc_iris <- rfsrc(Species ~ ., data = iris) -data(rfsrc_iris, package="ggRandomForests") +rfsrc_iris <- rfsrc(Species ~ ., data = iris) +#data(rfsrc_iris, package="ggRandomForests") gg_dta <- gg_vimp(rfsrc_iris) plot(gg_dta) @@ -45,12 +45,12 @@ data(rfsrc_airq, package="ggRandomForests") gg_dta <- gg_vimp(rfsrc_airq) plot(gg_dta) } - +\dontrun{ ## -------- Boston data data(rfsrc_Boston, package="ggRandomForests") gg_dta <- gg_vimp(rfsrc_Boston) plot(gg_dta) - +} \dontrun{ ## -------- mtcars data data(rfsrc_mtcars, package="ggRandomForests") @@ -66,7 +66,7 @@ data(rfsrc_veteran, package="ggRandomForests") gg_dta <- gg_vimp(rfsrc_veteran) plot(gg_dta) } - +\dontrun{ ## -------- pbc data data(rfsrc_pbc, package="ggRandomForests") gg_dta <- gg_vimp(rfsrc_pbc) @@ -76,6 +76,7 @@ plot(gg_dta) gg_dta <- gg_vimp(rfsrc_pbc, nvar=10) plot(gg_dta) } +} \references{ Ishwaran H. (2007). Variable importance in binary regression trees and forests, \emph{Electronic J. Statist.}, 1:519-537. diff --git a/man/interaction_data.Rd b/man/interaction_data.Rd deleted file mode 100644 index 1a7ba235..00000000 --- a/man/interaction_data.Rd +++ /dev/null @@ -1,185 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/interaction_data.R -\docType{data} -\name{interaction_data} -\alias{interaction_Boston} -\alias{interaction_data} -\alias{interaction_iris} -\alias{interaction_pbc} -\title{Cached \code{\link[randomForestSRC]{find.interaction}} matrix objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{find.interaction}} matrix objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{interaction_iris} - from a randomForestSR[C] for the \code{iris} data set. -\item \code{interaction_Boston} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{interaction_pbc} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -}} -\format{\code{\link[randomForestSRC]{find.interaction}} matrix} -\description{ -Cached \code{\link[randomForestSRC]{find.interaction}} matrix objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{find.interaction}} matrix objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{interaction_iris} - from a randomForestSR[C] for the \code{iris} data set. -\item \code{interaction_Boston} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{interaction_pbc} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -} -} -\details{ -Constructing the minimal depth interaction matrices on randomForestsSRC objects are -computationally expensive. We cache \code{\link[randomForestSRC]{find.interaction}} matrix objects -to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -(see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) - -For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -(see \code{\link{rfsrc_data}}), then calculate the minimal depth variable interaction -table with \code{\link[randomForestSRC]{find.interaction}}. Each data set is built with the -\code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version listed -in the \code{ggRandomForests} DESCRIPTION file. - -\itemize{ -\item \code{interaction_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -the measurements in centimeters of the variables sepal length and width and -petal length and width, respectively, for 50 flowers from each of 3 species -of iris. Build a classification random forest for predicting the species (setosa, -versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). - -\item \code{interaction_airq} - The \code{airquality} data set is from the New York State -Department of Conservation (ozone data) and the National Weather Service -(meteorological data) collected in New York, from May to September 1973. Build regression -random forest for predicting \code{Ozone} on 5 covariates and 153 observations. - -\item \code{interaction_mtcars} - The \code{mtcars} data was extracted from the 1974 Motor -Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and -performance for 32 automobiles (1973-74 models). Build a regression random forest for -predicting mpg on 10 covariates and 32 observations. - -\item \code{interaction_Boston} - The \code{Boston} housing values in suburbs of Boston from the -\code{MASS} package. Build a regression random forest for predicting medv (median home -values) on 13 covariates and 506 observations. - -\item \code{interaction_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -package. Build a survival random forest for time-to-event death data with 17 covariates and -312 observations (remaining 106 observations are held out). - -\item \code{interaction_veteran} - Veteran's Administration randomized trial of two treatment -regimens for lung cancer. Build a survival random forest for time-to-event death data -with 6 covariates and 137 observations. -} -} -\examples{ -\dontrun{ -#--------------------------------------------------------------------- -# iris data - classification random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_iris, package="ggRandomForests") - -# The interaction table -interaction_iris <- find.interaction(rfsrc_iris) - -# plot the forest interaction table -gg_dta <- gg_interaction(interaction_iris) -plot(gg_dta, panel=TRUE) - -#--------------------------------------------------------------------- -# MASS::Boston data - regression random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_Boston, package="ggRandomForests") - -# The interaction table -interaction_Boston <- find.interaction(rfsrc_Boston) - -# plot the forest interaction table -gg_dta <- gg_interaction(interaction_Boston) -plot(gg_dta, panel=TRUE) - -#--------------------------------------------------------------------- -# randomForestSRC::pbc data - survival random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_pbc, package="ggRandomForests") - -# The interaction table -interaction_pbc <- find.interaction(rfsrc_pbc) - -# plot the forest interaction table -gg_dta <- gg_interaction(interaction_pbc) -plot(gg_dta, panel=TRUE) - -} - -} -\references{ -#--------------------- - randomForestSRC ---------------------- - -Ishwaran H. and Kogalur U.B. (2014). Random Forests for -Survival, Regression and Classification (RF-SRC), R package -version 1.5.5. - -Ishwaran H. and Kogalur U.B. (2007). Random survival forests -for R. R News 7(2), 25-31. - -Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -(2008). Random survival forests. Ann. Appl. Statist. 2(3), -841-860. - -#--------------------- - Boston data set ---------------------- - - Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying - Influential Data and Sources of Collinearity. New York: Wiley. - -Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." - J. Environ. Economics and Management 5: 81-102. - -#--------------------- - Iris data set ---------------------- - -Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -Wadsworth \& Brooks/Cole. (has iris3 as iris.) - -Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -Annals of Eugenics, 7, Part II, 179-188. - -Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -of the American Iris Society, 59, 2-5. - -#--------------------- - pbc data set ---------------------- - -Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -New York: Wiley. - -T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -Springer-Verlag, New York. ISBN: 0-387-98784-3. -} -\seealso{ -\code{iris} \code{\link[MASS]{Boston}} -\code{\link[randomForestSRC]{pbc}} - \code{\link[randomForestSRC]{find.interaction}} - \code{\link{rfsrc_data}} - \code{\link{cache_rfsrc_datasets}} - \code{\link{gg_interaction}} - \code{\link{plot.gg_interaction}} -} -\keyword{datasets} - diff --git a/man/partial.rfsrc.Rd b/man/partial.rfsrc.Rd index 8260d3f7..3761f35e 100644 --- a/man/partial.rfsrc.Rd +++ b/man/partial.rfsrc.Rd @@ -116,7 +116,7 @@ plot.variable(mtcars.obj, partial = TRUE, smooth.lines = TRUE) ## iris #rfsrc_iris <- rfsrc(Species ~., data = iris) -data(rfsrc_iris, package="ggRandomForests") +#data(rfsrc_iris, package="ggRandomForests") #gg_dta <- partial.rfsrc(rfsrc_iris, ) \dontrun{ diff --git a/man/partial_coplot_data.Rd b/man/partial_coplot_data.Rd deleted file mode 100644 index 2bfebb64..00000000 --- a/man/partial_coplot_data.Rd +++ /dev/null @@ -1,107 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/partial_coplot_data.R -\docType{data} -\name{partial_coplot_data} -\alias{partial_coplot_Boston} -\alias{partial_coplot_Boston2} -\alias{partial_coplot_data} -\alias{partial_coplot_pbc} -\alias{partial_coplot_pbc2} -\title{Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{rfsrc}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{partial_coplot_Boston} - randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -}} -\format{List of \code{\link[randomForestSRC]{plot.variable}} objects} -\description{ -Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{rfsrc}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{partial_coplot_Boston} - randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -} -} -\details{ -Constructing random forests are computationally expensive. -We cache \code{\link[randomForestSRC]{rfsrc}} objects to improve the \code{ggRandomForests} -examples, diagnostics and vignettes run times. -(see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) - -For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}}. Tuning parameters used -in each case are documented in the examples. Each data set is built with the -\code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version listed -in the \code{ggRandomForests} DESCRIPTION file. - -\itemize{ -\item \code{partial_coplot_Boston} - The \code{Boston} housing values in suburbs of Boston from the -\code{MASS} package. Build a regression random forest for predicting medv (median home -values) on 13 covariates and 506 observations. - -} -} -\examples{ -\dontrun{ -#--------------------------------------------------------------------- -# MASS::Boston data - regression random forest -#--------------------------------------------------------------------- -data(Boston_rfsrc, package="ggRandomForests") - -# Cut the codependent variable -rm_pts <- cut_distribution(rfsrc_Boston$xvar$rm, groups=6) -rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks=rm_pts) - -# plot.variable for lstat on subsets of rm (this will take some time.) - partial_coplot_Boston <- gg_partial_coplot(rfsrc_Boston, xvar="lstat", - groups=rm_grp, - show.plots=FALSE) - -} - -} -\references{ -#--------------------- - randomForestSRC ---------------------- - -Ishwaran H. and Kogalur U.B. (2014). Random Forests for -Survival, Regression and Classification (RF-SRC), R package -version 1.5.5. - -Ishwaran H. and Kogalur U.B. (2007). Random survival forests -for R. R News 7(2), 25-31. - -Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -(2008). Random survival forests. Ann. Appl. Statist. 2(3), -841-860. - -#--------------------- - Boston data set ---------------------- - - Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying - Influential Data and Sources of Collinearity. New York: Wiley. - -Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." - J. Environ. Economics and Management 5: 81-102. - -#--------------------- - pbc data set ---------------------- - -Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -New York: Wiley. - -T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -Springer-Verlag, New York. ISBN: 0-387-98784-3. -} -\seealso{ -\code{\link[MASS]{Boston}} \code{\link[randomForestSRC]{plot.variable}} - \code{\link{cache_rfsrc_datasets}} -} -\keyword{datasets} - diff --git a/man/partial_data.Rd b/man/partial_data.Rd deleted file mode 100644 index 9eb6251e..00000000 --- a/man/partial_data.Rd +++ /dev/null @@ -1,180 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/partial_data.R -\docType{data} -\name{partial_data} -\alias{partial_Boston} -\alias{partial_data} -\alias{partial_iris} -\alias{partial_pbc} -\title{Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{plot.variable}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{partial_iris} - from a randomForestSR[C] for the \code{iris} data set. -\item \code{partial_Boston} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{partial_pbc} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -}} -\format{\code{\link[randomForestSRC]{plot.variable}}} -\description{ -Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{plot.variable}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{partial_iris} - from a randomForestSR[C] for the \code{iris} data set. -\item \code{partial_Boston} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{partial_pbc} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -} -} -\details{ -Constructing partial plot data with the randomForestsSRC::plot.variable function are -computationally expensive. We cache \code{\link[randomForestSRC]{plot.variable}} objects -to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -(see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) - -For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -(see \code{\link{rfsrc_data}}), then calculate the partial plot data with -\code{\link[randomForestSRC]{plot.variable}} function, setting \code{partial=TRUE}. Each data set is -built with the \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version -listed in the \code{ggRandomForests} DESCRIPTION file. - -\itemize{ -\item \code{partial_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -the measurements in centimeters of the variables sepal length and width and -petal length and width, respectively, for 50 flowers from each of 3 species -of iris. Build a classification random forest for predicting the species (setosa, -versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). - -\item \code{partial_Boston} - The \code{Boston} housing values in suburbs of Boston from the -\code{MASS} package. Build a regression random forest for predicting medv (median home -values) on 13 covariates and 506 observations. - -\item \code{partial_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -package. Build a survival random forest for time-to-event death data with 17 covariates and -312 observations (remaining 106 observations are held out). -} -} -\examples{ -\dontrun{ -#--------------------------------------------------------------------- -# iris data - classification random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_iris, package="ggRandomForests") - -# The plot.variable call - partial_iris <- plot.variable(rfsrc_iris, - partial=TRUE, show.plots=FALSE) - -# plot the forest partial plots -gg_dta <- gg_partial(partial_iris) -plot(gg_dta, panel=TRUE) - -#--------------------------------------------------------------------- -# MASS::Boston data - regression random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_Boston, package="ggRandomForests") - -# The plot.variable call -partial_Boston <- plot.variable(rfsrc_Boston, - partial=TRUE, show.plots = FALSE ) - -# plot the forest partial plots -gg_dta <- gg_partial(partial_Boston) -plot(gg_dta, panel=TRUE) - -#--------------------------------------------------------------------- -# randomForestSRC::pbc data - survival random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_pbc, package="ggRandomForests") - -# The plot.variable call - -# survival requires a time point specification. -# for the pbc data, we want 1, 3 and 5 year survival. -partial_pbc <- lapply(c(1,3,5), function(tm){ - plot.variable(rfsrc_pbc, surv.type = "surv", - time = tm, - xvar.names = xvar, - partial = TRUE, - show.plots = FALSE) - }) - -# plot the forest partial plots -gg_dta <- gg_partial(partial_pbc) -plot(gg_dta) -} - -} -\references{ -#--------------------- - randomForestSRC ---------------------- - -Ishwaran H. and Kogalur U.B. (2014). Random Forests for -Survival, Regression and Classification (RF-SRC), R package -version 1.5.5. - -Ishwaran H. and Kogalur U.B. (2007). Random survival forests -for R. R News 7(2), 25-31. - -Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -(2008). Random survival forests. Ann. Appl. Statist. 2(3), -841-860. - -#--------------------- - Boston data set ---------------------- - - Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying - Influential Data and Sources of Collinearity. New York: Wiley. - -Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." - J. Environ. Economics and Management 5: 81-102. - -#--------------------- - Iris data set ---------------------- - -Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -Wadsworth \& Brooks/Cole. (has iris3 as iris.) - -Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -Annals of Eugenics, 7, Part II, 179-188. - -Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -of the American Iris Society, 59, 2-5. - -#--------------------- - pbc data set ---------------------- - -Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -New York: Wiley. - -T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -Springer-Verlag, New York. ISBN: 0-387-98784-3. -} -\seealso{ -\code{iris} \code{MASS::Boston} -\code{\link[randomForestSRC]{pbc}} -\code{\link[randomForestSRC]{plot.variable}} -\code{\link{rfsrc_data}} - \code{\link{cache_rfsrc_datasets}} - \code{\link{gg_partial}} - \code{\link{plot.gg_partial}} -} -\keyword{datasets} - diff --git a/man/partial_surface_data.Rd b/man/partial_surface_data.Rd deleted file mode 100644 index 22cccb50..00000000 --- a/man/partial_surface_data.Rd +++ /dev/null @@ -1,163 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/partial_surface_data.R -\docType{data} -\name{partial_surface_data} -\alias{partial_Boston_surf} -\alias{partial_pbc_surf} -\alias{partial_pbc_time} -\alias{partial_surface_data} -\title{Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{plot.variable}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{partial_Boston_surf} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{partial_pbc_surf} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -\item \code{partial_pbc_time} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -}} -\format{list of \code{\link[randomForestSRC]{plot.variable}} objects} -\description{ -Cached \code{\link[randomForestSRC]{plot.variable}} objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{plot.variable}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{partial_Boston_surf} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{partial_pbc_surf} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -\item \code{partial_pbc_time} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -} -} -\details{ -Constructing partial plot data with the randomForestsSRC::plot.variable function are -computationally expensive. We cache \code{\link[randomForestSRC]{plot.variable}} objects -to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -(see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) - -For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -(see \code{\link{rfsrc_data}}), then calculate the partial plot data with -\code{\link[randomForestSRC]{plot.variable}} function, setting \code{partial=TRUE}. Each data set is -built with the \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version -listed in the \code{ggRandomForests} DESCRIPTION file. - -\itemize{ -\item \code{partial_Boston} - The \code{Boston} housing values in suburbs of Boston from the -\code{MASS} package. Build a regression random forest for predicting medv (median home -values) on 13 covariates and 506 observations. - -\item \code{partial_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -package. Build a survival random forest for time-to-event death data with 17 covariates and -312 observations (remaining 106 observations are held out). -} -} -\examples{ -\dontrun{ -#--------------------------------------------------------------------- -# MASS::Boston data - regression random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_Boston, package="ggRandomForests") - -# The plot.variable call -partial_Boston <- plot.variable(rfsrc_Boston, - partial=TRUE, show.plots = FALSE ) - -# plot the forest partial plots -gg_dta <- gg_partial(partial_Boston) -plot(gg_dta, panel=TRUE) - -#--------------------------------------------------------------------- -# randomForestSRC::pbc data - survival random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_pbc, package="ggRandomForests") - -# Restrict the time of interest to less than 5 years. -time_pts <- rfsrc_pbc$time.interest[which(rfsrc_pbc$time.interest<=5)] - -# Find the 50 points in time, evenly space along the distribution of -# event times for a series of partial dependence curves -time_cts <-quantile_pts(time_pts, groups = 50) - -# Generate the gg_partial_coplot data object -system.time(partial_pbc_time <- lapply(time_cts, function(ct){ - plot.variable(rfsrc_pbc, xvar = "bili", time = ct, - npts = 50, show.plots = FALSE, - partial = TRUE, surv.type="surv") - })) -# user system elapsed -# 2561.313 81.446 2641.707 - -# Find the quantile points to create 50 cut points -alb_partial_pts <-quantile_pts(rfsrc_pbc$xvar$albumin, groups = 50) - -system.time(partial_pbc_surf <- lapply(alb_partial_pts, function(ct){ - rfsrc_pbc$xvar$albumin <- ct - plot.variable(rfsrc_pbc, xvar = "bili", time = 1, - npts = 50, show.plots = FALSE, - partial = TRUE, surv.type="surv") - })) -# user system elapsed -# 2547.482 91.978 2671.870 - -} - -} -\references{ -#--------------------- - randomForestSRC ---------------------- - -Ishwaran H. and Kogalur U.B. (2014). Random Forests for -Survival, Regression and Classification (RF-SRC), R package -version 1.5.5. - -Ishwaran H. and Kogalur U.B. (2007). Random survival forests -for R. R News 7(2), 25-31. - -Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -(2008). Random survival forests. Ann. Appl. Statist. 2(3), -841-860. - -#--------------------- - Boston data set ---------------------- - - Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying - Influential Data and Sources of Collinearity. New York: Wiley. - -Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." - J. Environ. Economics and Management 5: 81-102. - -#--------------------- - pbc data set ---------------------- - -Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -New York: Wiley. - -T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -Springer-Verlag, New York. ISBN: 0-387-98784-3. -} -\seealso{ -\code{\link[MASS]{Boston}} -\code{\link[randomForestSRC]{pbc}} -\code{\link[randomForestSRC]{plot.variable}} -\code{\link{rfsrc_data}} - \code{\link{cache_rfsrc_datasets}} - \code{\link{gg_partial}} - \code{\link{plot.gg_partial}} -} -\keyword{datasets} - diff --git a/man/print.gg_minimal_depth.Rd b/man/print.gg_minimal_depth.Rd index 0e48f632..d479dc65 100644 --- a/man/print.gg_minimal_depth.Rd +++ b/man/print.gg_minimal_depth.Rd @@ -18,6 +18,7 @@ Print a \code{\link{gg_minimal_depth}} object. ## ------------------------------------------------------------ ## classification example ## ------------------------------------------------------------ +\dontrun{ ## You can build a randomForest # rfsrc_iris <- rfsrc(Species ~ ., data = iris) # varsel_iris <- var.select(rfsrc_iris) @@ -27,7 +28,7 @@ data(varsel_iris, package="ggRandomForests") # Get a data.frame containing minimaldepth measures gg_dta <- gg_minimal_depth(varsel_iris) print(gg_dta) - +} ## ------------------------------------------------------------ ## regression example ## ------------------------------------------------------------ @@ -42,7 +43,7 @@ print(gg_dta) # To nicely print a rfsrc::var.select output... print(varsel_airq) } - +\dontrun{ # ... or load a cached randomForestSRC object data(varsel_Boston, package="ggRandomForests") @@ -52,6 +53,6 @@ print(gg_dta) # To nicely print a rfsrc::var.select output... print(varsel_Boston) - +} } diff --git a/man/quantile_pts.Rd b/man/quantile_pts.Rd index 2e6f42c3..b655d259 100644 --- a/man/quantile_pts.Rd +++ b/man/quantile_pts.Rd @@ -26,6 +26,7 @@ The output can be passed directly into the breaks argument of the \code{cut} function for creating groups for coplots. } \examples{ +\dontrun{ data(rfsrc_Boston) # To create 6 intervals, we want 7 points. @@ -36,7 +37,7 @@ rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups=6, intervals=TRUE) rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks=rm_pts) summary(rm_grp) - +} } \seealso{ \code{cut} \code{\link{gg_partial_coplot}} diff --git a/man/rfsrc_data.Rd b/man/rfsrc_data.Rd deleted file mode 100644 index d35ffa40..00000000 --- a/man/rfsrc_data.Rd +++ /dev/null @@ -1,174 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rfsrc_data.R -\docType{data} -\name{rfsrc_data} -\alias{rfsrc_Boston} -\alias{rfsrc_data} -\alias{rfsrc_iris} -\alias{rfsrc_pbc} -\alias{rfsrc_pbc_test} -\title{Cached \code{\link[randomForestSRC]{rfsrc}} objects for examples, diagnostics and vignettes.} -\format{\code{\link[randomForestSRC]{rfsrc}} object} -\description{ -Data sets storing \code{\link[randomForestSRC]{rfsrc}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{rfsrc_iris} - randomForestSR[C] for the \code{iris} data set. -\item \code{rfsrc_Boston} - randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{rfsrc_pbc} - randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -} -} -\details{ -Constructing random forests are computationally expensive. -We cache \code{\link[randomForestSRC]{rfsrc}} objects to improve the \code{ggRandomForests} -examples, diagnostics and vignettes run times. -(see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) - -For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}}. Tuning parameters used -in each case are documented in the examples. Each data set is built with the -\code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version listed -in the \code{ggRandomForests} DESCRIPTION file. - -\itemize{ -\item \code{rfsrc_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -the measurements in centimeters of the variables sepal length and width and -petal length and width, respectively, for 50 flowers from each of 3 species -of iris. Build a classification random forest for predicting the species (setosa, -versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). - -\item \code{rfsrc_Boston} - The \code{Boston} housing values in suburbs of Boston from the -\code{MASS} package. Build a regression random forest for predicting medv (median home -values) on 13 covariates and 506 observations. - -\item \code{rfsrc_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -package. Build a survival random forest for time-to-event death data with 17 covariates and -312 observations (remaining 106 observations are held out). -} -} -\examples{ -\dontrun{ -#--------------------------------------------------------------------- -# iris data - classification random forest -#--------------------------------------------------------------------- -# rfsrc grow call -rfsrc_iris <- rfsrc(Species ~., data = iris) - -# plot the forest generalization error convergence -gg_dta <- gg_error(rfsrc_iris) -plot(gg_dta) - -# Plot the forest predictions -gg_dta <- gg_rfsrc(rfsrc_iris) -plot(gg_dta) - -#--------------------------------------------------------------------- -# MASS::Boston data - regression random forest -#--------------------------------------------------------------------- -# Load the data... -data(Boston, package="MASS") -Boston$chas <- as.logical(Boston$chas) - -# rfsrc grow call -rfsrc_Boston <- rfsrc(medv~., data=Boston) - -# plot the forest generalization error convergence -gg_dta <- gg_error(rfsrc_Boston) -plot(gg_dta) - -# Plot the forest predictions -gg_dta <- gg_rfsrc(rfsrc_Boston) -plot(gg_dta) - -#--------------------------------------------------------------------- -# randomForestSRC::pbc data - survival random forest -#--------------------------------------------------------------------- -# Load the data... -# For simplicity here. We do a bit of data tidying -# before running the stored random forest. -data(pbc, package="randomForestSRC") - -# Remove non-randomized cases -dta.train <- pbc[-which(is.na(pbc$treatment)),] - -# rfsrc grow call -rfsrc_pbc <- rfsrc(Surv(years, status) ~ ., dta.train, nsplit = 10, - na.action="na.impute") - -# plot the forest generalization error convergence -gg_dta <- gg_error(rfsrc_pbc) -plot(gg_dta) - -# Plot the forest predictions -gg_dta <- gg_rfsrc(rfsrc_pbc) -plot(gg_dta) - -} - -} -\references{ -#--------------------- - randomForestSRC ---------------------- - -Ishwaran H. and Kogalur U.B. (2014). Random Forests for -Survival, Regression and Classification (RF-SRC), R package -version 1.5.5. - -Ishwaran H. and Kogalur U.B. (2007). Random survival forests -for R. R News 7(2), 25-31. - -Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -(2008). Random survival forests. Ann. Appl. Statist. 2(3), -841-860. - -#--------------------- - Boston data set ---------------------- - - Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying - Influential Data and Sources of Collinearity. New York: Wiley. - -Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." - J. Environ. Economics and Management 5: 81-102. - -#--------------------- - Iris data set ---------------------- - -Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -Wadsworth \& Brooks/Cole. (has iris3 as iris.) - -Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -Annals of Eugenics, 7, Part II, 179-188. - -Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -of the American Iris Society, 59, 2-5. - -#--------------------- - pbc data set ---------------------- - -Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -New York: Wiley. - -T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -Springer-Verlag, New York. ISBN: 0-387-98784-3. -} -\seealso{ -\code{iris} \code{\link[MASS]{Boston}} -\code{\link[randomForestSRC]{pbc}} - \code{\link[randomForestSRC]{rfsrc}} - \code{\link{cache_rfsrc_datasets}} - \code{\link{gg_rfsrc}} - \code{\link{plot.gg_rfsrc}} - \code{\link{gg_error}} - \code{\link{plot.gg_error}} -} -\keyword{datasets} - diff --git a/man/surface_matrix.Rd b/man/surface_matrix.Rd index c1d74c73..1cfa30eb 100644 --- a/man/surface_matrix.Rd +++ b/man/surface_matrix.Rd @@ -22,6 +22,7 @@ and extract and construct the x, y and z matrices from the provided \code{xvar} column names. } \examples{ +\dontrun{ ## From vignette(randomForestRegression, package="ggRandomForests") ## data(rfsrc_Boston) @@ -45,7 +46,7 @@ partial_surf$rm <- rm.tmp # Transform the gg_partial_coplot object into a list of three named matrices # for surface plotting with plot3D::surf3D srf <- surface_matrix(partial_surf, c("lstat", "rm", "yhat")) - +} \dontrun{ # surf3D is in the plot3D package. diff --git a/man/varsel_data.Rd b/man/varsel_data.Rd deleted file mode 100644 index 88b180f7..00000000 --- a/man/varsel_data.Rd +++ /dev/null @@ -1,175 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/varsel_data.R -\docType{data} -\name{varsel_data} -\alias{varsel_Boston} -\alias{varsel_data} -\alias{varsel_iris} -\alias{varsel_pbc} -\title{Cached \code{\link[randomForestSRC]{var.select}} objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{var.select}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{varsel_iris} - from a randomForestSR[C] for the \code{iris} data set. -\item \code{varsel_Boston} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{varsel_pbc} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -}} -\format{\code{\link[randomForestSRC]{var.select}} object} -\description{ -Cached \code{\link[randomForestSRC]{var.select}} objects for examples, -diagnostics and vignettes. - -Data sets storing \code{\link[randomForestSRC]{var.select}} objects corresponding to -training data according to the following naming convention: -\itemize{ -\item \code{varsel_iris} - from a randomForestSR[C] for the \code{iris} data set. -\item \code{varsel_Boston} - from a randomForestS[R]C for the \code{Boston} housing -data set (\code{MASS} package). -\item \code{varsel_pbc} - from a randomForest[S]RC for the \code{pbc} data set - (\code{randomForestSRC} package) -} -} -\details{ -Constructing minimal depth variable selection with the randomForestsSRC::var.select function -is computationally expensive. We cache \code{\link[randomForestSRC]{var.select}} objects -to improve the \code{ggRandomForests} examples, diagnostics and vignettes run times. -(see \code{\link{cache_rfsrc_datasets}} to rebuild a complete set of these data sets.) - -For each data set listed, we build a \code{\link[randomForestSRC]{rfsrc}} -(see \code{\link{rfsrc_data}}), then calculate the minimal depth variable selection with -\code{\link[randomForestSRC]{var.select}} function, setting \code{method="md"}. Each data set is -built with the \code{\link{cache_rfsrc_datasets}} with the \code{randomForestSRC} version -listed in the \code{ggRandomForests} DESCRIPTION file. - -\itemize{ -\item \code{varsel_iris} - The famous (Fisher's or Anderson's) \code{iris} data set gives -the measurements in centimeters of the variables sepal length and width and -petal length and width, respectively, for 50 flowers from each of 3 species -of iris. Build a classification random forest for predicting the species (setosa, -versicolor, and virginica) on 5 variables (columns) and 150 observations (rows). - -\item \code{varsel_Boston} - The \code{Boston} housing values in suburbs of Boston from the -\code{MASS} package. Build a regression random forest for predicting medv (median home -values) on 13 covariates and 506 observations. - -\item \code{varsel_pbc} - The \code{pbc} data from the Mayo Clinic trial in primary biliary -cirrhosis (PBC) of the liver conducted between 1974 and 1984. A total of 424 PBC patients, -referred to Mayo Clinic during that ten-year interval, met eligibility criteria for the -randomized placebo controlled trial of the drug D-penicillamine. 312 cases participated in -the randomized trial and contain largely complete data. Data from the \code{randomForestSRC} -package. Build a survival random forest for time-to-event death data with 17 covariates and -312 observations (remaining 106 observations are held out). - -} -} -\examples{ -\dontrun{ -#--------------------------------------------------------------------- -# iris data - classification random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_iris, package="ggRandomForests") - -# The var.select call - varsel_iris <- var.select(rfsrc_iris) - -# plot the forestminimal depth ranking -gg_dta <- gg_minimal_depth(varsel_iris) -plot(gg_dta) - - -#--------------------------------------------------------------------- -# MASS::Boston data - regression random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_Boston, package="ggRandomForests") - -# The var.select call -varsel_Boston <- var.select(rfsrc_Boston) - -# plot the forestminimal depth ranking -gg_dta <- gg_minimal_depth(varsel_Boston) -plot(gg_dta) - -#--------------------------------------------------------------------- -# randomForestSRC::pbc data - survival random forest -#--------------------------------------------------------------------- -# load the rfsrc object from the cached data -data(rfsrc_pbc, package="ggRandomForests") - -# The var.select call -varsel_pbc <- var.select(rfsrc_pbc) - -# plot the forestminimal depth ranking -gg_dta <- gg_minimal_depth(varsel_pbc) -plot(gg_dta) - -} - -} -\references{ -#--------------------- - randomForestSRC ---------------------- - -Ishwaran H. and Kogalur U.B. (2014). Random Forests for -Survival, Regression and Classification (RF-SRC), R package -version 1.5.5. - -Ishwaran H. and Kogalur U.B. (2007). Random survival forests -for R. R News 7(2), 25-31. - -Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. -(2008). Random survival forests. Ann. Appl. Statist. 2(3), -841-860. - -#--------------------- - Boston data set ---------------------- - - Belsley, D.A., E. Kuh, and R.E. Welsch. 1980. Regression Diagnostics. Identifying - Influential Data and Sources of Collinearity. New York: Wiley. - -Harrison, D., and D.L. Rubinfeld. 1978. "Hedonic Prices and the Demand for Clean Air." - J. Environ. Economics and Management 5: 81-102. - -#--------------------- - Iris data set ---------------------- - -Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. -Wadsworth \& Brooks/Cole. (has iris3 as iris.) - -Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. -Annals of Eugenics, 7, Part II, 179-188. - -Anderson, Edgar (1935). The irises of the Gaspe Peninsula, Bulletin -of the American Iris Society, 59, 2-5. - -#--------------------- - pbc data set ---------------------- - -Flemming T.R and Harrington D.P., (1991) Counting Processes and Survival Analysis. -New York: Wiley. - -T Therneau and P Grambsch (2000), Modeling Survival Data: Extending the Cox Model, -Springer-Verlag, New York. ISBN: 0-387-98784-3. -} -\seealso{ -\code{iris} \code{\link[MASS]{Boston}} -\code{\link[randomForestSRC]{pbc}} -\code{\link[randomForestSRC]{var.select}} -\code{\link{rfsrc_data}} - \code{\link{cache_rfsrc_datasets}} - \code{\link{gg_minimal_depth}} - \code{\link{plot.gg_minimal_depth}} - \code{\link{gg_minimal_vimp}} - \code{\link{plot.gg_minimal_vimp}} -} -\keyword{datasets} - diff --git a/tests/test-all.R b/tests/test-all.R deleted file mode 100644 index eeb017bf..00000000 --- a/tests/test-all.R +++ /dev/null @@ -1,3 +0,0 @@ -library(testthat) - -test_check("ggRandomForests") diff --git a/tests/testthat/test_cache_rfsrc_datasets.R b/tests/testthat/test_cache_rfsrc_datasets.R deleted file mode 100644 index 6c3c0061..00000000 --- a/tests/testthat/test_cache_rfsrc_datasets.R +++ /dev/null @@ -1,18 +0,0 @@ -# testthat for gg_error function -context("cache_rfsrc_dataset tests") - -test_that("cache_rfsrc_dataset",{ - - # # Check the default set of data - # expect_output(cache_rfsrc_datasets(test=TRUE), - # "iris: randomForest") - # - # # If we have a bad path... - # expect_error(cache_rfsrc_datasets(pth="nothing")) - # - # # If we want the alternative sets - # expect_output(cache_rfsrc_datasets(set=c("airq"), - # test=TRUE), - # "airq: randomForest") - # # -}) diff --git a/tests/testthat/test_gg_error.R b/tests/testthat/test_gg_error.R deleted file mode 100644 index ea532b42..00000000 --- a/tests/testthat/test_gg_error.R +++ /dev/null @@ -1,205 +0,0 @@ -# testthat for gg_error function -context("gg_error tests") - -test_that("gg_error.rfsrc classifications",{ - - ## Load the cached forest - data(rfsrc_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_match(rfsrc_iris$family, "class") - - ## Create the correct gg_error object - gg_dta <- gg_error(rfsrc_iris) - - # Test object type - expect_is(gg_dta, "gg_error") - - # Test classification dimensions - expect_equal(dim(gg_dta)[1], dim(rfsrc_iris$err.rate)[1]) - expect_equal(dim(gg_dta)[2], dim(rfsrc_iris$err.rate)[2] + 1) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta[, -which(colnames(gg_dta) == "ntree")]), rfsrc_iris$err.rate) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # "Incorrect object type: Expects a gg_error object" - expect_error(gg_error(gg_plt)) - expect_error(gg_error.rfsrc(gg_plt)) - rfsrc_iris$err.rate <- NULL - expect_error(gg_error(rfsrc_iris)) - -}) - - -test_that("gg_error.randomForest classifications",{ - - ## Load the cached forest - rf_iris <- randomForest::randomForest(Species ~., - data = iris) - - # Test the cached forest type - expect_is(rf_iris, "randomForest") - - # Test the forest family - expect_match(rf_iris$type, "classification") - - ## Create the correct gg_error object - gg_dta <- gg_error(rf_iris) - - # Test object type - expect_is(gg_dta, "gg_error") - - # Test classification dimensions - expect_equal(dim(gg_dta)[1], dim(rf_iris$err.rate)[1]) - expect_equal(dim(gg_dta)[2], dim(rf_iris$err.rate)[2] + 1) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta[, -which(colnames(gg_dta) == "ntree")]), rf_iris$err.rate) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # "Incorrect object type: Expects a gg_error object" - expect_error(gg_error(gg_plt)) - expect_error(gg_error.randomForest(gg_plt)) - rf_iris$err.rate <- NULL - expect_error(gg_error(rf_iris)) - -}) - -test_that("gg_error survival", { - ## Load the cached forest - data(rfsrc_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - - # Test the forest family - expect_match(rfsrc_pbc$family, "surv") - - ## Create the correct gg_error object - gg_dta <- gg_error(rfsrc_pbc) - - # Test object type - expect_is(gg_dta, "gg_error") - - # Test classification dimensions - expect_equal(dim(gg_dta)[1], length(rfsrc_pbc$err.rate)) - expect_equal(dim(gg_dta)[2], 2) - - # Test data is correctly pulled from randomForest obect. - tmp <- c(gg_dta[,1]) - expect_equivalent(tmp, rfsrc_pbc$err.rate) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - expect_error(gg_error(gg_plt)) - # "Incorrect object type: Expects a gg_error object" -}) - -test_that("gg_error regression",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - # Test the forest family - expect_match(rfsrc_Boston$family, "regr") - - ## Create the correct gg_error object - gg_dta <- gg_error(rfsrc_Boston) - - # Test object type - expect_is(gg_dta, "gg_error") - - # Test classification dimensions - expect_equal(nrow(gg_dta), length(rfsrc_Boston$err.rate)) - expect_equal(ncol(gg_dta), 2) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(c(gg_dta[,1]), rfsrc_Boston$err.rate) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_error(rfsrc_Boston) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Test the exception for input - expect_error(gg_error(gg_plt)) - - ## Create the correct gg_error object - # gg_dta <- gg_error(rfsrc_Boston, training=TRUE) - - # Test object type -# expect_is(gg_dta, "gg_error") -}) - - -test_that("gg_error regression",{ - ## Load the cached forest - data(Boston, package="MASS") - - Boston$chas <- as.logical(Boston$chas) - - rf_Boston <- randomForest::randomForest(medv~., data=Boston) - # Test the cached forest type - expect_is(rf_Boston, "randomForest") - - # Test the forest family - expect_match(rf_Boston$type, "regression") - - ## Create the correct gg_error object - gg_dta <- gg_error(rf_Boston) - - # Test object type - expect_is(gg_dta, "gg_error") - - # Test classification dimensions - expect_equal(nrow(gg_dta), length(rf_Boston$mse)) - expect_equal(ncol(gg_dta), 2) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(c(gg_dta[,1]), rf_Boston$mse) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Test the exception for input - expect_error(gg_error(gg_plt)) - - ## Create the correct gg_error object - # gg_dta <- gg_error(rf_Boston, training=TRUE) - - # Test object type - # expect_is(gg_dta, "gg_error") -}) diff --git a/tests/testthat/test_gg_interaction.R b/tests/testthat/test_gg_interaction.R deleted file mode 100644 index ec7f470d..00000000 --- a/tests/testthat/test_gg_interaction.R +++ /dev/null @@ -1,158 +0,0 @@ -# testthat for gg_interaction function -context("gg_interaction tests") - -test_that("gg_interaction classifications",{ - - ## Load the cached forest - data(interaction_iris, package="ggRandomForests") - - # Test the cached interaction structure - expect_is(interaction_iris, "matrix") - - ## Create the correct gg_interaction object - gg_dta <- gg_interaction(interaction_iris) - - # Test object type - expect_is(gg_dta, "gg_interaction") - - # Test classification dimensions - expect_equal(dim(gg_dta), dim(interaction_iris)) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta), interaction_iris) - - ## Test plotting the gg_interaction object - gg_plt <- plot.gg_interaction(gg_dta, xvar="Petal.Width") - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # This one should fail with a variable not found message - expect_error(plot.gg_interaction(gg_dta, xvar="Petal")) - - # "Incorrect object type: Expects a gg_interaction object" - ## Test plotting the gg_interaction object - gg_plt <- plot.gg_interaction(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - -}) - - -test_that("gg_interaction survival",{ - - data(pbc, package = "randomForestSRC") - - ## Load the cached forest - data(interaction_pbc, package="ggRandomForests") - - # Test the cached interaction structure - expect_is(interaction_pbc, "matrix") - - ## Create the correct gg_interaction object - gg_dta <- gg_interaction(interaction_pbc) - - # Test object type - expect_is(gg_dta, "gg_interaction") - - # Test classification dimensions - expect_equal(dim(gg_dta), dim(interaction_pbc)) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta), interaction_pbc) - - ## Test plotting the gg_interaction object - gg_plt <- plot.gg_interaction(gg_dta, xvar="bili") - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # "Incorrect object type: Expects a gg_interaction object" - - labels <- c("event indicator (F = censor, T = death)", - "Treament (DPCA, Placebo)", - "age in years", - "Female", - "Asictes", - "Hepatomegaly", - "Spiders", - "Edema", - "serum bilirubin (mg/dl)", - "serum cholesterol (mg/dl)", - "albumin (gm/dl)", - "urine copper (ug/day)", - "alkaline phosphatase (U/liter)", - "SGOT (U/ml)", - "triglicerides (mg/dl)", - "platelets per cubic ml/1000", - "prothrombin time (sec)", - "histologic stage", - "survival time (years)") - - dta.labs <- data.frame(cbind(names = colnames(pbc), label = labels)) - - st.labs <- as.character(dta.labs$label) - names(st.labs) <- rownames(dta.labs) - gg_plt <- plot.gg_interaction(gg_dta, lbls=st.labs) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") -}) - -test_that("gg_interaction regression",{ - ## Load the cached forest - data(interaction_Boston, package="ggRandomForests") - - # Test the cached interaction structure - expect_is(interaction_Boston, "matrix") - - ## Create the correct gg_interaction object - gg_dta <- gg_interaction(interaction_Boston) - - # Test object type - expect_is(gg_dta, "gg_interaction") - - # Test classification dimensions - expect_equal(dim(gg_dta), dim(interaction_Boston)) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta), interaction_Boston) - - ## Test plotting the gg_interaction object - gg_plt <- plot.gg_interaction(gg_dta, xvar = "rm") - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # We really only want to run this one when we're developing - # data(rfsrc_Boston, package="ggRandomForests") - # - # expect_warning(gg_dta <- gg_interaction(rfsrc_Boston, - # xvar.names=rfsrc_Boston$xvar.names[1:2])) - # - # expect_error(gg_interaction(gg_dta)) - # -}) - -test_that("gg_interaction exceptions",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached interaction structure - expect_is(rfsrc_Boston, "rfsrc") - - # This one costs a lot of time in calculating the interaction matrix. - # ## Create the correct gg_interaction object - # expect_warning(gg_dta <- gg_interaction(rfsrc_Boston)) - # - # # Test object type - # expect_is(gg_dta, "gg_interaction") - # - data(interaction_Boston, package="ggRandomForests") - # Test the cached interaction structure - expect_is(interaction_Boston, "matrix") - - interaction_Boston <- interaction_Boston[-2,] - expect_error(gg_interaction(interaction_Boston)) -}) diff --git a/tests/testthat/test_gg_minimal_depth.R b/tests/testthat/test_gg_minimal_depth.R deleted file mode 100644 index 26c9e746..00000000 --- a/tests/testthat/test_gg_minimal_depth.R +++ /dev/null @@ -1,177 +0,0 @@ -# testthat for gg_minimal_depth function -context("gg_minimal_depth tests") - -test_that("gg_minimal_depth classifications",{ - ## Load the cached forest - data(varsel_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_iris, "list") - - # Test the forest family - expect_false(is.null(varsel_iris$md.obj)) - - ## Create the correct gg_error object - gg_dta <- gg_minimal_depth(varsel_iris) - - # Test object type - expect_is(gg_dta, "gg_minimal_depth") - - # Test varselect is the same - expect_equivalent(gg_dta$varselect[,-which(colnames(gg_dta$varselect) == "names")], - varsel_iris$varselect) -# -# expect_is(gg_dta$threshold - mean(gg_dta$varselect$depth) < 1.e-6 ) - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - expect_error(gg_minimal_depth(gg_plt)) - expect_error(gg_minimal_depth.rfsrc(gg_plt)) -}) - - -test_that("gg_minimal_depth survival",{ - ## Load the cached forest - data(varsel_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_pbc, "list") - - ## Create the correct gg_error object - gg_dta <- gg_minimal_depth(varsel_pbc) - - # Test object type - expect_is(gg_dta, "gg_minimal_depth") - - - # Test varselect is the same - expect_equivalent(gg_dta$varselect[, -which(colnames(gg_dta$varselect) == "names")], - varsel_pbc$varselect) - - ## Test plotting the gg_error object - gg_plt <- plot.gg_minimal_depth(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - gg_plt <- plot(gg_dta, nvar=12) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") -}) - -test_that("gg_minimal_depth regression",{ - ## Load the cached forest - data(varsel_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_Boston, "list") - - ## Create the correct gg_error object - gg_dta <- gg_minimal_depth(varsel_Boston) - - # Test object type - expect_is(gg_dta, "gg_minimal_depth") - - # Test varselect is the same - expect_equivalent(gg_dta$varselect[, -which(colnames(gg_dta$varselect) == "names")], - varsel_Boston$varselect) - - ## Test plotting the gg_error object - gg_plt <- plot.gg_minimal_depth(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Create the correct gg_minimal_depth object - gg_dta <- gg_minimal_depth(varsel_Boston) - # Test object type - expect_is(gg_dta, "gg_minimal_depth") - - data(Boston, package="MASS") - - cls <- sapply(Boston, class) - # - lbls <- - #crim - c("Crime rate by town.", - # zn - "Proportion of residential land zoned for lots over 25,000 sq.ft.", - # indus - "Proportion of non-retail business acres per town.", - # chas - "Charles River (tract bounds river).", - # nox - "Nitrogen oxides concentration (10 ppm).", - # rm - "Number of rooms per dwelling.", - # age - "Proportion of units built prior to 1940.", - # dis - "Distances to Boston employment center.", - # rad - "Accessibility to highways.", - # tax - "Property-tax rate per $10,000.", - # ptratio - "Pupil-teacher ratio by town.", - # black - "Proportion of blacks by town.", - # lstat - "Lower status of the population (percent).", - # medv - "Median value of homes ($1000s).") - - # Build a table for data description - dta.labs <- data.frame(cbind(Variable=names(cls), Description=lbls, type=cls)) - - # Build a named vector for labeling figures later/ - st.labs <- as.character(dta.labs$Description) - names(st.labs) <- names(cls) - - ## Test plotting the gg_error object - gg_plt <- plot.gg_minimal_depth(gg_dta, lbls=st.labs, selection=TRUE) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") -}) - -test_that("gg_minimal_depth exceptions",{ - data(varsel_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_Boston, "list") - - vsel <- varsel_Boston - vsel$varselect <- NULL - - expect_error(gg_minimal_depth(vsel)) - - vsel$threshold <- NULL - expect_error(gg_minimal_depth(vsel)) - - expect_output(print(gg_minimal_depth(varsel_Boston)), - "gg_minimal_depth", ignore.case = TRUE) - - - vsel <- varsel_Boston - vsel$varselect$vimp <- NULL - gg_dta <- gg_minimal_depth(vsel) - - expect_is(gg_dta, "gg_minimal_depth") - - expect_is(plot(gg_dta, type="rank"), "ggplot") - - # data(rfsrc_Boston, package="ggRandomForests") - # expect_output(gg_dta <- gg_minimal_depth(rfsrc_Boston, fast=TRUE), - # "minimal depth variable selection") - # expect_is(gg_dta, "gg_minimal_depth") - # - # expect_output(gg_plt <- plot.gg_minimal_depth(rfsrc_Boston, fast=TRUE), - # "minimal depth variable selection") - # expect_error(gg_minimal_depth(gg_plt)) - -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_minimal_vimp.R b/tests/testthat/test_gg_minimal_vimp.R deleted file mode 100644 index 47df4995..00000000 --- a/tests/testthat/test_gg_minimal_vimp.R +++ /dev/null @@ -1,155 +0,0 @@ -# testthat for gg_minimal_vimp function -context("gg_minimal_vimp tests") - -test_that("gg_minimal_vimp classifications",{ - ## Load the cached forest - data(varsel_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_iris, "list") - - # Test the forest family - expect_false(is.null(varsel_iris$md.obj)) - - ## Create the correct gg_error object - ggrf.obj <- gg_minimal_vimp(varsel_iris) - - # Test object type - expect_is(ggrf.obj, "gg_minimal_vimp") - - # Test varselect is the same - expect_equivalent(dim(ggrf.obj)[1], dim(varsel_iris$varselect)[1]) - expect_equivalent(dim(ggrf.obj)[2], 4) - - ## Test plotting the gg_error object - gg.obj <- plot.gg_minimal_vimp(ggrf.obj) - - # Test return is s ggplot object - expect_is(gg.obj, "ggplot") -}) - - -test_that("gg_minimal_vimp survival",{ - ## Load the cached forest - data(varsel_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_pbc, "list") - - ## Create the correct gg_error object - ggrf.obj <- gg_minimal_vimp(varsel_pbc) - - # Test object type - expect_is(ggrf.obj, "gg_minimal_vimp") - - - # Test varselect is the same - expect_equivalent(dim(ggrf.obj)[1], dim(varsel_pbc$varselect)[1]) - expect_equivalent(dim(ggrf.obj)[2], 4) - - ## Test plotting the gg_error object - gg.obj <- plot.gg_minimal_vimp(ggrf.obj) - - # Test return is s ggplot object - expect_is(gg.obj, "ggplot") -}) - -test_that("gg_minimal_vimp regression",{ - ## Load the cached forest - data(varsel_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_Boston, "list") - - ## Create the correct gg_error object - ggrf.obj <- gg_minimal_vimp(varsel_Boston) - - # Test object type - expect_is(ggrf.obj, "gg_minimal_vimp") - - - # Test varselect is the same - expect_equivalent(dim(ggrf.obj)[1], dim(varsel_Boston$varselect)[1]) - expect_equivalent(dim(ggrf.obj)[2], 4) - - ## Test plotting the gg_error object - gg.obj <- plot.gg_minimal_vimp(ggrf.obj) - - # Test return is s ggplot object - expect_is(gg.obj, "ggplot") - - data(Boston, package="MASS") - - cls <- sapply(Boston, class) - # - lbls <- - #crim - c("Crime rate by town.", - # zn - "Proportion of residential land zoned for lots over 25,000 sq.ft.", - # indus - "Proportion of non-retail business acres per town.", - # chas - "Charles River (tract bounds river).", - # nox - "Nitrogen oxides concentration (10 ppm).", - # rm - "Number of rooms per dwelling.", - # age - "Proportion of units built prior to 1940.", - # dis - "Distances to Boston employment center.", - # rad - "Accessibility to highways.", - # tax - "Property-tax rate per $10,000.", - # ptratio - "Pupil-teacher ratio by town.", - # black - "Proportion of blacks by town.", - # lstat - "Lower status of the population (percent).", - # medv - "Median value of homes ($1000s).") - - # Build a table for data description - dta.labs <- data.frame(cbind(Variable=names(cls), Description=lbls, type=cls)) - - # Build a named vector for labeling figures later/ - st.labs <- as.character(dta.labs$Description) - names(st.labs) <- names(cls) - - ## Test plotting the gg_error object - gg_plt <- plot.gg_minimal_vimp(varsel_Boston, lbls=st.labs) - expect_is(gg_plt, "ggplot") - -}) - - -test_that("gg_minimal_vimp exceptions",{ - data(varsel_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(varsel_Boston, "list") - - vsel <- varsel_Boston - vsel$varselect <- NULL - - expect_error(gg_minimal_vimp(vsel)) - - vsel$threshold <- NULL - expect_error(gg_minimal_vimp(vsel)) - - vsel <- varsel_Boston - vsel$varselect$vimp <- NULL - expect_error(gg_minimal_vimp(vsel)) - expect_error(plot.gg_minimal_vimp(vsel)) - - # data(rfsrc_Boston, package="ggRandomForests") - # expect_output(gg_dta <- gg_minimal_vimp(rfsrc_Boston, fast=TRUE), - # "minimal depth variable selection") - # expect_is(gg_dta, "gg_minimal_vimp") - # gg_plt <- plot.gg_minimal_vimp(gg_dta) - # expect_error(gg_minimal_depth(gg_plt)) - -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_partial.R b/tests/testthat/test_gg_partial.R deleted file mode 100644 index 6de0a551..00000000 --- a/tests/testthat/test_gg_partial.R +++ /dev/null @@ -1,223 +0,0 @@ -# testthat for gg_partial function -context("gg_partial tests") - -test_that("gg_partial classifications",{ - ## Load the cached forest - data(rfsrc_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_equal(rfsrc_iris$family, "class") - - # Load saved partial plot data. - data(partial_iris, package="ggRandomForests") - - expect_equivalent(length(partial_iris$pData), length(rfsrc_iris$xvar.names)) - - ## Create the correct gg_error object - gg_dta <- gg_partial(partial_iris) - - # Test object type - expect_is(gg_dta, "gg_partial_list") - - # Test varselect is the same - #expect_equivalent(select(gg_dta$varselect, -names), rfsrc_iris$importance) - - ## Test plotting the gg_error object - gg_plt <- plot.gg_partial(gg_dta[[2]]) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_partial_list(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt[[1]], "ggplot") - - expect_equivalent(length(gg_plt),length(partial_iris$pData) ) -}) - - -test_that("gg_partial survival",{ - ## Load the cached forest - data(rfsrc_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - - ## Get the partial data. - data(partial_pbc, package="ggRandomForests") - - ## Create the correct gg_error object - gg_dta <- gg_partial(partial_pbc[[1]]) - - # Test object type - expect_is(gg_dta, "gg_partial_list") - - ## Test plotting the gg_data object - gg_plt <- plot(gg_dta[[1]]) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt[[1]], "ggplot") - - expect_equivalent(length(gg_plt) , length(gg_dta)) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta, panel=TRUE) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - data(rfsrc_pbc, package="ggRandomForests") - data("varsel_pbc", package="ggRandomForests") - - # Data generation - ggrf <- gg_variable(rfsrc_pbc, time = c(1, 3), - time.labels = c("1 Year", "3 Years")) - - # Plot the bilirubin variable dependence plot - gg_plt <- plot(ggrf, xvar = "bili", alpha = .3) - - gg_plt <- gg_plt+ geom_smooth(se=.95) - - - xvar <- varsel_pbc$topvars - xvar.cat <- c("edema", "stage") - xvar <- xvar[-which(xvar %in% xvar.cat)] - - # plot the next 5 continuous variable dependence plots. - gg_plt <- plot(ggrf, xvar = xvar[2:6], panel = TRUE) - - gg_plt <- gg_plt + geom_smooth(se = FALSE, alpha = .3, - method = "glm", formula = y~poly(x,2)) - - expect_warning(gg_plt <- plot(ggrf, xvar = xvar.cat, panel=TRUE)) -}) - -test_that("gg_partial regression",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - ## Create the correct gg_error object - data(partial_Boston, package="ggRandomForests") - gg_dta <- gg_partial(partial_Boston) - - # Test object type - expect_is(gg_dta, "gg_partial_list") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_partial(gg_dta[[1]]) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_partial_list(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "list") - - expect_equivalent(length(gg_plt) , length(gg_dta)) - - # gg_partial exceptions - expect_error(gg_partial(gg_plt)) - - # Remove all but one partial data. - partial_Boston$xvar.names <- "lstat" - partial_Boston$nvar <- 1 - for(ind in length(partial_Boston$pData):2){ - partial_Boston$pData[[ind]] <- NULL - } - gg_dta <- gg_partial(partial_Boston) - - # Test object type - expect_is(gg_dta, "gg_partial") - - # generate a list of gg_partial objects, one per xvar. - expect_error(gg_p <- gg_partial(gg_dta), "gg_partial") - - expect_is(plot(gg_dta, error="bars"), "ggplot") - expect_is(plot(gg_dta, error="none"), "ggplot") - expect_is(plot(gg_dta, error="lines"), "ggplot") - expect_is(plot(gg_dta, error="shade"), "ggplot") - # Test object type - - expect_is(plot(gg_dta), "ggplot") - gg_plt <- plot(gg_dta, error="shade")+ geom_smooth(se=.95) - -}) - -test_that("gg_partial combine",{ - - # Load a set of plot.variable partial plot data - data(partial_pbc) - - # A list of 2 plot.variable objects - expect_is(partial_pbc, "list") - expect_gt(length(partial_pbc), 1) - - for(ind in 1:length(partial_pbc)){ - expect_is(partial_pbc[[ind]], "rfsrc") - expect_is(partial_pbc[[ind]], "plot.variable") - expect_is(partial_pbc[[ind]], "surv") - } - - # Create gg_partial objects - gg_prtl <- lapply(partial_pbc, gg_partial) - for(ind in 1:length(partial_pbc)){ - expect_is(gg_prtl[[ind]], "gg_partial_list") - } - - # Combine the objects to get multiple time curves - # along variables on a single figure. - ggpart <- combine.gg_partial(gg_prtl[[1]], gg_prtl[[2]], - lbls = c("30 day", "6 month")) - expect_is(ggpart, "gg_partial_list") - - # We should have at least 5 - expect_gt(length(ggpart), 5) - - # Plot each figure separately - gg_plt <- plot(ggpart) - expect_is(gg_plt, "list") - expect_gt(length(gg_plt), 5) - expect_equal(length(gg_plt), length(ggpart)) - - for(ind in 1:length(gg_plt)){ - expect_is(gg_plt[[ind]], "ggplot") - } - - # Get the continuous data for a panel of continuous plots. - ggcont <- ggpart - - ggcont$celltype <- ggcont$trt <- ggcont$prior <- NULL - expect_gt(length(ggcont), 5 - 3) - - gg_plt <- plot(ggcont, panel=TRUE) - expect_is(gg_plt, "ggplot") - # And the categorical for a panel of categorical plots. - ggpart$karno <- ggpart$diagtime <- ggpart$age <- NULL - expect_gt(length(ggpart), 5 - 3) - - gg_plt <- plot(ggpart, panel=TRUE) - expect_is(gg_plt, "ggplot") - - # Test coverage, auto labels - ggpart <- combine.gg_partial(gg_prtl[[1]], gg_prtl[[2]]) - expect_is(ggpart, "gg_partial_list") - - expect_error(combine.gg_partial(gg_prtl)) - expect_error(combine.gg_partial(gg_prtl, gg_prtl)) -}) diff --git a/tests/testthat/test_gg_partial_coplot.R b/tests/testthat/test_gg_partial_coplot.R deleted file mode 100644 index 8d2f5f3e..00000000 --- a/tests/testthat/test_gg_partial_coplot.R +++ /dev/null @@ -1,45 +0,0 @@ - -# testthat for gg_partial_coplot function -context("gg_partial_coplot tests") -test_that("gg_error classifications",{ - - data(Boston, package="MASS") - - # Unless we are on the same version as Travis-CI, - # we need to build rather than cache the rfsrc - rfsrc_Boston <- rfsrc(medv~., data=Boston, - importance="none", - nsplit=5) - # fast.restore can be added after randomForestSRC V1.6 release - - # Find the rm variable points to create 6 intervals of roughly - # equal size population - rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups=3, intervals=TRUE) - - # Pass these variable points to create the 6 (factor) intervals - rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks=rm_pts) - - # This is the expensive part. - partial_coplot_Boston <- gg_partial_coplot(rfsrc_Boston, xvar="lstat", - groups=rm_grp, - show.plots=FALSE, - npts=5) - expect_is(partial_coplot_Boston, "gg_partial_coplot") - - expect_equal(ncol(partial_coplot_Boston), 3) - - expect_equal(length(unique(partial_coplot_Boston$group)), 3) - - expect_error(gg_partial_coplot(partial_coplot_Boston, xvar="lstat", - groups=rm_grp, - npts=5)) - expect_error(gg_partial_coplot(rfsrc_Boston, xvar="lstat", - npts=5)) - rfsrc_Boston$forest <- NULL - expect_error(gg_partial_coplot(rfsrc_Boston, xvar="lstat", - groups=rm_grp, - show.plots=FALSE, - npts=5)) - - -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_rfsrc.R b/tests/testthat/test_gg_rfsrc.R deleted file mode 100644 index afdfb318..00000000 --- a/tests/testthat/test_gg_rfsrc.R +++ /dev/null @@ -1,191 +0,0 @@ -# testthat for gg_rfsrc function -context("gg_rfsrc tests") - -test_that("gg_rfsrc classifications",{ - ## Load the cached forest - data(rfsrc_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_is(rfsrc_iris, "class") - - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_iris) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - - # Test classification dimensions - expect_equal(nrow(gg_dta), nrow(rfsrc_iris$predicted.oob)) - expect_equal(ncol(gg_dta), ncol(rfsrc_iris$predicted.oob) + 1) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta[, -which(colnames(gg_dta) == "y")]), - rfsrc_iris$predicted.oob) - - ## Test plotting the gg_error object - gg_plt <- plot.gg_rfsrc(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_iris, oob=FALSE) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - - # Test classification dimensions - expect_equal(nrow(gg_dta), nrow(rfsrc_iris$predicted)) - expect_equal(ncol(gg_dta), ncol(rfsrc_iris$predicted) + 1) - - # Test data is correctly pulled from randomForest obect. - expect_equivalent(as.matrix(gg_dta[, -which(colnames(gg_dta) == "y")]), - rfsrc_iris$predicted) -}) - - -test_that("gg_rfsrc survival",{ - ## Load the cached forest - data(rfsrc_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - - # Test the forest family - expect_match(rfsrc_pbc$family, "surv") - - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_pbc) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "surv") - - # Test classification dimensions - ## Test plotting the gg_error object - gg_plt <- plot.gg_rfsrc(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_pbc, oob=FALSE) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "surv") - # Test classification dimensions - ## Test plotting the gg_error object - gg_plt <- plot.gg_rfsrc(gg_dta, alpha=.4) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Test classification dimensions - - gg_dta <- gg_rfsrc(rfsrc_pbc, by="treatment") - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "surv") - - ## Create the correct gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - - gg_dta <- gg_rfsrc(rfsrc_pbc,conf.int=.68) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "surv") - - # Test multiple conf intervals - gg_dta <- gg_rfsrc(rfsrc_pbc,conf.int=c(.025, .975), bs.sample=100) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "surv") - - ## Create the correct gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Test prediction - ## Load the cached forest - # Predict survival for 106 patients not in randomized trial - data(rfsrc_pbc_test, package="ggRandomForests") - # Print prediction summary - expect_is(gg_dta <- gg_rfsrc(rfsrc_pbc_test), "gg_rfsrc") - - # Test for group "by" name exists - expect_error(gg_rfsrc(rfsrc_pbc, by="trt")) - # And it's a vector or factor (not a number) - expect_error(gg_rfsrc(rfsrc_pbc, by=3)) - - # Test confidence intervals - -}) - -test_that("gg_rfsrc regression",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - # Test the forest family - expect_match(rfsrc_Boston$family, "regr") - - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_Boston) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "regr") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_rfsrc(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_Boston, oob=FALSE) - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - - # Test classification dimensions - ## Create the correct gg_error object - gg_dta <- gg_rfsrc(rfsrc_Boston, by="chas") - - # Test object type - expect_is(gg_dta, "gg_rfsrc") - expect_is(gg_dta, "regr") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_rfsrc(gg_dta) - - # Test data is correctly pulled from randomForest obect. - # Predicted values - rfsrc_Boston$family <- "test" - expect_error(gg_rfsrc(rfsrc_Boston)) - - # Test exceptions - # Is it an rfsrc object? - expect_error(gg_rfsrc(gg_plt)) - - # Does it contain the forest? - rfsrc_Boston$forest <- NULL - expect_error(gg_rfsrc(rfsrc_Boston)) - -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_roc.R b/tests/testthat/test_gg_roc.R deleted file mode 100644 index 63425211..00000000 --- a/tests/testthat/test_gg_roc.R +++ /dev/null @@ -1,161 +0,0 @@ -# testthat for gg_roc function -context("gg_roc tests") - -test_that("gg_roc classifications",{ - - ## Load the cached forest - data(rfsrc_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_match(rfsrc_iris$family, "class") - - ## Create the correct gg_roc object - which.outcome <- 1 - gg_dta <- gg_roc(rfsrc_iris, which.outcome) - - # Test object type - expect_is(gg_dta, "gg_roc") - - # Test classification dimensions - expect_equal(nrow(gg_dta), - length(unique(rfsrc_iris$predicted.oob[,which.outcome])) + 1) - expect_equal(ncol(gg_dta), 3) - - # Test data is correctly pulled from randomForest obect. - unts <- sort(unique(rfsrc_iris$predicted.oob[,which.outcome])) - expect_equivalent(gg_dta$pct, c(0,unts[-length(unts)],1)) - - ## Test plotting the gg_roc object - gg.obj <- plot.gg_roc(gg_dta) - - # Test return is s ggplot object - expect_is(gg.obj, "ggplot") - - # Try test set prediction. - gg_dta <- gg_roc(rfsrc_iris, which.outcome, oob=FALSE) - # Try test set prediction. - gg_plt <- plot.gg_roc(rfsrc_iris) - - # Test object type - expect_is(gg_dta, "gg_roc") - - # Test classification dimensions - expect_equal(nrow(gg_dta), - length(unique(rfsrc_iris$predicted[,which.outcome])) + 1) - expect_equal(ncol(gg_dta), 3) - - # Test data is correctly pulled from randomForest obect. - unts <- sort(unique(rfsrc_iris$predicted[,which.outcome])) - expect_equivalent(gg_dta$pct, c(0,unts[-length(unts)],1)) - - ## Test plotting the gg_roc object - gg.obj <- plot.gg_roc(gg_dta) - - # Test return is s ggplot object - expect_is(gg.obj, "ggplot") - - expect_is(plot.gg_roc(rfsrc_iris), "ggplot") -}) - - -test_that("gg_roc survival",{ - ## Load the cached forest - data(rfsrc_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - - # Test the forest family - expect_match(rfsrc_pbc$family, "surv") - - ## Create the correct gg_roc object - expect_error(gg_roc(rfsrc_pbc)) - -}) - -test_that("gg_roc regression",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - # Test the forest family - expect_match(rfsrc_Boston$family, "regr") - - ## Create the correct gg_roc object - expect_error(gg_roc(rfsrc_Boston)) - expect_error(plot.gg_roc(rfsrc_Boston)) - -}) - -test_that("calc_roc",{ - data(rfsrc_iris) - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_match(rfsrc_iris$family, "class") - - gg_dta <- calc_roc.rfsrc(rfsrc_iris, - rfsrc_iris$yvar, - which.outcome=1, oob=TRUE) - - # Test the cached forest type - expect_is(gg_dta, "data.frame") - - expect_equal(ncol(gg_dta), 3) - expect_equal(nrow(gg_dta), length(unique(rfsrc_iris$predicted.oob[,1])) + 1) - - expect_error(calc_roc.rfsrc(rfsrc_iris, - rfsrc_iris$yvar, - which.outcome="all")) - # Test oob=FALSE - gg_dta <- calc_roc.rfsrc(rfsrc_iris, - rfsrc_iris$yvar, - which.outcome=1, oob=FALSE) - - # Test the cached forest type - expect_is(gg_dta, "data.frame") - - expect_equal(ncol(gg_dta), 3) - - # test the auc calculator - auc <- calc_auc(gg_dta) - expect_true(auc > .9) - expect_true(auc <= 1) - # The second outcome. - gg_dta <- calc_roc.rfsrc(rfsrc_iris, - rfsrc_iris$yvar, - which.outcome=2, oob=TRUE) - - # Test the cached forest type - expect_is(gg_dta, "data.frame") - - expect_equal(ncol(gg_dta), 3) - expect_equal(nrow(gg_dta), length(unique(rfsrc_iris$predicted.oob[,2])) + 1) - - # test the auc calculator - auc <- calc_auc(gg_dta) - expect_true(auc > .9) - expect_true(auc <= 1) - # and the third... - gg_dta <- calc_roc.rfsrc(rfsrc_iris, - rfsrc_iris$yvar, - which.outcome=3, oob=TRUE) - - # Test the cached forest type - expect_is(gg_dta, "data.frame") - - expect_equal(ncol(gg_dta), 3) - expect_equal(nrow(gg_dta),length(unique(rfsrc_iris$predicted.oob[,3])) + 1) - - # test the auc calculator - auc <- calc_auc(gg_dta) - expect_true(auc > .9) - expect_true(auc <= 1) -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_survival.R b/tests/testthat/test_gg_survival.R deleted file mode 100644 index 23b2f838..00000000 --- a/tests/testthat/test_gg_survival.R +++ /dev/null @@ -1,91 +0,0 @@ -# testthat for gg_survival function -context("gg_survival tests") - -test_that("gg_survival classifications",{ - expect_error(gg_survival(data=iris)) -}) - - -test_that("gg_survival survival",{ - # ## Load the cached forest - data(pbc, package="randomForestSRC") - - # Test the cached forest type - expect_is(pbc, "data.frame") - - # Test object type - gg_dta <- gg_survival(interval = "days", - censor = "status", - by = "treatment", - data = pbc, - conf.int = .95) - - expect_is(gg_dta, "gg_survival") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_survival(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - expect_is(plot(gg_dta, error="bars"), "ggplot") - expect_is(plot(gg_dta, error="none"), "ggplot") - expect_is(plot(gg_dta, error="lines"), "ggplot") - expect_is(plot(gg_dta, type="surv"), "ggplot") - expect_is(plot(gg_dta, type="cum_haz"), "ggplot") - expect_is(plot(gg_dta, type="density"), "ggplot") - expect_is(plot(gg_dta, type="mid_int"), "ggplot") - expect_is(plot(gg_dta, type="life"), "ggplot") - expect_is(plot(gg_dta, type="hazard"), "ggplot") - expect_is(plot(gg_dta, type="proplife"), "ggplot") - # Test object type - gg_dta <- gg_survival(interval = "days", - censor = "status", - by = "treatment", - data = pbc, - conf.int = .95, - type="nelson") - - expect_is(gg_dta, "gg_survival") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_survival(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - - # Test object type - gg_dta <- gg_survival(interval = "days", - censor = "status", - data = pbc, - conf.int = .95) - - expect_is(gg_dta, "gg_survival") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_survival(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - expect_is(plot(gg_dta, error="bars"), "ggplot") - expect_is(plot(gg_dta, error="none"), "ggplot") - expect_is(plot(gg_dta, error="lines"), "ggplot") - expect_is(plot(gg_dta, type="surv"), "ggplot") - expect_is(plot(gg_dta, type="cum_haz"), "ggplot") - expect_is(plot(gg_dta, type="density"), "ggplot") - expect_is(plot(gg_dta, type="mid_int"), "ggplot") - expect_is(plot(gg_dta, type="life"), "ggplot") - expect_is(plot(gg_dta, type="hazard"), "ggplot") - expect_is(plot(gg_dta, type="proplife"), "ggplot") - -}) - -test_that("gg_survival regression",{ - ## Load the data - data(Boston, package="MASS") - - ## Create the correct gg_error object - expect_error(gg_survival(data=Boston)) -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_variable.R b/tests/testthat/test_gg_variable.R deleted file mode 100644 index ec9f9026..00000000 --- a/tests/testthat/test_gg_variable.R +++ /dev/null @@ -1,111 +0,0 @@ -# testthat for gg_variable function -context("gg_variable tests") - -test_that("gg_variable classifications",{ - ## Load the cached forest - data(rfsrc_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_equal(rfsrc_iris$family, "class") - - ## Create the correct gg_error object - gg_dta <- gg_variable(rfsrc_iris) - - # Test object type - expect_is(gg_dta, "gg_variable") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_variable(gg_dta, xvar = "Petal.Width") - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_variable(gg_dta, xvar = rfsrc_iris$xvar.names ) - - # Test return is s ggplot object - expect_is(gg_plt, "list") - expect_equal(length(gg_plt), length(rfsrc_iris$xvar.names)) - for(ind in 1:length(rfsrc_iris$xvar.names)) - expect_is(gg_plt[[ind]], "ggplot") - ## Test plotting the gg_error object - gg_plt <- plot.gg_variable(gg_dta, xvar = rfsrc_iris$xvar.names, - panel=TRUE) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") -}) - - -test_that("gg_variable survival",{ - ## Load the cached forest - data(rfsrc_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - - ## Create the correct gg_error object - gg_dta <- gg_variable(rfsrc_pbc, time=.25) - - # Test object type - expect_is(gg_dta, "gg_variable") - - ## Test plotting the gg_variable object - gg_plt <- plot.gg_variable(gg_dta, xvar="age") - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - - ## Test plotting the gg_variable object - gg_plt <- plot.gg_variable(gg_dta, xvar=rfsrc_pbc$xvar.names) - - # Test return is s ggplot object - expect_is(gg_plt, "list") - expect_equal(length(gg_plt), length(rfsrc_pbc$xvar.names)) - - - for(ind in 1:length(rfsrc_pbc$xvar.names)) - expect_is(gg_plt[[ind]], "ggplot") - - - ## Test plotting the gg_error object - expect_warning(gg_plt <- plot.gg_variable(gg_dta, xvar = rfsrc_pbc$xvar.names, - panel=TRUE) - ) - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - -}) - -test_that("gg_variable regression",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - ## Create the correct gg_error object - gg_dta <- gg_variable(rfsrc_Boston) - - # Test object type - expect_is(gg_dta, "gg_variable") - - ## Test plotting the gg_error object - gg_plt <- plot.gg_variable(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "list") - expect_equal(length(gg_plt), length(rfsrc_Boston$xvar.names)) - for(ind in 1:length(rfsrc_Boston$xvar.names)) - expect_is(gg_plt[[ind]], "ggplot") - - - ## Test plotting the gg_error object - expect_warning(gg_plt <- plot.gg_variable(gg_dta, panel=TRUE)) - expect_is(gg_plt, "ggplot") - -}) \ No newline at end of file diff --git a/tests/testthat/test_gg_vimp.R b/tests/testthat/test_gg_vimp.R deleted file mode 100644 index d74e931b..00000000 --- a/tests/testthat/test_gg_vimp.R +++ /dev/null @@ -1,219 +0,0 @@ -# testthat for gg_vimp function -context("gg_vimp tests") - -test_that("gg_vimp classifications", { - ## Load the cached forest - data(rfsrc_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - # Test the forest family - expect_equal(rfsrc_iris$family, "class") - - ## Create the correct gg_error object - gg_dta <- gg_vimp(rfsrc_iris) - - # Test object type - expect_is(gg_dta, "gg_vimp") - - # Test varselect is the same - #expect_equivalent(select(gg_dta$varselect, -names), rfsrc_iris$importance) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Grab only one class... by number. - gg_dta <- gg_vimp(rfsrc_iris, which.outcome=2) - - # Test object type - expect_is(gg_dta, "gg_vimp") - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - # Grab only one class... by number - for the overall model. - gg_dta <- gg_vimp(rfsrc_iris, which.outcome=0) - - # Test object type - expect_is(gg_dta, "gg_vimp") - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - # Grab only one class... by name - for the overall model. - gg_dta <- gg_vimp(rfsrc_iris, which.outcome="all") - - # Test object type - expect_is(gg_dta, "gg_vimp") - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - # Grab only one class... by name - for the overall model. - gg_dta <- gg_vimp(rfsrc_iris, which.outcome="setosa") - - # Test object type - expect_is(gg_dta, "gg_vimp") - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - # Grab only one class... by name - that doesn't exist. - expect_error(gg_vimp(rfsrc_iris, which.outcome="nothing special")) - - # Grab only one class... by number - that doesn't exist. - expect_error(gg_vimp(rfsrc_iris, which.outcome=200)) - - ## Single class/ - iris2 <- iris - iris2$spec <- factor(as.character(iris2$Species) == "setosa") - iris2 <- iris2[,-which(colnames(iris2) == "Species")] - - rf <- rfsrc(spec~., iris2, importance=TRUE) - - gg_dta <- gg_vimp(rf) - - expect_is(gg_dta, "gg_vimp") - - # Test passing in the wrong object - expect_error(gg_vimp(gg_dta)) - expect_error(gg_vimp.rfsrc(gg_dta)) - -}) - - -test_that("gg_vimp survival",{ - ## Load the cached forest - data(rfsrc_pbc, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - - ## Create the correct gg_error object - gg_dta <- gg_vimp(rfsrc_pbc) - - # Test object type - expect_is(gg_dta, "gg_vimp") - - # Test varselect is the same - expect_equal(gg_dta$vimp, as.vector(sort(rfsrc_pbc$importance, decreasing=TRUE))) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta, nvar=5) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - expect_is(plot(gg_dta, relative=TRUE), "ggplot") - - # Test cutting the size down - expect_is(gg_dta <- gg_vimp(rfsrc_pbc, nvar=10), "gg_vimp") - expect_equal(nrow(gg_dta), 10) - expect_is(plot(gg_dta), "ggplot") - - # Test the relative vimp output and plotting - expect_is(gg_dta <- gg_vimp(rfsrc_pbc, relative=TRUE), "gg_vimp") - expect_is(plot(gg_dta), "ggplot") - - expect_is(gg_dta <- gg_vimp(rfsrc_pbc, nvar=10, relative=TRUE), "gg_vimp") - expect_is(plot(gg_dta), "ggplot") - - # Test importance calculations. - # If the forest does not have importance - rfsrc_pbc$importance <- NULL - expect_warning(gg_dta <- gg_vimp(rfsrc_pbc)) - expect_is(gg_dta, "gg_vimp") - expect_is(plot(gg_dta), "ggplot") - -}) - -test_that("gg_vimp regression",{ - ## Load the cached forest - data(rfsrc_Boston, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - ## Create the correct gg_error object - gg_dta <- gg_vimp(rfsrc_Boston) - - # Test object type - expect_is(gg_dta, "gg_vimp") - - # Test varselect is the same - expect_equal(gg_dta$vimp, as.vector(sort(rfsrc_Boston$importance, decreasing=TRUE))) - - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - ## Test plotting the gg_error object - gg_plt <- plot(gg_dta, relative=TRUE) - - # Test return is s ggplot object - expect_is(gg_plt, "ggplot") - - data(Boston, package="MASS") - - cls <- sapply(Boston, class) - # - lbls <- - #crim - c("Crime rate by town.", - # zn - "Proportion of residential land zoned for lots over 25,000 sq.ft.", - # indus - "Proportion of non-retail business acres per town.", - # chas - "Charles River (tract bounds river).", - # nox - "Nitrogen oxides concentration (10 ppm).", - # rm - "Number of rooms per dwelling.", - # age - "Proportion of units built prior to 1940.", - # dis - "Distances to Boston employment center.", - # rad - "Accessibility to highways.", - # tax - "Property-tax rate per $10,000.", - # ptratio - "Pupil-teacher ratio by town.", - # black - "Proportion of blacks by town.", - # lstat - "Lower status of the population (percent).", - # medv - "Median value of homes ($1000s).") - - # Build a table for data description - dta.labs <- data.frame(cbind(Variable=names(cls), Description=lbls, type=cls)) - - # Build a named vector for labeling figures later/ - st.labs <- as.character(dta.labs$Description) - names(st.labs) <- names(cls) - - ## Test plotting the rfsrc object - gg_plt <- plot.gg_vimp(rfsrc_Boston, lbls=st.labs, relative=TRUE, - bars=rfsrc_Boston$xvar.names) - expect_is(gg_plt, "ggplot") - - -}) diff --git a/tests/testthat/test_lint.R b/tests/testthat/test_lint.R deleted file mode 100644 index 6f01229b..00000000 --- a/tests/testthat/test_lint.R +++ /dev/null @@ -1,6 +0,0 @@ -if (requireNamespace("lintr", quietly = TRUE)) { - context("lints") -# test_that("Package Style", -# lintr::expect_lint_free() -# }) -} \ No newline at end of file diff --git a/tests/testthat/test_partial.rfsrc.R b/tests/testthat/test_partial.rfsrc.R deleted file mode 100644 index 543d0c8e..00000000 --- a/tests/testthat/test_partial.rfsrc.R +++ /dev/null @@ -1,153 +0,0 @@ -# testthat for gg_vimp function -context("partial.rfsrc tests") - -test_that("partial.rfsrc regression",{ - - ## Load the cached forest - #data(rfsrc_Boston, package="ggRandomForests") - data(Boston, package="MASS") - - Boston$chas <- as.logical(Boston$chas) - - rfsrc_Boston <- rfsrc(medv~., data=Boston, ntree=100) - # Test the cached forest type - expect_is(rfsrc_Boston, "rfsrc") - - xvar <- c("lstat","chas") - ## Create the correct object - gg_dta <- partial.rfsrc(rfsrc_Boston, - xvar.names = xvar, - npts=10) - expect_equal(gg_dta$xvar.names, xvar) - expect_equal(names(gg_dta$pData), xvar) - expect_equal(gg_dta$pData[[1]]$xvar.name, xvar[1]) - expect_equal(gg_dta$pData[[2]]$xvar.name, xvar[2]) - expect_equal(length(gg_dta$pData[[1]]$yhat),10) - expect_equal(length(gg_dta$pData[[2]]$yhat),length(unique(rfsrc_Boston$xvar$chas))) - - ## Correct npts spec. - gg_dta <- partial.rfsrc(rfsrc_Boston, - xvar.names = c("lstat","chas"), - npts=-1) - - ## Number of vars spec. - gg_dta <- partial.rfsrc(rfsrc_Boston, - nvar = 2, - npts=3) - - ## subset by row numbers - gg_dta <- partial.rfsrc(rfsrc_Boston, - xvar.names = c("rm"), - subset=which(rfsrc_Boston$xvar$chas == 1), - npts=3) - ## subset by logicals - gg_dta <- partial.rfsrc(rfsrc_Boston, - xvar.names = c("rm"), - subset=rfsrc_Boston$xvar$chas == 1, - npts=3) - ##incorrect subset - expect_error(partial.rfsrc(rfsrc_Boston, - xvar.names = c("rm"), - subset=FALSE, - npts=10)) -}) - -test_that("partial.rfsrc survival",{ - ## Load the cached forest - #data(rfsrc_pbc, package="ggRandomForests") - data(pbc, package="randomForestSRC") - - # For whatever reason, the age variable is in days... makes no sense to me - for(ind in 1:dim(pbc)[2]){ - if(!is.factor(pbc[,ind])){ - if(length(unique(pbc[which(!is.na(pbc[,ind])),ind])) <= 2) { - if(sum(range(pbc[,ind],na.rm=TRUE) == c(0,1)) == 2){ - pbc[,ind] <- as.logical(pbc[,ind]) - } - } - }else{ - if(length(unique(pbc[which(!is.na(pbc[,ind])),ind])) <= 2) { - if(sum(sort(unique(pbc[,ind])) == c(0,1)) == 2){ - pbc[,ind] <- as.logical(pbc[,ind]) - } - if(sum(sort(unique(pbc[,ind])) == c(FALSE, TRUE)) == 2){ - pbc[,ind] <- as.logical(pbc[,ind]) - } - } - } - if(!is.logical(pbc[, ind]) & - length(unique(pbc[which(!is.na(pbc[,ind])),ind])) <= 5) { - pbc[,ind] <- factor(pbc[,ind]) - } - } - # Convert age to years - pbc$age <- pbc$age / 364.24 - - pbc$years <- pbc$days / 364.24 - pbc <- pbc[, -which(colnames(pbc) == "days")] - pbc$treatment <- as.numeric(pbc$treatment) - pbc$treatment[which(pbc$treatment == 1)] <- "DPCA" - pbc$treatment[which(pbc$treatment == 2)] <- "placebo" - pbc$treatment <- factor(pbc$treatment) - - dta.train <- pbc[-which(is.na(pbc$treatment)),] - # Create a test set from the remaining patients - pbc.test <- pbc[which(is.na(pbc$treatment)),] - rfsrc_pbc <- rfsrc(Surv(years, status) ~ ., dta.train, nsplit = 10, - na.action="na.impute", ntree=100) - - # Test the cached forest type - expect_is(rfsrc_pbc, "rfsrc") - xvar <- c("age", "copper") - ## Create the correct object - gg_dta <- partial.rfsrc(rfsrc_pbc, - xvar.names = xvar, - npts=10, surv.type="surv") - - expect_equal(gg_dta$xvar.names, xvar) - expect_equal(names(gg_dta$pData), xvar) - expect_equal(gg_dta$pData[[1]]$xvar.name, xvar[1]) - expect_equal(gg_dta$pData[[2]]$xvar.name, xvar[2]) - expect_equal(length(gg_dta$pData[[1]]$yhat),10) - - - # pretend we have an unsupervised forest - rfsrc_pbc$family <- "unsupv" - expect_error(partial.rfsrc(rfsrc_pbc, - npts=10)) -}) - -test_that("partial.rfsrc classification",{ - ## Load the cached forest - #data(rfsrc_iris, package="ggRandomForests") - rfsrc_iris <- rfsrc(Species ~., data = iris, ntree=100) - ## Load the cached forest - data(partial_iris, package="ggRandomForests") - - # Test the cached forest type - expect_is(rfsrc_iris, "rfsrc") - - ## Create the correct object - gg_dta <- partial.rfsrc(rfsrc_iris, - npts=10) - - ## Create the correct object - gg_dta <- partial.rfsrc(rfsrc_iris, - npts=10, which.outcome="versicolor") - - ## Create the correct object - gg_dta <- partial.rfsrc(rfsrc_iris, - npts=10, which.outcome=2) - # Wrong "rfsrc" type - expect_error(partial.rfsrc(partial_iris, - npts=10)) - - # No "forest" stored in rfsrc object - rfsrc_iris$forest <- NULL - expect_error(partial.rfsrc(rfsrc_iris, - npts=10)) - - # Incorrect xvar name. - expect_error(partial.rfsrc(rfsrc_iris,xvar.names = "lmstat", - npts=10)) -}) \ No newline at end of file diff --git a/tests/testthat/test_quantile_pts.R b/tests/testthat/test_quantile_pts.R deleted file mode 100644 index f6297eb9..00000000 --- a/tests/testthat/test_quantile_pts.R +++ /dev/null @@ -1,33 +0,0 @@ -# testthat for quantile_pts function -context("quantile_pts tests") - -test_that("cutting a vector at evenly space points",{ - data(rfsrc_Boston) - - # To create 6 intervals, we want 7 points. - # quantile_pts will find balanced intervals - rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups=6, intervals=TRUE) - - expect_is(rm_pts, "numeric") - expect_equal(length(rm_pts), 6 + 1) - - # When calculating intervals, we subtract 1.e-7 from the min value - expect_true(abs(min(rfsrc_Boston$xvar$rm) - min(rm_pts)) > 0) - - # Use cut to create the intervals - rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks=rm_pts) - expect_is(rm_grp, "factor") - expect_equal(length(rm_grp), length(rfsrc_Boston$xvar$rm)) - expect_equal(length(levels(rm_grp)), length(rm_pts) - 1) - - rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups=6) - - expect_is(rm_pts, "numeric") - expect_equal(length(rm_pts), 6) - # When calculating intervals, we subtract 1.e-7 from the min value - expect_equal(min(rfsrc_Boston$xvar$rm), min(rm_pts), tolerance = 1.e-7) - - # Test the number of points for lots of groups. - rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups=nrow(rfsrc_Boston$xvar) + 2) - expect_equal(length(rm_pts), length(unique(rfsrc_Boston$xvar$rm))) -}) diff --git a/tests/testthat/test_shift.R b/tests/testthat/test_shift.R deleted file mode 100644 index 5503e2f0..00000000 --- a/tests/testthat/test_shift.R +++ /dev/null @@ -1,10 +0,0 @@ -# testthat for shift function -context("shift tests") - -test_that("lead or lag a vector",{ - expect_that(shift(1:10, 2),is_identical_to(c(3:10, NA, NA))) - expect_that(shift(1:10, -2), is_identical_to(c(NA, NA, 1:8))) - expect_that(shift(1:10, 0), is_identical_to(1:10)) - expect_that(shift(1:10, 0), is_identical_to(1:10)) - expect_that(shift(1:10, 1:2), is_identical_to(cbind(c(2:10, NA),c(3:10, NA, NA)))) -}) \ No newline at end of file diff --git a/tests/testthat/test_surface_matrix.R b/tests/testthat/test_surface_matrix.R deleted file mode 100644 index 372d8249..00000000 --- a/tests/testthat/test_surface_matrix.R +++ /dev/null @@ -1,47 +0,0 @@ -# testthat for surface_matrix function -context("surface_matrix tests") - -test_that("cutting a vector at evenly space points",{ - - # Load the stored rfsrc and partial coplot data. - data(rfsrc_Boston) - rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups=50) - - ## From vignette(randomForestRegression, package="ggRandomForests") - ## - # Load the stored partial coplot data. - data(partial_Boston_surf) - - # Instead of groups, we want the raw rm point values, - # To make the dimensions match, we need to repeat the values - # for each of the 50 points in the lstat direction - rm.tmp <- do.call(c,lapply(rm_pts, - function(grp){ - rep(grp, length(partial_Boston_surf)) - })) - - # Convert the list of plot.variable output to - partial_surf <- do.call(rbind,lapply(partial_Boston_surf, gg_partial)) - - # attach the data to the gg_partial_coplot - partial_surf$rm <- rm.tmp - - # Transform the gg_partial_coplot object into a list of three named matrices - # for surface plotting with plot3D::surf3D - expect_warning(srf <- surface_matrix(partial_surf, c("lstat", "rm", "yhat"))) - - # a list, - expect_is(srf, "list") - - # with 3 dimensions - expect_equal(length(srf), 3) - - expect_equal(nrow(srf[[1]]), 50) - expect_equal(ncol(srf[[1]]), length(partial_Boston_surf)) - - expect_equal(nrow(srf[[1]]), nrow(srf[[2]])) - expect_equal(nrow(srf[[1]]), nrow(srf[[3]])) - expect_equal(ncol(srf[[1]]), ncol(srf[[2]])) - expect_equal(ncol(srf[[1]]), ncol(srf[[3]])) - -}) \ No newline at end of file diff --git a/vignettes/fig-rfs/rfs-albumin-bili-1.pdf b/vignettes/fig-rfs/rfs-albumin-bili-1.pdf deleted file mode 100644 index 85d58e61..00000000 Binary files a/vignettes/fig-rfs/rfs-albumin-bili-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-albumin-coplot-1.pdf b/vignettes/fig-rfs/rfs-albumin-coplot-1.pdf deleted file mode 100644 index 4bb086f3..00000000 Binary files a/vignettes/fig-rfs/rfs-albumin-coplot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-bili-albumin-1.pdf b/vignettes/fig-rfs/rfs-bili-albumin-1.pdf deleted file mode 100644 index 2ac490d9..00000000 Binary files a/vignettes/fig-rfs/rfs-bili-albumin-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-bili-coplot-1.pdf b/vignettes/fig-rfs/rfs-bili-coplot-1.pdf deleted file mode 100644 index b68990a0..00000000 Binary files a/vignettes/fig-rfs/rfs-bili-coplot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-categoricalEDA-1.pdf b/vignettes/fig-rfs/rfs-categoricalEDA-1.pdf deleted file mode 100644 index 85725c93..00000000 Binary files a/vignettes/fig-rfs/rfs-categoricalEDA-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-continuousEDA-1.pdf b/vignettes/fig-rfs/rfs-continuousEDA-1.pdf deleted file mode 100644 index 74d49492..00000000 Binary files a/vignettes/fig-rfs/rfs-continuousEDA-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-coplot_bilirubin-1.pdf b/vignettes/fig-rfs/rfs-coplot_bilirubin-1.pdf deleted file mode 100644 index b00c8e57..00000000 Binary files a/vignettes/fig-rfs/rfs-coplot_bilirubin-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-depthVimp-1.pdf b/vignettes/fig-rfs/rfs-depthVimp-1.pdf deleted file mode 100644 index 464bb6cb..00000000 Binary files a/vignettes/fig-rfs/rfs-depthVimp-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-errorPlot-1.pdf b/vignettes/fig-rfs/rfs-errorPlot-1.pdf deleted file mode 100644 index 7f4831dc..00000000 Binary files a/vignettes/fig-rfs/rfs-errorPlot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-gg_survival-bili-1.pdf b/vignettes/fig-rfs/rfs-gg_survival-bili-1.pdf deleted file mode 100644 index 86ef5492..00000000 Binary files a/vignettes/fig-rfs/rfs-gg_survival-bili-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-mindepth-plot-1.pdf b/vignettes/fig-rfs/rfs-mindepth-plot-1.pdf deleted file mode 100644 index 541991a8..00000000 Binary files a/vignettes/fig-rfs/rfs-mindepth-plot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-pbc-partial-edema-1.pdf b/vignettes/fig-rfs/rfs-pbc-partial-edema-1.pdf deleted file mode 100644 index ac56217b..00000000 Binary files a/vignettes/fig-rfs/rfs-pbc-partial-edema-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-pbc-partial-panel-1.pdf b/vignettes/fig-rfs/rfs-pbc-partial-panel-1.pdf deleted file mode 100644 index c8c2c4b3..00000000 Binary files a/vignettes/fig-rfs/rfs-pbc-partial-panel-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-plot_gg_cum_hazard-1.pdf b/vignettes/fig-rfs/rfs-plot_gg_cum_hazard-1.pdf deleted file mode 100644 index 57ddd999..00000000 Binary files a/vignettes/fig-rfs/rfs-plot_gg_cum_hazard-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-plot_gg_survival-1.pdf b/vignettes/fig-rfs/rfs-plot_gg_survival-1.pdf deleted file mode 100644 index 4c6ee2d6..00000000 Binary files a/vignettes/fig-rfs/rfs-plot_gg_survival-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-predictPlot-1.pdf b/vignettes/fig-rfs/rfs-predictPlot-1.pdf deleted file mode 100644 index 3c940dff..00000000 Binary files a/vignettes/fig-rfs/rfs-predictPlot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-rf-vimp-1.pdf b/vignettes/fig-rfs/rfs-rf-vimp-1.pdf deleted file mode 100644 index f0d0fd84..00000000 Binary files a/vignettes/fig-rfs/rfs-rf-vimp-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-rfsrc-mean2-1.pdf b/vignettes/fig-rfs/rfs-rfsrc-mean2-1.pdf deleted file mode 100644 index 37159f48..00000000 Binary files a/vignettes/fig-rfs/rfs-rfsrc-mean2-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-rfsrc-plot-1.pdf b/vignettes/fig-rfs/rfs-rfsrc-plot-1.pdf deleted file mode 100644 index f0bf4928..00000000 Binary files a/vignettes/fig-rfs/rfs-rfsrc-plot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-rfsrc-plot3Mnth-1.pdf b/vignettes/fig-rfs/rfs-rfsrc-plot3Mnth-1.pdf deleted file mode 100644 index 0eaad4df..00000000 Binary files a/vignettes/fig-rfs/rfs-rfsrc-plot3Mnth-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-surface3d-1.pdf b/vignettes/fig-rfs/rfs-surface3d-1.pdf deleted file mode 100644 index d3f1349b..00000000 Binary files a/vignettes/fig-rfs/rfs-surface3d-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-timeSurface3d-1.pdf b/vignettes/fig-rfs/rfs-timeSurface3d-1.pdf deleted file mode 100644 index df700423..00000000 Binary files a/vignettes/fig-rfs/rfs-timeSurface3d-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-var_dep-1.pdf b/vignettes/fig-rfs/rfs-var_dep-1.pdf deleted file mode 100644 index d67c2d04..00000000 Binary files a/vignettes/fig-rfs/rfs-var_dep-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-variable-plot-1.pdf b/vignettes/fig-rfs/rfs-variable-plot-1.pdf deleted file mode 100644 index f5206b4a..00000000 Binary files a/vignettes/fig-rfs/rfs-variable-plot-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-variable-plotCat-1.pdf b/vignettes/fig-rfs/rfs-variable-plotCat-1.pdf deleted file mode 100644 index 5bbafa6f..00000000 Binary files a/vignettes/fig-rfs/rfs-variable-plotCat-1.pdf and /dev/null differ diff --git a/vignettes/fig-rfs/rfs-variable-plotbili-1.pdf b/vignettes/fig-rfs/rfs-variable-plotbili-1.pdf deleted file mode 100644 index ff0a14f7..00000000 Binary files a/vignettes/fig-rfs/rfs-variable-plotbili-1.pdf and /dev/null differ diff --git a/vignettes/ggRandomForests.bib b/vignettes/ggRandomForests.bib deleted file mode 100644 index e3147ca1..00000000 --- a/vignettes/ggRandomForests.bib +++ /dev/null @@ -1,429 +0,0 @@ -@article{yoon:2010, -author={Yoon, Dustin Y. and Smedira, Nicholas G. and Nowicki, Edward R. and Hoercher, Katherine J. and Rajeswaran, Jeevanantham and Blackstone, Eugene H. and Lytle, Bruce W.}, -title={Decision support in surgical management of ischemic cardiomyopathy}, -journal={The Journal of Thoracic and Cardiovascular Surgery}, -volume=139, -Number=2, -pages={283--293}, -month=Feb, -year=2010 -} - -@article{Ishwaran:2008, - author = "Ishwaran, Hemant and Kogalur, Udaya B. and Blackstone, Eugene H. and Lauer, Michael S.", -journal = "The Annals of Applied Statistics", -number = "3", -pages = {841--860}, -title = {{Random Survival Forests}}, -volume = {2}, -year = {2008} -} - -@article{Ishwaran:2010a, - author = "Ishwaran, Hemant and Kogalur, Udaya B.", -journal = "Statistics and Probability Letters", -pages = {1056--1064}, -title = {{Consistency of Random Survival Forests}}, -volume = {80}, -year = {2010} -} - -@book{fleming:1991, -author = {Thomas R. Fleming and David P. Harrington}, -title = {Counting Processes and Survival Analysis.}, -publisher={John Wiley \& Sons, New York}, -year={1991} -} - -@article{Ishwaran:2007, -author = {Ishwaran, Hemant}, -journal = {Electronic Journal of Statistics}, -pages = {519--537}, -title = {{Variable Importance in Binary Regression Trees and Forests}}, -volume = {1}, -year = {2007} -} - -@Article{Ishwaran:2007a, - author="Ishwaran, Hemant and Kogalur, Udaya B.", - year="2007", - title={{Random Survival Forests for \proglang{R}}}, - journal= "\proglang{R} News", - volume="7", - issue="2", - pages="25--31" - } - -@Manual{rcore, - title = {\proglang{R}: A Language and Environment for Statistical Computing}, - author = {{\proglang{R} Core Team}}, - organization = {\proglang{R} Foundation for Statistical Computing}, - address = {Vienna, Austria}, - year = {2014}, - url = {http://www.R-project.org/}, - } - -@Article{ ehrlinger:2012a, - AUTHOR = {\myname{Ehrlinger, John} and Ishwaran, Hemant}, - TITLE = {Characterizing {$L_2$}Boosting}, - JOURNAL = {Ann. Statist.}, - FJOURNAL = {Annals of Statistics}, - YEAR = {2012}, - VOLUME = {40}, - NUMBER = {2}, - PAGES = {1074--1101}, - ISSN = {0090-5364}, - DOI = {10.1214/12-AOS997}, - ARXIV = {1207.5367}, - SICI = {0090-5364(2012)40:2<1074:CB>2.0.CO;2-M}, -} - -@Article{ ehrlinger:2012b, -title = "{Prediction Error Bagging Enhancements.}", -journal="In preparation", -author = {\myname{Ehrlinger, John} and Ishwaran, Hemant} -} - -@book{cart:1984, -address = {Monterey, CA}, -author = {Breiman, L and Friedman, Jerome H and Olshen, R and Stone, C}, -publisher = {Wadsworth and Brooks}, -title = {{Classification and Regression Trees}}, -year = {1984} -} - -@article{Breiman:2001, -author = {Breiman, Leo}, -journal = {Machine Learning}, -number = {1}, -pages = {5--32}, -publisher = {Kluwer Academic Publishers, Boston}, -title = {{Random Forests}}, -volume = {45}, -year = {2001} -} - -@article{Breiman:twoCultures:2001, -author = {Breiman, Leo}, -journal = {Statistical Science}, -number = {3}, -pages = {199--231}, -title = {{Statistical Modeling: The Two Cultures}}, -volume = {16}, -year = {2001} -} - -@techreport{BreimanOOB:1996e, -author = {Breiman, L}, -institution = {Statistics Department, University of California,Berkeley, CA. 94708}, -title = {{Out--Of--Bag Estimation}}, -url ={https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf}, -year = {1996} -} - -@article{Breiman:1996, -author = {Breiman, L}, -journal = {Machine Learning}, -pages = {123--140}, -title = {{Bagging Predictors}}, -volume = {26}, -year = {1996} -} - -@article{efron1993introduction, -author = {Efron, B and Tibshirani, R J}, -journal = {New York}, -title = {{An introduction to the bootstrap Chapman and Hall}}, -volume = {436}, -year = {1993} -} - - @Article{Liaw:2002, - title = {Classification and Regression by \pkg{randomForest}}, - author = {Liaw, Andy and Wiener, Matthew}, - journal = {\proglang{R} News}, - year = {2002}, - volume = {2}, - number = {3}, - pages = {18--22}, - } - - @Book{Wickham:2009, - author = {Wickham, Hadley }, - title = {\pkg{ggplot2}: Elegant Graphics for Data Analysis}, - publisher = {Springer-Verlag}, - address = {New York}, - year = {2009}, - isbn = {978-0-387-98140-6}, - URLcomment = {http://had.co.nz/ggplot2/book} - } - -@Article{Ishwaran:2010, -author={Ishwaran, Hemant and Kogalur, Udaya B. and Gorodeski, Eiran Z. and Minn, Andy J. and Lauer, Michael S.}, -year ={2010}, -title={High--Dimensional Variable Selection for Survival Data.}, -journal={J. Amer. Statist. Assoc.}, -volume={105}, -pages={205--217} -} - -@Article{Ishwaran:2011, -author={Ishwaran, Hemant and Kogalur, Udaya B. and Chen, Xi and Minn, Andy J. }, -year ={2011}, -title={Random Survival Forests for High--Dimensional Data.}, -journal={Statist. Anal. Data Mining}, -volume={4}, -pages={115--132} -} - -@misc{Ishwaran:RFSRC:2014, - title = {{Random Forests for Survival, Regression and Classification (RF-SRC), \proglang{R} package version 1.6.}}, - author = {Ishwaran, Hemant and Kogalur, Udaya B.}, - year = {2014}, - url = {http://CRAN.R-project.org/package=randomForestSRC} -} - -@Manual{Xie:2015, - title = {\pkg{knitr}: A General-Purpose Package for Dynamic Report Generation in \proglang{R}}, - author = {Yihui Xie}, - year = {2015}, - note = {\proglang{R} package version 1.9}, - url = {http://yihui.name/knitr/} -} - -@Book{Xie:2013, - title = {Dynamic Documents with \proglang{R} and \pkg{knitr}}, - author = {Yihui Xie}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2013}, - note = {ISBN 978-1482203530}, - url = {http://yihui.name/knitr/}, -} - -@InCollection{Xie:2014, - booktitle = {Implementing Reproducible Computational Research}, - editor = {Victoria Stodden and Friedrich Leisch and Roger D. Peng}, - title = {\pkg{knitr}: A Comprehensive Tool for Reproducible Research in \proglang{R}}, - author = {Yihui Xie}, - publisher = {Chapman and Hall/CRC}, - year = {2014}, - note = {ISBN 978-1466561595}, - url = {http://www.crcpress.com/product/isbn/9781466561595}, -} - -@Book{xie:2013a, -author={Yihui Xie}, -year={2013}, -title={{Dynamic Documents with \proglang{R} and \pkg{knitr}}}, -publisher={Chapman and Hall/CRC}, -isbn={978-1482203530} -} - -@incollection{xie:2013b, -author={Yihui Xie}, -year={2013}, -title={{\pkg{knitr}: A Comprehensive Tool for Reproducible Research in \proglang{R}}}, -editors={Victoria Stodden, Friedrich Leisch and Roger D. Peng}, -booktitle={Implementing Reproducible Computational Research.}, -publisher={Chapman and Hall/CRC}, -isbn={978-1466561595} -} - -@Article{ Breiman:1998, - author = "Leo Breiman", - title = {{Arcing Classifier (with Discussion and a Rejoinder by the Author)}}, - year = "1998", - journal = "The Annals of Statistics", - volume = "26", - number = "3", - pages = "801--849" -} - -@article{Breiman:2011, - abstract = {{There are two cultures in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated by a given stochastic data model. The other uses algorithmic models and treats the data mechanism as unknown. The statistical community has been committed to the almost exclusive use of data models. This commitment has led to irrelevant theory, questionable conclusions, and has kept statisticians from working on a large range of interesting current problems. Algorithmic modeling, both in theory and practice, has developed rapidly in fields outside statistics. It can be used both on large complex data sets and as a more accurate and informative alternative to data modeling on smaller data sets. If our goal as a field is to use data to solve problems, then we need to move away from exclusive dependence on data models and adopt a more diverse set of tools.}}, - author = {Breiman, Leo}, - citeulike-article-id = {7796484}, - citeulike-linkout-0 = {http://dx.doi.org/10.1214/ss/1009213726}, - doi = {10.1214/ss/1009213726}, - issn = {0883-4237}, - journal = {Statistical Science}, - keywords = {modeling}, - month = aug, - number = {3}, - pages = {199--231}, - posted-at = {2011-11-17 17:29:35}, - priority = {3}, - title = {{Statistical Modeling: The Two Cultures (with comments and a rejoinder by the author)}}, - url = {http://dx.doi.org/10.1214/ss/1009213726}, - volume = {16}, - year = {2001} -} - -@Book{ bootstrap:1994, - abstract = "{Statistics is a subject of many uses and surprisingly few effective practitioners. The traditional road to statistical knowledge is blocked, for most, by a formidable wall of mathematics. The approach in An Introduction to the Bootstrap avoids that wall. It arms scientists and engineers, as well as statisticians, with the computational techniques they need to analyze and understand complicated data sets.}", - author = "Bradley Efron and Robert Tibshirani", - isbn = "0412042312", - month = "May", - priority = "4", - publisher = "{Chapman \& Hall/CRC}", - title = "An Introduction to the Bootstrap", - year = "1994" -} - -@ARTICLE{Friedman:2000, - author = {Jerome H. Friedman}, - title = {Greedy Function Approximation: A Gradient Boosting Machine}, - journal = {Annals of Statistics}, - year = {2000}, - volume = {29}, - pages = {1189--1232} -} - -@article{Breiman01statisticalmodeling:, - author = {Leo Breiman}, - title = {Statistical modeling: The two cultures}, - journal = {Statistical Science}, - year = {2001} -} - -@article{cleveland:1979, -author={Cleveland, William S.}, -year={1979}, -title={Robust Locally Weighted Regression and Smoothing Scatterplots}, -journal={Journal of the American Statistical Association}, -volume={74}, -number={368}, -pages={829-836} -} - -@article{cleveland:1981, -author={Cleveland, William S.}, -year={1981}, -title={{LOWESS: A Program for Smoothing Scatterplots by Robust Locally Weighted Regression}}, -journal={The American Statistician}, -volume={35}, -number={1}, -pages={54} -} - -@article{cleveland:1988, -author={Cleveland, William S. and Devlin, Susan J.}, -year={1988}, -title={{Locally-Weighted Regression: An Approach to Regression Analysis by Local Fitting}}, -journal={Journal of the American Statistical Association}, -volume={83}, -number={403}, -pages={596-610} -} - -@book{chambers:1992, -author={Chambers, J. M.}, -year={1992}, -title={Statistical Models in \proglang{S}}, -editors={J. M. Chambers and T. J. Hastie}, -publisher={Wadsworth {\&} Brooks/Cole} -} - -@book{Becker:1988, -author={Becker, R. A. and Chambers, J. M. and Wilks, A. R.}, -year={1988}, -title={The New S Language.}, -publisher={Wadsworth {\&} Brooks/Cole} -} - -@book{cleveland:1993, -author={Cleveland, William S.}, -year={1993}, -title={Visualizing Data}, -publisher={Summit Press} -} - -@Book{StatisticalLearning:2009, - author = "Trevor Hastie and Robert Tibshirani and Jerome H. Friedman", - booktitle = "The Elements of Statistical Learning", - edition = "Second", - isbn = "978-0-387-84857-0", - month = aug, - publisher = "Springer-Verlag", - address = {New York}, - title = "The Elements of Statistical Learning: Data Mining, Inference, and Prediction", - year = "2009" -} - -@book{Tukey:1977, -author="Tukey, John W.", -year="1977", -title="Exploratory Data Analysis", -publisher="Pearson"} - - -@article{Harrison:1978, -author="Harrison, D. and Rubinfeld, D.L.", -year="1978", -title="Hedonic prices and the demand for clean air", -journal="J. Environ. Economics and Management", -volume="5", -pages="81–102" -} - -@book{Belsley:1980, -author="Belsley, D.A. and Kuh, E. and Welsch, R.E.", -year="1980", -title="Regression Diagnostics. Identifying Influential Data and Sources of Collinearity.", -publisher="John Wiley \& Sons, New York" -} - -@article{Rubin:1976, -author={Rubin, D.B.}, -journal={Biometrika}, -year={1976}, -title={Inference and Missing Data.}, -volume={63}, -pages={581-592} -} - -@Book{mass:2002, - title = {Modern Applied Statistics with \proglang{S}}, - author = {W. N. Venables and B. D. Ripley}, - publisher = {Springer-Verlag}, - edition = {Fourth}, - address = {New York}, - year = {2002}, - note = {ISBN 0-387-95457-0}, - url = {http://www.stats.ox.ac.uk/pub/MASS4}, - } - - @Manual{shiny:2015, - title = {\pkg{shiny}: Web Application Framework for \proglang{R}}, - author = {Winston Chang and Joe Cheng and JJ Allaire and Yihui Xie and Jonathan McPherson}, - year = {2015}, - note = {\proglang{R} package version 0.11.1}, - url = {http://CRAN.R-project.org/package=shiny}, - } - - @Manual{rcolorbrewer:2014, - title = {\pkg{RColorBrewer}: ColorBrewer Palettes}, - author = {Erich Neuwirth}, - year = {2014}, - note = {\proglang{R} package version 1.1-2}, - url = {http://CRAN.R-project.org/package=RColorBrewer}, - } - @Manual{plot3D:2014, - title = {\pkg{plot3D}: Plotting multi-dimensional data.}, - author = {Karline Soetaert}, - year = {2014}, - note = {R package version 1.0-2}, - url = {http://CRAN.R-project.org/package=plot3D}, - } - - -@article{blackstone:1986, - Author = {Blackstone, Eugene H. and Naftel, David C. and Turner, Malcolm E.}, - Journal = {Journal of the American Statistical Association}, - Number = {395}, - Pages = {615-624}, - Title = {The Decomposition of Time-Varying Hazard into Phases, Each Incorporating a Separate Stream of Concomitant Information}, - Volume = {81}, - Year = {1986} - } diff --git a/vignettes/randomForestSRC-Survival.Rnw b/vignettes/randomForestSRC-Survival.Rnw deleted file mode 100755 index d9476fc1..00000000 --- a/vignettes/randomForestSRC-Survival.Rnw +++ /dev/null @@ -1,1243 +0,0 @@ -\documentclass[nojss]{jss} - -\usepackage{setspace} -% \usepackage[sc]{mathpazo} -\usepackage{amsmath} -% \setcounter{secnumdepth}{2} -% \setcounter{tocdepth}{2} -%\usepackage{colortbl} - -\usepackage{xcolor} -\usepackage{booktabs} - -% For kable tables, these help. -\usepackage{floatrow} -\floatsetup[table]{capposition=bottom} -\floatplacement{table}{htb} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% declarations for jss.cls %% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%\VignetteEngine{knitr::knitr} -%\VignetteIndexEntry{randomForestSRC-Survival} -%\VignetteIndexEntry{ggRandomForests: Survival with random forest} -%\VignetteKeywords{random forest, survival, VIMP, minimal depth} -%\VignetteDepends{ggRandomForests} -%\VignettePackage{ggRandomForests} - -%% almost as usual -\author{John Ehrlinger \and Jeevanantham Rajeswaran \and Eugene H. Blackstone\\Cleveland Clinic } - -\title{\pkg{ggRandomForests}: Exploring Random Forest Survival} - -%% for pretty printing and a nice hypersummary also set: -\Plainauthor{Ehrlinger, Rajeswaran and Blackstone} %% comma-separated -\Plaintitle{ggRandomForests: Exploring Random Forest Survival} %% without formatting -\Shorttitle{Exploring Random Forest Survival} - -%% an abstract and keywords -\Abstract{ -Random forest~\citep{Breiman:2001} (RF) is a non-parametric statistical method requiring no distributional assumptions on covariate relation to the response. RF is a robust, nonlinear technique that optimizes predictive accuracy by fitting an ensemble of trees to stabilize model estimates. Random survival forests (RSF)~\citep{Ishwaran:2007a, Ishwaran:2008} are an extension of Breiman's RF techniques allowing efficient non-parametric analysis of time to event data. The \pkg{randomForestSRC} package~\citep{Ishwaran:RFSRC:2014} is a unified treatment of Breiman's random forest for survival, regression and classification problems. - -Predictive accuracy makes RF an attractive alternative to parametric models, though complexity and interpretability of the forest hinder wider application of the method. We introduce the \pkg{ggRandomForests} package, tools for visually understand random forest models grown in \proglang{R}~\citep{rcore} with the \pkg{randomForestSRC} package. The \pkg{ggRandomForests} package is structured to extract intermediate data objects from \pkg{randomForestSRC} objects and generate figures using the \pkg{ggplot2}~\citep{Wickham:2009} graphics package. - -This document is structured as a tutorial for building random forest for survival with the \pkg{randomForestSRC} package and using the \pkg{ggRandomForests} package for investigating how the forest is constructed. We analyse the Primary Biliary Cirrhosis of the liver data from a clinical trial at the Mayo Clinic~\citep{fleming:1991}. We demonstrate random forest variable selection using Variable Importance (VIMP)~\citep{Breiman:2001} and Minimal Depth~\citep{Ishwaran:2010}, a property derived from the construction of each tree within the forest. We will also demonstrate the use of variable dependence and partial dependence plots~\citep{Friedman:2000} to aid in the interpretation of RSF results. We then examine variable interactions between covariates using conditional variable dependence plots. Our aim is to demonstrate the strength of using Random Forest methods for both prediction and information retrieval, specifically in time to event data settings. -} -\Keywords{random forest, survival, VIMP, minimal depth, \proglang{R}, \pkg{randomForestSRC}} -\Plainkeywords{random forest, survival, VIMP, minimal depth, R, randomForestSRC} -%% at least one keyword must be supplied - -%% publication information -%% NOTE: Typically, this can be left commented and will be filled out by the technical editor -%% \Volume{13} -%% \Issue{9} -%% \Month{September} -%% \Year{2004} -\Submitdate{2015-04-06} -%% \Acceptdate{2004-09-29} - -%% The address of (at least) one author should be given -%% in the following format: -\Address{ -John Ehrlinger\\ -Quantitative Health Sciences\\ -Lerner Research Institute\\ -Cleveland Clinic\\ -9500 Euclid Ave\\ -Cleveland, Ohio 44195\\ -% Telephone: + 41/0/44634-4643 \\ -% Fax: + 41/0/44634-4386 \\ -E-mail: \email{john.ehrlinger@gmail.com}\\ -URL: \url{https://github.com/ehrlinger/ggRandomForests} -} - -%% It is also possible to add a telephone and fax number -%% before the e-mail in the following format: -%% Telephone: + 43/1/31336-5053 -%% Fax: + 43/1/31336-734 - -%% for those who use Sweave please include the following line (with % symbols): -%% need no \usepackage{Sweave.sty} - -%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -<>= -## Not displayed ## -library("knitr") -knitr::render_sweave() -# set global chunk options for knitr. These can be changed in the header for each individual R code chunk -opts_chunk$set(fig.path = 'fig-rfs/rfs-', - fig.align = 'center', - fig.pos = "!htb", - fig.show = 'hold', - fig.height = 3, - fig.width = 4, - size = 'footnotesize', - prompt = TRUE, - highlight = FALSE, - comment = NA, - echo = FALSE, # Change this to TRUE if you want to see all the code examples - results = FALSE, - message = FALSE, - warning = FALSE, - error = FALSE, - dev = 'pdf', prompt = TRUE) - -# Setup the R environment -options(object.size = Inf, expressions = 100000, memory = Inf, - replace.assign = TRUE, width = 90, prompt = "R> ") -options(mc.cores = 1, rf.cores = 0, stringsAsFactors = FALSE) -@ - -\begin{document} -%\doublespacing - -% ----------------------------------------------------- -\section{Introduction} \label{S:introduction} -% ----------------------------------------------------- - -Random forest~\citep{Breiman:2001} (RF) is a non-parametric statistical method which requires no distributional assumptions on covariate relation to the response. RF is a robust, nonlinear technique that optimizes predictive accuracy by fitting an ensemble of trees to stabilize model estimates. Random Survival Forest (RSF)~\citep{Ishwaran:2007a,Ishwaran:2008} is an extension of Breiman's RF techniques to survival settings, allowing efficient non-parametric analysis of time to event data. The \pkg{randomForestSRC} package~\citep[\url{http://CRAN.R-project.org/package=randomForestSRC}]{Ishwaran:RFSRC:2014} is a unified treatment of Breiman's random forest for survival, regression and classification problems. - -Predictive accuracy make RF an attractive alternative to parametric models, though complexity and interpretability of the forest hinder wider application of the method. We introduce the \pkg{ggRandomForests} package (\url{http://CRAN.R-project.org/package=ggRandomForests}) for visually exploring random forest models. The \pkg{ggRandomForests} package is structured to extract intermediate data objects from \pkg{randomForestSRC} objects and generate figures using the \pkg{ggplot2} graphics package~\citep[\url{http://CRAN.R-project.org/package=ggplot2}]{Wickham:2009}. - -Many of the figures created by the \pkg{ggRandomForests} package are also available directly from within the \pkg{randomForestSRC} package. However \pkg{ggRandomForests} offers the following advantages: -\begin{itemize} -\item Separation of data and figures: \pkg{ggRandomForests} contains functions that operate on either the \code{rfsrc} forest object directly, or on the output from \pkg{randomForestSRC} post processing functions (i.e., \code{plot.variable}, \code{var.select}) to generate intermediate \pkg{ggRandomForests} data objects. \pkg{ggRandomForests} functions are provide to further process these objects and plot results using the \pkg{ggplot2} graphics package. Alternatively, users can use these data objects for their own custom plotting or analysis operations. - -\item Each data object/figure is a single, self contained unit. This allows simple modification and manipulation of the data or \code{ggplot} objects to meet users specific needs and requirements. - -\item We chose to use the \pkg{ggplot2} package for our figures for flexibility in modifying the output. Each \pkg{ggRandomForests} plot function returns either a single \code{ggplot} object, or a \code{list} of \code{ggplot} objects, allowing the use of additional \pkg{ggplot2} functions to modify and customize the final figures. -\end{itemize} - -This document is structured as a tutorial for using the \pkg{randomForestSRC} package for building and post-processing random survival forest models and using the \pkg{ggRandomForests} package for understanding how the forest is constructed. In this tutorial, we will build a random survival forest for the primary biliary cirrhosis (PBC) of the liver data set~\citep{fleming:1991}, available in the \pkg{randomForestSRC} package. - -In Section~\ref{S:data} we introduce the \code{pbc} data set and summarize the proportional hazards analysis of this data from Chapter 4 of~\cite{fleming:1991}. In Section~\ref{S:rfsrc}, we describe how to grow a random survival forest with the \pkg{randomForestSRC} package. Random forest is not a parsimonious method, but uses all variables available in the data set to construct the response predictor. We demonstrate random forest variable selection techniques (Section~\ref{S:variableselection}) using Variable Importance (VIMP)~\citep{Breiman:2001} in Section~\ref{S:vimp} and Minimal Depth~\citep{Ishwaran:2010} in Section~\ref{S:minimalDepth}. We then compare both methods with variables used in the~\cite{fleming:1991} model. - -Once we have an idea of which variables we are most interested in, we use dependence plots~\citep{Friedman:2000} (Section~\ref{S:dependence}) to understand how these variables are related to the response. Variable dependence (Section~\ref{S:variabledependence}) plots give us an idea of the overall trend of a variable/response relation, while partial dependence plots (Section~\ref{S:partialdependence}) show us the risk adjusted relation by averaging out the effects of other variables. Dependence plots often show strongly non-linear variable/response relations that are not easily obtained through parametric modeling. - -We then graphically examine forest variable interactions with the use of variable and partial dependence conditioning plots (coplots)~\citep{chambers:1992,cleveland:1993} (Section~\ref{S:coplots}) and close with concluding remarks in Section~\ref{S:conclusion}. - -<>= -################## Load packages ################## -library("ggplot2") # Graphics engine -library("RColorBrewer") # Nice color palettes -library("plot3D") # for 3d surfaces. -library("dplyr") # Better data manipulations -library("parallel") # mclapply for multicore processing - -# Analysis packages. -library("randomForestSRC") # random forest for survival, regression and - # classification -library("ggRandomForests") # ggplot2 random forest figures (This!) - -################ Default Settings ################## -theme_set(theme_bw()) # A ggplot2 theme with white background - -## Set open circle for censored, and x for events -event.marks <- c(1, 4) -event.labels <- c(FALSE, TRUE) - -## We want red for death events, so reorder this set. -strCol <- brewer.pal(3, "Set1")[c(2,1,3)] -@ - -\section{Data summary: primary biliary cirrhosis (PBC) data set}\label{S:data} - -The \emph{primary biliary cirrhosis} of the liver (PBC) study consists of 424 PBC patients referred to Mayo Clinic between 1974 and 1984 who met eligibility criteria for a randomized placebo controlled trial of the drug D-penicillamine (DPCA). The data is described in~\cite[Chapter 0.2]{fleming:1991} and a partial likelihood model (Cox proportional hazards) is developed in Chapter 4.4. The \code{pbc} data set, included in the \pkg{randomForestSRC} package, contains 418 observations, of which 312 patients participated in the randomized trial~\cite[Appendix D]{fleming:1991}. -<>= -data("pbc", package = "randomForestSRC") -@ - -<>= -library("tidyr") # Transforming wide data into long data (gather) - -## Not displayed ## -## Set modes correctly. For binary variables: transform to logical -## Check for range of 0, 1 -## There is probably a better way to do this. -for(ind in 1:dim(pbc)[2]){ - if(!is.factor(pbc[, ind])){ - if(length(unique(pbc[which(!is.na(pbc[, ind])), ind]))<= 2) { - if(sum(range(pbc[, ind], na.rm = TRUE) == c(0, 1)) == 2){ - pbc[, ind] <- as.logical(pbc[, ind]) - } - } - }else{ - if(length(unique(pbc[which(!is.na(pbc[, ind])), ind]))<= 2) { - if(sum(sort(unique(pbc[, ind])) == c(0, 1)) == 2){ - pbc[, ind] <- as.logical(pbc[, ind]) - } - if(sum(sort(unique(pbc[, ind])) == c(FALSE, TRUE)) == 2){ - pbc[, ind] <- as.logical(pbc[, ind]) - } - } - } - if(!is.logical(pbc[, ind]) & - length(unique(pbc[which(!is.na(pbc[, ind])), ind]))<= 5) { - pbc[, ind] <- factor(pbc[, ind]) - } -} -# Convert age to years -pbc$age <- pbc$age/364.24 -pbc$years <- pbc$days/364.24 -pbc <- pbc %>% select(-days) -pbc$treatment <- as.numeric(pbc$treatment) -pbc$treatment[which(pbc$treatment == 1)] <- "DPCA" -pbc$treatment[which(pbc$treatment == 2)] <- "placebo" -pbc$treatment <- factor(pbc$treatment) - -cls <- sapply(pbc, class) - -labels <- c("Event (F = censor, T = death)", - "Treament (DPCA, Placebo)", - "Age (years)", - "Female = T", - "Presence of Asictes", - "Presence of Hepatomegaly", - "Presence of Spiders", - "Edema (0, 0.5, 1)", - "Serum Bilirubin (mg/dl)", - "Serum Cholesterol (mg/dl)", - "Albumin (gm/dl)", - "Urine Copper (ug/day)", - "Alkaline Phosphatase (U/liter)", - "SGOT (U/ml)", - "Triglicerides (mg/dl)", - "Platelets per cubic ml/1000", - "Prothrombin time (sec)", - "Histologic Stage", - "Time (years)") - -dta.labs <- data.frame(cbind(names = colnames(pbc), label = labels, type = cls)) -# Put the "years" variable on top. -dta.labs <- rbind(dta.labs[nrow(dta.labs),], dta.labs[-nrow(dta.labs),]) - -st.labs <- as.character(dta.labs$label) -names(st.labs) <- rownames(dta.labs) -@ - -For this analysis, we modify some of the data for better formatting of our results. Since the data contains about 12 years of follow up, we prefer using \code{years} instead of \code{days} to describe survival. We also convert the \code{age} variable to years, and the \code{treatment} variable to a factor containing levels of \code{c("DPCA", "placebo")}. The variable names, type and description are given in Table~\ref{T:dataLabs}. - -<>= -## Not displayed ## -# create a data dictionary table -tmp <- dta.labs -colnames(tmp) <- c("Variable name", "Description", "Type") -kable(tmp, - #format="latex", - caption = "\\label{T:dataLabs}\\code{pbc} data set variable dictionary.", - row.names = FALSE, - booktabs=TRUE) -@ - -\subsection{Exploratory data analysis}\label{S:eda} - -It is good practice to view your data before beginning analysis. Exploratory Data Analysis (EDA)~\cite{Tukey:1977} will help you to understand the data, and find outliers, missing values and other data anomalies within each variable before getting deep into the analysis. To this end, we use \pkg{ggplot2} figures with the \code{facet_wrap} function to create two sets of panel plots, one of histograms for categorical variables (Figure~\ref{fig:categoricalEDA}), and another of scatter plots for continuous variables (Figure~\ref{fig:continuousEDA}). Variables are plotted along a continuous variable on the X-axis to separate the individual observations. - -<>= -## Not displayed ## -# Use tidyr::gather to transform the data into long format. -cnt <- c(which(cls == "numeric" ), which(cls == "integer")) -fct <- setdiff(1:ncol(pbc), cnt) # The complement of numeric/integers. -fct <- c(fct, which(colnames(pbc) == "years")) -dta <- suppressWarnings(gather(pbc[,fct], variable, value, -years)) - -# plot panels for each covariate colored by the logical chas variable. -ggplot(dta, aes(x = years, fill = value)) + - geom_histogram(color = "black", binwidth = 1) + - labs(y = "", x = st.labs["years"]) + - scale_fill_brewer(palette="RdBu",na.value = "white" ) + - facet_wrap(~variable, scales = "free_y", nrow = 2) + - theme(legend.position = "none") -@ - -In categorical EDA plots (Figure~\ref{fig:categoricalEDA}), we are looking for patterns of missing data (white portion of bars). We often use surgical date for our X-axis variable to look for possible periods of low enrollment. There is not a comparable variable available in the \code{pbc} data set, so instead we used follow up time (\code{years}). Another reasonable choice may have been to use the patient \code{age} variable for the X-axis. The important quality of the selected variable is to spread the observations out to aid in finding data anomalies. - -<>= -## Not displayed ## -# Use tidyr::gather to transform the data into long format. -cnt <- c(cnt, which(colnames(pbc) == "status")) -dta <- gather(pbc[,cnt], variable, value, -years, -status) - -# plot panels for each covariate colored by the logical chas variable. -ggplot(dta %>% filter(!is.na(value)), - aes(x = years, y = value, color = status, shape = status)) + - geom_point(alpha = 0.4) + - geom_rug(data = dta[which(is.na(dta$value)),], color = "grey50") + - labs(y = "", x = st.labs["years"], color = "Death", shape = "Death") + - scale_color_manual(values = strCol) + - scale_shape_manual(values = event.marks) + - facet_wrap(~variable, scales = "free_y", ncol = 4) + - theme(legend.position = c(0.8, 0.2)) -@ - -In continuous data EDA plots (Figure~\ref{fig:continuousEDA}), we are looking for missingness (rug marks) and extreme or non-physical values. For survival settings, we color and shape the points as red `x's to indicate events, and blue circles to indicate censored observation. - -Extreme value examples are evident in a few of the variables in Figure~\ref{fig:continuousEDA}. We are typically looking for values that are outside of the biological range. This is often caused by measurements recorded in differing units, which can sometimes be corrected algorithmically. Since we can not ask the original investigator to clarify these values in this particular study, we will continue without modifying the data. - -<>= -## Not displayed ## -# create a missing data table -pbc.trial <- pbc %>% filter(!is.na(treatment)) -st <- apply(pbc,2, function(rw){sum(is.na(rw))}) -st.t <- apply(pbc.trial,2, function(rw){sum(is.na(rw))}) -st <- data.frame(cbind(full = st, trial = st.t)) -st <- st[which(st$full>0),] -colnames(st) <- c("pbc", "pbc.trial") - -kable(st, - format="latex", - caption = "\\label{T:missing}Missing value counts in \\code{pbc} data set and pbc clinical trial observations (\\code{pbc.trial}).", - digits = 3, - booktabs=TRUE) -@ - -Both EDA figures indicate the \code{pbc} data set contains quite a bit of missing data. Table~\ref{T:missing} shows the number of missing values in each variable of the \code{pbc} data set. Of the \Sexpr{ncol(pbc)} variables in the data, \Sexpr{nrow(st)} have missing values. The \code{pbc} column details variables with missing data in the full \code{pbc} data set, though there are \Sexpr{st["treatment", "full"]} patients that were not randomized into the trial. If we restrict the data to the trial only, most of the missing values are also removed, leaving only \Sexpr{sum(st$pbc.trial>0)} variables with missing values. Therefore, we will focus on the \Sexpr{nrow(pbc.trial)} observations from the clinical trial for the remainder of this document. We will discuss how \pkg{randomForestSRC} handles missing values in Section~\ref{S:imputation}. - -\subsection[PBC Model Summary]{\cite{fleming:1991} Model Summary (\code{gg\_survival})} - -We conclude the data set investigation with a summary of~\cite{fleming:1991} model results from Chapter 4.4. We start by generating Kaplan--Meier (KM) survival estimates comparing the treatment groups of DPCA and placebo. We use the \pkg{ggRandomForests} \code{gg_survival} function to generate these estimates from the data set as follows. - -<>= -# Create the trial and test data sets. -pbc.trial <- pbc %>% filter(!is.na(treatment)) -pbc.test <- pbc %>% filter(is.na(treatment)) - -# Create the gg_survival object -gg_dta <- gg_survival(interval = "years", - censor = "status", - by = "treatment", - data = pbc.trial, - conf.int = 0.95) -@ - -% \begin{CodeInput} -% R> pbc.trial <- pbc %>% filter(!is.na(treatment)) -% R> pbc.test <- pbc %>% filter(is.na(treatment)) -% R> -% R> gg_dta <- gg_survival(interval = "years", censor = "status", -% + by = "treatment", data = pbc.trial, -% + conf.int = 0.95) -% \end{CodeInput} -The code block reduces the \code{pbc} data set to the \code{pbc.trial} which only include observations from the clinical trial. The remaining observations are stored in the \code{pbc.test} data set for later use. The \pkg{ggRandomForests} package is designed to use a two step process in figure generation. The first step is data generation, where we store a \code{gg_survival} data object in the \code{gg_dta} object. The \code{gg_survival} function uses the \code{data} set, follow up \code{interval}, \code{censor} indicator and an optional grouping argument (\code{by}). By default \code{gg_survival} also calculates $95\%$ confidence band, which we can control with the \code{conf.int} argument. - -In the figure generation step, we use the \pkg{ggRandomForests} plot routine \code{plot.gg_survival} as shown in the following code block. The \code{plot.gg_survival} function uses the \code{gg_dta} data object to plot the survival estimate curves for each group and corresponding confidence interval ribbons. We have used additional \pkg{ggplot2} commands to modify the axis and legend labels (\code{labs}), the legend location (\code{theme}) and control the plot range of the y-axis (\code{coord_cartesian}) for this figure. - -<>= -plot(gg_dta) + - labs(y = "Survival Probability", x = "Observation Time (years)", - color = "Treatment", fill = "Treatment") + - theme(legend.position = c(0.2, 0.2)) + - coord_cartesian(y = c(0, 1.01)) -@ -The \code{gg_survival} plot of Figure~\ref{fig:plot_gg_survival} is analogous to~\cite{fleming:1991} Figure 0.2.3 and Figure 4.4.1, showing there is little difference between the treatment and control groups. - -The \code{gg_survival} function generates a variety of time-to-event estimates, including the cumulative hazard. The follow code block creates a cumulative hazard plot~\cite[Figure 0.2.1]{fleming:1991} in Figure~\ref{fig:plot_gg_cum_hazard} using the same data object generated by the original \code{gg_survival} function call. The red \code{DPCA} line is directly comparable to Figure 0.2.1, we've add the cumulative hazard estimates for the \code{placebo} population in blue. - -<>= -plot(gg_dta, type = "cum_haz") + - labs(y = "Cumulative Hazard", x = "Observation Time (years)", - color = "Treatment", fill = "Treatment") + - theme(legend.position = c(0.2, 0.8)) + - coord_cartesian(ylim = c(-0.02, 1.22)) -@ - -In Figure~\ref{fig:plot_gg_survival}, we demonstrated grouping on the categorical variable (\code{treatment}). To demonstrate plotting grouped survival on a continuous variable, we examine KM estimates of survival within stratified groups of bilirubin measures. The groupings are obtained directly from~\cite{fleming:1991} Figure 4.4.2, where they presented univariate model results of predicting survival on a function of bilirubin. - -We set up the \code{bili} groups on a temporary data set (\code{pbc.bili}) using the \code{cut} function with intervals matching the reference figure. For this example we combine the data generation and plot steps into a single line of code. The \code{error} argument of the \code{plot.gg_survival} function is used to control display of the confidence bands. We suppress the intervals for this figure with \code{error = "none"} and again modify the plot display with \pkg{ggplot2} commands to generate Figure~\ref{fig:gg_survival-bili}. - -<>= -pbc.bili <- pbc.trial -pbc.bili$bili_grp <- cut(pbc.bili$bili, breaks = c(0, 0.8, 1.3, 3.4, 29)) - -plot(gg_survival(interval = "years", censor = "status", by = "bili_grp", - data = pbc.bili), error = "none") + - labs(y = "Survival Probability", x = "Observation Time (years)", - color = "Bilirubin") -@ - -In Chapter 4,~\cite{fleming:1991} use partial likelihood methods to build a linear model with log transformations on some variables. We summarize the final, biologically reasonable model in Table~\ref{T:FHmodel} for later comparison with our random forest results. - -<>= -## Not displayed ## -# Create a table summarizing the ph model from fleming and harrington 1991 -fleming.table <- data.frame(matrix(ncol = 3, nrow = 5)) -rownames(fleming.table) <- - c("Age", "log(Albumin)", "log(Bilirubin)", "Edema", "log(Prothrombin Time)") -colnames(fleming.table) <- c("Coef.", "Std. Err.", "Z stat.") -fleming.table[,1] <- c(0.0333, -3.0553,0.8792, 0.7847, 3.0157) -fleming.table[,2] <- c(0.00866, 0.72408,0.09873,0.29913,1.02380) -fleming.table[,3] <- c(3.84,-4.22,8.9,2.62,2.95) - -kable(fleming.table, - format="latex", - caption = "\\label{T:FHmodel}\\code{pbc} proportional hazards model summary of 312 randomized cases in \\code{pbc.trial} data set. ~\\citep[Table 4.4.3c]{fleming:1991} ", - digits = 3, - booktabs=TRUE) -@ - -\section{Random survival forest}\label{S:rfsrc} - -A Random Forest~\citep{Breiman:2001} is grown by \emph{bagging}~\citep{Breiman:1996} a collection of \emph{classification and regression trees} (CART)~\citep{cart:1984}. The method uses a set of $B$ \emph{bootstrap}~\citep{bootstrap:1994} samples, growing an independent tree model on each sub-sample of the population. Each tree is grown by recursively partitioning the population based on optimization of a \emph{split rule} over the $p$-dimensional covariate space. At each split, a subset of $m \le p$ candidate variables are tested for the split rule optimization, dividing each node into two daughter nodes. Each daughter node is then split again until the process reaches the \emph{stopping criteria} of either \emph{node purity} or \emph{node member size}, which defines the set of \emph{terminal (unsplit) nodes} for the tree. In regression trees, node impurity is measured by mean squared error, whereas in classification problems, the Gini index is used~\citep{Friedman:2000} . - -Random forest sorts each training set observation into one unique terminal node per tree. Tree estimates for each observation are constructed at each terminal node, among the terminal node members. The Random Forest estimate for each observation is then calculated by aggregating, averaging (regression) or votes (classification), the terminal node results across the collection of $B$ trees. - -Random Survival Forests~\citep{Ishwaran:2007, Ishwaran:2008} (RSF) are an extension of Random Forest to analyze right censored, time to event data. A forest of survival trees is grown using a log-rank splitting rule to select the optimal candidate variables. Survival estimate for each observation are constructed with a Kaplan--Meier (KM) estimator within each terminal node, at each event time. - -Random Survival Forests adaptively discover nonlinear effects and interactions and are fully nonparametric. Averaging over many trees enables RSF to approximate complex survival functions, including non-proportional hazards, while maintaining low prediction error. \cite{Ishwaran:2010a} showed that RSF is uniformly consistent and that survival forests have a uniform approximating property in finite-sample settings, a property not possessed by individual survival trees. - -The \pkg{randomForestSRC} \code{rfsrc} function call grows the forest, determining the type of forest by the response supplied in the \code{formula} argument. In the following code block, we grow a random forest for survival, by passing a survival (\code{Surv}) object to the forest. The forest uses all remaining variables in the \code{pbc.trial} data set to generate the RSF survival model. - -<>= -rfsrc_pbc <- rfsrc(Surv(years, status) ~ ., data = pbc.trial, - nsplit = 10, na.action = "na.impute") -@ - -<>= -# in reality, we use data caching to make vignette -# compilation quicker. The rfsrc_pbc forest is stored -# as a ggRandomForests data sets -# -# This code block produces the R output from the -# rfsrc grow block above. We set the chunk argument -# "echo=FALSE" above so this code does not show up -# in the manuscript. -data("rfsrc_pbc", package = "ggRandomForests") -rfsrc_pbc -@ - -The \code{print.rfsrc} function returns information on how the random forest was grown. Here the \code{family = "surv"} forest has \code{ntree = 1000} trees (the default \code{ntree} argument). The forest selected from \code{ceil}$(\sqrt{p=17}) = 5$ randomly selected candidate variables for splitting at each node, stopping when a terminal node contained three or fewer observations. For continuous variables, we used a random logrank split rule, which randomly selects from \code{nsplit = 10} split point values, instead of optimizing over all possible values. - -\subsection[Generalization Error]{Generalization error (\code{gg\_error})} - -One advantage of random forest is a built in generalization error estimate. Each bootstrap sample selects approximately $63.2\%$ of the population on average. The remaining $36.8\%$ of observations, the Out-of-Bag~\citep{BreimanOOB:1996e} (OOB) sample, can be used as a hold out test set for each tree. An OOB prediction error estimate can be calculated for each observation by predicting the response over the set of trees which were not trained with that particular observation. Out-of-Bag prediction error estimates have been shown to be nearly identical to $n$--fold cross validation estimates~\citep{StatisticalLearning:2009}. This feature of random forest allows us to obtain both model fit and validation in one pass of the algorithm. - -The \code{gg_error} function operates on the random forest (\code{rfsrc_pbc}) object to extract the error estimates as a function of the number of trees in the forest. The following code block first creates a \code{gg_error} data object, then uses the \code{plot.gg_error} function to create a \code{ggplot} object for display in a single line of code. - -<>= -plot(gg_error(rfsrc_pbc)) + coord_cartesian(ylim = c(0.09, 0.31)) -@ - -The \code{gg_error} plot of Figure~\ref{fig:errorPlot} demonstrates that it does not take a large number of trees to stabilize the forest prediction error estimate. However, to ensure that each variable has enough of a chance to be included in the forest prediction process, we do want to create a rather large random forest of trees. - -\subsection[Prediction]{Training Set Prediction (\code{gg\_rfsrc})}\label{S:prediction} - -The \code{gg_rfsrc} function extracts the OOB prediction estimates from the random forest. This code block executes the data extraction and plotting in one line, since we are not interested in holding the prediction estimates for later reuse. Each of the \pkg{ggRandomForests} plot commands return \code{ggplot} objects, which we can also store for modification or reuse later in the analysis (\code{ggRFsrc} object). Note that we again use additional \pkg{ggplot2} commands to modify the display of the plot object. - -<>= -ggRFsrc <- plot(gg_rfsrc(rfsrc_pbc), alpha = 0.2) + - scale_color_manual(values = strCol) + - theme(legend.position = "none") + - labs(y = "Survival Probability", x = "Time (years)") + - coord_cartesian(ylim = c(-0.01, 1.01)) -show(ggRFsrc) -@ - -The \code{gg_rfsrc} plot of Figure~\ref{fig:rfsrc-plot} shows the predicted survival from our RSF model. Each line represents a single patient in the training data set, where censored patients are colored blue, and patients who have experienced the event (death) are colored in red. We extend all predicted survival curves to the longest follow up time (12 years), regardless of the actual length of a patient's follow up time. - -Interpretation of general survival properties from Figure~\ref{fig:rfsrc-plot} is difficult because of the number of curves displayed. To get more interpretable results, it is preferable to plot a summary of the survival results. The following code block compares the predicted survival between treatment groups, as we did in Figure~\ref{fig:plot_gg_survival}. -<>= -plot(gg_rfsrc(rfsrc_pbc, by = "treatment")) + - theme(legend.position = c(0.2, 0.2)) + - labs(y = "Survival Probability", x = "Time (years)") + - coord_cartesian(ylim = c(-0.01, 1.01)) -@ - -The \code{gg_rfsrc} plot of Figure~\ref{fig:rfsrc-mean2} shows the median survival with a $95\%$ shaded confidence band for the \code{DPCA} group in red, and the \code{placebo} group in blue. When calling \code{gg_rfsrc} with either a \code{by} argument or a \code{conf.int} argument, the function calculates a bootstrap confidence interval around the median survival line. By default, the function will calculate the \code{conf.int=0.95} confidence interval, with the number of \code{bs.samples} equal to the number of observations. - -\subsection{Random forest imputation}\label{S:imputation} - -There are two modeling issues when dealing with missing data values: ``How does the algorithm build a model when values are missing from the training data?'', and ``How does the algorithm predict a response when values are missing from the test data?''. The standard procedure for linear models is to either remove or impute the missing data values before modelling. Removing the missingness is done by either removing the variable with missing values (column wise) or removing the observations (row wise). Removal is a simple solution, but may bias results when either observations or variables are scarce. - -The \pkg{randomForestSRC} package imputes missing values using \emph{adaptive tree imputation}~\citep{Ishwaran:2008}. Rather than impute missing values before growing the forest, the algorithm takes a ``just--in--time'' approach. At each node split, the set of \code{mtry} candidate variables is checked for missing values. Missing values are then imputed by randomly drawing values from non-missing data within the node. The split-statistic is then calculated on observations that were not missing values. The imputed values are used to sort observations into the subsequent daughter nodes and then discarded before the next split occurs. The process is repeated until the stopping criteria is reached and all observations are sorted into terminal nodes. - -A final imputation step can be used to fill in missing values from within the terminal nodes. This step uses a process similar to the previous imputation but uses the OOB non-missing terminal node data for the random draws. These values are aggregated (averaging for continuous variables, voting for categorical variables) over the \code{ntree} trees in the forest to estimate an imputed data set. By default, the missing values are not filled into the training data, but are available within the forest object for later use if desired. - -Adaptive tree imputation still requires the missing at random assumptions~\citep{Rubin:1976}. At each imputation step, the random forest assumes that similar observations are grouped together within each node. The random draws used to fill in missing data do not bias the split rule, but only sort observations similar in non-missing data into like nodes. An additional feature of this approach is the ability of predicting on test set observations with missing values. - -\subsection{Test set predictions} - -The strength of adaptive tree imputation becomes clear when doing prediction on test set observations. If we want to predict survival for patients that did not participate in the trial using the model we created in Section~\ref{S:rfsrc}, we need to somehow account for the missing values detailed in Table~\ref{T:missing}. - -The \code{predict.rfsrc} call takes the forest object (\code{rfsrc_pbc}), and the test data set (\code{pbc_test}) and returns a predicted survival using the same forest imputation method for missing values within the test data set (\code{na.action="na.impute"}). -<>= -rfsrc_pbc_test <- predict(rfsrc_pbc, newdata = pbc.test, - na.action = "na.impute") -@ - -<>= -# Predict survival for 106 patients not in randomized trial -data("rfsrc_pbc_test", package="ggRandomForests") -# Print prediction summary -rfsrc_pbc_test -@ - -The forest summary indicates there are 106 test set observations with 36 deaths and the predicted error rate is $19.1\%$. We plot the predicted survival just as we did the training set estimates. - -<>= -plot(gg_rfsrc(rfsrc_pbc_test), alpha=.2) + - scale_color_manual(values = strCol) + - theme(legend.position = "none") + - labs(y = "Survival Probability", x = "Time (years)") + - coord_cartesian(ylim = c(-0.01, 1.01)) -@ -The \code{gg_rfsrc} plot of Figure~\ref{fig:predictPlot} shows the test set predictions, similar to the training set predictions in Figure~\ref{fig:rfsrc-plot}, though with fewer patients the survival curves do not cover the same area of the figure. It is important to note that because Figure~\ref{fig:rfsrc-plot} is constructed with OOB estimates, the survival results are comparable as estimates from unseen observations in Figure~\ref{fig:predictPlot}. - -\section{Variable selection}\label{S:variableselection} - -Random forest is not a parsimonious method, but uses all variables available in the data set to construct the response predictor. Also, unlike parametric models, random forest does not require the explicit specification of the functional form of covariates to the response. Therefore there is no explicit $p$-value/significance test for variable selection with a random forest model. Instead, RF ascertains which variables contribute to the prediction through the split rule optimization, optimally choosing variables which separate observations. - -The typical goal of a random forest analysis is to build a \emph{prediction} model, in contrast to extracting \emph{information} regarding the underlying process~\citep{Breiman:twoCultures:2001}. There is not usually much care given in how variables are included into the training data set. Since the goal is prediction, investigators often include the ``kitchen sink'' if it can help. - -In contrast, in survival settings we are typically also interested in how we can possibly improve the the outcome of interest. To achieve this, for understandable inference, it is important to avoid both duplication and transformations of variables whenever possible when building our data sets. Duplication of variables, including multiple measures of a similar covariate, can reduce or mask the importance of the covariate. Transformations can also mask importance as well as make interpretation of the inference results difficult to impossible. - -In this Section, We explore two separate approaches to investigate the RF variable selection process. Variable Importance (Section~\ref{S:vimp}), a property related to variable misspecification, and Minimal Depth (Section~\ref{S:minimalDepth}), a property derived from the construction of the trees within the forest. - -\subsection[Variable Importance]{Variable Importance (\code{gg\_vimp})}\label{S:vimp} - -\emph{Variable importance} (VIMP) was originally defined in CART using a measure involving surrogate variables (see Chapter 5 of~\cite{cart:1984}). The most popular VIMP method uses a prediction error approach involving ``noising-up'' each variable in turn. VIMP for a variable $x_v$ is the difference between prediction error when $x_v$ is randomly permuted, compared to prediction error under the observed values~\citep{Breiman:2001,Liaw:2002,Ishwaran:2007,Ishwaran:2008}. - -Since VIMP is the difference in OOB prediction error before and after permutation, a large VIMP value indicates that misspecification detracts from the predictive accuracy in the forest. VIMP close to zero indicates the variable contributes nothing to predictive accuracy, and negative values indicate the predictive accuracy \emph{improves} when the variable is misspecified. In the later case, we assume noise is more informative than the true variable. As such, we ignore variables with negative and near zero values of VIMP, relying on large positive values to indicate that the predictive power of the forest is dependent on those variables. - -The \code{gg_vimp} function extracts VIMP measures for each of the variables used to grow the forest. The \code{plot.gg_vimp} function shows the variables, in VIMP rank order, labeled with the named vector in the \code{lbls} argument. - -<>= -plot(gg_vimp(rfsrc_pbc), lbls = st.labs) + - theme(legend.position = c(0.8, 0.2)) + - labs(fill = "VIMP > 0") -@ -<>= -## calculate for document -ggda <- gg_vimp(rfsrc_pbc) -@ -The \code{gg_vimp} plot of Figure~\ref{fig:rf-vimp} details VIMP ranking for the \code{pbc.trial} baseline variables, from the largest (\Sexpr{gsub(" \\(mg/dl\\)", "",st.labs[as.character(ggda$vars)[1]])}) at the top, to smallest (\Sexpr{gsub(" \\(mg/dl\\)", "",st.labs[as.character(ggda$vars)[nrow(ggda)]], )}) at the bottom. VIMP measures are shown using bars to compare the scale of the error increase under permutation and colored by the sign of the measure (red for negative values). Note that four of the five highest ranking variables by VIMP match those selected by the~\cite{fleming:1991} model listed in Table~\ref{T:FHmodel}, with urine copper (2) ranking higher than age (8). We will return to this in Section~\ref{S:modelSelection}. - -\subsection[Minimal Depth]{Minimal Depth (\code{gg\_minimal\_depth})}\label{S:minimalDepth} - -In VIMP, prognostic risk factors are determined by testing the forest prediction under alternative data settings, ranking the most important variables according to their impact on predictive ability of the forest. An alternative method uses inspection of the forest construction to rank variables. \emph{Minimal depth}~\citep{Ishwaran:2010, Ishwaran:2011} assumes that variables with high impact on the prediction are those that most frequently split nodes nearest to the root node, where they partition the largest samples of the population. - -Within each tree, node levels are numbered based on their relative distance to the root of the tree (with the root at 0). Minimal depth measures important risk factors by averaging the depth of the first split for each variable over all trees within the forest. The assumption in the metric is that smaller minimal depth values indicate the variable separates large groups of observations, and therefore has a large impact on the forest prediction. - -In general, to select variables according to VIMP, we examine the VIMP values, looking for some point along the ranking where there is a large difference in VIMP measures. Given minimal depth is a quantitative property of the forest construction, \cite{Ishwaran:2010} also derive an analytic threshold for evidence of variable impact. A simple optimistic threshold rule uses the mean of the minimal depth distribution, classifying variables with minimal depth lower than this threshold as important in forest prediction. - -The \pkg{randomForestSRC} \code{var.select} function uses the minimal depth methodology for variable selection, returning an object with both minimal depth and vimp measures. The \pkg{ggRandomForests} \code{gg_minimal_depth} function is analogous to the \code{gg_vimp} function. Variables are ranked from most important at the top (minimal depth measure), to least at the bottom (maximal minimal depth). - -<>= -varsel_pbc <- var.select(rfsrc_pbc) -gg_md <- gg_minimal_depth(varsel_pbc, lbls = st.labs) -print(gg_md) -@ - -<>= -data("varsel_pbc", package = "ggRandomForests") -gg_md <- gg_minimal_depth(varsel_pbc) -gg_md -@ - -The \code{gg_minimal_depth} summary mostly reproduces the output from the \code{var.select} function from the \pkg{randomForestSRC} package. We report the minimal depth threshold (\code{threshold} \Sexpr{round(gg_md$md.obj$threshold, digits=3)}) and the number of variables with depth below that threshold (\code{model size} \Sexpr{gg_md$modelsize}). We also list a table of the top (\Sexpr{gg_md$modelsize}) selected variables, in minimal depth rank order with the associated VIMP measures. The minimal depth numbers indicate that \code{bili} tends to split between the first and second node level, and the next three variables (\code{albumin}, \code{copper}, \code{prothrombin}) split between the second and third levels on average. - -<>= -plot(gg_md, lbls = st.labs) -@ - -The \code{gg_minimal_depth} plot of Figure~\ref{fig:mindepth-plot} is similar to the \code{gg_vimp} plot in Figure~\ref{fig:rf-vimp}, ranking variables from most important at the top (minimal depth measure), to least at the bottom (maximal minimal depth). The vertical dashed line indicates the minimal depth threshold where smaller minimal depth values indicate higher importance and larger values indicate lower importance. - -\subsection{Variable selection comparison}\label{S:modelSelection} - -Since the VIMP and Minimal Depth measures use different criteria, we expect the variable ranking to be somewhat different. We use \code{gg_minimal_vimp} function to compare rankings between minimal depth and VIMP in Figure~\ref{fig:depthVimp}. - -<>= -plot(gg_minimal_vimp(gg_md), lbls = st.labs) + - theme(legend.position=c(0.8, 0.2)) -@ - -The points along the red dashed line indicate where the measures are in agreement. Points above the red dashed line are ranked higher by VIMP than by minimal depth, indicating the variables are more sensitive to misspecification. Those below the line have a higher minimal depth ranking, indicating they are better at dividing large portions of the population. The further the points are from the line, the more the discrepancy between measures. - -<>= -fleming.table$nm <- c("age","albumin", "bili","edema", "prothrombin") -fh.model <- data.frame(cbind(names = fleming.table$nm, - FH = order(abs(fleming.table$`Z stat.`), - decreasing = TRUE), - Variable=rownames(fleming.table), - Coeff=fleming.table$Coef. - )) -gg_v <- gg_vimp(rfsrc_pbc) -gg_v$rank <- 1:nrow(gg_v) -rownames(gg_v) <- gg_v$vars -md <- data.frame(cbind(names=gg_md$topvars)) -md$rank <- 1:nrow(md) -rownames(md) <- gg_md$topvars -md$vimp <- gg_v[rownames(md),]$rank - -md <- left_join(md, fh.model, by = "names") -md <- md[,c(1, 4, 2,3)] -colnames(md) <- c("Variable", "FH","Min depth", "VIMP" ) -kable(md, - format="latex", - caption = "\\label{T:modelComp}Comparison of variable selection criteria. Minimal depth ranking, VIMP ranking and ~\\cite{fleming:1991} (FH) proportional hazards model ranked according to \\code{abs(Z stat)} from Table~\\ref{T:FHmodel}.", - align=c("l", "r","r","r"), - digits = 3, - row.names = FALSE, - booktabs=TRUE) -@ - - -We examine the ranking of the different variable selection methods further in Table~\ref{T:modelComp}. We can use the Z statistic from Table~\ref{T:FHmodel} to rank variables selected in the~\cite{fleming:1991} model to compare with variables selected by minimal depth and VIMP. The table is constructed by taking the \Sexpr{nrow(gg_md)} top ranked minimal depth variables (below the selection threshold) and matching the VIMP ranking and~\cite{fleming:1991} model transforms. We see all three methods indicate a strong relation of serum bilirubin to survival, and overall, the minimal depth and VIMP rankings agree reasonably well with the~\cite{fleming:1991} model. - -The minimal depth selection process reduced the number of variables of interest from~\Sexpr{ncol(pbc)-2} to \Sexpr{length(varsel_pbc$topvars)}, which is still a rather large subset of interest. An obvious selection set is to examine the five variables selected by~\cite{fleming:1991}. Combining the Minimal Depth and~\cite{fleming:1991} model, there may be evidence to keep the top 7 variables. Though minimal depth does not indicate the \code{edema} variable is very interesting, VIMP ranking does agree with the proportional hazards model, indicating we might not want to remove the \code{edema} variable. Both minimal depth and VIMP suggest including \code{copper}, a measure associated with liver disease. - -Regarding the \code{chol} variable, recall missing data summary of Table~\ref{T:missing}. In in the trial data set, there were 28 observations missing \code{chol} values. The forest imputation randomly sorts observations with missing values into daughter nodes when using the \code{chol} variable, which is also how \pkg{randomForestSRC} calculates VIMP. We therefore expect low values for VIMP when a variable has a reasonable number of missing values. - -Restricting our remaining analysis to the five~\cite{fleming:1991} variables, plus the \code{copper} retains the biological sense of these analysis. We will now examine how these six variables are related to survival using variable dependence methods to determine the direction of the effect and verify that the log transforms used by~\cite{fleming:1991} are appropriate. - -\section{Variable dependence}\label{S:dependence} - -As random forest is not parsimonious, we have used minimal depth and VIMP to reduce the number of variables to a manageable subset. Once we have an idea of which variables contribute most to the predictive accuracy of the forest, we would like to know how the response depends on these variables. - -Although often characterized as a \emph{black box} method, the forest predictor is a function of the predictor variables $\hat{f}_{RF} = f(x).$ We use graphical methods to examine the forest predicted response dependency on covariates. We again have two options, variable dependence plots (Section~\ref{S:variabledependence}) are quick and easy to generate, and partial dependence plots (Section~\ref{S:partialdependence}) are more computationally intensive but give us a risk adjusted look at variable dependence. - -\subsection[Variable Dependence]{Variable Dependence (\code{gg\_variable})}\label{S:variabledependence} - -\emph{Variable dependence} plots show the predicted response relative to a covariate of interest, with each training set observation represented by a point on the plot. Interpretation of variable dependence plots can only be in general terms, as point predictions are a function of all covariates in that particular observation. - -Variable dependence is straight forward to calculate, involving only the getting the predicted response for each observation. In survival settings, we must account for the additional dimension of time. We plot the response at specific time points of interest, for example survival at 1 or 3 years. -<>= -ggRFsrc + geom_vline(aes(xintercept = 1), linetype = "dashed") + - geom_vline(aes(xintercept = 3), linetype = "dashed") + - coord_cartesian(xlim = c(0, 5)) -@ -The \code{gg_rfsrc} of Figure~\ref{fig:rfsrc-plot3Mnth} identical to Figure~\ref{fig:rfsrc-plot} (stored in the \code{ggRFsrc} variable) with the addition of a vertical dashed line at the 1 and 3 year survival time. A variable dependence plot is generated from the predicted response value of each survival curve at the intersecting time line plotted against covariate value for that observation. This can be visualized as taking a slice of the predicted response at each time line, and spreading the resulting points out along the variable of interest. - -The \code{gg_variable} function extracts the training set variables and the predicted OOB response from \code{rfsrc} and \code{predict} objects. In the following code block, we store the \code{gg_variable} data object for later use (\code{gg_v}), as all remaining variable dependence plots can be constructed from this object. -<>= -gg_v <- gg_variable(rfsrc_pbc, time = c(1, 3), - time.labels = c("1 Year", "3 Years")) - -plot(gg_v, xvar = "bili", alpha = 0.4) + #, se=FALSE - labs(y = "Survival", x = st.labs["bili"]) + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - coord_cartesian(ylim = c(-0.01, 1.01)) -@ -The \code{gg_variable} plot of Figure~\ref{fig:variable-plotbili} shows variable dependence for the Serum Bilirubin (\code{bili}) variable. Again censored cases are shown as blue circles, events are indicated by the red `x' symbols. Each predicted point is dependent on the full combination of all other covariates, not only on the covariate displayed in the dependence plot. The smooth loess line~\citep{cleveland:1981, cleveland:1988} indicates the trend of the prediction over the change in the variable. - -Examination of Figure~\ref{fig:variable-plotbili} indicates most of the cases are grouped in the lower end of \code{bili} values. We also see that most of the higher values experienced an event. The ``normal'' range of Bilirubin is from 0.3 to 1.9 mg/dL, indicating the distribution from our population is well outside the normal range. These values make biological sense considering Bilirubin is a pigment created in the liver, the organ effected by the PBC disease. The figure also shows that the risk of death increases as time progresses. The risk at 3 years is much greater than that at 1 year for patients with high Bilirubin values compared to those with values closer to the normal range. - -The \code{plot.gg_variable} function call operates on the \code{gg_variable} object controlled by the list of variables of interest in the \code{xvar} argument. By default, the \code{plot.gg_variable} function returns a list of \code{ggplot} objects, one figure for each variable named in \code{xvar}. The remaining arguments are passed to internal \pkg{ggplot2} functions controlling the display of the figure. The \code{se} argument is passed to the internal call to \code{geom_smooth} for fitting smooth lines to the data. The \code{alpha} argument lightens the coloring points in the \code{geom_point} call, making it easier to see point over plotting. We also demonstrate modification of the plot labels using the \code{labs} function and point attributes with the \code{scale_} functions. - -An additional \code{plot.gg_variable} argument (\code{panel = TRUE}) can be used to combine multiple variable dependence plots into a single figure. In the following code block, we plot the remaining continuous variables of interest found in Section~\ref{S:modelSelection}. -<>= -xvar <- c("bili", "albumin", "copper", "prothrombin", "age") -xvar.cat <- c("edema") - -plot(gg_v, xvar = xvar[-1], panel = TRUE, alpha = 0.4) + #se = FALSE, span=1 - labs(y = "Survival") + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - coord_cartesian(ylim = c(-0.05, 1.05)) -@ -The \code{gg_variable} plot in Figure~\ref{fig:variable-plot} displays a panel of the remaining continuous variable dependence plots. The panels are sorted in the order of variables in the \code{xvar} argument and include a smooth loess line~\citep{cleveland:1981,cleveland:1988} to indicate the trend of the prediction dependence over the covariate values. The \code{se=FALSE} argument turns off the loess confidence band, and the \code{span=1} argument controls the degree of smoothing. - -The figures indicate that survival increases with \code{albumin} level, and decreases with \code{bili}, \code{copper}, \code{prothrombin} and \code{age}. Note the extreme value of \code{prothrombin} (> 16) influences the loess curve more than other points, which would make it a candidate for further investigation. - -We expect survival at 3 years to be lower than at 1 year. However, comparing the two time plots for each variable does indicate a difference in response relation for \code{bili}, \code{copper} and \code{prothrombine}. The added risk for high levels of these variables at 3 years indicates a non-proportional hazards response. The similarity between the time curves for \code{albumin} and \code{age} indicates the effect of these variables is constant over the disease progression. - -There is not a convenient method to panel scatter plots and boxplots together, so we recommend creating panel plots for each variable type separately. We plot the categorical variable (\code{edema}) in Figure~\ref{fig:variable-plotCat} separately from the continuous variables in Figure~\ref{fig:variable-plot}. - -<>= -plot(gg_v, xvar = xvar.cat, alpha = 0.4) + labs(y = "Survival") + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - coord_cartesian(ylim = c(-0.01, 1.02)) -@ -The \code{gg_variable} plot of Figure~\ref{fig:variable-plotCat} for categorical variable dependence displays boxplots to examine the distribution of predicted values within each level of the variable. The points are plotted with a jitter to see the censored and event markers more clearly. The boxes are shown with horizontal bars indicating the median, 75th (top) and 25th (bottom) percentiles. Whiskers extend to 1.5 times the interquartile range. Points plotted beyond the whiskers are considered outliers. - -When using categorical variables with linear models, we use boolean dummy variables to indicate class membership. In the case of \code{edema}, we would probably create two logical variables for \code{edema = 0.5} (complex Edema presence indicator) and \code{edema = 1.0} (Edema with diuretics) contrasted with the \code{edema = 0} variable (no Edema). Random Forest can use factor variables directly, separating the populations into homogeneous groups of \code{edema} at nodes that split on that variable. Figure~\ref{fig:variable-plotCat} indicates similar survival response distribution between 1 and 3 year when \code{edema = 1.0}. The distribution of predicted survival does seem to spread out more than for the other values, again indicating a possible non-proportional hazards response. - -\subsection[Partial Dependence]{Partial Dependence (\code{gg\_partial})}\label{S:partialdependence} - -\emph{Partial dependence} plots are a risk adjusted alternative to variable dependence. Partial plots are generated by integrating out the effects of variables beside the covariate of interest. The figures are constructed by selecting points evenly spaced along the distribution of the variable of interest. For each of these points ($X = x$), we calculate the average RF prediction over all remaining covariates in the training set by -\begin{equation} -\tilde{f}(x) = \frac{1}{n} \sum_{i = 1}^n \hat{f}(x, x_{i, o}), -\label{E:partial} -\end{equation} -where $\hat{f}$ is the predicted response from the random forest and $x_{i, o}$ is the value for all other covariates other than $X = x$ for observation $i$~\citep{Friedman:2000}. - -Generating partial dependence data is effectively averaging the response for a series of nomograms constructed for each observation by varying the variable of interest. The operation is computationally intensive, especially when there are a large number of observations. The default parameters for the \code{plot.variable} function generate partial dependence estimates at \code{npts = 25} points along the variable of interest. For each point of interest, the \code{plot.variable} function averages the \code{n} response predictions. This process is repeated for each of the variables of interest. - -For time to event data, we also have to deal with the additional time dimension, as with variable dependence. The following code block uses the \code{mclapply} function from the \pkg{parallel} package to run the \code{plot.variable} function for three time points (\code{time}=1, 3 and 5 years) in parallel. For RSF models, we calculate a risk adjusted survival estimates (\code{surv.type="surv"}), suppressing the internal base graphs (\code{show.plots = FALSE}) and store the point estimates in the \code{partial_pbc} \code{list}. -<>= -xvar <- c(xvar, xvar.cat) -partial_pbc <- mclapply(c(1,3,5), function(tm){ - plot.variable(rfsrc_pbc, surv.type = "surv", time = tm, xvar.names = xvar, - partial = TRUE, show.plots = FALSE) - }) -@ - -<>= -data("partial_pbc", package = "ggRandomForests") -xvar <- c(xvar, xvar.cat) -@ - -Because partial dependence data is collapsed onto the risk adjusted response, we can show multiple time curves on a single panel. The following code block converts the \code{plot.variable} output into a list of \code{gg_partial} objects, and then combines these data objects, with descriptive labels, along each variable of interest using the \code{combine.gg_partial} function. -<>= -gg_dta <- mclapply(partial_pbc, gg_partial) -pbc_ggpart <- combine.gg_partial(gg_dta[[1]], gg_dta[[2]], - lbls = c("1 Year", "3 Years")) -@ - -We then segregate the continuous and categorical variables, and generate a panel plot of all continuous variables in the \code{gg_partial} plot of Figure~\ref{fig:pbc-partial-panel}. The panels are ordered by minimal depth ranking. Since all variables are plotted on the same Y-axis scale, those that are strongly related to survival make other variables look flatter. The figures also confirm the strong non-linear contribution of these variables. Non-proportional hazard response is also evident in at least the \code{bili} and \code{copper} variables by noting the divergence of curves as time progresses. -<>= -ggpart <- pbc_ggpart -ggpart$edema <- NULL - -plot(ggpart, panel = TRUE) + #, se = FALSE - labs(x = "", y = "Survival", color = "Time", shape = "Time") + - theme(legend.position = c(0.8, 0.2)) + - coord_cartesian(ylim = c(25, 101)) -@ - -Categorical partial dependence is displayed as boxplots, similar to categorical variable dependence. Risk adjustment greatly reduces the spread of the response as expected, and may also move the mean response compared to the unadjusted results. The categorical \code{gg_partial} plot of Figure~\ref{fig:pbc-partial-edema} indicates that, adjusting for other variables, survival decreases with rising \code{edema} values. We also note that the risk adjusted distribution does spread out as we move further out in time. -<>= -ggplot(pbc_ggpart[["edema"]], aes(y=yhat, x=edema, col=group))+ - geom_boxplot(notch = TRUE, - outlier.shape = NA) + # panel=TRUE, - labs(x = "Edema", y = "Survival (%)", color="Time", shape="Time") + - theme(legend.position = c(0.2, 0.2)) + - coord_cartesian(ylim = c(25, 101)) -@ - -Partial dependence is an extrapolation operation. By averaging over a series of nomograms, the algorithm constructs observations for all values of the variable of interest, regardless of the relation with other variables. In contrast, variable dependence only uses observations from within the training set. A simple example would be for a model including BMI, weight and height. When examining partial dependence of BMI, the algorithm only manipulates BMI values, height or weight values. The averaging operation is then confounded in two directions. First, dependence on height and weight is shared with BMI, making it difficult to see the true response dependence. Second, partial dependence is calculated over nomograms that can not physically occur. For simple variable combinations, like BMI, it is not difficult to recognize this and modify the independent variable list to avoid these issues. However, care must be taken when interpreting more complex biological variables. - -\subsection{Partial dependence as a function of time}\label{S:timeSurface} - -In the previous section, we calculated risk adjusted (partial) dependence at two time points (1 and 3 years). The selection of these points can be driven by biological times of interest (i.e., 1 year and 5 year survival in cancer studies) or by investigating time points of interest from a \code{gg_rfsrc} prediction plot. We typically restrict generating \code{gg_partial} plots to the variables of interest at two or three time points of interest due to computational constraints. - -It is instructive to see a more detailed map of the risk adjusted response to get a feel for interpreting partial and variable dependence plots. In Figure~\ref{fig:pbc-partial-panel}, we can visualize the two curves as extending into the plane of the page along a time axis. Filling in more partial dependence curves, it is possible to create a partial dependence surface. - -For this exercise, we will generate a series of 50 \code{gg_partial} plot curves for the \code{bili} variable. To fill the surface in, we also increased the number of points along the distribution of \code{bili} to \code{npts=50} to create a grid of $50 \times 50$ risk adjusted estimates of survival along time in one dimension and the \code{bili} variable in the second. - -<>= -# Restrict the time of interest to less than 5 years. -time_pts <- rfsrc_pbc$time.interest[which(rfsrc_pbc$time.interest<=5)] - -# Find the 50 points in time, evenly space along the distribution of -# event times for a series of partial dependence curves -time_cts <-quantile_pts(time_pts, groups = 50) - -# Load the stored partial coplot data. -data("partial_pbc_time") - -# We need to attach the time points of interest to our data. -time.tmp <- do.call(c,lapply(time_cts, - function(grp){rep(grp, 50)})) - -# Convert the list of plot.variable output to gg_partial -partial_time <- do.call(rbind,lapply(partial_pbc_time, gg_partial)) - -# attach the time data to the gg_partial_coplot -partial_time$time <- time.tmp - -# Modify the figure margins to make it larger -par(mai = c(0.5,0.55,0,0)) - -# Transform the gg_partial_coplot object into a list of three named matrices -# for surface plotting with plot3D::surf3D -srf <- surface_matrix(partial_time, c("time", "bili", "yhat")) - -# Generate the figure. -surf3D(x = srf$x, y = srf$y, z = srf$z, col = heat.colors(25), - colkey = FALSE, border = "black", bty = "b2", - shade = 0.5, expand = 0.5, theta=110, phi=15, - lighting = TRUE, lphi = -50, ticktype="detailed", - ylab = "Bilirubin", xlab = "Time", zlab = "Survival" -) - -# Extract the 1 and 3 year points. -# Find the indices of the points closest in time -t.pts <- sapply(c(1,3), function(pt){min(abs(srf$x - pt), na.rm=TRUE)}) -indx <- vector("list", length=2) -indx[[1]] <- which(abs(srf$x - 1) < t.pts[1]+1.e-5) -indx[[2]] <- which(abs(srf$x - 3) < t.pts[2]+1.e-5) - -# Generate curves along 1 and 3 year partial dependence -alt <- lapply(indx, function(ind){ - lines3D(x=srf$x[ind], y=srf$y[ind],z=srf$z[ind], - add=TRUE, col="blue", lwd=6) -}) -@ - -The \code{gg_partial} surface of Figure~\ref{fig:timeSurface3d} was constructed using the \code{surf3D} function from the \pkg{plot3D} package~\citep[\url{http://CRAN.R-project.org/package=plot3D}]{plot3D:2014}. Source code for generating this figure is shown in Appendix~\ref{A:TimeDomain}. - -The figure shows partial dependence of survival (Z-axis) as a function of \code{bili} over a five year follow up time period. Lines perpendicular to the Bilirubin axis are distributed along the \code{bili} variable. Lines parallel to the Bilirubin axis are taken at 50 training set event times, the first event after $t=0$ at the back to last event before $t=5$ years at the front. The distribution of the time lines is also evenly selected using the same procedure as selecting points for partial dependence curves. - -The 2500 estimated partial dependence points are joined together with a simple straight line interpolation to create the surface, colored according to the survival estimates (yellow close to 1, red for lower values) to aid the visualization of 3 dimensions on a 2 dimensional page. The blue lines in Figure~\ref{fig:timeSurface3d} correspond to the 1 and 3 year partial dependence, as shown in the \code{bili} panel of Figure~\ref{fig:pbc-partial-panel}. - -Viewed as a surface, we see how the partial dependence changes with time. For low values of \code{bili}, survival decreases at a constant rate. For higher values, the rate seems constant until somewhere near 2 years, where it increases rapidly before slowing again as we approach the 5 year point. -%' -%' \section{Variable Interactions}\label{S:interactions} -%' -%' We could stop with the results that our RF analysis has found these six variables to be important in predicting survival. Where the survival response is decreasing with increasing \code{bili}, \code{copper}, \code{prothrombin}, \code{age} and \code{edema} and increasing with increasing \code{albumin}. These results agree with the sign of the~\cite{fleming:1991} model coefficients shown in Table~\ref{T:FHmodel}. The \code{gg_partial} plot in Figure~\ref{fig:pbc-partial-panel} supports the \code{log} transform of \code{bili}, \code{albumin} and \code{prothrombin} and suggest a similar transform for including the \code{copper} variable in a proportional hazards model. The \code{age} variable does seem to have a more linear response than the other continuous variables, and using dummy variables for \code{edema} would preclude the need for a transformation. -%' -%' Using minimal depth, it is also possible to calculate measures of pairwise interactions among variables. Recall that minimal depth measure is defined by averaging the tree depth of variable $i$ relative to the root node. To detect interactions, this calculation can be modified to measure the minimal depth of a variable $j$ with respect to the maximal subtree for variable $i$~\citep{Ishwaran:2010,Ishwaran:2011}. -%' -%' The \code{randomForestSRC::find.interaction} function traverses the forest, calculating all pairwise minimal depth interactions, and returns a $p \times p$ matrix of interaction measures. The diagonal terms are normalized to the root node, and off diagonal terms are normalized measures of pairwise variable interaction. -%' -%' <>= -%' ggint <- gg_interaction(rfsrc_pbc) -%' @ -%' -%' <>= -%' data(interaction_pbc, package = "ggRandomForests") -%' ggint <- gg_interaction(interaction_pbc) -%' @ -%' -%' The \code{gg_interaction} function wraps the \code{find.interaction} matrix for use with the \pkg{ggRandomForests} plot and print functions. The \code{xvar} argument is used to restrict the variables of interest and the \code{panel = TRUE} argument displays the results in a single figure. -%' <>= -%' plot(ggint, xvar = xvar) -%' @ -%' -%' The \code{gg_interaction} plots in Figure~\ref{fig:interactionPanel} show interactions for the target variable (shown with the red cross) with interaction scores for all remaining variables. We expect the covariate with lowest minimal depth (\code{bili}) to be associated with almost all other variables, as it typically splits close to the root node, so viewed alone it may not be as informative as looking at a collection of interactive depth plots. Scanning across the panels, we see each successive target depth increasing, as expected. We also see the interactive variables increasing with increasing target depth. - -\section{Conditional dependence plots}\label{S:coplots} - -Conditioning plots (coplots)~\citep{chambers:1992,cleveland:1993} are a powerful visualization tool to efficiently study how a response depends on two or more variables~\citep{cleveland:1993}. The method allows us to view data by grouping observations on some conditional membership. The simplest example involves a categorical variable, where we plot our data conditional on class membership, for instance on groups of the \code{edema} variable. We can view a coplot as a stratified variable dependence plot, indicating trends in the RF prediction results within panels of group membership. - -Interactions with categorical data can be generated directly from variable dependence plots. Recall the variable dependence for bilirubin shown in Figure~\ref{fig:variable-plotbili}. We recreated the \code{gg_variable} plot in Figure~\ref{fig:var_dep}, modified by adding a linear smooth as we intend on segregating the data along conditional class membership. -<>= -# Get variable dependence at 1 year -ggvar <- gg_variable(rfsrc_pbc, time = 1) - -# For labeling coplot membership -ggvar$edema <- paste("edema = ", ggvar$edema, sep = "") - -# Plot with linear smooth (method argument) -var_dep <- plot(ggvar, xvar = "bili", - alpha = 0.5) + -# geom_smooth(method = "glm",se = FALSE) + - labs(y = "Survival", - x = st.labs["bili"]) + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - coord_cartesian(y = c(-.01,1.01)) - -var_dep -@ - -We can view the conditional dependence of survival against bilirubin, conditional on \code{edema} group membership (categorical variable) in Figure~\ref{fig:coplot_bilirubin} by reusing the saved \code{ggplot} object (\code{var_dep}) and adding a call to the \code{facet_grid} function. -<>= -var_dep + facet_grid(~edema) -@ - -Comparing Figure~\ref{fig:var_dep} with conditional panels of Figure~\ref{fig:coplot_bilirubin}, we see the overall response is similar to the \code{edema=0} response. The survival for \code{edema=0.5} is slightly lower, though the slope of the smooth indicates a similar relation to \code{bili}. The \code{edema=1} panel shows that the survival for this (smaller) group of patients is worse, but still follows the trend of decreasing with increasing \code{bili}. - -Conditional membership within a continuous variable requires stratification at some level. We can sometimes make these stratification along some feature of the variable, for instance a variable with integer values, or 5 or 10 year age group cohorts. However with our variables of interest, there are no logical stratification indications. Therefore we arbitrarily stratify our variables into 6 groups of roughly equal population size using the \code{quantile_cuts} function. We pass the break points located by \code{quantile_cuts} to the \code{cut} function to create grouping intervals, which we can then add to the \code{gg_variable} object before plotting with the \code{plot.gg_variable} function. This time we use the \code{facet_wrap} function to generate the panels grouping interval, which automatically sorts the six panels into two rows of three panels each. - -<>= -# Find intervals with similar number of observations and create groups. -albumin_cts <- quantile_pts(ggvar$albumin, groups = 6, intervals = TRUE) -ggvar$albumin_grp <- cut(ggvar$albumin, breaks = albumin_cts) - -# Adjust naming for facets -levels(ggvar$albumin_grp) <- paste("albumin =", levels(ggvar$albumin_grp)) - -plot(ggvar, xvar = "bili", alpha = 0.5) + #method = "glm", , se = FALSE - labs(y = "Survival", x = st.labs["bili"]) + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - facet_wrap(~albumin_grp) + - coord_cartesian(y = c(-.01,1.01)) -@ -The \code{gg_variable} coplot of Figure~\ref{fig:albumin-coplot} indicates that the effect of \code{bili} decreases conditional on membership within increasing \code{albumin} groups. To get a better feel for how the response depends on both these variables together, it is instructive to look at the compliment coplot of \code{albumin} conditional on membership in \code{bili} groups. We repeat the previous coplot process, predicted survival as a function of the \code{albumin} variable, conditional on membership within 6 groups \code{bili} intervals. As the code to create the coplot of Figure~\ref{fig:bili-coplot} is nearly identical to the code for creating Figure~\ref{fig:albumin-coplot}, we include the source code for this figure in Appendix~\ref{A:biliCoplot}. - -<>= -# Find intervals with similar number of observations. -bili_cts <-quantile_pts(ggvar$bili, groups = 6, intervals = TRUE) - -# We need to move the minimal value so we include that observation -bili_cts[1] <- bili_cts[1] - 1.e-7 - -# Create the conditional groups and add to the gg_variable object -bili_grp <- cut(ggvar$bili, breaks = bili_cts) -ggvar$bili_grp <- bili_grp - -# Adjust naming for facets -levels(ggvar$bili_grp) <- paste("bilirubin =", levels(bili_grp)) - -# plot.gg_variable -plot(ggvar, xvar = "albumin", alpha = 0.5) + -# method = "glm", se = FALSE) + - labs(y = "Survival", x = st.labs["albumin"]) + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - facet_wrap(~bili_grp) + - coord_cartesian(ylim = c(-0.01,1.01)) -@ -The \code{gg_variable} coplot of Figure~\ref{fig:bili-coplot} indicates the probability of survival increases with increasing \code{albumin} and increases within groups of increasing \code{bili}. - -Typically, conditional plots for continuous variables include overlapping intervals along the grouped variable~\citep{cleveland:1993}. We chose to use mutually exclusive continuous variable intervals for the following reasons: - \begin{itemize} - \item Simplicity - We can create the coplot figures directly from the \code{gg_variable} object by adding a conditional group column directly to the object. - - \item Interpretability - We find it easier to interpret and compare the panels if each observation is only in a single panel. - - \item Clarity - We prefer using more space for the data portion of the figures than typically displayed in the \code{coplot} function which requires the bar plot to present the overlapping segments. - \end{itemize} - -It is still possible to augment the \code{gg_variable} to include overlapping conditional membership with continuous variables by duplicating rows of the training set data within the \code{rfsrc$xvar} object, and then setting the conditional group membership as described. The \code{plot.gg_variable} function recipe above could be used to generate the panel plot, with panels ordered according to the factor levels of the grouping variable. We leave this as an exercise for the reader. - -\subsection[Partial dependence coplots]{Partial dependence coplots (\code{gg\_partial\_coplot})}\label{S:partialcoplots} - -By characterizing conditional plots as stratified variable dependence plots, the next logical step would be to generate an analogous conditional partial dependence plot. The process is similar to variable dependence coplots, first determine conditional group membership, then calculate the partial dependence estimates on each subgroup using the \code{plot.variable} function with a \code{subset} argument for each grouped interval. The \pkg{ggRandomForests} \code{gg_partial_coplot} function is a wrapper for generating conditional partial dependence data objects. Given a random forest (\code{rfsrc}) object and a \code{groups} vector for conditioning the training data set observations, \code{gg_partial_coplot} calls the \code{plot.variable} function the training set observations conditional on \code{groups} membership. The function returns a \code{gg_partial_coplot} object, a subclass of the \code{gg_partial} object, which can be plotted with the \code{plot.gg_partial} function. - -The following code block will generate the data object for creating partial dependence coplot of 1 year survival as a function of \code{bili} conditional on membership within the 6 groups of \code{albumin} intervals that we examined in the Figure~\ref{fig:albumin-coplot}. -<>= -partial_coplot_pbc <- gg_partial_coplot(rfsrc_pbc, xvar = "bili", - groups = ggvar$albumin_grp, - surv_type = "surv", - time = 1, - show.plots = FALSE) -@ - -<>= -# Load cached partial plot data -data("partial_coplot_pbc", package = "ggRandomForests") -@ - -<>= -ggplot(partial_coplot_pbc, aes(x=bili, y=yhat, col=group, shape=group)) + # - geom_smooth(se = FALSE) + - labs(x = st.labs["bili"], y = "Survival at 1 year (%)", - color = "albumin", shape = "albumin") + - coord_cartesian(y = c(49,101)) -@ -The \code{gg_partial_coplot} of Figure~\ref{fig:bili-albumin} shows point estimates of the risk adjusted survival as a function of \code{bili} conditional on group membership defined by \code{albumin} intervals. The figure is slightly different than the \code{gg_partial} plot of Figure~\ref{fig:pbc-partial-panel} as each set of partial dependence estimates is calculated over a subset of the training data. We again connect the point estimates with a Loess curve. - -For completeness, we construct the compliment coplot view of one year survival as a function of \code{albumin} conditional on \code{bili} interval group membership in Figure~\ref{fig:albumin-bili}. We list the source code for this figure in Appendix~\ref{A:biliPartialCoplot}. - -<>= -# Load cached partial plot data -data("partial_coplot_pbc2", package = "ggRandomForests") - -# Partial coplot -ggplot(partial_coplot_pbc2, aes(x=albumin, y=yhat, col=group, shape=group))+ - geom_smooth(se = FALSE) + - labs(x = st.labs["albumin"], y = "Survival at 1 year (%)", - color = "Bilirubin", shape = "Bilirubin") + - coord_cartesian(y = c(49,101)) -@ - -\subsection{Partial plot surfaces}\label{S:partialSurface} - -Just as in partial dependence, we can view the partial coplot curves as slices along a surface that could extend along an axis into the page. This visualization is made a bit difficult by our choice to select groups of similar population size, as the curves are not evenly spaced along the grouping variables. So, similar to the partial dependence surface we created along time in Section~\ref{S:timeSurface}, we can examine the relation of these two variables using a partial dependence surface. -A difficulty with conditional dependence for this exercise is the reduction of the sample sizes for calculating a coplot surface. So instead, we calculate the full partial dependence surface by generating 50 \code{albumin} values spaced evenly along the data distribution. For each value of \code{albumin}, we calculate the partial dependence on \code{bili} at \code{npts = 50} points with the \code{plot.variable} function. We generate the surface again using the \code{surf3D} function. - -<>= -# Find the quantile points to create 50 cut points -alb_partial_pts <-quantile_pts(ggvar$albumin, groups = 50) - -# Load the stored partial coplot data. -data("partial_pbc_surf") - -# Instead of groups, we want the raw albumin point values, -# To make the dimensions match, we need to repeat the values -# for each of the 50 points in the albumin direction -albumin.tmp <- do.call(c,lapply(alb_partial_pts, - function(grp){rep(grp, 50)})) - -# Convert the list of plot.variable output to -partial_surf <- do.call(rbind,lapply(partial_pbc_surf, gg_partial)) - -# attach the data to the gg_partial_coplot -partial_surf$albumin <- albumin.tmp - -# Modify the figure margins to make the figure larger -par(mai = c(0.5,.55,0,0)) - -# Transform the gg_partial_coplot object into a list of three named matrices -# for surface plotting with plot3D::surf3D -srf <- surface_matrix(partial_surf, c("bili", "albumin", "yhat")) - -# Generate the figure. -surf3D(x = srf$x, y = srf$y, z = srf$z, col = topo.colors(25), - colkey = FALSE, border = "black", bty = "b2", - shade = 0.5, expand = 0.5, theta=55, phi=15, - lighting = TRUE, lphi = -50, ticktype="detailed", - xlab = "Bilirubin", ylab = "Albumin", zlab = "Survival at 1 Year" - ) - -# Extract the albumin and bilirubin points -# Remove end points -bli <- bili_cts[-c(1,7)] -alb <- albumin_cts[-c(1,7)] - -# Find the indices of the points closest to split points -alb.pts <- lapply(alb, function(pt){min(abs(srf$y - pt), na.rm=TRUE)}) -bli.pts <- lapply(bli, function(pt){min(abs(srf$x - pt), na.rm=TRUE)}) - -indx.alb <- lapply(1:length(alb.pts), function(al){ - which(abs(srf$y - alb[al]) < alb.pts[[al]]+1.e-5)}) -indx.bli <- lapply(1:length(bli.pts), function(al){ - which(abs(srf$x - bli[al]) < bli.pts[[al]]+1.e-5)}) - -# Draw the lines -indx <- c(indx.alb, indx.bli) -st <- lapply(indx, function(ind){ - lines3D(x=srf$x[ind], - y=srf$y[ind], - z=srf$z[ind], - add=TRUE, col="blue", lwd=6)}) -@ -The partial dependence surface of Figure~\ref{fig:surface3d} shows partial dependence of 1 year survival on the Z-axis against values of Bilirubin and Albumin. We again use linear interpolation between the 2500 estimates, and color the surface by the response. Here blue corresponds to lower and yellow to higher risk adjusted survival. The blue lines are placed at the cut points between groups of \code{albumin} and \code{bili} used in the partial coplots of Figures~\ref{fig:bili-albumin} and~\ref{fig:albumin-bili} respectively. - -To construct the partial coplot for groups of \code{albumin} in Figure~\ref{fig:bili-albumin}, we arbitrarily segmented the training set into 6 groups of equal membership size. The segments between blue lines parallel to the Bilirubin axis indicate where on the surface these observations are located. Similarly, the blues lines perpendicular to the Bilirubin axis segment observations into the 6 groups of \code{bili} intervals. Figure~\ref{fig:surface3d} indicates the arbitrary grouping for groups of \code{bili} in Figure~\ref{fig:albumin-bili}. - -The figure indicates that partial dependence of higher \code{albimun} levels are similar, which results in the over plotting seen in Figure~\ref{fig:bili-albumin}. The distribution is sparser at lower \code{albimun} levels, creating the larger area in lowest \code{albimun} values, where the partial dependence changes the most. - -\section{Conclusion}\label{S:conclusion} - -In this vignette, we have demonstrated the use of Random Survival Forest methods with the \pkg{ggRandomForests}~(\url{http://CRAN.R-project.org/package=ggRandomForests}) package. We have shown how to grow a random forest model and determine which variables contribute to the forest prediction accuracy using both VIMP and Minimal Depth measures. We outlined how to investigate variable associations with the response variable using variable dependence and the risk adjusted partial dependence plots. We've also explored variable interactions by using pairwise minimal depth interactions and directly viewed these interactions using variable dependence coplots and partial dependence coplots. Along the way, we've demonstrated the use of additional commands from the \pkg{ggplot2} package~\citep[\url{http://CRAN.R-project.org/package=ggplot2}]{Wickham:2009} package for modifying and customizing plots from \pkg{ggRandomForests} functions. - -% ----------------------------------------------------- -\section{Computational details} -% ----------------------------------------------------- - -This document is a package vignette for the \pkg{ggRandomForests} package for ``Visually Exploring Random Forests'' (\url{http://CRAN.R-project.org/package=ggRandomForests}). The \pkg{ggRandomForests} package is designed for use with the \pkg{randomForestSRC} package~\citep[\url{http://CRAN.R-project.org/package=randomForestSRC}]{Ishwaran:RFSRC:2014} for growing survival, regression and classification random forest models and uses the \pkg{ggplot2} package~\citep[\url{http://CRAN.R-project.org/package=ggplot2}]{Wickham:2009} for plotting diagnostic and variable association results. \pkg{ggRandomForests} is structured to extract data objects from \pkg{randomForestSRC} objects and provides functions for printing and plotting these objects. - -The vignette is a tutorial for using the \pkg{ggRandomForests} package with the \pkg{randomForestSRC} package for building and post-processing random survival forests. In this tutorial, we explore a random forest for survival model constructed for the primary biliary cirrhosis (PBC) of the liver data set~\citep{fleming:1991}, available in the \pkg{randomForestSRC} package. We grow a random survival forest and demonstrate how \pkg{ggRandomForests} can be used when determining how the survival response depends on predictive variables within the model. The tutorial demonstrates the design and usage of many of \pkg{ggRandomForests} functions and features and also how to modify and customize the resulting \code{ggplot} graphic objects along the way. - -The vignette is written in \LaTeX using the \pkg{knitr} package~\citep[\url{http://CRAN.R-project.org/package=knitr}]{Xie:2015, Xie:2014,Xie:2013}, which facilitates weaving \proglang{R}~\citep{rcore} code, results and figures into document text. - -This vignette is available within the \pkg{ggRandomForests} package on the Comprehensive R Archive Network (CRAN)~\citep[\url{http://cran.r-project.org}]{rcore}. Once the package has been installed, the vignette can be viewed directly from within \proglang{R} with the following command: -<>= -vignette("randomForestSRC-Survival", package = "ggRandomForests") -@ - -A development version of the \pkg{ggRandomForests} package is also available on GitHub (\url{https://github.com}). We invite comments, feature requests and bug reports for this package at \url{https://github.com/ehrlinger/ggRandomForests}. - - -\section*{Acknowledgement} -This work was supported in part by the National Institutes of Health grant R01-HL103552-01A1. - -\singlespacing -\bibliography{ggRandomForests} - - -%%\end{document} - - -%\doublespacing -%\newpage - -\appendix -\section{Source Code} -Throughout this document, we have listed all \proglang{R} source code to create the figures included here with a few exceptions. For completeness, we include the missing code blocks in this appendix. The code blocks are included here in order of appearance in the document. - -\subsection{Partial Dependence in Time Dimension}\label{A:TimeDomain} - -The surface plot of~\ref{S:timeSurface} demonstrates how partial dependence curves relate to the survival curves. This code block is the \proglang{R} source code for creating Figure~\ref{fig:timeSurface3d}. - -<>= -# Restrict the time of interest to less than 5 years. -time_pts <- rfsrc_pbc$time.interest[which(rfsrc_pbc$time.interest<=5)] - -# Find the 50 points in time, evenly space along the distribution of -# event times for a series of partial dependence curves -time_cts <-quantile_pts(time_pts, groups = 50, intervals = TRUE) - -# Load stored data from the package. -# See ?partial_pbc_time for how this data was generated. -# -# Time surfaces are created with the partial.rfsrc command -# partial_pbc_time <- partial.rfsrc(rfsrc_pbc, xvar = "bili",sav -# npts = 50, show.plots = FALSE, -# surv.type="surv") -# -load(partial_pbc_time, package="ggRandomForests") - -# We need to attach the time points of interest to our data. -time.tmp <- do.call(c,lapply(time_cts, - function(grp){rep(grp, 50)})) - -# Convert the list of plot.variable output to gg_partial -partial_time <- do.call(rbind,lapply(partial_pbc_time, gg_partial)) - -# attach the time data to the gg_partial_coplot -partial_time$time <- time.tmp - -# Modify the figure margins to make it larger -par(mai = c(0,0.3,0,0)) - -# Transform the gg_partial_coplot object into a list of three named matrices -# for surface plotting with plot3D::surf3D -srf <- surface_matrix(partial_time, c("time", "bili", "yhat")) - -# Generate the figure. -surf3D(x = srf$x, y = srf$y, z = srf$z, col = heat.colors(25), - colkey = FALSE, border = "black", bty = "b2", - shade = 0.5, expand = 0.5, theta=110,phi=15, - lighting = TRUE, lphi = -50, - ylab = "Bilirubin", xlab = "Time", zlab = "Survival" -) - -# Extract the 1 and 3 year points. -# Find the indices of the points closest in time -t.pts <- sapply(c(1,3), function(pt){min(abs(srf$x - pt), na.rm=TRUE)}) -# Extract the 1 and 3 year points. -# Find the indices of the points closest in time -t.pts <- sapply(c(1,3), function(pt){min(abs(srf$x - pt), na.rm=TRUE)}) -indx <- vector("list", length=2) -indx[[1]] <- which(abs(srf$x - 1) < t.pts[1]+1.e-5) -indx[[2]] <- which(abs(srf$x - 3) < t.pts[2]+1.e-5) - -# Generate curves along 1 and 3 year partial dependence -alt <- lapply(indx, function(ind){ - lines3D(x=srf$x[ind], y=srf$y[ind],z=srf$z[ind], - add=TRUE, col="blue", lwd=6) - }) -@ - -\subsection{Bilirubin Coplot}\label{A:biliCoplot} - -In Section~\ref{S:coplots}, we generate variable dependence coplots for the \code{bili} variable conditional on grouping on intervals of the \code{albumin} variable, and the complimentary \code{albumin} variable conditional on grouping on intervals of the \code{bili} variable. We include the source code for Figure~\ref{fig:albumin-coplot} in the document. Since the code is nearly identical for the later case, we include the source code for generating Figure~\ref{fig:bili-coplot} here. - -<>= -# Find intervals with similar number of observations. -bili_cts <-quantile_pts(ggvar$bili, groups = 6, intervals = TRUE) - -# We need to move the minimal value so we include that observation -bili_cts[1] <- bili_cts[1] - 1.e-7 - -# Create the conditional groups and add to the gg_variable object -ggvar$bili_grp <- cut(ggvar$bili, breaks = bili_cts) - -# Adjust naming for facets -levels(ggvar$bili_grp) <- paste("bilirubin = ",levels(ggvar$bili_grp), sep = "") - -# plot.gg_variable -plot(ggvar[-which(is.na(ggvar$albumin)),], xvar = "albumin", - method = "glm", alpha = 0.5, se = FALSE) + - labs(y = "Survival", x = st.labs["albumin"]) + - theme(legend.position = "none") + - scale_color_manual(values = strCol, labels = event.labels) + - scale_shape_manual(values = event.marks, labels = event.labels) + - facet_wrap(~bili_grp) + - coord_cartesian(ylim = c(-0.01, 1.01)) -@ - -\subsection{Bilirubin Partial Coplot}\label{A:biliPartialCoplot} -Similar to variable dependence coplots, In Section~\ref{S:partialcoplots}, we compare the partial dependence coplots for the same \code{albumin} and \code{bili} variable groupings. Again, the source code for Figure~\ref{fig:bili-albumin} is nearly identical to the source code for generating Figure~\ref{fig:albumin-bili}. We include the partial dependence coplot source code for the \code{albumin} variable conditional on grouping on intervals of the \code{bili} variable. - -<>= -partial_coplot_pbc2 <- gg_partial_coplot(rfsrc_pbc, xvar = "albumin", - groups = bili_grp, - surv_type = "surv", - time = 1, - show.plots = FALSE) - - -# Stored in -# data(partial_coplot_pbc2, package = "ggRandomForests") - -plot(partial_coplot_pbc2, se = FALSE) + - labs(x = st.labs["albumin"], y = "Survival at 1 year (%)", - color = "Bilirubin", shape = "Bilirubin") + - scale_color_brewer(palette = "Set2") + - coord_cartesian(y = c(49,101)) -@ - -\subsection{Partial Dependence in Multiple Variable Dimensions}\label{A:variableDomain} -In Section~\ref{S:partialSurface}, we generate a partial dependence surface of one year survival dependence on both \code{bili} and \code{albumin} variables. We include the Source code for generating Figure~\ref{fig:surface3d} here. - -<>= -# Find the quantile points to create 50 cut points -alb_partial_pts <-quantile_pts(ggvar$albumin, groups = 50) - -# Load the stored partial coplot data. -# See ?partial_pbc_surf for how this data was generated. -# -# partial_pbc_surf <- lapply(alb_partial_pts, function(ct){ -# rfsrc_pbc$xvar$albumin <- ct -# plot.variable(rfsrc_pbc, xvar = "bili", time = 1, -# npts = 50, show.plots = FALSE, -# partial = TRUE, surv.type="surv") -# }) -# -data("partial_pbc_surf") - -# Instead of groups, we want the raw albumin point values, -# To make the dimensions match, we need to repeat the values -# for each of the 50 points in the albumin direction -albumin.tmp <- do.call(c,lapply(alb_partial_pts, - function(grp){rep(grp, 50)})) - -# Convert the list of plot.variable output to -partial_surf <- do.call(rbind,lapply(partial_pbc_surf, gg_partial)) - -# attach the data to the gg_partial_coplot -partial_surf$albumin <- albumin.tmp - -# Modify the figure margins to make the figure larger -par(mai = c(0,.3,0,0)) - -# Transform the gg_partial_coplot object into a list of three named matrices -# for surface plotting with plot3D::surf3D -srf <- surface_matrix(partial_surf, c("bili", "albumin", "yhat")) - -# Generate the figure. -surf3D(x = srf$x, y = srf$y, z = srf$z, col = topo.colors(25), - colkey = FALSE, border = "black", bty = "b2", - shade = 0.5, expand = 0.5, theta=55, phi=15, - lighting = TRUE, lphi = -50, - xlab = "Bilirubin", ylab = "Albumin", zlab = "Survival at 1 Year" - ) - -# Extract the albumin and bilirubin points -# Remove end points -bli <- bili_cts[-c(1,7)] -alb <- albumin_cts[-c(1,7)] - -# Find the indices of the points closest to split points -alb.pts <- lapply(alb, function(pt){min(abs(srf$y - pt), na.rm=TRUE)}) -bli.pts <- lapply(bli, function(pt){min(abs(srf$x - pt), na.rm=TRUE)}) - -indx.alb <- lapply(1:length(alb.pts), function(al){ - which(abs(srf$y - alb[al]) < alb.pts[[al]]+1.e-5)}) -indx.bli <- lapply(1:length(bli.pts), function(al){ - which(abs(srf$x - bli[al]) < bli.pts[[al]]+1.e-5)}) - -# Draw the lines -indx <- c(indx.alb, indx.bli) -st <- lapply(indx, function(ind){ - lines3D(x=srf$x[ind], - y=srf$y[ind], - z=srf$z[ind], - add=TRUE, col="blue", lwd=6)}) -@ - -\end{document} \ No newline at end of file diff --git a/vignettes/randomForestSRC-Survival.pdf b/vignettes/randomForestSRC-Survival.pdf deleted file mode 100644 index aac7e7d3..00000000 Binary files a/vignettes/randomForestSRC-Survival.pdf and /dev/null differ