From df821a47bc3a790e7e189b89dc2da67347022691 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 16:28:20 +0900 Subject: [PATCH 01/26] Added the `methods` package for S4 classes to `DESCRIPTION`. --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index deb7122..5786568 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,8 @@ Depends: R (>= 4.0.2), Imports: glmnet (>= 4.0.2), - assertthat + assertthat, + methods Suggests: testthat, knitr, From e2a8d8bb820a197e917c6d70a6b7c40f91ec35c3 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 16:33:15 +0900 Subject: [PATCH 02/26] Fixed arguments names of generic methods, which are inconsistent with original methods. --- R/coef-aglm.R | 10 ++++++++-- R/deviance-aglm.R | 10 ++++++++-- R/plot-aglm.R | 10 ++++++++-- R/predict-aglm.R | 10 ++++++++-- R/print-aglm.R | 10 ++++++++-- R/residuals-aglm.R | 13 ++++++++++--- man/coef.AccurateGLM.Rd | 4 ++-- man/deviance.AccurateGLM.Rd | 4 ++-- man/plot.AccurateGLM.Rd | 4 ++-- man/predict.AccurateGLM.Rd | 4 ++-- man/print.AccurateGLM.Rd | 4 ++-- man/residuals.AccurateGLM.Rd | 7 ++++--- 12 files changed, 64 insertions(+), 26 deletions(-) diff --git a/R/coef-aglm.R b/R/coef-aglm.R index faf97e1..cec2e3c 100644 --- a/R/coef-aglm.R +++ b/R/coef-aglm.R @@ -3,14 +3,20 @@ #' Extract coefficients from an AccurateGLM object. #' -#' @param model An AccurateGLM object. +#' @param object An AccurateGLM object. #' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. #' Default is the entire sequence used to create the model. #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @importFrom assertthat assert_that #' @export -coef.AccurateGLM <- function(model, index=NULL, name=NULL, s=NULL, exact=FALSE, ...) { +coef.AccurateGLM <- function(object, index=NULL, name=NULL, s=NULL, exact=FALSE, ...) { + # It's necessary to use same names for some arguments as the original methods, + # because devtools::check() issues warnings when using inconsistent names. + # As a result, we sometimes should accept uncomfortable argument names, + # but still have rights to use preferable names internally. + model <- object + coefs <- coef(model@backend_models[[1]], s, exact, ...) # If `name` is set, `index` is overwritten. diff --git a/R/deviance-aglm.R b/R/deviance-aglm.R index c554070..fdb3029 100644 --- a/R/deviance-aglm.R +++ b/R/deviance-aglm.R @@ -3,10 +3,16 @@ #' Extract the deviance from an AccurateGLM object. #' -#' @param model An AccurateGLM object. +#' @param object An AccurateGLM object. #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @export -deviance.AccurateGLM <- function(model, ...) { +deviance.AccurateGLM <- function(object, ...) { + # It's necessary to use same names for some arguments as the original methods, + # because devtools::check() issues warnings when using inconsistent names. + # As a result, we sometimes should accept uncomfortable argument names, + # but still have rights to use preferable names internally. + model <- object + return(deviance(model@backend_models[[1]], ...)) } diff --git a/R/plot-aglm.R b/R/plot-aglm.R index 0391915..7fc5445 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -3,7 +3,7 @@ #' Plot coefficients from an AccurateGLM object #' -#' @param model An AccurateGLM object. +#' @param x An AccurateGLM object. #' @param vars An integer or character vectors (indices or names) specifying which variables should be plotted. #' @param verbose If TRUE, outputs details. #' @param s A numeric value specifying lambda value at which plotting is required. @@ -29,7 +29,7 @@ #' #' @export #' @importFrom assertthat assert_that -plot.AccurateGLM <- function(model, +plot.AccurateGLM <- function(x, vars=NULL, verbose=TRUE, s=NULL, @@ -42,6 +42,12 @@ plot.AccurateGLM <- function(model, main="", add_rug=FALSE, ...) { + # It's necessary to use same names for some arguments as the original methods, + # because devtools::check() issues warnings when using inconsistent names. + # As a result, we sometimes should accept uncomfortable argument names, + # but still have rights to use preferable names internally. + model <- x + nvars <- length(model@vars_info) if (is.null(vars)) { diff --git a/R/predict-aglm.R b/R/predict-aglm.R index 25bed4a..fa8734e 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -3,7 +3,7 @@ #' Make predictions from a fitted AccurateGLM #' -#' @param model An AccurateGLM object. +#' @param object An AccurateGLM object. #' @param newx An input matrix or data.frame used for predictions. #' @param type Type of prediction required. #' * Type `"link"` gives the linear predictors for `"binomial"`, `"poisson"` models, and for `"gaussian"` models it gives the fitted values. @@ -19,13 +19,19 @@ #' @export #' @importFrom assertthat assert_that #' @importFrom glmnet predict.glmnet -predict.AccurateGLM <- function(model, +predict.AccurateGLM <- function(object, newx=NULL, s=NULL, type=c("link","response","coefficients","nonzero","class"), exact=FALSE, newoffset, ...) { + # It's necessary to use same names for some arguments as the original methods, + # because devtools::check() issues warnings when using inconsistent names. + # As a result, we sometimes should accept uncomfortable argument names, + # but still have rights to use preferable names internally. + model <- object + # Check and set `type` type <- match.arg(type) diff --git a/R/print-aglm.R b/R/print-aglm.R index f06b1c9..ff376f3 100644 --- a/R/print-aglm.R +++ b/R/print-aglm.R @@ -3,11 +3,17 @@ #' Print an AccurateGLM object #' -#' @param model An AccurateGLM object. +#' @param x An AccurateGLM object. #' @param digits Significant digits in printout. #' @param ... Other arguments are passed directly to `print` functions of `model@backend_models`. #' #' @export -print.AccurateGLM <- function(model, digits=max(3, getOption("digits") - 3), ...) { +print.AccurateGLM <- function(x, digits=max(3, getOption("digits") - 3), ...) { + # It's necessary to use same names for some arguments as the original methods, + # because devtools::check() issues warnings when using inconsistent names. + # As a result, we sometimes should accept uncomfortable argument names, + # but still have rights to use preferable names internally. + model <- x + print(model@backend_models[[1]], digits, ...) } diff --git a/R/residuals-aglm.R b/R/residuals-aglm.R index b4aa28d..b260b67 100644 --- a/R/residuals-aglm.R +++ b/R/residuals-aglm.R @@ -3,7 +3,7 @@ #' Calculate residuals for AGLM model #' -#' @param model An AccurateGLM object. +#' @param object An AccurateGLM object. #' @param type Type of prediction required. #' * Type `"working"` Working residuals. #' * Type `"pearson"` Pearson residuals. @@ -14,13 +14,20 @@ #' #' @export #' @importFrom assertthat assert_that -residuals.AccurateGLM <- function(model, +residuals.AccurateGLM <- function(object, x=NULL, y=NULL, offset=NULL, weights=NULL, type=c("working", "pearson", "deviance"), - s=NULL) { + s=NULL, + ...) { + # It's necessary to use same names for some arguments as the original methods, + # because devtools::check() issues warnings when using inconsistent names. + # As a result, we sometimes should accept uncomfortable argument names, + # but still have rights to use preferable names internally. + model <- object + # Check and set `type` type <- match.arg(type) diff --git a/man/coef.AccurateGLM.Rd b/man/coef.AccurateGLM.Rd index 9681342..9117eff 100644 --- a/man/coef.AccurateGLM.Rd +++ b/man/coef.AccurateGLM.Rd @@ -4,10 +4,10 @@ \alias{coef.AccurateGLM} \title{Extract coefficients from an AccurateGLM object.} \usage{ -\method{coef}{AccurateGLM}(model, index = NULL, name = NULL, s = NULL, exact = FALSE, ...) +\method{coef}{AccurateGLM}(object, index = NULL, name = NULL, s = NULL, exact = FALSE, ...) } \arguments{ -\item{model}{An AccurateGLM object.} +\item{object}{An AccurateGLM object.} \item{s}{Value(s) of the penalty parameter \code{lambda} at which predictions are required. Default is the entire sequence used to create the model.} diff --git a/man/deviance.AccurateGLM.Rd b/man/deviance.AccurateGLM.Rd index b708168..bb0b462 100644 --- a/man/deviance.AccurateGLM.Rd +++ b/man/deviance.AccurateGLM.Rd @@ -4,10 +4,10 @@ \alias{deviance.AccurateGLM} \title{Extract the deviance from an AccurateGLM object.} \usage{ -\method{deviance}{AccurateGLM}(model, ...) +\method{deviance}{AccurateGLM}(object, ...) } \arguments{ -\item{model}{An AccurateGLM object.} +\item{object}{An AccurateGLM object.} \item{...}{Other arguments are passed directly to \code{deviance} functions of \code{model@backend_models}.} } diff --git a/man/plot.AccurateGLM.Rd b/man/plot.AccurateGLM.Rd index 9534f54..23c1aab 100644 --- a/man/plot.AccurateGLM.Rd +++ b/man/plot.AccurateGLM.Rd @@ -5,7 +5,7 @@ \title{Plot coefficients from an AccurateGLM object} \usage{ \method{plot}{AccurateGLM}( - model, + x, vars = NULL, verbose = TRUE, s = NULL, @@ -21,7 +21,7 @@ ) } \arguments{ -\item{model}{An AccurateGLM object.} +\item{x}{An AccurateGLM object.} \item{vars}{An integer or character vectors (indices or names) specifying which variables should be plotted.} diff --git a/man/predict.AccurateGLM.Rd b/man/predict.AccurateGLM.Rd index dd2a2bd..88faa56 100644 --- a/man/predict.AccurateGLM.Rd +++ b/man/predict.AccurateGLM.Rd @@ -5,7 +5,7 @@ \title{Make predictions from a fitted AccurateGLM} \usage{ \method{predict}{AccurateGLM}( - model, + object, newx = NULL, s = NULL, type = c("link", "response", "coefficients", "nonzero", "class"), @@ -15,7 +15,7 @@ ) } \arguments{ -\item{model}{An AccurateGLM object.} +\item{object}{An AccurateGLM object.} \item{newx}{An input matrix or data.frame used for predictions.} diff --git a/man/print.AccurateGLM.Rd b/man/print.AccurateGLM.Rd index 9916bdd..8c04bec 100644 --- a/man/print.AccurateGLM.Rd +++ b/man/print.AccurateGLM.Rd @@ -4,10 +4,10 @@ \alias{print.AccurateGLM} \title{Print an AccurateGLM object} \usage{ -\method{print}{AccurateGLM}(model, digits = max(3, getOption("digits") - 3), ...) +\method{print}{AccurateGLM}(x, digits = max(3, getOption("digits") - 3), ...) } \arguments{ -\item{model}{An AccurateGLM object.} +\item{x}{An AccurateGLM object.} \item{digits}{Significant digits in printout.} diff --git a/man/residuals.AccurateGLM.Rd b/man/residuals.AccurateGLM.Rd index 26aba4b..2d12d9f 100644 --- a/man/residuals.AccurateGLM.Rd +++ b/man/residuals.AccurateGLM.Rd @@ -5,17 +5,18 @@ \title{Calculate residuals for AGLM model} \usage{ \method{residuals}{AccurateGLM}( - model, + object, x = NULL, y = NULL, offset = NULL, weights = NULL, type = c("working", "pearson", "deviance"), - s = NULL + s = NULL, + ... ) } \arguments{ -\item{model}{An AccurateGLM object.} +\item{object}{An AccurateGLM object.} \item{type}{Type of prediction required. \itemize{ From 31f8ae59b1967c0ac5e5d6482b9d8771a0c82893 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 17:01:52 +0900 Subject: [PATCH 03/26] Fixed R codes for possible problems suggested by devtools::check(). --- NAMESPACE | 20 ++++++++++++++++++++ R/aglm-input.R | 12 +++++++----- R/aglm.R | 1 + R/binning.R | 1 + R/coef-aglm.R | 1 + R/cv-aglm.R | 1 + R/cva-aglm.R | 1 + R/deviance-aglm.R | 1 + R/plot-aglm.R | 16 ++++++++++++++++ R/predict-aglm.R | 2 ++ R/residuals-aglm.R | 2 ++ 11 files changed, 53 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f32014d..1102321 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,3 +22,23 @@ importFrom(assertthat,assert_that) importFrom(glmnet,cv.glmnet) importFrom(glmnet,glmnet) importFrom(glmnet,predict.glmnet) +importFrom(grDevices,devAskNewPage) +importFrom(graphics,barplot) +importFrom(graphics,boxplot) +importFrom(graphics,lines) +importFrom(graphics,mtext) +importFrom(graphics,par) +importFrom(graphics,points) +importFrom(graphics,rug) +importFrom(methods,new) +importFrom(stats,IQR) +importFrom(stats,coef) +importFrom(stats,deviance) +importFrom(stats,getCall) +importFrom(stats,ksmooth) +importFrom(stats,predict) +importFrom(stats,quantile) +importFrom(stats,residuals) +importFrom(stats,smooth.spline) +importFrom(utils,flush.console) +importFrom(utils,str) diff --git a/R/aglm-input.R b/R/aglm-input.R index a774b31..346d70b 100644 --- a/R/aglm-input.R +++ b/R/aglm-input.R @@ -12,6 +12,7 @@ setClass("AGLM_Input", #' Create a new AGLM_Input object #' @importFrom assertthat assert_that +#' @importFrom methods new newInput <- function(x, qualitative_vars_UD_only=NULL, qualitative_vars_both=NULL, @@ -86,11 +87,11 @@ newInput <- function(x, cl <- class(idxs_or_names) idxs <- seq(length(var_names)) if (cl == "integer") { - is_hit <- function(idx) {return(idx %in% idxs_or_names)} - idxs <- idxs[sapply(idxs, is_hit)] + is_hit_i <- function(idx) {return(idx %in% idxs_or_names)} + idxs <- idxs[sapply(idxs, is_hit_i)] } else if (cl == "character") { - is_hit <- function(var_name) {return(var_name %in% idxs_or_names)} - idxs <- idxs[sapply(var_names, is_hit)] + is_hit_c <- function(var_name) {return(var_name %in% idxs_or_names)} + idxs <- idxs[sapply(var_names, is_hit_c)] } else { assert_that(FALSE, msg="qualitative_vars_UD_only, qualitative_vars_both, qualitative_vars_both, quantitative_vars should be integer or character vectors.") } @@ -309,6 +310,7 @@ getMatrixRepresentationByVector <- function(raw_vec, var_info, drop_OD=FALSE) { return(z) } +#' @importFrom assertthat assert_that getMatrixRepresentation <- function(x, idx, drop_OD=FALSE) { var_info <- x@vars_info[[idx]] z <- NULL @@ -347,7 +349,7 @@ getMatrixRepresentation <- function(x, idx, drop_OD=FALSE) { } colnames(z) <- nm } else { - assert_true(FALSE) # never expects to come here + assert_that(FALSE) # never expects to come here } return(z) diff --git a/R/aglm.R b/R/aglm.R index 8c7e4e0..a543af4 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -33,6 +33,7 @@ #' @export #' @importFrom assertthat assert_that #' @importFrom glmnet glmnet +#' @importFrom methods new aglm <- function(x, y, qualitative_vars_UD_only=NULL, qualitative_vars_both=NULL, diff --git a/R/binning.R b/R/binning.R index 49dfa94..6c97e6d 100644 --- a/R/binning.R +++ b/R/binning.R @@ -37,6 +37,7 @@ createEqualWidthBins <- function(left, right, nbin){ #' #' @export #' @importFrom assertthat assert_that +#' @importFrom stats quantile createEqualFreqBins <- function(x_vec, nbin.max) { nbin.max <- as.integer(nbin.max) assert_that(nbin.max > 1 & length(x_vec) > 0) diff --git a/R/coef-aglm.R b/R/coef-aglm.R index cec2e3c..2af61c0 100644 --- a/R/coef-aglm.R +++ b/R/coef-aglm.R @@ -9,6 +9,7 @@ #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @importFrom assertthat assert_that +#' @importFrom stats coef #' @export coef.AccurateGLM <- function(object, index=NULL, name=NULL, s=NULL, exact=FALSE, ...) { # It's necessary to use same names for some arguments as the original methods, diff --git a/R/cv-aglm.R b/R/cv-aglm.R index 0803429..03e417f 100644 --- a/R/cv-aglm.R +++ b/R/cv-aglm.R @@ -33,6 +33,7 @@ #' @export #' @importFrom assertthat assert_that #' @importFrom glmnet cv.glmnet +#' @importFrom methods new cv.aglm <- function(x, y, qualitative_vars_UD_only=NULL, qualitative_vars_both=NULL, diff --git a/R/cva-aglm.R b/R/cva-aglm.R index c67c39e..4fd6f49 100644 --- a/R/cva-aglm.R +++ b/R/cva-aglm.R @@ -16,6 +16,7 @@ #' #' @export #' @importFrom assertthat assert_that +#' @importFrom methods new cva.aglm <- function(x, y, alpha=seq(0, 1, len=11)^3, nfolds=10, diff --git a/R/deviance-aglm.R b/R/deviance-aglm.R index fdb3029..4e108e3 100644 --- a/R/deviance-aglm.R +++ b/R/deviance-aglm.R @@ -7,6 +7,7 @@ #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @export +#' @importFrom stats deviance deviance.AccurateGLM <- function(object, ...) { # It's necessary to use same names for some arguments as the original methods, # because devtools::check() issues warnings when using inconsistent names. diff --git a/R/plot-aglm.R b/R/plot-aglm.R index 7fc5445..901db61 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -29,6 +29,22 @@ #' #' @export #' @importFrom assertthat assert_that +#' @importFrom utils str +#' @importFrom utils flush.console +#' @importFrom stats getCall +#' @importFrom stats residuals +#' @importFrom stats coef +#' @importFrom stats IQR +#' @importFrom stats smooth.spline +#' @importFrom stats ksmooth +#' @importFrom graphics par +#' @importFrom graphics points +#' @importFrom graphics lines +#' @importFrom graphics rug +#' @importFrom graphics mtext +#' @importFrom graphics boxplot +#' @importFrom graphics barplot +#' @importFrom grDevices devAskNewPage plot.AccurateGLM <- function(x, vars=NULL, verbose=TRUE, diff --git a/R/predict-aglm.R b/R/predict-aglm.R index fa8734e..580c456 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -19,6 +19,8 @@ #' @export #' @importFrom assertthat assert_that #' @importFrom glmnet predict.glmnet +#' @importFrom methods new +#' @importFrom stats predict predict.AccurateGLM <- function(object, newx=NULL, s=NULL, diff --git a/R/residuals-aglm.R b/R/residuals-aglm.R index b260b67..57fc297 100644 --- a/R/residuals-aglm.R +++ b/R/residuals-aglm.R @@ -14,6 +14,8 @@ #' #' @export #' @importFrom assertthat assert_that +#' @importFrom stats predict +#' @importFrom stats getCall residuals.AccurateGLM <- function(object, x=NULL, y=NULL, From 3abde2eb566b9b45eb220a6f384f03ed3d1923b4 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 17:19:29 +0900 Subject: [PATCH 04/26] Fixed code/documentation mismathces suggested by devtools::check(). --- R/accurate-glm.R | 20 ++++++++++++++++---- man/AccurateGLM-class.Rd | 28 +++++++++++++++++++++++++--- man/CVA_AccurateGLM-class.Rd | 10 ++++++---- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/R/accurate-glm.R b/R/accurate-glm.R index 9ab99d4..d9cd575 100644 --- a/R/accurate-glm.R +++ b/R/accurate-glm.R @@ -2,12 +2,23 @@ # written by Kenji Kondo @ 2019/1/2 -#' S4 class for fitted AGLM +#' S4 class for fitted AGLM, used as a result of aglm() or cv.aglm() #' #' @slot backend_models Internally used model objects to be passed to backend functions. #' Currently glmnet is used as a backend and this slot holding a glmnet object. #' @slot vars_info A list of list. Each element of `vars_info` represents one predictor variable and contains various informations of it. -#' @slot others slots for holding cross-validation results +#' @slot lambda The values of `lambda` used in the fits. +#' @slot cvm The mean cross-validated error. +#' @slot cvsd The estimate of standarded error of `cvm`. +#' @slot cvup The upper curve as `cvm + cvsd`. +#' @slot cvlo The lower curve as `cvm - cvsd`. +#' @slot nzero The number of non-zero coefficients at each lambda. +#' @slot name A text string indicating type of measure (for plotting purposes). +#' @slot lambda.min The value of `lambda` that gives minimum `cvm`. +#' @slot lambda.1se The largest value of `lambda` such that error is within 1 standard error of the minimum. +#' @slot fit.preval If `keep=TRUE`, this is the array of prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold. +#' @slot foldid An integer vector of values between 1 and nfold identifying what fold each observation is in. +#' @slot call An object of class call, which is used to record how cva.aglm() is called. #' #' @export setClass("AccurateGLM", @@ -26,14 +37,15 @@ setClass("AccurateGLM", foldid="integer", call="ANY")) -#' S4 class for the result of cva.aglm function +#' S4 class for a result of cva.aglm() #' #' @slot models_list Results of cv.glmnet() for all the values of alpha. #' @slot alpha A numeric values specifying alpha values to be examined. #' @slot nfolds An integer value specifying the number of folds. -#' @slot alpha.min The alpha value which achieves the minimum loss. #' @slot alpha.min.index An integer value specifying the index of `alpha.min` in `alpha`. +#' @slot alpha.min The alpha value which achieves the minimum loss. #' @slot lambda.min The lambda value which achieves the minimum loss, when combined with `alpha.min`. +#' @slot call An object of class call, which is used to record how cva.aglm() is called. #' #' @export setClass("CVA_AccurateGLM", diff --git a/man/AccurateGLM-class.Rd b/man/AccurateGLM-class.Rd index a7b43f8..985d2f7 100644 --- a/man/AccurateGLM-class.Rd +++ b/man/AccurateGLM-class.Rd @@ -3,9 +3,9 @@ \docType{class} \name{AccurateGLM-class} \alias{AccurateGLM-class} -\title{S4 class for fitted AGLM} +\title{S4 class for fitted AGLM, used as a result of aglm() or cv.aglm()} \description{ -S4 class for fitted AGLM +S4 class for fitted AGLM, used as a result of aglm() or cv.aglm() } \section{Slots}{ @@ -15,6 +15,28 @@ Currently glmnet is used as a backend and this slot holding a glmnet object.} \item{\code{vars_info}}{A list of list. Each element of \code{vars_info} represents one predictor variable and contains various informations of it.} -\item{\code{others}}{slots for holding cross-validation results} +\item{\code{lambda}}{The values of \code{lambda} used in the fits.} + +\item{\code{cvm}}{The mean cross-validated error.} + +\item{\code{cvsd}}{The estimate of standarded error of \code{cvm}.} + +\item{\code{cvup}}{The upper curve as \code{cvm + cvsd}.} + +\item{\code{cvlo}}{The lower curve as \code{cvm - cvsd}.} + +\item{\code{nzero}}{The number of non-zero coefficients at each lambda.} + +\item{\code{name}}{A text string indicating type of measure (for plotting purposes).} + +\item{\code{lambda.min}}{The value of \code{lambda} that gives minimum \code{cvm}.} + +\item{\code{lambda.1se}}{The largest value of \code{lambda} such that error is within 1 standard error of the minimum.} + +\item{\code{fit.preval}}{If \code{keep=TRUE}, this is the array of prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold.} + +\item{\code{foldid}}{An integer vector of values between 1 and nfold identifying what fold each observation is in.} + +\item{\code{call}}{An object of class call, which is used to record how cva.aglm() is called.} }} diff --git a/man/CVA_AccurateGLM-class.Rd b/man/CVA_AccurateGLM-class.Rd index 31e17b8..e9306d0 100644 --- a/man/CVA_AccurateGLM-class.Rd +++ b/man/CVA_AccurateGLM-class.Rd @@ -3,9 +3,9 @@ \docType{class} \name{CVA_AccurateGLM-class} \alias{CVA_AccurateGLM-class} -\title{S4 class for the result of cva.aglm function} +\title{S4 class for a result of cva.aglm()} \description{ -S4 class for the result of cva.aglm function +S4 class for a result of cva.aglm() } \section{Slots}{ @@ -16,10 +16,12 @@ S4 class for the result of cva.aglm function \item{\code{nfolds}}{An integer value specifying the number of folds.} -\item{\code{alpha.min}}{The alpha value which achieves the minimum loss.} - \item{\code{alpha.min.index}}{An integer value specifying the index of \code{alpha.min} in \code{alpha}.} +\item{\code{alpha.min}}{The alpha value which achieves the minimum loss.} + \item{\code{lambda.min}}{The lambda value which achieves the minimum loss, when combined with \code{alpha.min}.} + +\item{\code{call}}{An object of class call, which is used to record how cva.aglm() is called.} }} From 2f2a0bcf3925881c078cf9334d95ee98208cb52f Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 17:27:33 +0900 Subject: [PATCH 05/26] Added some github-related files to .Rbuildignore. --- .Rbuildignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 8312324..0b2a61e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,3 +3,5 @@ ^aglm\.Rcheck$ ^aglm.*\.tar\.gz$ ^aglm.*\.tgz$ +^LICENSE\.md$ +^\.github$ From 62efa89cfdf3812ef80648f6bf4a47331957f9db Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 17:55:53 +0900 Subject: [PATCH 06/26] Fixed undocumented arguments suggested by devtools::check(). --- R/aglm-input.R | 2 +- R/coef-aglm.R | 3 +++ R/cv-aglm.R | 1 + R/plot-aglm.R | 1 + R/predict-aglm.R | 4 ++++ R/residuals-aglm.R | 9 +++++++++ man/coef.AccurateGLM.Rd | 6 ++++++ man/cv.aglm.Rd | 2 ++ man/newInput.Rd | 26 -------------------------- man/plot.AccurateGLM.Rd | 2 ++ man/predict.AccurateGLM.Rd | 7 +++++++ man/residuals.AccurateGLM.Rd | 14 ++++++++++++++ 12 files changed, 50 insertions(+), 27 deletions(-) delete mode 100644 man/newInput.Rd diff --git a/R/aglm-input.R b/R/aglm-input.R index 346d70b..1ba8d9e 100644 --- a/R/aglm-input.R +++ b/R/aglm-input.R @@ -10,7 +10,7 @@ setClass("AGLM_Input", representation=representation(vars_info="list", data="data.frame")) -#' Create a new AGLM_Input object +# An inner-use function for creating a new AGLM_Input object #' @importFrom assertthat assert_that #' @importFrom methods new newInput <- function(x, diff --git a/R/coef-aglm.R b/R/coef-aglm.R index 2af61c0..b243b1d 100644 --- a/R/coef-aglm.R +++ b/R/coef-aglm.R @@ -4,8 +4,11 @@ #' Extract coefficients from an AccurateGLM object. #' #' @param object An AccurateGLM object. +#' @param index An integer vector of variable indices whose coefficients should be returned. +#' @param name An character vector of variable names whose coefficients should be returned. #' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. #' Default is the entire sequence used to create the model. +#' @param exact Same as coef.glmnet(). #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @importFrom assertthat assert_that diff --git a/R/cv-aglm.R b/R/cv-aglm.R index 03e417f..8f6ea1a 100644 --- a/R/cv-aglm.R +++ b/R/cv-aglm.R @@ -26,6 +26,7 @@ #' @param nbin.max a maximum number of bins which is automatically generated. Only used when `breaks` is not set. #' @param bins_list A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order. #' @param bins_names A list of column name or column index, each name or index of which specifies which column of `x` is binned used with an element of `bins_list` in the same position. +#' @param keep Same as cv.glmnet(). #' @param ... Other arguments are passed directly to backend (currently cv.glmnet() is used), and if not given, backend API's default values are used to call backend functions. #' #' @return Result of cross-validation. diff --git a/R/plot-aglm.R b/R/plot-aglm.R index 901db61..c42c13e 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -26,6 +26,7 @@ #' @param only_plot If `TRUE`, the function set no graphical parameters and no title. #' @param main A character value which indicates titles of panels. #' @param add_rug A boolean value which indicates draw rugplot for quantitative variables. +#' @param ... Other arguments are currently not used. #' #' @export #' @importFrom assertthat assert_that diff --git a/R/predict-aglm.R b/R/predict-aglm.R index 580c456..59f3757 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -5,6 +5,8 @@ #' #' @param object An AccurateGLM object. #' @param newx An input matrix or data.frame used for predictions. +#' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. +#' Default is the entire sequence used to create the model. #' @param type Type of prediction required. #' * Type `"link"` gives the linear predictors for `"binomial"`, `"poisson"` models, and for `"gaussian"` models it gives the fitted values. #' * Type `"response"` gives the fitted probabilities for `"binomial"`, fitted mean for `"poisson"`, and for `"gaussian"` models it is equivalent to type `"link"`. @@ -12,6 +14,8 @@ #' Note that for `"binomial"` models, results are returned only for the class corresponding to the second level of the factor response. #' * Type `"class"` applies only to `"binomial"`, and produces the class label corresponding to the maximum probability. #' * Type `"nonzero"` returns a list of the indices of the nonzero coefficients for each value of `s`. +#' @param exact Same as predict.glmnet(). +#' @param newoffset If an offset is used in the fit, then one must be supplied for making predictions (except for type="coefficients" or type="nonzero"). #' @param ... Other arguments are passed directly to backend (currently glmnet() is used), and if not given, backend API's deault values are used. #' #' @return The object returned depends on type. diff --git a/R/residuals-aglm.R b/R/residuals-aglm.R index 57fc297..d0065a9 100644 --- a/R/residuals-aglm.R +++ b/R/residuals-aglm.R @@ -4,11 +4,20 @@ #' Calculate residuals for AGLM model #' #' @param object An AccurateGLM object. +#' @param x An input matrix or data.frame used for predictions in residual calculations. +#' If not given, `x` used for fitting the model is used. +#' @param y A numeric vector used as true target values in residual calculations. +#' If not given, `y` used for fitting the model is used. +#' @param offset A numeric offset values used for predictions in residual calculations. +#' If not given, `offset` used for fitting the model is used. +#' @param weights A numeric weight values, corresponding with exposure size. +#' If not given, `weights` used for fitting the model is used. #' @param type Type of prediction required. #' * Type `"working"` Working residuals. #' * Type `"pearson"` Pearson residuals. #' * Type `"deviance"` Devian residuals. #' @param s A numeric value specifying lambda value at which plotting is required. +#' @param ... Other arguments are currently not used. #' #' @return The object returned depends on type. #' diff --git a/man/coef.AccurateGLM.Rd b/man/coef.AccurateGLM.Rd index 9117eff..2fdf14c 100644 --- a/man/coef.AccurateGLM.Rd +++ b/man/coef.AccurateGLM.Rd @@ -9,9 +9,15 @@ \arguments{ \item{object}{An AccurateGLM object.} +\item{index}{An integer vector of variable indices whose coefficients should be returned.} + +\item{name}{An character vector of variable names whose coefficients should be returned.} + \item{s}{Value(s) of the penalty parameter \code{lambda} at which predictions are required. Default is the entire sequence used to create the model.} +\item{exact}{Same as coef.glmnet().} + \item{...}{Other arguments are passed directly to \code{deviance} functions of \code{model@backend_models}.} } \description{ diff --git a/man/cv.aglm.Rd b/man/cv.aglm.Rd index 1af20c7..fd8458f 100644 --- a/man/cv.aglm.Rd +++ b/man/cv.aglm.Rd @@ -68,6 +68,8 @@ values are constructed. Choose "C"(default) or "J". \item{family}{Response type. Currently "gaussian", "binomial", and "poisson" are supported.} +\item{keep}{Same as cv.glmnet().} + \item{...}{Other arguments are passed directly to backend (currently cv.glmnet() is used), and if not given, backend API's default values are used to call backend functions.} } \value{ diff --git a/man/newInput.Rd b/man/newInput.Rd deleted file mode 100644 index 14a481e..0000000 --- a/man/newInput.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aglm-input.R -\name{newInput} -\alias{newInput} -\title{Create a new AGLM_Input object} -\usage{ -newInput( - x, - qualitative_vars_UD_only = NULL, - qualitative_vars_both = NULL, - qualitative_vars_OD_only = NULL, - quantitative_vars = NULL, - use_LVar = FALSE, - extrapolation = "default", - add_linear_columns = TRUE, - add_OD_columns_of_qualitatives = TRUE, - add_interaction_columns = TRUE, - OD_type_of_quantitatives = "C", - nbin.max = NULL, - bins_list = NULL, - bins_names = NULL -) -} -\description{ -Create a new AGLM_Input object -} diff --git a/man/plot.AccurateGLM.Rd b/man/plot.AccurateGLM.Rd index 23c1aab..16931e3 100644 --- a/man/plot.AccurateGLM.Rd +++ b/man/plot.AccurateGLM.Rd @@ -55,6 +55,8 @@ The default value is \code{TRUE}.} \item{main}{A character value which indicates titles of panels.} \item{add_rug}{A boolean value which indicates draw rugplot for quantitative variables.} + +\item{...}{Other arguments are currently not used.} } \description{ Plot coefficients from an AccurateGLM object diff --git a/man/predict.AccurateGLM.Rd b/man/predict.AccurateGLM.Rd index 88faa56..1647626 100644 --- a/man/predict.AccurateGLM.Rd +++ b/man/predict.AccurateGLM.Rd @@ -19,6 +19,9 @@ \item{newx}{An input matrix or data.frame used for predictions.} +\item{s}{Value(s) of the penalty parameter \code{lambda} at which predictions are required. +Default is the entire sequence used to create the model.} + \item{type}{Type of prediction required. \itemize{ \item Type \code{"link"} gives the linear predictors for \code{"binomial"}, \code{"poisson"} models, and for \code{"gaussian"} models it gives the fitted values. @@ -29,6 +32,10 @@ Note that for \code{"binomial"} models, results are returned only for the class \item Type \code{"nonzero"} returns a list of the indices of the nonzero coefficients for each value of \code{s}. }} +\item{exact}{Same as predict.glmnet().} + +\item{newoffset}{If an offset is used in the fit, then one must be supplied for making predictions (except for type="coefficients" or type="nonzero").} + \item{...}{Other arguments are passed directly to backend (currently glmnet() is used), and if not given, backend API's deault values are used.} } \value{ diff --git a/man/residuals.AccurateGLM.Rd b/man/residuals.AccurateGLM.Rd index 2d12d9f..9e027fb 100644 --- a/man/residuals.AccurateGLM.Rd +++ b/man/residuals.AccurateGLM.Rd @@ -18,6 +18,18 @@ \arguments{ \item{object}{An AccurateGLM object.} +\item{x}{An input matrix or data.frame used for predictions in residual calculations. +If not given, \code{x} used for fitting the model is used.} + +\item{y}{A numeric vector used as true target values in residual calculations. +If not given, \code{y} used for fitting the model is used.} + +\item{offset}{A numeric offset values used for predictions in residual calculations. +If not given, \code{offset} used for fitting the model is used.} + +\item{weights}{A numeric weight values, corresponding with exposure size. +If not given, \code{weights} used for fitting the model is used.} + \item{type}{Type of prediction required. \itemize{ \item Type \code{"working"} Working residuals. @@ -26,6 +38,8 @@ }} \item{s}{A numeric value specifying lambda value at which plotting is required.} + +\item{...}{Other arguments are currently not used.} } \value{ The object returned depends on type. From 311395559ee8d5509b8b157f05d862d43b661c1d Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Tue, 23 Feb 2021 18:32:49 +0900 Subject: [PATCH 07/26] Small fixes to avoid devtools::check() warnings. --- .Rbuildignore | 1 + DESCRIPTION | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index 0b2a61e..d05fc86 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,4 @@ ^aglm.*\.tgz$ ^LICENSE\.md$ ^\.github$ +^examples/* diff --git a/DESCRIPTION b/DESCRIPTION index 5786568..7953aec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ LazyData: true RoxygenNote: 7.1.1 Roxygen: list(markdown = TRUE) Depends: - R (>= 4.0.2), + R (>= 4.0.0), Imports: glmnet (>= 4.0.2), assertthat, From 9666b44d065fd77ad8d82942e31a55a1f3756def Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Wed, 24 Feb 2021 21:34:29 +0900 Subject: [PATCH 08/26] Fixed documents to avoid error suggested by devtools::spell_check(). --- DESCRIPTION | 1 + R/accurate-glm.R | 20 ++++++++++---------- R/aglm.R | 16 ++++++++-------- R/binning.R | 2 +- R/coef-aglm.R | 6 +++--- R/cv-aglm.R | 20 ++++++++++---------- R/cva-aglm.R | 4 ++-- R/deviance-aglm.R | 4 ++-- R/get-dummies.R | 14 +++++++------- R/plot-aglm.R | 14 +++++++------- R/predict-aglm.R | 8 ++++---- R/print-aglm.R | 4 ++-- R/residuals-aglm.R | 4 ++-- README.md | 4 ++-- man/AccurateGLM-class.Rd | 16 ++++++++-------- man/CVA_AccurateGLM-class.Rd | 8 ++++---- man/aglm.Rd | 16 ++++++++-------- man/coef.AccurateGLM.Rd | 8 ++++---- man/cv.aglm.Rd | 20 ++++++++++---------- man/cva.aglm.Rd | 4 ++-- man/deviance.AccurateGLM.Rd | 6 +++--- man/executeBinning.Rd | 2 +- man/getLVarMatForOneVec.Rd | 2 +- man/getODummyMatForOneVec.Rd | 8 ++++---- man/getUDummyMatForOneVec.Rd | 4 ++-- man/plot.AccurateGLM.Rd | 16 ++++++++-------- man/predict.AccurateGLM.Rd | 10 +++++----- man/print.AccurateGLM.Rd | 6 +++--- man/residuals.AccurateGLM.Rd | 4 ++-- 29 files changed, 126 insertions(+), 125 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7953aec..305f380 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,6 +8,7 @@ Description: A handy tool for actuarial modeling, which is designed to achieve b AGLM is based on GLM but customized by expert actuaries for areas which require not only accuracy but also accountability. License: GPL-2 Encoding: UTF-8 +Language: en-US LazyData: true RoxygenNote: 7.1.1 Roxygen: list(markdown = TRUE) diff --git a/R/accurate-glm.R b/R/accurate-glm.R index d9cd575..c786707 100644 --- a/R/accurate-glm.R +++ b/R/accurate-glm.R @@ -2,23 +2,23 @@ # written by Kenji Kondo @ 2019/1/2 -#' S4 class for fitted AGLM, used as a result of aglm() or cv.aglm() +#' S4 class for fitted AGLM, used as a result of `aglm()` or `cv.aglm()` #' #' @slot backend_models Internally used model objects to be passed to backend functions. -#' Currently glmnet is used as a backend and this slot holding a glmnet object. -#' @slot vars_info A list of list. Each element of `vars_info` represents one predictor variable and contains various informations of it. +#' Currently `glmnet` is used as a backend and this slot holding a `glmnet` object. +#' @slot vars_info A list of list. Each element of `vars_info` represents one predictor variable and contains various information of it. #' @slot lambda The values of `lambda` used in the fits. #' @slot cvm The mean cross-validated error. -#' @slot cvsd The estimate of standarded error of `cvm`. +#' @slot cvsd The estimate of standard error of `cvm`. #' @slot cvup The upper curve as `cvm + cvsd`. #' @slot cvlo The lower curve as `cvm - cvsd`. #' @slot nzero The number of non-zero coefficients at each lambda. #' @slot name A text string indicating type of measure (for plotting purposes). #' @slot lambda.min The value of `lambda` that gives minimum `cvm`. #' @slot lambda.1se The largest value of `lambda` such that error is within 1 standard error of the minimum. -#' @slot fit.preval If `keep=TRUE`, this is the array of prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold. -#' @slot foldid An integer vector of values between 1 and nfold identifying what fold each observation is in. -#' @slot call An object of class call, which is used to record how cva.aglm() is called. +#' @slot fit.preval If `keep=TRUE`, this is the array of previously prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold. +#' @slot foldid An integer vector of values between 1 and `nfold` identifying what fold each observation is in. +#' @slot call An object of class call, which is used to record how `cva.aglm()` is called. #' #' @export setClass("AccurateGLM", @@ -37,15 +37,15 @@ setClass("AccurateGLM", foldid="integer", call="ANY")) -#' S4 class for a result of cva.aglm() +#' S4 class for a result of `cva.aglm()` #' -#' @slot models_list Results of cv.glmnet() for all the values of alpha. +#' @slot models_list Results of `cv.glmnet()` for all the values of alpha. #' @slot alpha A numeric values specifying alpha values to be examined. #' @slot nfolds An integer value specifying the number of folds. #' @slot alpha.min.index An integer value specifying the index of `alpha.min` in `alpha`. #' @slot alpha.min The alpha value which achieves the minimum loss. #' @slot lambda.min The lambda value which achieves the minimum loss, when combined with `alpha.min`. -#' @slot call An object of class call, which is used to record how cva.aglm() is called. +#' @slot call An object of class call, which is used to record how `cva.aglm()` is called. #' #' @export setClass("CVA_AccurateGLM", diff --git a/R/aglm.R b/R/aglm.R index a543af4..806ed26 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -9,7 +9,7 @@ #' @param qualitative_vars_UD_only A list of indices or names for specifying which columns are qualitative and need only U-dummy representations. #' @param qualitative_vars_both A list of indices or names for specifying which columns are qualitative and need both U-dummy and O-dummy representations. #' @param qualitative_vars_OD_only A list of indices or names for specifying which columns are qualitative and need only O-dummy representations. -#' @param quantitative_vars A list of indices or names for specyfying which columns are quantitative. +#' @param quantitative_vars A list of indices or names for specifying which columns are quantitative. #' @param use_LVar A boolean value which indicates whether this function uses L-variable representations or not. #' @param extrapolation A character value which indicates how contribution curves outside bins are extrapolated. #' * "default": No extrapolations. @@ -18,17 +18,17 @@ #' @param add_OD_columns_of_qualitatives A boolean value which indicates whether this function use O-dummy representations for qualitative and ordinal variables or not. #' @param add_interaction_columns A boolean value which indicates whether this function uses intersection effects or not. #' @param OD_type_of_quantitatives A character value which indicates how O-dummy matrices of quantitative -#' values are constructed. Choose "C"(default) or "J". -#' * "C": Continuous-type dummies, which result continuous contribution curves. -#' * "J": Jump-type dummies, which result contribution curves with jumps. -#' * "N": No use of O-dummies -#' @param family Response type. Currently "gaussian", "binomial", and "poisson" are supported. +#' values are constructed. Choose `"C"`(default) or `"J"`. +#' * `"C"`: Continuous-type dummies, which result continuous contribution curves. +#' * `"J"`: Jump-type dummies, which result contribution curves with jumps. +#' * `"N"`: No use of O-dummies +#' @param family Response type. Currently `"gaussian"`, `"binomial"`, and `"poisson"` are supported. #' @param nbin.max a maximum number of bins which is automatically generated. Only used when `breaks` is not set. #' @param bins_list A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order. #' @param bins_names A list of column name or column index, each name or index of which specifies which column of `x` is binned used with an element of `bins_list` in the same position. -#' @param ... Other arguments are passed directly to backend (currently glmnet() is used), and if not given, backend API's default values are used to call backend functions. +#' @param ... Other arguments are passed directly to backend (currently `glmnet()` is used), and if not given, default values of the backend API are used to call backend functions. #' -#' @return An AccurateGLM object, fitted to the data (x, y) +#' @return An `AccurateGLM` object, fitted to the data `(x, y)` #' #' @export #' @importFrom assertthat assert_that diff --git a/R/binning.R b/R/binning.R index 6c97e6d..b9734a7 100644 --- a/R/binning.R +++ b/R/binning.R @@ -66,7 +66,7 @@ createEqualFreqBins <- function(x_vec, nbin.max) { #' #' @return a list which has two members `labels` and `breaks`. #' * `labels`: an integer vector of `length(x_vec)`. -#' `(labels[i]==k)` indicates the i-th element of x_vec is in the k-th bin. +#' `(labels[i]==k)` indicates the `i-th` element of `x_vec` is in the `k-th` bin. #' * `breaks`: a numeric vector which indicates the boundaries of bins, of length (number of bins - 1). #' #' @export diff --git a/R/coef-aglm.R b/R/coef-aglm.R index b243b1d..80700d4 100644 --- a/R/coef-aglm.R +++ b/R/coef-aglm.R @@ -1,14 +1,14 @@ # calculate deviances for AGLM # written by Kenji Kondo @ 2019/1/3 -#' Extract coefficients from an AccurateGLM object. +#' Extract coefficients from an `AccurateGLM` object. #' -#' @param object An AccurateGLM object. +#' @param object An `AccurateGLM` object. #' @param index An integer vector of variable indices whose coefficients should be returned. #' @param name An character vector of variable names whose coefficients should be returned. #' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. #' Default is the entire sequence used to create the model. -#' @param exact Same as coef.glmnet(). +#' @param exact Same as `coef.glmnet()`. #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @importFrom assertthat assert_that diff --git a/R/cv-aglm.R b/R/cv-aglm.R index 8f6ea1a..49e43b5 100644 --- a/R/cv-aglm.R +++ b/R/cv-aglm.R @@ -9,25 +9,25 @@ #' @param qualitative_vars_UD_only A list of indices or names for specifying which columns are qualitative and need only U-dummy representations. #' @param qualitative_vars_both A list of indices or names for specifying which columns are qualitative and need both U-dummy and O-dummy representations. #' @param qualitative_vars_OD_only A list of indices or names for specifying which columns are qualitative and need only O-dummy representations. -#' @param quantitative_vars A list of indices or names for specyfying which columns are quantitative. +#' @param quantitative_vars A list of indices or names for specifying which columns are quantitative. #' @param use_LVar A boolean value which indicates whether this function uses L-variable representations or not. #' @param extrapolation A character value which indicates how contribution curves outside bins are extrapolated. -#' * "default": No extrapolations. -#' * "flat": Extrapolates with flat lines. +#' * `"default"`: No extrapolations. +#' * `"flat"`: Extrapolates with flat lines. #' @param add_linear_columns A boolean value which indicates whether this function uses linear effects or not. #' @param add_OD_columns_of_qualitatives A boolean value which indicates whether this function use O-dummy representations for qualitative and ordinal variables or not. #' @param add_interaction_columns A boolean value which indicates whether this function uses interaction effects or not. #' @param OD_type_of_quantitatives A character value which indicates how O-dummy matrices of quantitative -#' values are constructed. Choose "C"(default) or "J". -#' * "C": Continuous-type dummies, which result continuous contribution curves. -#' * "J": Jump-type dummies, which result contribution curves with jumps. -#' * "N": No use of O-dummies -#' @param family Response type. Currently "gaussian", "binomial", and "poisson" are supported. +#' values are constructed. Choose `"C"`(default) or `"J"`. +#' * `"C"`: Continuous-type dummies, which result continuous contribution curves. +#' * `"J"`: Jump-type dummies, which result contribution curves with jumps. +#' * `"N"`: No use of O-dummies +#' @param family Response type. Currently `"gaussian"`, `"binomial"`, and `"poisson"` are supported. #' @param nbin.max a maximum number of bins which is automatically generated. Only used when `breaks` is not set. #' @param bins_list A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order. #' @param bins_names A list of column name or column index, each name or index of which specifies which column of `x` is binned used with an element of `bins_list` in the same position. -#' @param keep Same as cv.glmnet(). -#' @param ... Other arguments are passed directly to backend (currently cv.glmnet() is used), and if not given, backend API's default values are used to call backend functions. +#' @param keep Same as `cv.glmnet()`. +#' @param ... Other arguments are passed directly to backend (currently `cv.glmnet()` is used), and if not given, default values of the backend API are used to call backend functions. #' #' @return Result of cross-validation. #' diff --git a/R/cva-aglm.R b/R/cva-aglm.R index 4fd6f49..094f90e 100644 --- a/R/cva-aglm.R +++ b/R/cva-aglm.R @@ -8,9 +8,9 @@ #' @param y An integer or numeric vector which represents response variable. #' @param alpha A numeric vector which represents alpha values to be examined. #' @param nfolds The number of folds. The default value is 10. -#' @param foldid An integer vector of values between 1 and nfold identifying what fold each observation is in. +#' @param foldid An integer vector of values between 1 and `nfold` identifying what fold each observation is in. #' @param parallel.alpha not implemented yet. -#' @param ... Other arguments are passed directly to cv.aglm(). +#' @param ... Other arguments are passed directly to `cv.aglm()`. #' #' @return Result of cross-validation. See `CVA_AccurateGLM`'s document for more details. #' diff --git a/R/deviance-aglm.R b/R/deviance-aglm.R index 4e108e3..ecd1121 100644 --- a/R/deviance-aglm.R +++ b/R/deviance-aglm.R @@ -1,9 +1,9 @@ # calculate deviances for AGLM # written by Kenji Kondo @ 2019/1/3 -#' Extract the deviance from an AccurateGLM object. +#' Extract the deviance from an `AccurateGLM` object. #' -#' @param object An AccurateGLM object. +#' @param object An `AccurateGLM` object. #' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @export diff --git a/R/get-dummies.R b/R/get-dummies.R index 9665920..0bc4a98 100644 --- a/R/get-dummies.R +++ b/R/get-dummies.R @@ -6,8 +6,8 @@ #' @param x_vec A non-numeric vector to be converted into dummy matrix. #' @param levels A character vector indicates unique set of possible values. #' If NULL, all the unique values of `x_vec` are used. -#' @param drop_last A boolean value. If TRUE, the last column of dummy matrix is dropped to avoid colinear -#' @param only_info A bboolean value. If TRUE, actual creation of dummy matrix is omitted. +#' @param drop_last A logical value. If TRUE, the last column of dummy matrix is dropped. +#' @param only_info A logical value. If TRUE, actual creation of dummy matrix is omitted. #' #' @return a list with two members `levels` and `dummy_mat`. #' * `levels`: Same as input @@ -47,11 +47,11 @@ getUDummyMatForOneVec <- function(x_vec, levels=NULL, drop_last=TRUE, only_info= #' @param breaks A numeric vector which indicates the boundaries of bins, of length (number of bins + 1). #' If NULL, evenly cut bins are automatically generated and used. #' @param nbin.max A maximum number of bins which is used. Only used when `breaks` is not set. -#' @param only_info A boolean value. If TRUE, actual creation of dummy matrix is omitted. -#' @param dummy_type A character value. Choose "C"(default) or "J". For integer or numeric `x_vec`, +#' @param only_info A logical value. If TRUE, actual creation of dummy matrix is omitted. +#' @param dummy_type A character value. Choose `"C"`(default) or `"J"`. For integer or numeric `x_vec`, #' `dummy_type="C"` is used as default. Otherwise, `dummy_type="J"` is used as default. -#' * "C": Continuous-type dummies, which result continuous contribution curves. -#' * "J": Jum-type dummies, which result contribution curves with jumps. +#' * `"C"`: Continuous-type dummies, which result continuous contribution curves. +#' * `"J"`: Jump-type dummies, which result contribution curves with jumps. #' #' @return a list with two members `breaks` and `dummy_mat`. #' * `breaks`: Same as input @@ -103,7 +103,7 @@ getODummyMatForOneVec <- function(x_vec, breaks=NULL, nbin.max=100, only_info=FA #' @param breaks A numeric vector which indicates the boundaries of bins, of length (number of bins + 1). #' If NULL, evenly cut bins are automatically generated and used. #' @param nbin.max A maximum number of bins which is used. Only used when `breaks` is not set. -#' @param only_info A boolean value. If TRUE, actual creation of dummy matrix is omitted. +#' @param only_info A logical value. If TRUE, actual creation of dummy matrix is omitted. #' #' @return a list with two members `breaks` and `dummy_mat`. #' * `breaks`: Same as input diff --git a/R/plot-aglm.R b/R/plot-aglm.R index c42c13e..3a4bc45 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -1,31 +1,31 @@ # plotting function for AGLM # written by Kenji Kondo @ 2019/1/3 -#' Plot coefficients from an AccurateGLM object +#' Plot coefficients from an `AccurateGLM` object #' -#' @param x An AccurateGLM object. +#' @param x An `AccurateGLM` object. #' @param vars An integer or character vectors (indices or names) specifying which variables should be plotted. #' @param verbose If TRUE, outputs details. #' @param s A numeric value specifying lambda value at which plotting is required. #' Note that this function can't plot for multiple lambda values, so it allows only #' single `s` value (which means `model` is trained with multiple lambda values and plot with one of them), #' or `s=NULL` (which means `model` is trained with single lambda value and plot with that value). -#' @param resid A boolean value which indicates to plot residuals, +#' @param resid A logical value which indicates to plot residuals, #' or a character value which indicates residual type to be plotted (see the help of `residuals.AccurateGLM()`), #' or a numerical vector which indicates residual values to be plotted. #' Note that working residuals are used in the first case with `resid=TRUE`. -#' @param smooth_resid A boolean value which indicates whether draws smoothing lines of residuals or not, +#' @param smooth_resid A logical value which indicates whether draws smoothing lines of residuals or not, #' or a character value which is one of options below: #' * `"both"` draws both balls and smoothing lines. #' * `"smooth_only"` draws only smoothing line. #' Note that smoothing lines are only drawn for quantitative variables. #' The default value is `TRUE`. #' @param smooth_resid_fun A function to be used to smooth partial residual values. -#' @param ask A boolean value which indicates ask if go to next plot. -#' @param layout A pair of integer values which indicates how many plots are drawn rawwise and columnwise respectively, +#' @param ask A logical value which indicates ask if go to next plot. +#' @param layout A pair of integer values which indicates how many plots are drawn row-wise and column-wise respectively, #' @param only_plot If `TRUE`, the function set no graphical parameters and no title. #' @param main A character value which indicates titles of panels. -#' @param add_rug A boolean value which indicates draw rugplot for quantitative variables. +#' @param add_rug A logical value which indicates draw rug plot for quantitative variables. #' @param ... Other arguments are currently not used. #' #' @export diff --git a/R/predict-aglm.R b/R/predict-aglm.R index 59f3757..3cefecf 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -1,9 +1,9 @@ # predicting function for AGLM model # written by Kenji Kondo @ 2019/1/3 -#' Make predictions from a fitted AccurateGLM +#' Make predictions from a fitted `AccurateGLM` #' -#' @param object An AccurateGLM object. +#' @param object An `AccurateGLM` object. #' @param newx An input matrix or data.frame used for predictions. #' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. #' Default is the entire sequence used to create the model. @@ -14,9 +14,9 @@ #' Note that for `"binomial"` models, results are returned only for the class corresponding to the second level of the factor response. #' * Type `"class"` applies only to `"binomial"`, and produces the class label corresponding to the maximum probability. #' * Type `"nonzero"` returns a list of the indices of the nonzero coefficients for each value of `s`. -#' @param exact Same as predict.glmnet(). +#' @param exact Same as `predict.glmnet()`. #' @param newoffset If an offset is used in the fit, then one must be supplied for making predictions (except for type="coefficients" or type="nonzero"). -#' @param ... Other arguments are passed directly to backend (currently glmnet() is used), and if not given, backend API's deault values are used. +#' @param ... Other arguments are passed directly to backend (currently `glmnet()` is used), and if not given, default values of the backend API are used. #' #' @return The object returned depends on type. #' diff --git a/R/print-aglm.R b/R/print-aglm.R index ff376f3..0d13c60 100644 --- a/R/print-aglm.R +++ b/R/print-aglm.R @@ -1,9 +1,9 @@ # printing function for AGLM # witten by Kenji Kondo @ 2019/1/3 -#' Print an AccurateGLM object +#' Print an `AccurateGLM` object #' -#' @param x An AccurateGLM object. +#' @param x An `AccurateGLM` object. #' @param digits Significant digits in printout. #' @param ... Other arguments are passed directly to `print` functions of `model@backend_models`. #' diff --git a/R/residuals-aglm.R b/R/residuals-aglm.R index d0065a9..9e3ffa5 100644 --- a/R/residuals-aglm.R +++ b/R/residuals-aglm.R @@ -3,7 +3,7 @@ #' Calculate residuals for AGLM model #' -#' @param object An AccurateGLM object. +#' @param object An `AccurateGLM` object. #' @param x An input matrix or data.frame used for predictions in residual calculations. #' If not given, `x` used for fitting the model is used. #' @param y A numeric vector used as true target values in residual calculations. @@ -15,7 +15,7 @@ #' @param type Type of prediction required. #' * Type `"working"` Working residuals. #' * Type `"pearson"` Pearson residuals. -#' * Type `"deviance"` Devian residuals. +#' * Type `"deviance"` Deviance residuals. #' @param s A numeric value specifying lambda value at which plotting is required. #' @param ... Other arguments are currently not used. #' diff --git a/README.md b/README.md index 142641a..e325075 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ devtools::install_github("kkondo1981/aglm") ``` *Note:* -Now aglm requires the newest version of R and glmnet, so please update versions of your local packages before installation. +Now `aglm` requires the newest version of `R` and `glmnet`, so please update versions of your local packages before installation. ### Usage See https://github.com/kkondo1981/aglm/tree/master/examples @@ -17,4 +17,4 @@ See https://github.com/kkondo1981/aglm/tree/master/examples ### Release note #### version 0.3.2 -- Fixed to use R 4.0 and glmnet 4.0. +- Fixed to use `R` 4.0 and `glmnet` 4.0. diff --git a/man/AccurateGLM-class.Rd b/man/AccurateGLM-class.Rd index 985d2f7..b645d1d 100644 --- a/man/AccurateGLM-class.Rd +++ b/man/AccurateGLM-class.Rd @@ -3,23 +3,23 @@ \docType{class} \name{AccurateGLM-class} \alias{AccurateGLM-class} -\title{S4 class for fitted AGLM, used as a result of aglm() or cv.aglm()} +\title{S4 class for fitted AGLM, used as a result of \code{aglm()} or \code{cv.aglm()}} \description{ -S4 class for fitted AGLM, used as a result of aglm() or cv.aglm() +S4 class for fitted AGLM, used as a result of \code{aglm()} or \code{cv.aglm()} } \section{Slots}{ \describe{ \item{\code{backend_models}}{Internally used model objects to be passed to backend functions. -Currently glmnet is used as a backend and this slot holding a glmnet object.} +Currently \code{glmnet} is used as a backend and this slot holding a \code{glmnet} object.} -\item{\code{vars_info}}{A list of list. Each element of \code{vars_info} represents one predictor variable and contains various informations of it.} +\item{\code{vars_info}}{A list of list. Each element of \code{vars_info} represents one predictor variable and contains various information of it.} \item{\code{lambda}}{The values of \code{lambda} used in the fits.} \item{\code{cvm}}{The mean cross-validated error.} -\item{\code{cvsd}}{The estimate of standarded error of \code{cvm}.} +\item{\code{cvsd}}{The estimate of standard error of \code{cvm}.} \item{\code{cvup}}{The upper curve as \code{cvm + cvsd}.} @@ -33,10 +33,10 @@ Currently glmnet is used as a backend and this slot holding a glmnet object.} \item{\code{lambda.1se}}{The largest value of \code{lambda} such that error is within 1 standard error of the minimum.} -\item{\code{fit.preval}}{If \code{keep=TRUE}, this is the array of prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold.} +\item{\code{fit.preval}}{If \code{keep=TRUE}, this is the array of previously prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold.} -\item{\code{foldid}}{An integer vector of values between 1 and nfold identifying what fold each observation is in.} +\item{\code{foldid}}{An integer vector of values between 1 and \code{nfold} identifying what fold each observation is in.} -\item{\code{call}}{An object of class call, which is used to record how cva.aglm() is called.} +\item{\code{call}}{An object of class call, which is used to record how \code{cva.aglm()} is called.} }} diff --git a/man/CVA_AccurateGLM-class.Rd b/man/CVA_AccurateGLM-class.Rd index e9306d0..57cf220 100644 --- a/man/CVA_AccurateGLM-class.Rd +++ b/man/CVA_AccurateGLM-class.Rd @@ -3,14 +3,14 @@ \docType{class} \name{CVA_AccurateGLM-class} \alias{CVA_AccurateGLM-class} -\title{S4 class for a result of cva.aglm()} +\title{S4 class for a result of \code{cva.aglm()}} \description{ -S4 class for a result of cva.aglm() +S4 class for a result of \code{cva.aglm()} } \section{Slots}{ \describe{ -\item{\code{models_list}}{Results of cv.glmnet() for all the values of alpha.} +\item{\code{models_list}}{Results of \code{cv.glmnet()} for all the values of alpha.} \item{\code{alpha}}{A numeric values specifying alpha values to be examined.} @@ -22,6 +22,6 @@ S4 class for a result of cva.aglm() \item{\code{lambda.min}}{The lambda value which achieves the minimum loss, when combined with \code{alpha.min}.} -\item{\code{call}}{An object of class call, which is used to record how cva.aglm() is called.} +\item{\code{call}}{An object of class call, which is used to record how \code{cva.aglm()} is called.} }} diff --git a/man/aglm.Rd b/man/aglm.Rd index 99ff46e..8dd37a7 100644 --- a/man/aglm.Rd +++ b/man/aglm.Rd @@ -35,7 +35,7 @@ aglm( \item{qualitative_vars_OD_only}{A list of indices or names for specifying which columns are qualitative and need only O-dummy representations.} -\item{quantitative_vars}{A list of indices or names for specyfying which columns are quantitative.} +\item{quantitative_vars}{A list of indices or names for specifying which columns are quantitative.} \item{use_LVar}{A boolean value which indicates whether this function uses L-variable representations or not.} @@ -52,11 +52,11 @@ aglm( \item{add_interaction_columns}{A boolean value which indicates whether this function uses intersection effects or not.} \item{OD_type_of_quantitatives}{A character value which indicates how O-dummy matrices of quantitative -values are constructed. Choose "C"(default) or "J". +values are constructed. Choose \code{"C"}(default) or \code{"J"}. \itemize{ -\item "C": Continuous-type dummies, which result continuous contribution curves. -\item "J": Jump-type dummies, which result contribution curves with jumps. -\item "N": No use of O-dummies +\item \code{"C"}: Continuous-type dummies, which result continuous contribution curves. +\item \code{"J"}: Jump-type dummies, which result contribution curves with jumps. +\item \code{"N"}: No use of O-dummies }} \item{nbin.max}{a maximum number of bins which is automatically generated. Only used when \code{breaks} is not set.} @@ -65,12 +65,12 @@ values are constructed. Choose "C"(default) or "J". \item{bins_names}{A list of column name or column index, each name or index of which specifies which column of \code{x} is binned used with an element of \code{bins_list} in the same position.} -\item{family}{Response type. Currently "gaussian", "binomial", and "poisson" are supported.} +\item{family}{Response type. Currently \code{"gaussian"}, \code{"binomial"}, and \code{"poisson"} are supported.} -\item{...}{Other arguments are passed directly to backend (currently glmnet() is used), and if not given, backend API's default values are used to call backend functions.} +\item{...}{Other arguments are passed directly to backend (currently \code{glmnet()} is used), and if not given, default values of the backend API are used to call backend functions.} } \value{ -An AccurateGLM object, fitted to the data (x, y) +An \code{AccurateGLM} object, fitted to the data \verb{(x, y)} } \description{ fit an AGLM model diff --git a/man/coef.AccurateGLM.Rd b/man/coef.AccurateGLM.Rd index 2fdf14c..542c7e9 100644 --- a/man/coef.AccurateGLM.Rd +++ b/man/coef.AccurateGLM.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/coef-aglm.R \name{coef.AccurateGLM} \alias{coef.AccurateGLM} -\title{Extract coefficients from an AccurateGLM object.} +\title{Extract coefficients from an \code{AccurateGLM} object.} \usage{ \method{coef}{AccurateGLM}(object, index = NULL, name = NULL, s = NULL, exact = FALSE, ...) } \arguments{ -\item{object}{An AccurateGLM object.} +\item{object}{An \code{AccurateGLM} object.} \item{index}{An integer vector of variable indices whose coefficients should be returned.} @@ -16,10 +16,10 @@ \item{s}{Value(s) of the penalty parameter \code{lambda} at which predictions are required. Default is the entire sequence used to create the model.} -\item{exact}{Same as coef.glmnet().} +\item{exact}{Same as \code{coef.glmnet()}.} \item{...}{Other arguments are passed directly to \code{deviance} functions of \code{model@backend_models}.} } \description{ -Extract coefficients from an AccurateGLM object. +Extract coefficients from an \code{AccurateGLM} object. } diff --git a/man/cv.aglm.Rd b/man/cv.aglm.Rd index fd8458f..feb05c8 100644 --- a/man/cv.aglm.Rd +++ b/man/cv.aglm.Rd @@ -36,14 +36,14 @@ cv.aglm( \item{qualitative_vars_OD_only}{A list of indices or names for specifying which columns are qualitative and need only O-dummy representations.} -\item{quantitative_vars}{A list of indices or names for specyfying which columns are quantitative.} +\item{quantitative_vars}{A list of indices or names for specifying which columns are quantitative.} \item{use_LVar}{A boolean value which indicates whether this function uses L-variable representations or not.} \item{extrapolation}{A character value which indicates how contribution curves outside bins are extrapolated. \itemize{ -\item "default": No extrapolations. -\item "flat": Extrapolates with flat lines. +\item \code{"default"}: No extrapolations. +\item \code{"flat"}: Extrapolates with flat lines. }} \item{add_linear_columns}{A boolean value which indicates whether this function uses linear effects or not.} @@ -53,11 +53,11 @@ cv.aglm( \item{add_interaction_columns}{A boolean value which indicates whether this function uses interaction effects or not.} \item{OD_type_of_quantitatives}{A character value which indicates how O-dummy matrices of quantitative -values are constructed. Choose "C"(default) or "J". +values are constructed. Choose \code{"C"}(default) or \code{"J"}. \itemize{ -\item "C": Continuous-type dummies, which result continuous contribution curves. -\item "J": Jump-type dummies, which result contribution curves with jumps. -\item "N": No use of O-dummies +\item \code{"C"}: Continuous-type dummies, which result continuous contribution curves. +\item \code{"J"}: Jump-type dummies, which result contribution curves with jumps. +\item \code{"N"}: No use of O-dummies }} \item{nbin.max}{a maximum number of bins which is automatically generated. Only used when \code{breaks} is not set.} @@ -66,11 +66,11 @@ values are constructed. Choose "C"(default) or "J". \item{bins_names}{A list of column name or column index, each name or index of which specifies which column of \code{x} is binned used with an element of \code{bins_list} in the same position.} -\item{family}{Response type. Currently "gaussian", "binomial", and "poisson" are supported.} +\item{family}{Response type. Currently \code{"gaussian"}, \code{"binomial"}, and \code{"poisson"} are supported.} -\item{keep}{Same as cv.glmnet().} +\item{keep}{Same as \code{cv.glmnet()}.} -\item{...}{Other arguments are passed directly to backend (currently cv.glmnet() is used), and if not given, backend API's default values are used to call backend functions.} +\item{...}{Other arguments are passed directly to backend (currently \code{cv.glmnet()} is used), and if not given, default values of the backend API are used to call backend functions.} } \value{ Result of cross-validation. diff --git a/man/cva.aglm.Rd b/man/cva.aglm.Rd index d6b0b68..1db062e 100644 --- a/man/cva.aglm.Rd +++ b/man/cva.aglm.Rd @@ -23,11 +23,11 @@ cva.aglm( \item{nfolds}{The number of folds. The default value is 10.} -\item{foldid}{An integer vector of values between 1 and nfold identifying what fold each observation is in.} +\item{foldid}{An integer vector of values between 1 and \code{nfold} identifying what fold each observation is in.} \item{parallel.alpha}{not implemented yet.} -\item{...}{Other arguments are passed directly to cv.aglm().} +\item{...}{Other arguments are passed directly to \code{cv.aglm()}.} } \value{ Result of cross-validation. See \code{CVA_AccurateGLM}'s document for more details. diff --git a/man/deviance.AccurateGLM.Rd b/man/deviance.AccurateGLM.Rd index bb0b462..d9b8c69 100644 --- a/man/deviance.AccurateGLM.Rd +++ b/man/deviance.AccurateGLM.Rd @@ -2,15 +2,15 @@ % Please edit documentation in R/deviance-aglm.R \name{deviance.AccurateGLM} \alias{deviance.AccurateGLM} -\title{Extract the deviance from an AccurateGLM object.} +\title{Extract the deviance from an \code{AccurateGLM} object.} \usage{ \method{deviance}{AccurateGLM}(object, ...) } \arguments{ -\item{object}{An AccurateGLM object.} +\item{object}{An \code{AccurateGLM} object.} \item{...}{Other arguments are passed directly to \code{deviance} functions of \code{model@backend_models}.} } \description{ -Extract the deviance from an AccurateGLM object. +Extract the deviance from an \code{AccurateGLM} object. } diff --git a/man/executeBinning.Rd b/man/executeBinning.Rd index 578cac6..d62912e 100644 --- a/man/executeBinning.Rd +++ b/man/executeBinning.Rd @@ -21,7 +21,7 @@ Ignore if \code{breaks} is set.} a list which has two members \code{labels} and \code{breaks}. \itemize{ \item \code{labels}: an integer vector of \code{length(x_vec)}. -\code{(labels[i]==k)} indicates the i-th element of x_vec is in the k-th bin. +\code{(labels[i]==k)} indicates the \code{i-th} element of \code{x_vec} is in the \code{k-th} bin. \item \code{breaks}: a numeric vector which indicates the boundaries of bins, of length (number of bins - 1). } } diff --git a/man/getLVarMatForOneVec.Rd b/man/getLVarMatForOneVec.Rd index b5c0922..76ad39f 100644 --- a/man/getLVarMatForOneVec.Rd +++ b/man/getLVarMatForOneVec.Rd @@ -14,7 +14,7 @@ If NULL, evenly cut bins are automatically generated and used.} \item{nbin.max}{A maximum number of bins which is used. Only used when \code{breaks} is not set.} -\item{only_info}{A boolean value. If TRUE, actual creation of dummy matrix is omitted.} +\item{only_info}{A logical value. If TRUE, actual creation of dummy matrix is omitted.} } \value{ a list with two members \code{breaks} and \code{dummy_mat}. diff --git a/man/getODummyMatForOneVec.Rd b/man/getODummyMatForOneVec.Rd index 1cacde3..75556e1 100644 --- a/man/getODummyMatForOneVec.Rd +++ b/man/getODummyMatForOneVec.Rd @@ -20,13 +20,13 @@ If NULL, evenly cut bins are automatically generated and used.} \item{nbin.max}{A maximum number of bins which is used. Only used when \code{breaks} is not set.} -\item{only_info}{A boolean value. If TRUE, actual creation of dummy matrix is omitted.} +\item{only_info}{A logical value. If TRUE, actual creation of dummy matrix is omitted.} -\item{dummy_type}{A character value. Choose "C"(default) or "J". For integer or numeric \code{x_vec}, +\item{dummy_type}{A character value. Choose \code{"C"}(default) or \code{"J"}. For integer or numeric \code{x_vec}, \code{dummy_type="C"} is used as default. Otherwise, \code{dummy_type="J"} is used as default. \itemize{ -\item "C": Continuous-type dummies, which result continuous contribution curves. -\item "J": Jum-type dummies, which result contribution curves with jumps. +\item \code{"C"}: Continuous-type dummies, which result continuous contribution curves. +\item \code{"J"}: Jump-type dummies, which result contribution curves with jumps. }} } \value{ diff --git a/man/getUDummyMatForOneVec.Rd b/man/getUDummyMatForOneVec.Rd index 793c67d..1cf94ad 100644 --- a/man/getUDummyMatForOneVec.Rd +++ b/man/getUDummyMatForOneVec.Rd @@ -17,9 +17,9 @@ getUDummyMatForOneVec( \item{levels}{A character vector indicates unique set of possible values. If NULL, all the unique values of \code{x_vec} are used.} -\item{drop_last}{A boolean value. If TRUE, the last column of dummy matrix is dropped to avoid colinear} +\item{drop_last}{A logical value. If TRUE, the last column of dummy matrix is dropped.} -\item{only_info}{A bboolean value. If TRUE, actual creation of dummy matrix is omitted.} +\item{only_info}{A logical value. If TRUE, actual creation of dummy matrix is omitted.} } \value{ a list with two members \code{levels} and \code{dummy_mat}. diff --git a/man/plot.AccurateGLM.Rd b/man/plot.AccurateGLM.Rd index 16931e3..5a394a9 100644 --- a/man/plot.AccurateGLM.Rd +++ b/man/plot.AccurateGLM.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/plot-aglm.R \name{plot.AccurateGLM} \alias{plot.AccurateGLM} -\title{Plot coefficients from an AccurateGLM object} +\title{Plot coefficients from an \code{AccurateGLM} object} \usage{ \method{plot}{AccurateGLM}( x, @@ -21,7 +21,7 @@ ) } \arguments{ -\item{x}{An AccurateGLM object.} +\item{x}{An \code{AccurateGLM} object.} \item{vars}{An integer or character vectors (indices or names) specifying which variables should be plotted.} @@ -32,12 +32,12 @@ Note that this function can't plot for multiple lambda values, so it allows only single \code{s} value (which means \code{model} is trained with multiple lambda values and plot with one of them), or \code{s=NULL} (which means \code{model} is trained with single lambda value and plot with that value).} -\item{resid}{A boolean value which indicates to plot residuals, +\item{resid}{A logical value which indicates to plot residuals, or a character value which indicates residual type to be plotted (see the help of \code{residuals.AccurateGLM()}), or a numerical vector which indicates residual values to be plotted. Note that working residuals are used in the first case with \code{resid=TRUE}.} -\item{smooth_resid}{A boolean value which indicates whether draws smoothing lines of residuals or not, +\item{smooth_resid}{A logical value which indicates whether draws smoothing lines of residuals or not, or a character value which is one of options below: * \code{"both"} draws both balls and smoothing lines. * \code{"smooth_only"} draws only smoothing line. @@ -46,18 +46,18 @@ The default value is \code{TRUE}.} \item{smooth_resid_fun}{A function to be used to smooth partial residual values.} -\item{ask}{A boolean value which indicates ask if go to next plot.} +\item{ask}{A logical value which indicates ask if go to next plot.} -\item{layout}{A pair of integer values which indicates how many plots are drawn rawwise and columnwise respectively,} +\item{layout}{A pair of integer values which indicates how many plots are drawn row-wise and column-wise respectively,} \item{only_plot}{If \code{TRUE}, the function set no graphical parameters and no title.} \item{main}{A character value which indicates titles of panels.} -\item{add_rug}{A boolean value which indicates draw rugplot for quantitative variables.} +\item{add_rug}{A logical value which indicates draw rug plot for quantitative variables.} \item{...}{Other arguments are currently not used.} } \description{ -Plot coefficients from an AccurateGLM object +Plot coefficients from an \code{AccurateGLM} object } diff --git a/man/predict.AccurateGLM.Rd b/man/predict.AccurateGLM.Rd index 1647626..057b374 100644 --- a/man/predict.AccurateGLM.Rd +++ b/man/predict.AccurateGLM.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/predict-aglm.R \name{predict.AccurateGLM} \alias{predict.AccurateGLM} -\title{Make predictions from a fitted AccurateGLM} +\title{Make predictions from a fitted \code{AccurateGLM}} \usage{ \method{predict}{AccurateGLM}( object, @@ -15,7 +15,7 @@ ) } \arguments{ -\item{object}{An AccurateGLM object.} +\item{object}{An \code{AccurateGLM} object.} \item{newx}{An input matrix or data.frame used for predictions.} @@ -32,15 +32,15 @@ Note that for \code{"binomial"} models, results are returned only for the class \item Type \code{"nonzero"} returns a list of the indices of the nonzero coefficients for each value of \code{s}. }} -\item{exact}{Same as predict.glmnet().} +\item{exact}{Same as \code{predict.glmnet()}.} \item{newoffset}{If an offset is used in the fit, then one must be supplied for making predictions (except for type="coefficients" or type="nonzero").} -\item{...}{Other arguments are passed directly to backend (currently glmnet() is used), and if not given, backend API's deault values are used.} +\item{...}{Other arguments are passed directly to backend (currently \code{glmnet()} is used), and if not given, default values of the backend API are used.} } \value{ The object returned depends on type. } \description{ -Make predictions from a fitted AccurateGLM +Make predictions from a fitted \code{AccurateGLM} } diff --git a/man/print.AccurateGLM.Rd b/man/print.AccurateGLM.Rd index 8c04bec..30fda75 100644 --- a/man/print.AccurateGLM.Rd +++ b/man/print.AccurateGLM.Rd @@ -2,17 +2,17 @@ % Please edit documentation in R/print-aglm.R \name{print.AccurateGLM} \alias{print.AccurateGLM} -\title{Print an AccurateGLM object} +\title{Print an \code{AccurateGLM} object} \usage{ \method{print}{AccurateGLM}(x, digits = max(3, getOption("digits") - 3), ...) } \arguments{ -\item{x}{An AccurateGLM object.} +\item{x}{An \code{AccurateGLM} object.} \item{digits}{Significant digits in printout.} \item{...}{Other arguments are passed directly to \code{print} functions of \code{model@backend_models}.} } \description{ -Print an AccurateGLM object +Print an \code{AccurateGLM} object } diff --git a/man/residuals.AccurateGLM.Rd b/man/residuals.AccurateGLM.Rd index 9e027fb..c737ef4 100644 --- a/man/residuals.AccurateGLM.Rd +++ b/man/residuals.AccurateGLM.Rd @@ -16,7 +16,7 @@ ) } \arguments{ -\item{object}{An AccurateGLM object.} +\item{object}{An \code{AccurateGLM} object.} \item{x}{An input matrix or data.frame used for predictions in residual calculations. If not given, \code{x} used for fitting the model is used.} @@ -34,7 +34,7 @@ If not given, \code{weights} used for fitting the model is used.} \itemize{ \item Type \code{"working"} Working residuals. \item Type \code{"pearson"} Pearson residuals. -\item Type \code{"deviance"} Devian residuals. +\item Type \code{"deviance"} Deviance residuals. }} \item{s}{A numeric value specifying lambda value at which plotting is required.} From 2bcb35ea4ea378e384f9da3eadb2a7ce9d92de0b Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sat, 29 May 2021 17:16:21 +0900 Subject: [PATCH 09/26] little fixes. --- DESCRIPTION | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 305f380..a88abbf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,13 @@ Package: aglm Type: Package -Title: Accurate Generalized Linear Model (AGLM) +Title: Accurate Generalized Linear Model Version: 0.3.2 Author: Kenji Kondo, Kazuhisa Takahashi, others Maintainer: Kenji Kondo -Description: A handy tool for actuarial modeling, which is designed to achieve both accuracy and accountability. - AGLM is based on GLM but customized by expert actuaries for areas which require not only accuracy but also accountability. +Description: A handy tool for actuarial modeling based on GLM. License: GPL-2 Encoding: UTF-8 Language: en-US -LazyData: true RoxygenNote: 7.1.1 Roxygen: list(markdown = TRUE) Depends: @@ -23,4 +21,3 @@ Suggests: knitr, rmarkdown, MASS -VignetteBuilder: knitr From d37a32d589b10dc0f445e85ee21e89a9aa65f0bb Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sat, 29 May 2021 21:01:20 +0900 Subject: [PATCH 10/26] Fix documents for `aglm.R`. --- R/aglm.R | 101 ++++++++++++++++++++++++++++++++++++---------------- man/aglm.Rd | 71 ++++++++++++++++++++---------------- 2 files changed, 111 insertions(+), 61 deletions(-) diff --git a/R/aglm.R b/R/aglm.R index 806ed26..06a1bb6 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -1,34 +1,73 @@ -# fitting function for AGLM model -# written by Kenji Kondo @ 2019/1/1 - - -#' fit an AGLM model -#' -#' @param x An input matrix or data.frame to be fitted. -#' @param y An integer or numeric vector which represents response variable. -#' @param qualitative_vars_UD_only A list of indices or names for specifying which columns are qualitative and need only U-dummy representations. -#' @param qualitative_vars_both A list of indices or names for specifying which columns are qualitative and need both U-dummy and O-dummy representations. -#' @param qualitative_vars_OD_only A list of indices or names for specifying which columns are qualitative and need only O-dummy representations. -#' @param quantitative_vars A list of indices or names for specifying which columns are quantitative. -#' @param use_LVar A boolean value which indicates whether this function uses L-variable representations or not. -#' @param extrapolation A character value which indicates how contribution curves outside bins are extrapolated. -#' * "default": No extrapolations. -#' * "flat": Extrapolates with flat lines. -#' @param add_linear_columns A boolean value which indicates whether this function uses linear effects or not. -#' @param add_OD_columns_of_qualitatives A boolean value which indicates whether this function use O-dummy representations for qualitative and ordinal variables or not. -#' @param add_interaction_columns A boolean value which indicates whether this function uses intersection effects or not. -#' @param OD_type_of_quantitatives A character value which indicates how O-dummy matrices of quantitative -#' values are constructed. Choose `"C"`(default) or `"J"`. -#' * `"C"`: Continuous-type dummies, which result continuous contribution curves. -#' * `"J"`: Jump-type dummies, which result contribution curves with jumps. -#' * `"N"`: No use of O-dummies -#' @param family Response type. Currently `"gaussian"`, `"binomial"`, and `"poisson"` are supported. -#' @param nbin.max a maximum number of bins which is automatically generated. Only used when `breaks` is not set. -#' @param bins_list A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order. -#' @param bins_names A list of column name or column index, each name or index of which specifies which column of `x` is binned used with an element of `bins_list` in the same position. -#' @param ... Other arguments are passed directly to backend (currently `glmnet()` is used), and if not given, default values of the backend API are used to call backend functions. -#' -#' @return An `AccurateGLM` object, fitted to the data `(x, y)` +#' Fit an AGLM model +#' +#' @param x +#' A design matrix. +#' Usually a `data.frame` object is expected, but a `matrix` object is fine if all columns are of a same class. +#' Each column may have one of the following classes, and `aglm` will automatically determine how to handle it: +#' * `numeric`: interpreted as a quantitative variable. `aglm` performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies). +#' * `factor` (unordered) or `logical` : interpreted as a qualitative variable without order. `aglm` creates dummy variables suitable for unordered values (named U-dummies). +#' * `ordered`: interpreted as a qualitative variable with order. `aglm` creates both O-dummies and U-dummies. +#' These dummy variables are added to `x` and form a larger matrix, which is used internally as an actual design matrix. +#' If you need to change the default behavior, use the following options: `qualitative_vars_UD_only`, `qualitative_vars_both`, `qualitative_vars_OD_only`, and `quantitative_vars`. +#' +#' @param y +#' A response variable. +#' +#' @param qualitative_vars_UD_only +#' Used to change the default behavior of `aglm` for given variables. +#' Variables specified by this parameter are considered as qualitative variables and only U-dummies are created as auxiliary columns. +#' This parameter may have one of the following classes: +#' * `integer`: specifying variables by index. +#' * `character`: specifying variables by name. +#' +#' @param qualitative_vars_both +#' Same as `qualitative_vars_UD_only`, except that both O-dummies and U-dummies are created for specified variables. +#' +#' @param qualitative_vars_OD_only +#' Same as `qualitative_vars_UD_only`, except that both only O-dummies are created for specified variables. +#' +#' @param quantitative_vars +#' Same as `qualitative_vars_UD_only`, except that specified variables are considered as quantitative variables. +#' +#' @param use_LVar +#' Used to get predictions without jumps by `aglm`. +#' By default, `aglm` uses O-dummies as the representation of a quantitative variable, but the resulting linear combination form a step function and has a jump at each break of binning. +#' To avoid this, set `use_LVar=TRUE` to use a different type of auxiliary variable (named L-Variables), such that the linear combination form a piecewise linear functions and is continuous. +#' +#' @param extrapolation +#' Used to control values of linear combination for quantitative variables, outside where the data exists. +#' Currently, this parameter is useful only when `use_LVar=TRUE`, where values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. +#' You can set `extrapolation="flat"` to get constant values outside the data instead. +#' +#' @param add_linear_columns +#' By default, for quantitative variables, `aglm` expands them by adding dummies and the original columns, i.e. the linear effects, are remained in the resulting model. +#' You can set `add_linear_columns=FALSE` to drop linear effects. +#' +#' @param add_OD_columns_of_qualitatives +#' Set to `FALSE` if you do not want to use O-dummies for qualitative variables with order (usually, columns with `ordered` class). +#' +#' @param add_interaction_columns +#' If this parameter is set to `TRUE`, `aglm` creates an additional auxiliary variable `x_i * x_j` for each pair `(x_i, x_j)` of variables. +#' +#' @param OD_type_of_quantitatives +#' Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated). +#' +#' @param family +#' A `family` object or a string representing the type of the error distribution. +#' Currently `aglm` supports `gaussian`, `binomial`, and `poisson`. +#' +#' @param nbin.max +#' An integer representing the maximum number of bins when `aglm` perform binning for quantitative variables. +#' +#' @param bins_list +#' Used to set custom bins for variables with O-dummies. +#' +#' @param bins_names +#' Used to set custom bins for variables with O-dummies. +#' +#' @param ... Other arguments are passed directly when calling `glmnet()`. +#' +#' @return A model object fitted to the data. #' #' @export #' @importFrom assertthat assert_that diff --git a/man/aglm.Rd b/man/aglm.Rd index 8dd37a7..914d817 100644 --- a/man/aglm.Rd +++ b/man/aglm.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/aglm.R \name{aglm} \alias{aglm} -\title{fit an AGLM model} +\title{Fit an AGLM model} \usage{ aglm( x, @@ -25,53 +25,64 @@ aglm( ) } \arguments{ -\item{x}{An input matrix or data.frame to be fitted.} +\item{x}{A design matrix. +Usually a \code{data.frame} object is expected, but a \code{matrix} object is fine if all columns are of a same class. +Each column may have one of the following classes, and \code{aglm} will automatically determine how to handle it: +\itemize{ +\item \code{numeric}: interpreted as a quantitative variable. \code{aglm} performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies). +\item \code{factor} (unordered) or \code{logical} : interpreted as a qualitative variable without order. \code{aglm} creates dummy variables suitable for unordered values (named U-dummies). +\item \code{ordered}: interpreted as a qualitative variable with order. \code{aglm} creates both O-dummies and U-dummies. +These dummy variables are added to \code{x} and form a larger matrix, which is used internally as an actual design matrix. +If you need to change the default behavior, use the following options: \code{qualitative_vars_UD_only}, \code{qualitative_vars_both}, \code{qualitative_vars_OD_only}, and \code{quantitative_vars}. +}} -\item{y}{An integer or numeric vector which represents response variable.} +\item{y}{A response variable.} -\item{qualitative_vars_UD_only}{A list of indices or names for specifying which columns are qualitative and need only U-dummy representations.} +\item{qualitative_vars_UD_only}{Used to change the default behavior of \code{aglm} for given variables. +Variables specified by this parameter are considered as qualitative variables and only U-dummies are created as auxiliary columns. +This parameter may have one of the following classes: +\itemize{ +\item \code{integer}: specifying variables by index. +\item \code{character}: specifying variables by name. +}} -\item{qualitative_vars_both}{A list of indices or names for specifying which columns are qualitative and need both U-dummy and O-dummy representations.} +\item{qualitative_vars_both}{Same as \code{qualitative_vars_UD_only}, except that both O-dummies and U-dummies are created for specified variables.} -\item{qualitative_vars_OD_only}{A list of indices or names for specifying which columns are qualitative and need only O-dummy representations.} +\item{qualitative_vars_OD_only}{Same as \code{qualitative_vars_UD_only}, except that both only O-dummies are created for specified variables.} -\item{quantitative_vars}{A list of indices or names for specifying which columns are quantitative.} +\item{quantitative_vars}{Same as \code{qualitative_vars_UD_only}, except that specified variables are considered as quantitative variables.} -\item{use_LVar}{A boolean value which indicates whether this function uses L-variable representations or not.} +\item{use_LVar}{Used to get predictions without jumps by \code{aglm}. +By default, \code{aglm} uses O-dummies as the representation of a quantitative variable, but the resulting linear combination form a step function and has a jump at each break of binning. +To avoid this, set \code{use_LVar=TRUE} to use a different type of auxiliary variable (named L-Variables), such that the linear combination form a piecewise linear functions and is continuous.} -\item{extrapolation}{A character value which indicates how contribution curves outside bins are extrapolated. -\itemize{ -\item "default": No extrapolations. -\item "flat": Extrapolates with flat lines. -}} +\item{extrapolation}{Used to control values of linear combination for quantitative variables, outside where the data exists. +Currently, this parameter is useful only when \code{use_LVar=TRUE}, where values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. +You can set \code{extrapolation="flat"} to get constant values outside the data instead.} -\item{add_linear_columns}{A boolean value which indicates whether this function uses linear effects or not.} +\item{add_linear_columns}{By default, for quantitative variables, \code{aglm} expands them by adding dummies and the original columns, i.e. the linear effects, are remained in the resulting model. +You can set \code{add_linear_columns=FALSE} to drop linear effects.} -\item{add_OD_columns_of_qualitatives}{A boolean value which indicates whether this function use O-dummy representations for qualitative and ordinal variables or not.} +\item{add_OD_columns_of_qualitatives}{Set to \code{FALSE} if you do not want to use O-dummies for qualitative variables with order (usually, columns with \code{ordered} class).} -\item{add_interaction_columns}{A boolean value which indicates whether this function uses intersection effects or not.} +\item{add_interaction_columns}{If this parameter is set to \code{TRUE}, \code{aglm} creates an additional auxiliary variable \code{x_i * x_j} for each pair \verb{(x_i, x_j)} of variables.} -\item{OD_type_of_quantitatives}{A character value which indicates how O-dummy matrices of quantitative -values are constructed. Choose \code{"C"}(default) or \code{"J"}. -\itemize{ -\item \code{"C"}: Continuous-type dummies, which result continuous contribution curves. -\item \code{"J"}: Jump-type dummies, which result contribution curves with jumps. -\item \code{"N"}: No use of O-dummies -}} +\item{OD_type_of_quantitatives}{Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).} -\item{nbin.max}{a maximum number of bins which is automatically generated. Only used when \code{breaks} is not set.} +\item{nbin.max}{An integer representing the maximum number of bins when \code{aglm} perform binning for quantitative variables.} -\item{bins_list}{A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order.} +\item{bins_list}{Used to set custom bins for variables with O-dummies.} -\item{bins_names}{A list of column name or column index, each name or index of which specifies which column of \code{x} is binned used with an element of \code{bins_list} in the same position.} +\item{bins_names}{Used to set custom bins for variables with O-dummies.} -\item{family}{Response type. Currently \code{"gaussian"}, \code{"binomial"}, and \code{"poisson"} are supported.} +\item{family}{A \code{family} object or a string representing the type of the error distribution. +Currently \code{aglm} supports \code{gaussian}, \code{binomial}, and \code{poisson}.} -\item{...}{Other arguments are passed directly to backend (currently \code{glmnet()} is used), and if not given, default values of the backend API are used to call backend functions.} +\item{...}{Other arguments are passed directly when calling \code{glmnet()}.} } \value{ -An \code{AccurateGLM} object, fitted to the data \verb{(x, y)} +A model object fitted to the data. } \description{ -fit an AGLM model +Fit an AGLM model } From 7a6ea9b0568bdcaeb41549f5c8344ce98173dfd2 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 30 May 2021 22:20:50 +0900 Subject: [PATCH 11/26] Rewrite all docs. --- DESCRIPTION | 5 +- NAMESPACE | 1 - R/accurate-glm.R | 56 +++++++++--------- R/aglm-input.R | 16 +----- R/aglm-package.R | 79 +++++++++++++++++++++++++ R/aglm.R | 26 ++++++++- R/binning.R | 49 ++++++++-------- R/coef-aglm.R | 35 ++++++++---- R/cv-aglm.R | 108 ++++++++++++++++++++++++----------- R/cva-aglm.R | 54 +++++++++++++----- R/deviance-aglm.R | 17 ++++-- R/get-dummies.R | 78 +++++++++++-------------- R/plot-aglm.R | 96 ++++++++++++++++++++++--------- R/predict-aglm.R | 54 ++++++++++++------ R/print-aglm.R | 21 ++++--- R/residuals-aglm.R | 66 ++++++++++++++------- man/AGLM_Input-class.Rd | 4 +- man/AccurateGLM-class.Rd | 36 ++++++------ man/CVA_AccurateGLM-class.Rd | 21 ++++--- man/aglm-package.Rd | 100 ++++++++++++++++++++++++++++++++ man/aglm.Rd | 24 ++++++-- man/coef.AccurateGLM.Rd | 21 ++++--- man/createEqualFreqBins.Rd | 15 +++-- man/createEqualWidthBins.Rd | 15 +++-- man/cv.aglm.Rd | 68 ++++++++++++---------- man/cva.aglm.Rd | 34 ++++++++--- man/deviance.AccurateGLM.Rd | 11 ++-- man/executeBinning.Rd | 24 ++++---- man/getDesignMatrix.Rd | 17 ------ man/getLVarMatForOneVec.Rd | 25 ++++---- man/getODummyMatForOneVec.Rd | 32 ++++------- man/getUDummyMatForOneVec.Rd | 28 ++++----- man/plot.AccurateGLM.Rd | 78 ++++++++++++++++--------- man/predict.AccurateGLM.Rd | 43 ++++++++------ man/print.AccurateGLM.Rd | 13 +++-- man/residuals.AccurateGLM.Rd | 45 +++++++++------ 36 files changed, 923 insertions(+), 492 deletions(-) create mode 100644 R/aglm-package.R create mode 100644 man/aglm-package.Rd delete mode 100644 man/getDesignMatrix.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a88abbf..7c716ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,4 +20,7 @@ Suggests: testthat, knitr, rmarkdown, - MASS + MASS, + mathjaxr +RdMacros: + mathjaxr diff --git a/NAMESPACE b/NAMESPACE index 1102321..51183a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,7 +12,6 @@ export(createEqualWidthBins) export(cv.aglm) export(cva.aglm) export(executeBinning) -export(getDesignMatrix) export(getLVarMatForOneVec) export(getODummyMatForOneVec) export(getUDummyMatForOneVec) diff --git a/R/accurate-glm.R b/R/accurate-glm.R index c786707..3e421a3 100644 --- a/R/accurate-glm.R +++ b/R/accurate-glm.R @@ -1,24 +1,21 @@ -# S4 class for fitted AGLM -# written by Kenji Kondo @ 2019/1/2 - - -#' S4 class for fitted AGLM, used as a result of `aglm()` or `cv.aglm()` +#' Class for results of `aglm()` and `cv.aglm()` +#' +#' @slot backend_models The fitted backend `glmnet` model is stored. +#' @slot vars_info A list, each of whose element is information of one variable. +#' @slot lambda Same as in the result of \link{cv.glmnet}. +#' @slot cvm Same as in the result of \link{cv.glmnet}. +#' @slot cvsd Same as in the result of \link{cv.glmnet}. +#' @slot cvup Same as in the result of \link{cv.glmnet}. +#' @slot cvlo Same as in the result of \link{cv.glmnet}. +#' @slot nzero Same as in the result of \link{cv.glmnet}. +#' @slot name Same as in the result of \link{cv.glmnet}. +#' @slot lambda.min Same as in the result of \link{cv.glmnet}. +#' @slot lambda.1se Same as in the result of \link{cv.glmnet}. +#' @slot fit.preval Same as in the result of \link{cv.glmnet}. +#' @slot foldid Same as in the result of \link{cv.glmnet}. +#' @slot call An object of class `call`, corresponding to the function call when this `AccurateGLM` object is created. #' -#' @slot backend_models Internally used model objects to be passed to backend functions. -#' Currently `glmnet` is used as a backend and this slot holding a `glmnet` object. -#' @slot vars_info A list of list. Each element of `vars_info` represents one predictor variable and contains various information of it. -#' @slot lambda The values of `lambda` used in the fits. -#' @slot cvm The mean cross-validated error. -#' @slot cvsd The estimate of standard error of `cvm`. -#' @slot cvup The upper curve as `cvm + cvsd`. -#' @slot cvlo The lower curve as `cvm - cvsd`. -#' @slot nzero The number of non-zero coefficients at each lambda. -#' @slot name A text string indicating type of measure (for plotting purposes). -#' @slot lambda.min The value of `lambda` that gives minimum `cvm`. -#' @slot lambda.1se The largest value of `lambda` such that error is within 1 standard error of the minimum. -#' @slot fit.preval If `keep=TRUE`, this is the array of previously prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold. -#' @slot foldid An integer vector of values between 1 and `nfold` identifying what fold each observation is in. -#' @slot call An object of class call, which is used to record how `cva.aglm()` is called. +#' @author Kenji Kondo #' #' @export setClass("AccurateGLM", @@ -37,15 +34,18 @@ setClass("AccurateGLM", foldid="integer", call="ANY")) -#' S4 class for a result of `cva.aglm()` + +#' Class for results of `cva.aglm()` +#' +#' @slot models_list A list consists of `cv.glmnet()`'s results for all \eqn{\alpha} values. +#' @slot alpha Same as in \link{cv.aglm}. +#' @slot nfolds Same as in \link{cv.aglm}. +#' @slot alpha.min.index The index of `alpha.min` in the vector `alpha`. +#' @slot alpha.min The \eqn{\alpha} value achieving the minimum loss among all the values of `alpha`. +#' @slot lambda.min The \eqn{\lambda} value achieving the minimum loss when \eqn{\alpha} is equal to `alpha.min`. +#' @slot call An object of class `call`, corresponding to the function call when this `CVA_AccurateGLM` object is created. #' -#' @slot models_list Results of `cv.glmnet()` for all the values of alpha. -#' @slot alpha A numeric values specifying alpha values to be examined. -#' @slot nfolds An integer value specifying the number of folds. -#' @slot alpha.min.index An integer value specifying the index of `alpha.min` in `alpha`. -#' @slot alpha.min The alpha value which achieves the minimum loss. -#' @slot lambda.min The lambda value which achieves the minimum loss, when combined with `alpha.min`. -#' @slot call An object of class call, which is used to record how `cva.aglm()` is called. +#' @author Kenji Kondo #' #' @export setClass("CVA_AccurateGLM", diff --git a/R/aglm-input.R b/R/aglm-input.R index 1ba8d9e..10e2056 100644 --- a/R/aglm-input.R +++ b/R/aglm-input.R @@ -1,11 +1,7 @@ -# handling inputs of AGLM model -# written by Kenji Kondo @ 2019/1/2 - - #' S4 class for input #' -#' @slot vars_info A list of list. Each element has some information of one feature. -#' @slot data A data.frame which contains original data itself. +#' @slot vars_info A list, each of whose element is information of one variable. +#' @slot data The original data. setClass("AGLM_Input", representation=representation(vars_info="list", data="data.frame")) @@ -310,6 +306,7 @@ getMatrixRepresentationByVector <- function(raw_vec, var_info, drop_OD=FALSE) { return(z) } + #' @importFrom assertthat assert_that getMatrixRepresentation <- function(x, idx, drop_OD=FALSE) { var_info <- x@vars_info[[idx]] @@ -356,13 +353,6 @@ getMatrixRepresentation <- function(x, idx, drop_OD=FALSE) { } -#' Get design-matrix representation of AGLM_Input objects -#' -#' @param x An AGLM_Input object -#' -#' @return A data.frame which represents the matrix representation of `x`. -#' -#' @export #' @importFrom assertthat assert_that getDesignMatrix <- function(x) { # Check arguments diff --git a/R/aglm-package.R b/R/aglm-package.R new file mode 100644 index 0000000..f676c9c --- /dev/null +++ b/R/aglm-package.R @@ -0,0 +1,79 @@ +#' aglm: Accurate Generalized Linear Model +#' +#' Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying +#' a sort of feature transformations using a discretization of numerical features and specific +#' coding methodologies of dummy variables. More details can be found in +#' \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper}. +#' +#' The collection of functions provided by the `aglm` package has almost the same structure as the famous `glmnet` package, +#' so users familiar with the `glmnet` package will be able to handle it easily. +#' In fact, this structure is reasonable in implementation, because what the `aglm` package does is +#' applying appropriate transformations to the given data and passing it to the `glmnet` package as a backend. +#' +#' @section Fitting functions: +#' The `aglm` package provides three different fitting functions, depending on how users want to handle hyper-parameters of AGLM models. +#' +#' Because AGLM is based on regularized GLM, the regularization term of the loss function can be expressed as follows: +#' \loadmathjax +#' \mjsdeqn{ +#' R(\lbrace \beta_{jk} \rbrace; \lambda, \alpha) +#' = \lambda \left\lbrace +#' (1 - \alpha)\sum_{j=1}^{p} \sum_{k=1}^{m_j}|\beta_{jk}|^2 + \alpha \sum_{j=1}^{p} \sum_{k=1}^{m_j} |\beta_{jk}| +#' \right\rbrace, +#' } +#' where \eqn{\beta_jk} is the k-th coefficient of auxiliary variables for the j-th column in data, +#' \eqn{\alpha} is a weight which controls how L1 and L2 regularization terms are mixed, +#' and \eqn{\lambda} determines the strength of the regularization. +#' +#' Searching hyper-parameters \eqn{\alpha} and \eqn{\lambda} is often useful to get better results, but usually time-consuming. +#' That's why the `aglm` package provides three fitting functions with different strategies for spcifying hyper-parameters as follows: +#' * \link{aglm}: A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). +#' * \link{cv.aglm}: A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. +#' * \link{cva.aglm}: A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}. +#' +#' Generally speaking, setting an appropriate \eqn{\lambda} is often important to get meaningful results, +#' and using `cv.aglm()` with default \eqn{\alpha=1} (LASSO) is usually enough. +#' Since `cva.aglm()` is much time-consuming than `cv.aglm()`, it is better to use it only if particularly better results are needed. +#' +#' The following S4 classes are defined to store results of the fitting functions. +#' * \link{AccurateGLM-class}: A class for results of `aglm()` and `cv.aglm()` +#' * \link{CVA_AccurateGLM-class}: A class for results of `cva.aglm()` +#' +#' @section Using the fitted model: +#' Users can use models obtained from fitting functions in various ways, by passing them to following functions: +#' * \link[=predict.AccurateGLM]{predict}: Make predictions for new data +#' * \link[=plot.AccurateGLM]{plot}: Plot contribution of each variable and residuals +#' * \link[=print.AccurateGLM]{print}: Display textual information of the model +#' * \link[=coef.AccurateGLM]{coef}: Get coefficients +#' * \link[=deviance.AccurateGLM]{deviance}: Get deviance +#' * \link[=residuals.AccurateGLM]{residuals}: Get residuals of various types +#' +#' We emphasize that `plot()` is particularly useful to understand the fitted model, +#' because it presents a visual representation of how variables in the original data are used by the model. +#' +#' @section Other functions: +#' The following functions are basically for internal use, but exported as utility functions for convenience. +#' +#' * Functions for creating feature vectors +#' * \link{getUDummyMatForOneVec} +#' * \link{getODummyMatForOneVec} +#' * \link{getLVarMatForOneVec} +#' * Functions for binning +#' * \link{createEqualWidthBins} +#' * \link{createEqualFreqBins} +#' * \link{executeBinning} +#' +#' +#' @author +#' * Kenji Kondo, +#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' +#' +#' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +#' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +#' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +#' \emph{Actuarial Colloquium Paris 2020} +#' +#' @docType package +#' @name aglm-package +NULL diff --git a/R/aglm.R b/R/aglm.R index 06a1bb6..3d7d415 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -1,4 +1,7 @@ -#' Fit an AGLM model +#' Fit an AGLM model with no cross-validation +#' +#' A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). +#' See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. #' #' @param x #' A design matrix. @@ -8,6 +11,7 @@ #' * `factor` (unordered) or `logical` : interpreted as a qualitative variable without order. `aglm` creates dummy variables suitable for unordered values (named U-dummies). #' * `ordered`: interpreted as a qualitative variable with order. `aglm` creates both O-dummies and U-dummies. #' These dummy variables are added to `x` and form a larger matrix, which is used internally as an actual design matrix. +#' #' If you need to change the default behavior, use the following options: `qualitative_vars_UD_only`, `qualitative_vars_both`, `qualitative_vars_OD_only`, and `quantitative_vars`. #' #' @param y @@ -65,9 +69,25 @@ #' @param bins_names #' Used to set custom bins for variables with O-dummies. #' -#' @param ... Other arguments are passed directly when calling `glmnet()`. +#' @param ... +#' Other arguments are passed directly when calling `glmnet()`. +#' +#' @return +#' A model object fitted to the data. +#' Functions such as `predict` and `plot` can be applied to the returned object. +#' See \link{AccurateGLM-class} for more details. +#' +#' +#' @author +#' * Kenji Kondo, +#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' +#' +#' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +#' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +#' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +#' \emph{Actuarial Colloquium Paris 2020} #' -#' @return A model object fitted to the data. #' #' @export #' @importFrom assertthat assert_that diff --git a/R/binning.R b/R/binning.R index b9734a7..47d6ad7 100644 --- a/R/binning.R +++ b/R/binning.R @@ -1,20 +1,18 @@ -# utility functions for binning numerical data -# written by Kenji Kondo @ 2019/1/1 - - # Inner function used in executeBinning isBinningFeasible <- function(x_vec) { return(is.integer(x_vec) | is.numeric(x_vec) | is.ordered(x_vec)) } -#' Create bins by Equal Width Binning +#' Create bins (equal width binning) +#' +#' @param left The leftmost value of the interval to be binned. +#' @param right The rightmost value of the interval to be binned. +#' @param nbin The number of bins. #' -#' @param left left value of the original interval. -#' @param right right value of the original interval. -#' @param nbin number of bins to be created. +#' @return A numeric vector representing breaks obtained by binning. #' -#' @return a numeric vector which indicates the boundaries of bins, with `nbin` elements. +#' @author Kenji Kondo #' #' @export #' @importFrom assertthat assert_that @@ -27,13 +25,15 @@ createEqualWidthBins <- function(left, right, nbin){ } -#' Create bins by Equal Freq Binning +#' Create bins (equal frequency binning) #' -#' @param x_vec A reference integer or numeric or ordered vector to be binned. -#' @param nbin.max An integer value which indicates the maximum counts of bins. -#' Note that this function makes `min(nbin.max, length(x_vec))` counts of bins. +#' @param x_vec A numeric vector, whose quantiles are used as breaks. +#' @param nbin.max The maximum number of bins. #' -#' @return a numeric vector which indicates the boundaries of bins, with `nbin` elements. +#' @return A numeric vector representing breaks obtained by binning. +#' Note that the number of bins is equal to `min(nbin.max, length(x_vec))`. +#' +#' @author Kenji Kondo #' #' @export #' @importFrom assertthat assert_that @@ -55,19 +55,18 @@ createEqualFreqBins <- function(x_vec, nbin.max) { } -#' Execute binning for numerical data. +#' Binning the data to given bins. +#' +#' @param x_vec The data to be binned. +#' @param breaks A numeric vector representing breaks of bins (If `NULL`, automatically generated). +#' @param nbin.max The maximum number of bins (used only if `breaks=NULL`). +#' @param method `"freq"` for equal frequency binning or `"width"` for equal width binning (used only if `breaks=NULL`). #' -#' @param x_vec an integer or numeric or ordered vector, to be binned. -#' @param breaks a numeric vector which indicates the boundaries of bins, of length (number of bins - 1). -#' If NULL is set, bins are generated using the method which is specified by the `method` argument. -#' @param nbin.max a maximum number of bins which is generated by this function. Only used when `breaks` is not set. -#' @param method used for specifying binning method. "freq": equal freq binning (default), "width": equal width binning. -#' Ignore if `breaks` is set. +#' @return A list with the following fields: +#' * `labels`: An integer vector with same length as `x_vec`, where `labels[i]==k` means the i-th element of `x_vec` is in the k-th bin. +#' * `breaks`: Breaks of bins used for binning. #' -#' @return a list which has two members `labels` and `breaks`. -#' * `labels`: an integer vector of `length(x_vec)`. -#' `(labels[i]==k)` indicates the `i-th` element of `x_vec` is in the `k-th` bin. -#' * `breaks`: a numeric vector which indicates the boundaries of bins, of length (number of bins - 1). +#' @author Kenji Kondo #' #' @export #' @importFrom assertthat assert_that diff --git a/R/coef-aglm.R b/R/coef-aglm.R index 80700d4..f705e44 100644 --- a/R/coef-aglm.R +++ b/R/coef-aglm.R @@ -1,15 +1,28 @@ -# calculate deviances for AGLM -# written by Kenji Kondo @ 2019/1/3 - -#' Extract coefficients from an `AccurateGLM` object. +#' Get coefficients +#' +#' @param object +#' A model object obtained from `aglm()` or `cv.aglm()`. +#' +#' @param index +#' An integer value representing the index of variable whose coefficients are required. +#' +#' @param name +#' A string representing the name of variable whose coefficients are required. +#' Note that if both `index` and `name` are set, `index` is discarded. +#' +#' @param s +#' Same as in \link{coef.glmnet}. +#' +#' @param exact +#' Same as in \link{coef.glmnet}. +#' +#' @param ... +#' Other arguments are passed directly to `coef.glmnet()`. +#' +#' +#' @author +#' Kenji Kondo #' -#' @param object An `AccurateGLM` object. -#' @param index An integer vector of variable indices whose coefficients should be returned. -#' @param name An character vector of variable names whose coefficients should be returned. -#' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. -#' Default is the entire sequence used to create the model. -#' @param exact Same as `coef.glmnet()`. -#' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @importFrom assertthat assert_that #' @importFrom stats coef diff --git a/R/cv-aglm.R b/R/cv-aglm.R index 49e43b5..b2d7cb8 100644 --- a/R/cv-aglm.R +++ b/R/cv-aglm.R @@ -1,35 +1,79 @@ -# cross-validation function for AGLM model -# written by Kenji Kondo @ 2019/2/24 - - -#' Cross-validation for AGLM models -#' -#' @param x An input matrix or data.frame to be fitted. -#' @param y An integer or numeric vector which represents response variable. -#' @param qualitative_vars_UD_only A list of indices or names for specifying which columns are qualitative and need only U-dummy representations. -#' @param qualitative_vars_both A list of indices or names for specifying which columns are qualitative and need both U-dummy and O-dummy representations. -#' @param qualitative_vars_OD_only A list of indices or names for specifying which columns are qualitative and need only O-dummy representations. -#' @param quantitative_vars A list of indices or names for specifying which columns are quantitative. -#' @param use_LVar A boolean value which indicates whether this function uses L-variable representations or not. -#' @param extrapolation A character value which indicates how contribution curves outside bins are extrapolated. -#' * `"default"`: No extrapolations. -#' * `"flat"`: Extrapolates with flat lines. -#' @param add_linear_columns A boolean value which indicates whether this function uses linear effects or not. -#' @param add_OD_columns_of_qualitatives A boolean value which indicates whether this function use O-dummy representations for qualitative and ordinal variables or not. -#' @param add_interaction_columns A boolean value which indicates whether this function uses interaction effects or not. -#' @param OD_type_of_quantitatives A character value which indicates how O-dummy matrices of quantitative -#' values are constructed. Choose `"C"`(default) or `"J"`. -#' * `"C"`: Continuous-type dummies, which result continuous contribution curves. -#' * `"J"`: Jump-type dummies, which result contribution curves with jumps. -#' * `"N"`: No use of O-dummies -#' @param family Response type. Currently `"gaussian"`, `"binomial"`, and `"poisson"` are supported. -#' @param nbin.max a maximum number of bins which is automatically generated. Only used when `breaks` is not set. -#' @param bins_list A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order. -#' @param bins_names A list of column name or column index, each name or index of which specifies which column of `x` is binned used with an element of `bins_list` in the same position. -#' @param keep Same as `cv.glmnet()`. -#' @param ... Other arguments are passed directly to backend (currently `cv.glmnet()` is used), and if not given, default values of the backend API are used to call backend functions. -#' -#' @return Result of cross-validation. +#' Fit an AGLM model with cross-validation for \eqn{\lambda} +#' +#' A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. +#' See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. +#' +#' @param x +#' A design matrix. +#' See \link{aglm} for more details. +#' +#' @param y +#' A response variable. +#' +#' @param qualitative_vars_UD_only +#' Same as in \link{aglm}. +#' +#' @param qualitative_vars_both +#' Same as in \link{aglm}. +#' +#' @param qualitative_vars_OD_only +#' Same as in \link{aglm}. +#' +#' @param quantitative_vars +#' Same as in \link{aglm}. +#' +#' @param use_LVar +#' Same as in \link{aglm}. +#' +#' @param extrapolation +#' Same as in \link{aglm}. +#' +#' @param add_linear_columns +#' Same as in \link{aglm}. +#' +#' @param add_OD_columns_of_qualitatives +#' Same as in \link{aglm}. +#' +#' @param add_interaction_columns +#' Same as in \link{aglm}. +#' +#' @param OD_type_of_quantitatives +#' Same as in \link{aglm}. +#' +#' @param nbin.max +#' Same as in \link{aglm}. +#' +#' @param bins_list +#' Same as in \link{aglm}. +#' +#' @param bins_names +#' Same as in \link{aglm}. +#' +#' @param family +#' Same as in \link{aglm}. +#' +#' @param keep +#' Set to `TRUE` if you need the `fit.preval` field in the returned value, as in `cv.glmnet()`. +#' +#' @param ... +#' Other arguments are passed directly when calling `cv.glmnet()`. +#' +#' @return +#' A model object fitted to the data with cross-validation results. +#' Functions such as `predict` and `plot` can be applied to the returned object, same as the result of `aglm()`. +#' See \link{AccurateGLM-class} for more details. +#' +#' +#' @author +#' * Kenji Kondo, +#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' +#' +#' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +#' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +#' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +#' \emph{Actuarial Colloquium Paris 2020} +#' #' #' @export #' @importFrom assertthat assert_that diff --git a/R/cva-aglm.R b/R/cva-aglm.R index 094f90e..4d702f5 100644 --- a/R/cva-aglm.R +++ b/R/cva-aglm.R @@ -1,18 +1,46 @@ -# cross-validation function for both alpha and lambda -# written by Kenji Kondo @ 2019/5/6 - - -#' Cross-validation for both alpha and lambda +#' Fit an AGLM model with cross-validation for both \eqn{\alpha} and \eqn{\lambda} +#' +#' A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}. +#' See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. +#' +#' @param x +#' A design matrix. +#' See \link{aglm} for more details. +#' +#' @param y +#' A response variable. +#' +#' @param alpha +#' A numeric vector representing \eqn{\alpha} values to be examined in cross-validation. +#' +#' @param nfolds +#' An integer value representing the number of folds. +#' +#' @param foldid +#' An integer vector with the same length as observations. +#' Each element should take a value from 1 to `nfolds`, identifying which fold it belongs. +#' +#' @param parallel.alpha +#' (not used yet) +#' +#' @param ... +#' Other arguments are passed directly to `cv.aglm()`. +#' +#' @return +#' An object storing fitted models and information of cross-validation. +#' See \link{CVA_AccurateGLM-class} for more details. +#' +#' +#' @author +#' * Kenji Kondo, +#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' #' -#' @param x An input matrix or data.frame to be fitted. -#' @param y An integer or numeric vector which represents response variable. -#' @param alpha A numeric vector which represents alpha values to be examined. -#' @param nfolds The number of folds. The default value is 10. -#' @param foldid An integer vector of values between 1 and `nfold` identifying what fold each observation is in. -#' @param parallel.alpha not implemented yet. -#' @param ... Other arguments are passed directly to `cv.aglm()`. +#' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +#' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +#' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +#' \emph{Actuarial Colloquium Paris 2020} #' -#' @return Result of cross-validation. See `CVA_AccurateGLM`'s document for more details. #' #' @export #' @importFrom assertthat assert_that diff --git a/R/deviance-aglm.R b/R/deviance-aglm.R index ecd1121..307d0ff 100644 --- a/R/deviance-aglm.R +++ b/R/deviance-aglm.R @@ -1,10 +1,15 @@ -# calculate deviances for AGLM -# written by Kenji Kondo @ 2019/1/3 - -#' Extract the deviance from an `AccurateGLM` object. +#' Get deviance +#' +#' @param object +#' A model object obtained from `aglm()` or `cv.aglm()`. +#' +#' @param ... +#' Other arguments are passed directly to `deviance.glmnet()`. +#' +#' +#' @author +#' Kenji Kondo #' -#' @param object An `AccurateGLM` object. -#' @param ... Other arguments are passed directly to `deviance` functions of `model@backend_models`. #' #' @export #' @importFrom stats deviance diff --git a/R/get-dummies.R b/R/get-dummies.R index 0bc4a98..8d0f0d9 100644 --- a/R/get-dummies.R +++ b/R/get-dummies.R @@ -1,20 +1,18 @@ -# utility functions for get dummy variables from various data -# written by Kenji Kondo @ 2019/1/1 - -#' Get U-dummy matrix for one-dimensional vector +#' Create a U-dummy matrix for one variable +#' +#' @param x_vec A vector representing original variable. +#' The class of `x_vec` should be one of `integer`, `character`, or `factor`. +#' @param levels A character vector representing values of `x_vec` used to create U-dummies. +#' If `NULL`, all the unique values of `x_vec` are used to create dummies. +#' @param drop_last If `TRUE`, the last column of the resulting matrix is dropped to avoid multicollinearity. +#' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned. #' -#' @param x_vec A non-numeric vector to be converted into dummy matrix. -#' @param levels A character vector indicates unique set of possible values. -#' If NULL, all the unique values of `x_vec` are used. -#' @param drop_last A logical value. If TRUE, the last column of dummy matrix is dropped. -#' @param only_info A logical value. If TRUE, actual creation of dummy matrix is omitted. +#' @return A list with the following fields: +#' * `levels`: Same as input. +#' * `drop_last`: Same as input. +#' * `dummy_mat`: The created U-dummy matrix (only if `only_info=FALSE`). #' -#' @return a list with two members `levels` and `dummy_mat`. -#' * `levels`: Same as input -#' * `drop_last`: Same as input -#' * `dummy_mat`: An integer matrix with size (length of `x_vec`, length of `levels` or minus 1 when `drop_last=TRUE`). -#' `dummy_mat[i, j]` is 1 if and only if `x_vec[i] == levels[j]`, and 0 otherwise. -#' Omitted if `only_info=TRUE` +#' @author Kenji Kondo #' #' @export #' @importFrom assertthat assert_that @@ -41,26 +39,19 @@ getUDummyMatForOneVec <- function(x_vec, levels=NULL, drop_last=TRUE, only_info= } -#' Get O-dummy matrix for one-dimensional vector +#' Create a O-dummy matrix for one variable #' -#' @param x_vec An integer or numeric vector to be converted into dummy matrix. -#' @param breaks A numeric vector which indicates the boundaries of bins, of length (number of bins + 1). -#' If NULL, evenly cut bins are automatically generated and used. -#' @param nbin.max A maximum number of bins which is used. Only used when `breaks` is not set. -#' @param only_info A logical value. If TRUE, actual creation of dummy matrix is omitted. -#' @param dummy_type A character value. Choose `"C"`(default) or `"J"`. For integer or numeric `x_vec`, -#' `dummy_type="C"` is used as default. Otherwise, `dummy_type="J"` is used as default. -#' * `"C"`: Continuous-type dummies, which result continuous contribution curves. -#' * `"J"`: Jump-type dummies, which result contribution curves with jumps. +#' @param x_vec A numeric vector representing original variable. +#' @param breaks A numeric vector representing breaks of bins (If `NULL`, automatically generated). +#' @param nbin.max The maximum number of bins (used only if `breaks=NULL`). +#' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned. +#' @param dummy_type Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated). #' -#' @return a list with two members `breaks` and `dummy_mat`. +#' @return a list with the following fields: #' * `breaks`: Same as input -#' * `dummy_mat`: An integer matrix with size (length of `x_vec`, length of `breaks` minus 1). -#' `dummy_mat[i, j]` is 1 if and only if `breaks[i] < x_vec[i] <= breaks[i+1]`, and 0 otherwise. -#' Note that, in case where `x_vec[i]` is outside of `(breaks[1], breaks[length(breaks)]]`, -#' `x_vec[i]` is considered to be in the first bin if `x_vec[i] <= breaks[1]`, and -#' be in the last bin if `x_vec[i] > breaks[length(breaks)]`. -#' Omitted if `only_info=TRUE` +#' * `dummy_mat`: The created O-dummy matrix (only if `only_info=FALSE`). +#' +#' @author Kenji Kondo #' #' @export #' @importFrom assertthat assert_that @@ -97,22 +88,19 @@ getODummyMatForOneVec <- function(x_vec, breaks=NULL, nbin.max=100, only_info=FA else return(list(breaks=binned_x$breaks, dummy_mat=dummy_mat)) } -#' Get L-variable matrix for one-dimensional vector + +#' Create L-variable matrix for one variable #' -#' @param x_vec An integer or numeric vector to be converted into dummy matrix. -#' @param breaks A numeric vector which indicates the boundaries of bins, of length (number of bins + 1). -#' If NULL, evenly cut bins are automatically generated and used. -#' @param nbin.max A maximum number of bins which is used. Only used when `breaks` is not set. -#' @param only_info A logical value. If TRUE, actual creation of dummy matrix is omitted. +#' @param x_vec A numeric vector representing original variable. +#' @param breaks A numeric vector representing breaks of bins (If `NULL`, automatically generated). +#' @param nbin.max The maximum number of bins (used only if `breaks=NULL`). +#' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned. #' -#' @return a list with two members `breaks` and `dummy_mat`. +#' @return a list with the following fields: #' * `breaks`: Same as input -#' * `dummy_mat`: An integer matrix with size (length of `x_vec`, length of `breaks` minus 1). -#' `dummy_mat[i, j]` is 1 if and only if `breaks[i] < x_vec[i] <= breaks[i+1]`, and 0 otherwise. -#' Note that, in case where `x_vec[i]` is outside of `(breaks[1], breaks[length(breaks)]]`, -#' `x_vec[i]` is considered to be in the first bin if `x_vec[i] <= breaks[1]`, and -#' be in the last bin if `x_vec[i] > breaks[length(breaks)]`. -#' Omitted if `only_info=TRUE` +#' * `dummy_mat`: The created L-variable matrix (only if `only_info=FALSE`). +#' +#' @author Kenji Kondo #' #' @export #' @importFrom assertthat assert_that diff --git a/R/plot-aglm.R b/R/plot-aglm.R index 3a4bc45..e781612 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -1,32 +1,72 @@ -# plotting function for AGLM -# written by Kenji Kondo @ 2019/1/3 - -#' Plot coefficients from an `AccurateGLM` object +#' Plot contribution of each variable and residuals +#' +#' @param x +#' A model object obtained from `aglm()` or `cv.aglm()`. +#' +#' @param vars +#' Used to specify variables to be plotted (`NULL` means all the variables). +#' This parameter may have one of the following classes: +#' * `integer`: specifying variables by index. +#' * `character`: specifying variables by name. +#' +#' @param verbose +#' Set to `FALSE` if textual outputs are not needed. +#' +#' @param s +#' A numeric value specifying \eqn{\lambda} at which plotting is required. +#' Note that plotting for multiple \eqn{\lambda}'s are not allowed and `s` always should be a single value. +#' When the model is trained with only a single \eqn{\lambda} value, just set it to `NULL` to plot for that value. +#' +#' @param resid +#' Used to display residuals in plots. +#' This parameter may have one of the following classes: +#' * `logical`(single value): If `TRUE`, working residuals are plotted. +#' * `character`(single value): type of residual to be plotted. See \link{residuals.AccurateGLM} for more details on types of residuals. +#' * `numerical`(vector): residual values to be plotted. +#' +#' @param smooth_resid +#' Used to display smoothing lines of residuals for quantitative variables. +#' This parameter may have one of the following classes: +#' * `logical`: If `TRUE`, smoothing lines are drawn. +#' * `character`: +#' * `smooth_resid="both"`: Balls and smoothing lines are drawn. +#' * `smooth_resid="smooth_only"`: Only smoothing lines are drawn. +#' +#' @param smooth_resid_fun +#' Set if users need custom smoothing functions. +#' +#' @param ask +#' By default, `plot()` stops and waits inputs each time plotting for each variable is completed. +#' Users can set `ask=FALSE` to avoid this. +#' It is useful, for example, when using devices as `bmp` to create image files. +#' +#' @param layout +#' Plotting multiple variables for each page is allowed. +#' To achieve this, set it to a pair of integer, which indicating number of rows and columns, respectively. +#' +#' @param only_plot +#' Set to `TRUE` if no automatic graphical configurations are needed. +#' +#' @param main +#' Used to specify the title of plotting. +#' +#' @param add_rug +#' Set to `TRUE` for rug plots. +#' +#' @param ... +#' Other arguments are currently not used and just discarded. +#' +#' +#' @author +#' * Kenji Kondo, +#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' +#' +#' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +#' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +#' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +#' \emph{Actuarial Colloquium Paris 2020} #' -#' @param x An `AccurateGLM` object. -#' @param vars An integer or character vectors (indices or names) specifying which variables should be plotted. -#' @param verbose If TRUE, outputs details. -#' @param s A numeric value specifying lambda value at which plotting is required. -#' Note that this function can't plot for multiple lambda values, so it allows only -#' single `s` value (which means `model` is trained with multiple lambda values and plot with one of them), -#' or `s=NULL` (which means `model` is trained with single lambda value and plot with that value). -#' @param resid A logical value which indicates to plot residuals, -#' or a character value which indicates residual type to be plotted (see the help of `residuals.AccurateGLM()`), -#' or a numerical vector which indicates residual values to be plotted. -#' Note that working residuals are used in the first case with `resid=TRUE`. -#' @param smooth_resid A logical value which indicates whether draws smoothing lines of residuals or not, -#' or a character value which is one of options below: -#' * `"both"` draws both balls and smoothing lines. -#' * `"smooth_only"` draws only smoothing line. -#' Note that smoothing lines are only drawn for quantitative variables. -#' The default value is `TRUE`. -#' @param smooth_resid_fun A function to be used to smooth partial residual values. -#' @param ask A logical value which indicates ask if go to next plot. -#' @param layout A pair of integer values which indicates how many plots are drawn row-wise and column-wise respectively, -#' @param only_plot If `TRUE`, the function set no graphical parameters and no title. -#' @param main A character value which indicates titles of panels. -#' @param add_rug A logical value which indicates draw rug plot for quantitative variables. -#' @param ... Other arguments are currently not used. #' #' @export #' @importFrom assertthat assert_that diff --git a/R/predict-aglm.R b/R/predict-aglm.R index 3cefecf..7bf58d2 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -1,24 +1,42 @@ -# predicting function for AGLM model -# written by Kenji Kondo @ 2019/1/3 +#' Make predictions for new data +#' +#' @param object +#' A model object obtained from `aglm()` or `cv.aglm()`. +#' +#' @param newx +#' A design matrix for new data. +#' See the description of `x` in \link{aglm} for more details. +#' +#' @param s +#' Same as in \link{predict.glmnet}. +#' +#' @param type +#' Same as in \link{predict.glmnet}. +#' +#' @param exact +#' Same as in \link{predict.glmnet}. +#' +#' @param newoffset +#' Same as in \link{predict.glmnet}. -#' Make predictions from a fitted `AccurateGLM` +#' @param ... +#' Other arguments are passed directly when calling `predict.glmnet()`. +#' +#' @return +#' The returned object depends on `type`. +#' See \link{predict.glmnet} for more details. +#' +#' +#' @author +#' * Kenji Kondo, +#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' #' -#' @param object An `AccurateGLM` object. -#' @param newx An input matrix or data.frame used for predictions. -#' @param s Value(s) of the penalty parameter `lambda` at which predictions are required. -#' Default is the entire sequence used to create the model. -#' @param type Type of prediction required. -#' * Type `"link"` gives the linear predictors for `"binomial"`, `"poisson"` models, and for `"gaussian"` models it gives the fitted values. -#' * Type `"response"` gives the fitted probabilities for `"binomial"`, fitted mean for `"poisson"`, and for `"gaussian"` models it is equivalent to type `"link"`. -#' * Type `"coefficients"` computes the coefficients at the requested values for `s`. -#' Note that for `"binomial"` models, results are returned only for the class corresponding to the second level of the factor response. -#' * Type `"class"` applies only to `"binomial"`, and produces the class label corresponding to the maximum probability. -#' * Type `"nonzero"` returns a list of the indices of the nonzero coefficients for each value of `s`. -#' @param exact Same as `predict.glmnet()`. -#' @param newoffset If an offset is used in the fit, then one must be supplied for making predictions (except for type="coefficients" or type="nonzero"). -#' @param ... Other arguments are passed directly to backend (currently `glmnet()` is used), and if not given, default values of the backend API are used. +#' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +#' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +#' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +#' \emph{Actuarial Colloquium Paris 2020} #' -#' @return The object returned depends on type. #' #' @export #' @importFrom assertthat assert_that diff --git a/R/print-aglm.R b/R/print-aglm.R index 0d13c60..52d0347 100644 --- a/R/print-aglm.R +++ b/R/print-aglm.R @@ -1,11 +1,18 @@ -# printing function for AGLM -# witten by Kenji Kondo @ 2019/1/3 - -#' Print an `AccurateGLM` object +#' Display textual information of the model +#' +#' @param x +#' A model object obtained from `aglm()` or `cv.aglm()`. +#' +#' @param digits +#' Used to control significant digits in printout. +#' +#' @param ... +#' Other arguments are passed directly to `print.glmnet()`. +#' +#' +#' @author +#' Kenji Kondo #' -#' @param x An `AccurateGLM` object. -#' @param digits Significant digits in printout. -#' @param ... Other arguments are passed directly to `print` functions of `model@backend_models`. #' #' @export print.AccurateGLM <- function(x, digits=max(3, getOption("digits") - 3), ...) { diff --git a/R/residuals-aglm.R b/R/residuals-aglm.R index 9e3ffa5..72de3fc 100644 --- a/R/residuals-aglm.R +++ b/R/residuals-aglm.R @@ -1,25 +1,51 @@ -# calculate residuals for AGLM model -# written by Kenji Kondo @ 2019/7/7 - -#' Calculate residuals for AGLM model +#' Get residuals of various types +#' +#' \loadmathjax +#' +#' @param object +#' A model object obtained from `aglm()` or `cv.aglm()`. +#' +#' @param x +#' A design matrix. +#' If not given, `x` for fitting is used. +#' +#' @param y +#' A response variable. +#' If not given, `y` for fitting is used. +#' +#' @param offset +#' An offset values. +#' If not given, `offset` for fitting is used. +#' +#' @param weights +#' Sample weights. +#' If not given, `weights` for fitting is used. +#' +#' @param type +#' A string representing type of deviance: +#' * `"working"` get working residual +#' \mjsdeqn{r^W_i = (y_i - \mu_i) \left(\frac{\partial \eta}{\partial \mu}\right)_{\mu=\mu_i},} +#' where \eqn{y_i} is a response value, \eqn{\mu} is GLM mean, and \eqn{\eta=g^{-1}(\mu)} with the link function \eqn{g}. +#' * `"pearson"` get Pearson residuals +#' \mjsdeqn{r^P_i = \frac{y_i - \mu_i}{\sqrt{V(\mu_i)}},} +#' where \eqn{V} is the variance function. +#' * `"deviance"` get deviance residuals +#' \mjsdeqn{r^D_i = {\rm sign}(y_i - \mu_i) \sqrt{d_i},} +#' where \eqn{d_i} is the contribution to deviance. +#' +#' @param s +#' A numeric value specifying \eqn{\lambda} at which residuals are calculated. +#' +#' @param ... +#' Other arguments are currently not used and just discarded. +#' +#' @return +#' A numeric vector representing calculated residuals. +#' #' -#' @param object An `AccurateGLM` object. -#' @param x An input matrix or data.frame used for predictions in residual calculations. -#' If not given, `x` used for fitting the model is used. -#' @param y A numeric vector used as true target values in residual calculations. -#' If not given, `y` used for fitting the model is used. -#' @param offset A numeric offset values used for predictions in residual calculations. -#' If not given, `offset` used for fitting the model is used. -#' @param weights A numeric weight values, corresponding with exposure size. -#' If not given, `weights` used for fitting the model is used. -#' @param type Type of prediction required. -#' * Type `"working"` Working residuals. -#' * Type `"pearson"` Pearson residuals. -#' * Type `"deviance"` Deviance residuals. -#' @param s A numeric value specifying lambda value at which plotting is required. -#' @param ... Other arguments are currently not used. +#' @author +#' Kenji Kondo #' -#' @return The object returned depends on type. #' #' @export #' @importFrom assertthat assert_that diff --git a/man/AGLM_Input-class.Rd b/man/AGLM_Input-class.Rd index 7ea4057..d19b468 100644 --- a/man/AGLM_Input-class.Rd +++ b/man/AGLM_Input-class.Rd @@ -10,8 +10,8 @@ S4 class for input \section{Slots}{ \describe{ -\item{\code{vars_info}}{A list of list. Each element has some information of one feature.} +\item{\code{vars_info}}{A list, each of whose element is information of one variable.} -\item{\code{data}}{A data.frame which contains original data itself.} +\item{\code{data}}{The original data.} }} diff --git a/man/AccurateGLM-class.Rd b/man/AccurateGLM-class.Rd index b645d1d..44a9640 100644 --- a/man/AccurateGLM-class.Rd +++ b/man/AccurateGLM-class.Rd @@ -3,40 +3,42 @@ \docType{class} \name{AccurateGLM-class} \alias{AccurateGLM-class} -\title{S4 class for fitted AGLM, used as a result of \code{aglm()} or \code{cv.aglm()}} +\title{Class for results of \code{aglm()} and \code{cv.aglm()}} \description{ -S4 class for fitted AGLM, used as a result of \code{aglm()} or \code{cv.aglm()} +Class for results of \code{aglm()} and \code{cv.aglm()} } \section{Slots}{ \describe{ -\item{\code{backend_models}}{Internally used model objects to be passed to backend functions. -Currently \code{glmnet} is used as a backend and this slot holding a \code{glmnet} object.} +\item{\code{backend_models}}{The fitted backend \code{glmnet} model is stored.} -\item{\code{vars_info}}{A list of list. Each element of \code{vars_info} represents one predictor variable and contains various information of it.} +\item{\code{vars_info}}{A list, each of whose element is information of one variable.} -\item{\code{lambda}}{The values of \code{lambda} used in the fits.} +\item{\code{lambda}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{cvm}}{The mean cross-validated error.} +\item{\code{cvm}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{cvsd}}{The estimate of standard error of \code{cvm}.} +\item{\code{cvsd}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{cvup}}{The upper curve as \code{cvm + cvsd}.} +\item{\code{cvup}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{cvlo}}{The lower curve as \code{cvm - cvsd}.} +\item{\code{cvlo}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{nzero}}{The number of non-zero coefficients at each lambda.} +\item{\code{nzero}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{name}}{A text string indicating type of measure (for plotting purposes).} +\item{\code{name}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{lambda.min}}{The value of \code{lambda} that gives minimum \code{cvm}.} +\item{\code{lambda.min}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{lambda.1se}}{The largest value of \code{lambda} such that error is within 1 standard error of the minimum.} +\item{\code{lambda.1se}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{fit.preval}}{If \code{keep=TRUE}, this is the array of previously prevalidated fits. Some entries can be NA, if that and subsequent values of lambda are not reached for that fold.} +\item{\code{fit.preval}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{foldid}}{An integer vector of values between 1 and \code{nfold} identifying what fold each observation is in.} +\item{\code{foldid}}{Same as in the result of \link{cv.glmnet}.} -\item{\code{call}}{An object of class call, which is used to record how \code{cva.aglm()} is called.} +\item{\code{call}}{An object of class \code{call}, corresponding to the function call when this \code{AccurateGLM} object is created.} }} +\author{ +Kenji Kondo +} diff --git a/man/CVA_AccurateGLM-class.Rd b/man/CVA_AccurateGLM-class.Rd index 57cf220..1d107e8 100644 --- a/man/CVA_AccurateGLM-class.Rd +++ b/man/CVA_AccurateGLM-class.Rd @@ -3,25 +3,28 @@ \docType{class} \name{CVA_AccurateGLM-class} \alias{CVA_AccurateGLM-class} -\title{S4 class for a result of \code{cva.aglm()}} +\title{Class for results of \code{cva.aglm()}} \description{ -S4 class for a result of \code{cva.aglm()} +Class for results of \code{cva.aglm()} } \section{Slots}{ \describe{ -\item{\code{models_list}}{Results of \code{cv.glmnet()} for all the values of alpha.} +\item{\code{models_list}}{A list consists of \code{cv.glmnet()}'s results for all \eqn{\alpha} values.} -\item{\code{alpha}}{A numeric values specifying alpha values to be examined.} +\item{\code{alpha}}{Same as in \link{cv.aglm}.} -\item{\code{nfolds}}{An integer value specifying the number of folds.} +\item{\code{nfolds}}{Same as in \link{cv.aglm}.} -\item{\code{alpha.min.index}}{An integer value specifying the index of \code{alpha.min} in \code{alpha}.} +\item{\code{alpha.min.index}}{The index of \code{alpha.min} in the vector \code{alpha}.} -\item{\code{alpha.min}}{The alpha value which achieves the minimum loss.} +\item{\code{alpha.min}}{The \eqn{\alpha} value achieving the minimum loss among all the values of \code{alpha}.} -\item{\code{lambda.min}}{The lambda value which achieves the minimum loss, when combined with \code{alpha.min}.} +\item{\code{lambda.min}}{The \eqn{\lambda} value achieving the minimum loss when \eqn{\alpha} is equal to \code{alpha.min}.} -\item{\code{call}}{An object of class call, which is used to record how \code{cva.aglm()} is called.} +\item{\code{call}}{An object of class \code{call}, corresponding to the function call when this \code{CVA_AccurateGLM} object is created.} }} +\author{ +Kenji Kondo +} diff --git a/man/aglm-package.Rd b/man/aglm-package.Rd new file mode 100644 index 0000000..c24617a --- /dev/null +++ b/man/aglm-package.Rd @@ -0,0 +1,100 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aglm-package.R +\docType{package} +\name{aglm-package} +\alias{aglm-package} +\title{aglm: Accurate Generalized Linear Model} +\description{ +Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying +a sort of feature transformations using a discretization of numerical features and specific +coding methodologies of dummy variables. More details can be found in +\href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper}. +} +\details{ +The collection of functions provided by the \code{aglm} package has almost the same structure as the famous \code{glmnet} package, +so users familiar with the \code{glmnet} package will be able to handle it easily. +In fact, this structure is reasonable in implementation, because what the \code{aglm} package does is +applying appropriate transformations to the given data and passing it to the \code{glmnet} package as a backend. +} +\section{Fitting functions}{ + +The \code{aglm} package provides three different fitting functions, depending on how users want to handle hyper-parameters of AGLM models. + +Because AGLM is based on regularized GLM, the regularization term of the loss function can be expressed as follows: +\loadmathjax +\mjsdeqn{ +R(\lbrace \beta_{jk} \rbrace; \lambda, \alpha) += \lambda \left\lbrace +(1 - \alpha)\sum_{j=1}^{p} \sum_{k=1}^{m_j}|\beta_{jk}|^2 + \alpha \sum_{j=1}^{p} \sum_{k=1}^{m_j} |\beta_{jk}| +\right\rbrace, +} +where \eqn{\beta_jk} is the k-th coefficient of auxiliary variables for the j-th column in data, +\eqn{\alpha} is a weight which controls how L1 and L2 regularization terms are mixed, +and \eqn{\lambda} determines the strength of the regularization. + +Searching hyper-parameters \eqn{\alpha} and \eqn{\lambda} is often useful to get better results, but usually time-consuming. +That's why the \code{aglm} package provides three fitting functions with different strategies for spcifying hyper-parameters as follows: +\itemize{ +\item \link{aglm}: A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). +\item \link{cv.aglm}: A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. +\item \link{cva.aglm}: A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}. +} + +Generally speaking, setting an appropriate \eqn{\lambda} is often important to get meaningful results, +and using \code{cv.aglm()} with default \eqn{\alpha=1} (LASSO) is usually enough. +Since \code{cva.aglm()} is much time-consuming than \code{cv.aglm()}, it is better to use it only if particularly better results are needed. + +The following S4 classes are defined to store results of the fitting functions. +\itemize{ +\item \link{AccurateGLM-class}: A class for results of \code{aglm()} and \code{cv.aglm()} +\item \link{CVA_AccurateGLM-class}: A class for results of \code{cva.aglm()} +} +} + +\section{Using the fitted model}{ + +Users can use models obtained from fitting functions in various ways, by passing them to following functions: +\itemize{ +\item \link[=predict.AccurateGLM]{predict}: Make predictions for new data +\item \link[=plot.AccurateGLM]{plot}: Plot contribution of each variable and residuals +\item \link[=print.AccurateGLM]{print}: Display textual information of the model +\item \link[=coef.AccurateGLM]{coef}: Get coefficients +\item \link[=deviance.AccurateGLM]{deviance}: Get deviance +\item \link[=residuals.AccurateGLM]{residuals}: Get residuals of various types +} + +We emphasize that \code{plot()} is particularly useful to understand the fitted model, +because it presents a visual representation of how variables in the original data are used by the model. +} + +\section{Other functions}{ + +The following functions are basically for internal use, but exported as utility functions for convenience. +\itemize{ +\item Functions for creating feature vectors +\itemize{ +\item \link{getUDummyMatForOneVec} +\item \link{getODummyMatForOneVec} +\item \link{getLVarMatForOneVec} +} +\item Functions for binning +\itemize{ +\item \link{createEqualWidthBins} +\item \link{createEqualFreqBins} +\item \link{executeBinning} +} +} +} + +\references{ +Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +\emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +\url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +\emph{Actuarial Colloquium Paris 2020} +} +\author{ +\itemize{ +\item Kenji Kondo, +\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +} +} diff --git a/man/aglm.Rd b/man/aglm.Rd index 914d817..23430b6 100644 --- a/man/aglm.Rd +++ b/man/aglm.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/aglm.R \name{aglm} \alias{aglm} -\title{Fit an AGLM model} +\title{Fit an AGLM model with no cross-validation} \usage{ aglm( x, @@ -33,8 +33,9 @@ Each column may have one of the following classes, and \code{aglm} will automati \item \code{factor} (unordered) or \code{logical} : interpreted as a qualitative variable without order. \code{aglm} creates dummy variables suitable for unordered values (named U-dummies). \item \code{ordered}: interpreted as a qualitative variable with order. \code{aglm} creates both O-dummies and U-dummies. These dummy variables are added to \code{x} and form a larger matrix, which is used internally as an actual design matrix. -If you need to change the default behavior, use the following options: \code{qualitative_vars_UD_only}, \code{qualitative_vars_both}, \code{qualitative_vars_OD_only}, and \code{quantitative_vars}. -}} +} + +If you need to change the default behavior, use the following options: \code{qualitative_vars_UD_only}, \code{qualitative_vars_both}, \code{qualitative_vars_OD_only}, and \code{quantitative_vars}.} \item{y}{A response variable.} @@ -82,7 +83,22 @@ Currently \code{aglm} supports \code{gaussian}, \code{binomial}, and \code{poiss } \value{ A model object fitted to the data. +Functions such as \code{predict} and \code{plot} can be applied to the returned object. +See \link{AccurateGLM-class} for more details. } \description{ -Fit an AGLM model +A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). +See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. +} +\references{ +Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +\emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +\url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +\emph{Actuarial Colloquium Paris 2020} +} +\author{ +\itemize{ +\item Kenji Kondo, +\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +} } diff --git a/man/coef.AccurateGLM.Rd b/man/coef.AccurateGLM.Rd index 542c7e9..d006ac9 100644 --- a/man/coef.AccurateGLM.Rd +++ b/man/coef.AccurateGLM.Rd @@ -2,24 +2,27 @@ % Please edit documentation in R/coef-aglm.R \name{coef.AccurateGLM} \alias{coef.AccurateGLM} -\title{Extract coefficients from an \code{AccurateGLM} object.} +\title{Get coefficients} \usage{ \method{coef}{AccurateGLM}(object, index = NULL, name = NULL, s = NULL, exact = FALSE, ...) } \arguments{ -\item{object}{An \code{AccurateGLM} object.} +\item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.} -\item{index}{An integer vector of variable indices whose coefficients should be returned.} +\item{index}{An integer value representing the index of variable whose coefficients are required.} -\item{name}{An character vector of variable names whose coefficients should be returned.} +\item{name}{A string representing the name of variable whose coefficients are required. +Note that if both \code{index} and \code{name} are set, \code{index} is discarded.} -\item{s}{Value(s) of the penalty parameter \code{lambda} at which predictions are required. -Default is the entire sequence used to create the model.} +\item{s}{Same as in \link{coef.glmnet}.} -\item{exact}{Same as \code{coef.glmnet()}.} +\item{exact}{Same as in \link{coef.glmnet}.} -\item{...}{Other arguments are passed directly to \code{deviance} functions of \code{model@backend_models}.} +\item{...}{Other arguments are passed directly to \code{coef.glmnet()}.} } \description{ -Extract coefficients from an \code{AccurateGLM} object. +Get coefficients +} +\author{ +Kenji Kondo } diff --git a/man/createEqualFreqBins.Rd b/man/createEqualFreqBins.Rd index c33a2a4..4a49d35 100644 --- a/man/createEqualFreqBins.Rd +++ b/man/createEqualFreqBins.Rd @@ -2,19 +2,22 @@ % Please edit documentation in R/binning.R \name{createEqualFreqBins} \alias{createEqualFreqBins} -\title{Create bins by Equal Freq Binning} +\title{Create bins (equal frequency binning)} \usage{ createEqualFreqBins(x_vec, nbin.max) } \arguments{ -\item{x_vec}{A reference integer or numeric or ordered vector to be binned.} +\item{x_vec}{A numeric vector, whose quantiles are used as breaks.} -\item{nbin.max}{An integer value which indicates the maximum counts of bins. -Note that this function makes \code{min(nbin.max, length(x_vec))} counts of bins.} +\item{nbin.max}{The maximum number of bins.} } \value{ -a numeric vector which indicates the boundaries of bins, with \code{nbin} elements. +A numeric vector representing breaks obtained by binning. +Note that the number of bins is equal to \code{min(nbin.max, length(x_vec))}. } \description{ -Create bins by Equal Freq Binning +Create bins (equal frequency binning) +} +\author{ +Kenji Kondo } diff --git a/man/createEqualWidthBins.Rd b/man/createEqualWidthBins.Rd index 498ec1c..ea0e1d2 100644 --- a/man/createEqualWidthBins.Rd +++ b/man/createEqualWidthBins.Rd @@ -2,20 +2,23 @@ % Please edit documentation in R/binning.R \name{createEqualWidthBins} \alias{createEqualWidthBins} -\title{Create bins by Equal Width Binning} +\title{Create bins (equal width binning)} \usage{ createEqualWidthBins(left, right, nbin) } \arguments{ -\item{left}{left value of the original interval.} +\item{left}{The leftmost value of the interval to be binned.} -\item{right}{right value of the original interval.} +\item{right}{The rightmost value of the interval to be binned.} -\item{nbin}{number of bins to be created.} +\item{nbin}{The number of bins.} } \value{ -a numeric vector which indicates the boundaries of bins, with \code{nbin} elements. +A numeric vector representing breaks obtained by binning. } \description{ -Create bins by Equal Width Binning +Create bins (equal width binning) +} +\author{ +Kenji Kondo } diff --git a/man/cv.aglm.Rd b/man/cv.aglm.Rd index feb05c8..3215d83 100644 --- a/man/cv.aglm.Rd +++ b/man/cv.aglm.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/cv-aglm.R \name{cv.aglm} \alias{cv.aglm} -\title{Cross-validation for AGLM models} +\title{Fit an AGLM model with cross-validation for \eqn{\lambda}} \usage{ cv.aglm( x, @@ -26,55 +26,61 @@ cv.aglm( ) } \arguments{ -\item{x}{An input matrix or data.frame to be fitted.} +\item{x}{A design matrix. +See \link{aglm} for more details.} -\item{y}{An integer or numeric vector which represents response variable.} +\item{y}{A response variable.} -\item{qualitative_vars_UD_only}{A list of indices or names for specifying which columns are qualitative and need only U-dummy representations.} +\item{qualitative_vars_UD_only}{Same as in \link{aglm}.} -\item{qualitative_vars_both}{A list of indices or names for specifying which columns are qualitative and need both U-dummy and O-dummy representations.} +\item{qualitative_vars_both}{Same as in \link{aglm}.} -\item{qualitative_vars_OD_only}{A list of indices or names for specifying which columns are qualitative and need only O-dummy representations.} +\item{qualitative_vars_OD_only}{Same as in \link{aglm}.} -\item{quantitative_vars}{A list of indices or names for specifying which columns are quantitative.} +\item{quantitative_vars}{Same as in \link{aglm}.} -\item{use_LVar}{A boolean value which indicates whether this function uses L-variable representations or not.} +\item{use_LVar}{Same as in \link{aglm}.} -\item{extrapolation}{A character value which indicates how contribution curves outside bins are extrapolated. -\itemize{ -\item \code{"default"}: No extrapolations. -\item \code{"flat"}: Extrapolates with flat lines. -}} +\item{extrapolation}{Same as in \link{aglm}.} -\item{add_linear_columns}{A boolean value which indicates whether this function uses linear effects or not.} +\item{add_linear_columns}{Same as in \link{aglm}.} -\item{add_OD_columns_of_qualitatives}{A boolean value which indicates whether this function use O-dummy representations for qualitative and ordinal variables or not.} +\item{add_OD_columns_of_qualitatives}{Same as in \link{aglm}.} -\item{add_interaction_columns}{A boolean value which indicates whether this function uses interaction effects or not.} +\item{add_interaction_columns}{Same as in \link{aglm}.} -\item{OD_type_of_quantitatives}{A character value which indicates how O-dummy matrices of quantitative -values are constructed. Choose \code{"C"}(default) or \code{"J"}. -\itemize{ -\item \code{"C"}: Continuous-type dummies, which result continuous contribution curves. -\item \code{"J"}: Jump-type dummies, which result contribution curves with jumps. -\item \code{"N"}: No use of O-dummies -}} +\item{OD_type_of_quantitatives}{Same as in \link{aglm}.} -\item{nbin.max}{a maximum number of bins which is automatically generated. Only used when \code{breaks} is not set.} +\item{nbin.max}{Same as in \link{aglm}.} -\item{bins_list}{A list of numeric vectors, each element of which is used as breaks when binning of a quantitative variable or a qualitative variable with order.} +\item{bins_list}{Same as in \link{aglm}.} -\item{bins_names}{A list of column name or column index, each name or index of which specifies which column of \code{x} is binned used with an element of \code{bins_list} in the same position.} +\item{bins_names}{Same as in \link{aglm}.} -\item{family}{Response type. Currently \code{"gaussian"}, \code{"binomial"}, and \code{"poisson"} are supported.} +\item{family}{Same as in \link{aglm}.} -\item{keep}{Same as \code{cv.glmnet()}.} +\item{keep}{Set to \code{TRUE} if you need the \code{fit.preval} field in the returned value, as in \code{cv.glmnet()}.} -\item{...}{Other arguments are passed directly to backend (currently \code{cv.glmnet()} is used), and if not given, default values of the backend API are used to call backend functions.} +\item{...}{Other arguments are passed directly when calling \code{cv.glmnet()}.} } \value{ -Result of cross-validation. +A model object fitted to the data with cross-validation results. +Functions such as \code{predict} and \code{plot} can be applied to the returned object, same as the result of \code{aglm()}. +See \link{AccurateGLM-class} for more details. } \description{ -Cross-validation for AGLM models +A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. +See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. +} +\references{ +Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +\emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +\url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +\emph{Actuarial Colloquium Paris 2020} +} +\author{ +\itemize{ +\item Kenji Kondo, +\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +} } diff --git a/man/cva.aglm.Rd b/man/cva.aglm.Rd index 1db062e..0c26df9 100644 --- a/man/cva.aglm.Rd +++ b/man/cva.aglm.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/cva-aglm.R \name{cva.aglm} \alias{cva.aglm} -\title{Cross-validation for both alpha and lambda} +\title{Fit an AGLM model with cross-validation for both \eqn{\alpha} and \eqn{\lambda}} \usage{ cva.aglm( x, @@ -15,23 +15,39 @@ cva.aglm( ) } \arguments{ -\item{x}{An input matrix or data.frame to be fitted.} +\item{x}{A design matrix. +See \link{aglm} for more details.} -\item{y}{An integer or numeric vector which represents response variable.} +\item{y}{A response variable.} -\item{alpha}{A numeric vector which represents alpha values to be examined.} +\item{alpha}{A numeric vector representing \eqn{\alpha} values to be examined in cross-validation.} -\item{nfolds}{The number of folds. The default value is 10.} +\item{nfolds}{An integer value representing the number of folds.} -\item{foldid}{An integer vector of values between 1 and \code{nfold} identifying what fold each observation is in.} +\item{foldid}{An integer vector with the same length as observations. +Each element should take a value from 1 to \code{nfolds}, identifying which fold it belongs.} -\item{parallel.alpha}{not implemented yet.} +\item{parallel.alpha}{(not used yet)} \item{...}{Other arguments are passed directly to \code{cv.aglm()}.} } \value{ -Result of cross-validation. See \code{CVA_AccurateGLM}'s document for more details. +An object storing fitted models and information of cross-validation. +See \link{CVA_AccurateGLM-class} for more details. } \description{ -Cross-validation for both alpha and lambda +A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}. +See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. +} +\references{ +Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +\emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +\url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +\emph{Actuarial Colloquium Paris 2020} +} +\author{ +\itemize{ +\item Kenji Kondo, +\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +} } diff --git a/man/deviance.AccurateGLM.Rd b/man/deviance.AccurateGLM.Rd index d9b8c69..7bada40 100644 --- a/man/deviance.AccurateGLM.Rd +++ b/man/deviance.AccurateGLM.Rd @@ -2,15 +2,18 @@ % Please edit documentation in R/deviance-aglm.R \name{deviance.AccurateGLM} \alias{deviance.AccurateGLM} -\title{Extract the deviance from an \code{AccurateGLM} object.} +\title{Get deviance} \usage{ \method{deviance}{AccurateGLM}(object, ...) } \arguments{ -\item{object}{An \code{AccurateGLM} object.} +\item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.} -\item{...}{Other arguments are passed directly to \code{deviance} functions of \code{model@backend_models}.} +\item{...}{Other arguments are passed directly to \code{deviance.glmnet()}.} } \description{ -Extract the deviance from an \code{AccurateGLM} object. +Get deviance +} +\author{ +Kenji Kondo } diff --git a/man/executeBinning.Rd b/man/executeBinning.Rd index d62912e..92ef332 100644 --- a/man/executeBinning.Rd +++ b/man/executeBinning.Rd @@ -2,29 +2,29 @@ % Please edit documentation in R/binning.R \name{executeBinning} \alias{executeBinning} -\title{Execute binning for numerical data.} +\title{Binning the data to given bins.} \usage{ executeBinning(x_vec, breaks = NULL, nbin.max = 100, method = "freq") } \arguments{ -\item{x_vec}{an integer or numeric or ordered vector, to be binned.} +\item{x_vec}{The data to be binned.} -\item{breaks}{a numeric vector which indicates the boundaries of bins, of length (number of bins - 1). -If NULL is set, bins are generated using the method which is specified by the \code{method} argument.} +\item{breaks}{A numeric vector representing breaks of bins (If \code{NULL}, automatically generated).} -\item{nbin.max}{a maximum number of bins which is generated by this function. Only used when \code{breaks} is not set.} +\item{nbin.max}{The maximum number of bins (used only if \code{breaks=NULL}).} -\item{method}{used for specifying binning method. "freq": equal freq binning (default), "width": equal width binning. -Ignore if \code{breaks} is set.} +\item{method}{\code{"freq"} for equal frequency binning or \code{"width"} for equal width binning (used only if \code{breaks=NULL}).} } \value{ -a list which has two members \code{labels} and \code{breaks}. +A list with the following fields: \itemize{ -\item \code{labels}: an integer vector of \code{length(x_vec)}. -\code{(labels[i]==k)} indicates the \code{i-th} element of \code{x_vec} is in the \code{k-th} bin. -\item \code{breaks}: a numeric vector which indicates the boundaries of bins, of length (number of bins - 1). +\item \code{labels}: An integer vector with same length as \code{x_vec}, where \code{labels[i]==k} means the i-th element of \code{x_vec} is in the k-th bin. +\item \code{breaks}: Breaks of bins used for binning. } } \description{ -Execute binning for numerical data. +Binning the data to given bins. +} +\author{ +Kenji Kondo } diff --git a/man/getDesignMatrix.Rd b/man/getDesignMatrix.Rd deleted file mode 100644 index 5aefdcb..0000000 --- a/man/getDesignMatrix.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aglm-input.R -\name{getDesignMatrix} -\alias{getDesignMatrix} -\title{Get design-matrix representation of AGLM_Input objects} -\usage{ -getDesignMatrix(x) -} -\arguments{ -\item{x}{An AGLM_Input object} -} -\value{ -A data.frame which represents the matrix representation of \code{x}. -} -\description{ -Get design-matrix representation of AGLM_Input objects -} diff --git a/man/getLVarMatForOneVec.Rd b/man/getLVarMatForOneVec.Rd index 76ad39f..63f3dd2 100644 --- a/man/getLVarMatForOneVec.Rd +++ b/man/getLVarMatForOneVec.Rd @@ -2,32 +2,29 @@ % Please edit documentation in R/get-dummies.R \name{getLVarMatForOneVec} \alias{getLVarMatForOneVec} -\title{Get L-variable matrix for one-dimensional vector} +\title{Create L-variable matrix for one variable} \usage{ getLVarMatForOneVec(x_vec, breaks = NULL, nbin.max = 100, only_info = FALSE) } \arguments{ -\item{x_vec}{An integer or numeric vector to be converted into dummy matrix.} +\item{x_vec}{A numeric vector representing original variable.} -\item{breaks}{A numeric vector which indicates the boundaries of bins, of length (number of bins + 1). -If NULL, evenly cut bins are automatically generated and used.} +\item{breaks}{A numeric vector representing breaks of bins (If \code{NULL}, automatically generated).} -\item{nbin.max}{A maximum number of bins which is used. Only used when \code{breaks} is not set.} +\item{nbin.max}{The maximum number of bins (used only if \code{breaks=NULL}).} -\item{only_info}{A logical value. If TRUE, actual creation of dummy matrix is omitted.} +\item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.} } \value{ -a list with two members \code{breaks} and \code{dummy_mat}. +a list with the following fields: \itemize{ \item \code{breaks}: Same as input -\item \code{dummy_mat}: An integer matrix with size (length of \code{x_vec}, length of \code{breaks} minus 1). -\code{dummy_mat[i, j]} is 1 if and only if \verb{breaks[i] < x_vec[i] <= breaks[i+1]}, and 0 otherwise. -Note that, in case where \code{x_vec[i]} is outside of \verb{(breaks[1], breaks[length(breaks)]]}, -\code{x_vec[i]} is considered to be in the first bin if \code{x_vec[i] <= breaks[1]}, and -be in the last bin if \code{x_vec[i] > breaks[length(breaks)]}. -Omitted if \code{only_info=TRUE} +\item \code{dummy_mat}: The created L-variable matrix (only if \code{only_info=FALSE}). } } \description{ -Get L-variable matrix for one-dimensional vector +Create L-variable matrix for one variable +} +\author{ +Kenji Kondo } diff --git a/man/getODummyMatForOneVec.Rd b/man/getODummyMatForOneVec.Rd index 75556e1..c37df67 100644 --- a/man/getODummyMatForOneVec.Rd +++ b/man/getODummyMatForOneVec.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get-dummies.R \name{getODummyMatForOneVec} \alias{getODummyMatForOneVec} -\title{Get O-dummy matrix for one-dimensional vector} +\title{Create a O-dummy matrix for one variable} \usage{ getODummyMatForOneVec( x_vec, @@ -13,34 +13,26 @@ getODummyMatForOneVec( ) } \arguments{ -\item{x_vec}{An integer or numeric vector to be converted into dummy matrix.} +\item{x_vec}{A numeric vector representing original variable.} -\item{breaks}{A numeric vector which indicates the boundaries of bins, of length (number of bins + 1). -If NULL, evenly cut bins are automatically generated and used.} +\item{breaks}{A numeric vector representing breaks of bins (If \code{NULL}, automatically generated).} -\item{nbin.max}{A maximum number of bins which is used. Only used when \code{breaks} is not set.} +\item{nbin.max}{The maximum number of bins (used only if \code{breaks=NULL}).} -\item{only_info}{A logical value. If TRUE, actual creation of dummy matrix is omitted.} +\item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.} -\item{dummy_type}{A character value. Choose \code{"C"}(default) or \code{"J"}. For integer or numeric \code{x_vec}, -\code{dummy_type="C"} is used as default. Otherwise, \code{dummy_type="J"} is used as default. -\itemize{ -\item \code{"C"}: Continuous-type dummies, which result continuous contribution curves. -\item \code{"J"}: Jump-type dummies, which result contribution curves with jumps. -}} +\item{dummy_type}{Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).} } \value{ -a list with two members \code{breaks} and \code{dummy_mat}. +a list with the following fields: \itemize{ \item \code{breaks}: Same as input -\item \code{dummy_mat}: An integer matrix with size (length of \code{x_vec}, length of \code{breaks} minus 1). -\code{dummy_mat[i, j]} is 1 if and only if \verb{breaks[i] < x_vec[i] <= breaks[i+1]}, and 0 otherwise. -Note that, in case where \code{x_vec[i]} is outside of \verb{(breaks[1], breaks[length(breaks)]]}, -\code{x_vec[i]} is considered to be in the first bin if \code{x_vec[i] <= breaks[1]}, and -be in the last bin if \code{x_vec[i] > breaks[length(breaks)]}. -Omitted if \code{only_info=TRUE} +\item \code{dummy_mat}: The created O-dummy matrix (only if \code{only_info=FALSE}). } } \description{ -Get O-dummy matrix for one-dimensional vector +Create a O-dummy matrix for one variable +} +\author{ +Kenji Kondo } diff --git a/man/getUDummyMatForOneVec.Rd b/man/getUDummyMatForOneVec.Rd index 1cf94ad..8cbc176 100644 --- a/man/getUDummyMatForOneVec.Rd +++ b/man/getUDummyMatForOneVec.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get-dummies.R \name{getUDummyMatForOneVec} \alias{getUDummyMatForOneVec} -\title{Get U-dummy matrix for one-dimensional vector} +\title{Create a U-dummy matrix for one variable} \usage{ getUDummyMatForOneVec( x_vec, @@ -12,25 +12,27 @@ getUDummyMatForOneVec( ) } \arguments{ -\item{x_vec}{A non-numeric vector to be converted into dummy matrix.} +\item{x_vec}{A vector representing original variable. +The class of \code{x_vec} should be one of \code{integer}, \code{character}, or \code{factor}.} -\item{levels}{A character vector indicates unique set of possible values. -If NULL, all the unique values of \code{x_vec} are used.} +\item{levels}{A character vector representing values of \code{x_vec} used to create U-dummies. +If \code{NULL}, all the unique values of \code{x_vec} are used to create dummies.} -\item{drop_last}{A logical value. If TRUE, the last column of dummy matrix is dropped.} +\item{drop_last}{If \code{TRUE}, the last column of the resulting matrix is dropped to avoid multicollinearity.} -\item{only_info}{A logical value. If TRUE, actual creation of dummy matrix is omitted.} +\item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.} } \value{ -a list with two members \code{levels} and \code{dummy_mat}. +A list with the following fields: \itemize{ -\item \code{levels}: Same as input -\item \code{drop_last}: Same as input -\item \code{dummy_mat}: An integer matrix with size (length of \code{x_vec}, length of \code{levels} or minus 1 when \code{drop_last=TRUE}). -\code{dummy_mat[i, j]} is 1 if and only if \code{x_vec[i] == levels[j]}, and 0 otherwise. -Omitted if \code{only_info=TRUE} +\item \code{levels}: Same as input. +\item \code{drop_last}: Same as input. +\item \code{dummy_mat}: The created U-dummy matrix (only if \code{only_info=FALSE}). } } \description{ -Get U-dummy matrix for one-dimensional vector +Create a U-dummy matrix for one variable +} +\author{ +Kenji Kondo } diff --git a/man/plot.AccurateGLM.Rd b/man/plot.AccurateGLM.Rd index 5a394a9..ea78f1b 100644 --- a/man/plot.AccurateGLM.Rd +++ b/man/plot.AccurateGLM.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/plot-aglm.R \name{plot.AccurateGLM} \alias{plot.AccurateGLM} -\title{Plot coefficients from an \code{AccurateGLM} object} +\title{Plot contribution of each variable and residuals} \usage{ \method{plot}{AccurateGLM}( x, @@ -21,43 +21,69 @@ ) } \arguments{ -\item{x}{An \code{AccurateGLM} object.} +\item{x}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.} -\item{vars}{An integer or character vectors (indices or names) specifying which variables should be plotted.} +\item{vars}{Used to specify variables to be plotted (\code{NULL} means all the variables). +This parameter may have one of the following classes: +\itemize{ +\item \code{integer}: specifying variables by index. +\item \code{character}: specifying variables by name. +}} -\item{verbose}{If TRUE, outputs details.} +\item{verbose}{Set to \code{FALSE} if textual outputs are not needed.} -\item{s}{A numeric value specifying lambda value at which plotting is required. -Note that this function can't plot for multiple lambda values, so it allows only -single \code{s} value (which means \code{model} is trained with multiple lambda values and plot with one of them), -or \code{s=NULL} (which means \code{model} is trained with single lambda value and plot with that value).} +\item{s}{A numeric value specifying \eqn{\lambda} at which plotting is required. +Note that plotting for multiple \eqn{\lambda}'s are not allowed and \code{s} always should be a single value. +When the model is trained with only a single \eqn{\lambda} value, just set it to \code{NULL} to plot for that value.} -\item{resid}{A logical value which indicates to plot residuals, -or a character value which indicates residual type to be plotted (see the help of \code{residuals.AccurateGLM()}), -or a numerical vector which indicates residual values to be plotted. -Note that working residuals are used in the first case with \code{resid=TRUE}.} +\item{resid}{Used to display residuals in plots. +This parameter may have one of the following classes: +\itemize{ +\item \code{logical}(single value): If \code{TRUE}, working residuals are plotted. +\item \code{character}(single value): type of residual to be plotted. See \link{residuals.AccurateGLM} for more details on types of residuals. +\item \code{numerical}(vector): residual values to be plotted. +}} -\item{smooth_resid}{A logical value which indicates whether draws smoothing lines of residuals or not, -or a character value which is one of options below: -* \code{"both"} draws both balls and smoothing lines. -* \code{"smooth_only"} draws only smoothing line. -Note that smoothing lines are only drawn for quantitative variables. -The default value is \code{TRUE}.} +\item{smooth_resid}{Used to display smoothing lines of residuals for quantitative variables. +This parameter may have one of the following classes: +\itemize{ +\item \code{logical}: If \code{TRUE}, smoothing lines are drawn. +\item \code{character}: +\itemize{ +\item \code{smooth_resid="both"}: Balls and smoothing lines are drawn. +\item \code{smooth_resid="smooth_only"}: Only smoothing lines are drawn. +} +}} -\item{smooth_resid_fun}{A function to be used to smooth partial residual values.} +\item{smooth_resid_fun}{Set if users need custom smoothing functions.} -\item{ask}{A logical value which indicates ask if go to next plot.} +\item{ask}{By default, \code{plot()} stops and waits inputs each time plotting for each variable is completed. +Users can set \code{ask=FALSE} to avoid this. +It is useful, for example, when using devices as \code{bmp} to create image files.} -\item{layout}{A pair of integer values which indicates how many plots are drawn row-wise and column-wise respectively,} +\item{layout}{Plotting multiple variables for each page is allowed. +To achieve this, set it to a pair of integer, which indicating number of rows and columns, respectively.} -\item{only_plot}{If \code{TRUE}, the function set no graphical parameters and no title.} +\item{only_plot}{Set to \code{TRUE} if no automatic graphical configurations are needed.} -\item{main}{A character value which indicates titles of panels.} +\item{main}{Used to specify the title of plotting.} -\item{add_rug}{A logical value which indicates draw rug plot for quantitative variables.} +\item{add_rug}{Set to \code{TRUE} for rug plots.} -\item{...}{Other arguments are currently not used.} +\item{...}{Other arguments are currently not used and just discarded.} } \description{ -Plot coefficients from an \code{AccurateGLM} object +Plot contribution of each variable and residuals +} +\references{ +Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +\emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +\url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +\emph{Actuarial Colloquium Paris 2020} +} +\author{ +\itemize{ +\item Kenji Kondo, +\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +} } diff --git a/man/predict.AccurateGLM.Rd b/man/predict.AccurateGLM.Rd index 057b374..121e5f6 100644 --- a/man/predict.AccurateGLM.Rd +++ b/man/predict.AccurateGLM.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/predict-aglm.R \name{predict.AccurateGLM} \alias{predict.AccurateGLM} -\title{Make predictions from a fitted \code{AccurateGLM}} +\title{Make predictions for new data} \usage{ \method{predict}{AccurateGLM}( object, @@ -15,32 +15,37 @@ ) } \arguments{ -\item{object}{An \code{AccurateGLM} object.} +\item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.} -\item{newx}{An input matrix or data.frame used for predictions.} +\item{newx}{A design matrix for new data. +See the description of \code{x} in \link{aglm} for more details.} -\item{s}{Value(s) of the penalty parameter \code{lambda} at which predictions are required. -Default is the entire sequence used to create the model.} +\item{s}{Same as in \link{predict.glmnet}.} -\item{type}{Type of prediction required. -\itemize{ -\item Type \code{"link"} gives the linear predictors for \code{"binomial"}, \code{"poisson"} models, and for \code{"gaussian"} models it gives the fitted values. -\item Type \code{"response"} gives the fitted probabilities for \code{"binomial"}, fitted mean for \code{"poisson"}, and for \code{"gaussian"} models it is equivalent to type \code{"link"}. -\item Type \code{"coefficients"} computes the coefficients at the requested values for \code{s}. -Note that for \code{"binomial"} models, results are returned only for the class corresponding to the second level of the factor response. -\item Type \code{"class"} applies only to \code{"binomial"}, and produces the class label corresponding to the maximum probability. -\item Type \code{"nonzero"} returns a list of the indices of the nonzero coefficients for each value of \code{s}. -}} +\item{type}{Same as in \link{predict.glmnet}.} -\item{exact}{Same as \code{predict.glmnet()}.} +\item{exact}{Same as in \link{predict.glmnet}.} -\item{newoffset}{If an offset is used in the fit, then one must be supplied for making predictions (except for type="coefficients" or type="nonzero").} +\item{newoffset}{Same as in \link{predict.glmnet}.} -\item{...}{Other arguments are passed directly to backend (currently \code{glmnet()} is used), and if not given, default values of the backend API are used.} +\item{...}{Other arguments are passed directly when calling \code{predict.glmnet()}.} } \value{ -The object returned depends on type. +The returned object depends on \code{type}. +See \link{predict.glmnet} for more details. } \description{ -Make predictions from a fitted \code{AccurateGLM} +Make predictions for new data +} +\references{ +Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) +\emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr +\url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr +\emph{Actuarial Colloquium Paris 2020} +} +\author{ +\itemize{ +\item Kenji Kondo, +\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +} } diff --git a/man/print.AccurateGLM.Rd b/man/print.AccurateGLM.Rd index 30fda75..fd7d096 100644 --- a/man/print.AccurateGLM.Rd +++ b/man/print.AccurateGLM.Rd @@ -2,17 +2,20 @@ % Please edit documentation in R/print-aglm.R \name{print.AccurateGLM} \alias{print.AccurateGLM} -\title{Print an \code{AccurateGLM} object} +\title{Display textual information of the model} \usage{ \method{print}{AccurateGLM}(x, digits = max(3, getOption("digits") - 3), ...) } \arguments{ -\item{x}{An \code{AccurateGLM} object.} +\item{x}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.} -\item{digits}{Significant digits in printout.} +\item{digits}{Used to control significant digits in printout.} -\item{...}{Other arguments are passed directly to \code{print} functions of \code{model@backend_models}.} +\item{...}{Other arguments are passed directly to \code{print.glmnet()}.} } \description{ -Print an \code{AccurateGLM} object +Display textual information of the model +} +\author{ +Kenji Kondo } diff --git a/man/residuals.AccurateGLM.Rd b/man/residuals.AccurateGLM.Rd index c737ef4..bdcc33f 100644 --- a/man/residuals.AccurateGLM.Rd +++ b/man/residuals.AccurateGLM.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/residuals-aglm.R \name{residuals.AccurateGLM} \alias{residuals.AccurateGLM} -\title{Calculate residuals for AGLM model} +\title{Get residuals of various types} \usage{ \method{residuals}{AccurateGLM}( object, @@ -16,34 +16,43 @@ ) } \arguments{ -\item{object}{An \code{AccurateGLM} object.} +\item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.} -\item{x}{An input matrix or data.frame used for predictions in residual calculations. -If not given, \code{x} used for fitting the model is used.} +\item{x}{A design matrix. +If not given, \code{x} for fitting is used.} -\item{y}{A numeric vector used as true target values in residual calculations. -If not given, \code{y} used for fitting the model is used.} +\item{y}{A response variable. +If not given, \code{y} for fitting is used.} -\item{offset}{A numeric offset values used for predictions in residual calculations. -If not given, \code{offset} used for fitting the model is used.} +\item{offset}{An offset values. +If not given, \code{offset} for fitting is used.} -\item{weights}{A numeric weight values, corresponding with exposure size. -If not given, \code{weights} used for fitting the model is used.} +\item{weights}{Sample weights. +If not given, \code{weights} for fitting is used.} -\item{type}{Type of prediction required. +\item{type}{A string representing type of deviance: \itemize{ -\item Type \code{"working"} Working residuals. -\item Type \code{"pearson"} Pearson residuals. -\item Type \code{"deviance"} Deviance residuals. +\item \code{"working"} get working residual +\mjsdeqn{r^W_i = (y_i - \mu_i) \left(\frac{\partial \eta}{\partial \mu}\right)_{\mu=\mu_i},} +where \eqn{y_i} is a response value, \eqn{\mu} is GLM mean, and \eqn{\eta=g^{-1}(\mu)} with the link function \eqn{g}. +\item \code{"pearson"} get Pearson residuals +\mjsdeqn{r^P_i = \frac{y_i - \mu_i}{\sqrt{V(\mu_i)}},} +where \eqn{V} is the variance function. +\item \code{"deviance"} get deviance residuals +\mjsdeqn{r^D_i = {\rm sign}(y_i - \mu_i) \sqrt{d_i},} +where \eqn{d_i} is the contribution to deviance. }} -\item{s}{A numeric value specifying lambda value at which plotting is required.} +\item{s}{A numeric value specifying \eqn{\lambda} at which residuals are calculated.} -\item{...}{Other arguments are currently not used.} +\item{...}{Other arguments are currently not used and just discarded.} } \value{ -The object returned depends on type. +A numeric vector representing calculated residuals. } \description{ -Calculate residuals for AGLM model +\loadmathjax +} +\author{ +Kenji Kondo } From c7ed69b63337afb03e0727bfb6d77610b9593db3 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 30 May 2021 23:40:03 +0900 Subject: [PATCH 12/26] Refactor examples and integrated into documents. --- R/aglm.R | 7 +- R/cv-aglm.R | 3 + R/cva-aglm.R | 3 + R/plot-aglm.R | 3 + R/predict-aglm.R | 3 + examples/LVar.R | 19 -- examples/aglm-1.R | 42 +++ examples/aglm-2.R | 28 ++ examples/cross-validation.R | 59 ---- examples/cv-aglm-1.R | 29 ++ examples/cva-aglm-1.R | 34 +++ examples/freMTPL2freq.rmd | 281 ------------------ ...trapolation.R => lvar-and-extrapolation.R} | 14 +- examples/nes96.R | 39 --- examples/plot-coefs.R | 31 -- examples/{boston.R => predict-and-plot-1.R} | 29 +- man/aglm.Rd | 115 ++++++- man/cv.aglm.Rd | 31 ++ man/cva.aglm.Rd | 36 +++ man/plot.AccurateGLM.Rd | 44 +++ man/predict.AccurateGLM.Rd | 44 +++ 21 files changed, 451 insertions(+), 443 deletions(-) delete mode 100644 examples/LVar.R create mode 100644 examples/aglm-1.R create mode 100644 examples/aglm-2.R delete mode 100644 examples/cross-validation.R create mode 100644 examples/cv-aglm-1.R create mode 100644 examples/cva-aglm-1.R delete mode 100644 examples/freMTPL2freq.rmd rename examples/{extrapolation.R => lvar-and-extrapolation.R} (73%) delete mode 100644 examples/nes96.R delete mode 100644 examples/plot-coefs.R rename examples/{boston.R => predict-and-plot-1.R} (52%) diff --git a/R/aglm.R b/R/aglm.R index 3d7d415..47b05b0 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -40,7 +40,7 @@ #' #' @param extrapolation #' Used to control values of linear combination for quantitative variables, outside where the data exists. -#' Currently, this parameter is useful only when `use_LVar=TRUE`, where values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. +#' By default, values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. #' You can set `extrapolation="flat"` to get constant values outside the data instead. #' #' @param add_linear_columns @@ -78,6 +78,11 @@ #' See \link{AccurateGLM-class} for more details. #' #' +#' @example examples/aglm-1.R +#' @example examples/aglm-2.R +#' @example examples/lvar-and-extrapolation.R +#' +#' #' @author #' * Kenji Kondo, #' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) diff --git a/R/cv-aglm.R b/R/cv-aglm.R index b2d7cb8..c51b5ab 100644 --- a/R/cv-aglm.R +++ b/R/cv-aglm.R @@ -64,6 +64,9 @@ #' See \link{AccurateGLM-class} for more details. #' #' +#' @example examples/cv-aglm-1.R +#' +#' #' @author #' * Kenji Kondo, #' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) diff --git a/R/cva-aglm.R b/R/cva-aglm.R index 4d702f5..10148df 100644 --- a/R/cva-aglm.R +++ b/R/cva-aglm.R @@ -31,6 +31,9 @@ #' See \link{CVA_AccurateGLM-class} for more details. #' #' +#' @example examples/cva-aglm-1.R +#' +#' #' @author #' * Kenji Kondo, #' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) diff --git a/R/plot-aglm.R b/R/plot-aglm.R index e781612..0dd3214 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -57,6 +57,9 @@ #' Other arguments are currently not used and just discarded. #' #' +#' @example examples/predict-and-plot-1.R +#' +#' #' @author #' * Kenji Kondo, #' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) diff --git a/R/predict-aglm.R b/R/predict-aglm.R index 7bf58d2..f7ebe29 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -27,6 +27,9 @@ #' See \link{predict.glmnet} for more details. #' #' +#' @example examples/predict-and-plot-1.R +#' +#' #' @author #' * Kenji Kondo, #' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) diff --git a/examples/LVar.R b/examples/LVar.R deleted file mode 100644 index eb746ee..0000000 --- a/examples/LVar.R +++ /dev/null @@ -1,19 +0,0 @@ -library(MASS) # For Boston -library(aglm) - -## Read data -xy <- Boston # xy is a data.frame to be processed. -colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. - -## Split data into train and test -n <- nrow(xy) # Sample size. -x <- xy[-ncol(xy)] -y <- xy$y - -## Select the best lambda by `cv.aglm()`, fixing `alpha=1` (LASSO) -cv.model <- cv.aglm(x, y, use_LVar=TRUE) -lambda.min <- cv.model@lambda.min -cat("lambda.min: ", lambda.min, "\n") - -## Plots coefs of cross-validated model -plot(cv.model, s=cv.model@lambda.min, resid=TRUE, add_rug=TRUE) diff --git a/examples/aglm-1.R b/examples/aglm-1.R new file mode 100644 index 0000000..813d808 --- /dev/null +++ b/examples/aglm-1.R @@ -0,0 +1,42 @@ + +#################### Gaussian case #################### + +library(MASS) # For Boston +library(aglm) + +## Read data +xy <- Boston # xy is a data.frame to be processed. +colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/4)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[-ncol(xy)] +y <- train$y +newx <- test[-ncol(xy)] +y_true <- test$y + +## Fit the model +model <- aglm(x, y) # alpha=1 (the default value) + +## Predict for various alpha and lambda +lambda <- 0.1 +y_pred <- predict(model, newx=newx, s=lambda) +rmse <- sqrt(mean((y_true - y_pred)^2)) +cat(sprintf("RMSE for lambda=%.2f: %.5f \n\n", lambda, rmse)) + +lambda <- 1.0 +y_pred <- predict(model, newx=newx, s=lambda) +rmse <- sqrt(mean((y_true - y_pred)^2)) +cat(sprintf("RMSE for lambda=%.2f: %.5f \n\n", lambda, rmse)) + +alpha <- 0 +model <- aglm(x, y, alpha=alpha) + +lambda <- 0.1 +y_pred <- predict(model, newx=newx, s=lambda) +rmse <- sqrt(mean((y_true - y_pred)^2)) +cat(sprintf("RMSE for alpha=%.2f and lambda=%.2f: %.5f \n\n", alpha, lambda, rmse)) diff --git a/examples/aglm-2.R b/examples/aglm-2.R new file mode 100644 index 0000000..0407f73 --- /dev/null +++ b/examples/aglm-2.R @@ -0,0 +1,28 @@ + +#################### Binomial case #################### + +library(aglm) +library(faraway) + +## Read data +xy <- nes96 + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/5)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +y <- train$vote +newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] + +## Fit the model +model <- aglm(x, y, family="binomial") + +## Make the confusion matrix +lambda <- 0.1 +y_true <- test$vote +y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))] + +print(table(y_true, y_pred)) diff --git a/examples/cross-validation.R b/examples/cross-validation.R deleted file mode 100644 index 9182997..0000000 --- a/examples/cross-validation.R +++ /dev/null @@ -1,59 +0,0 @@ -library(MASS) # For Boston -library(aglm) - -# Function to produce a data.frame of O-dummies -make.bins <- function(data, max.nbin = 100){ - temp <- apply(data, 2, function(x){as.vector(quantile(x, seq(0, 1, 1 / (min(max.nbin, length(x)) - 1))))}) - apply(temp, 2, unique) -} - -## Read data -xy <- Boston # xy is a data.frame to be processed. -colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. - -## Split data into train and test -n <- nrow(xy) # Sample size. -set.seed(2018) # For reproducibility. -test.id <- sample(n, round(n/4)) # ID numbders for test data. -test <- xy[test.id,] # test is the data.frame for testing. -train <- xy[-test.id,] # train is the data.frame for training. -x <- train[-ncol(xy)] -y <- train$y -newx <- test[-ncol(xy)] -y_true <- test$y - -## Create bins -bins_list <- make.bins(x[, colnames(x) != "chas"]) -bins_names <- colnames(x)[colnames(x) != "chas"] - -## Set chas and rad variables as factors -x$chas <- as.factor(x$chas) -x$rad <- as.ordered(x$rad) -newx$chas <- factor(newx$chas, levels=levels(x$chas)) -newx$rad <- ordered(newx$rad, levels=levels(x$rad)) - - -## Select the best lambda by `cv.aglm()`, fixing `alpha=1` (LASSO) -cv.model <- cv.aglm(x, y, bins_list=bins_list, bins_names=bins_names) -lambda.min <- cv.model@lambda.min -cat("lambda.min: ", lambda.min, "\n") - -# Predict y for newx -y_pred <- predict(cv.model, newx=newx, s="lambda.min") -cat("RMSE: ", sqrt(mean((y_true - y_pred)^2)), "\n") -plot(y_true, y_pred) - - -## Select the best (alpha, lambda) simultaneously by `cva.aglm()` -cva.model <- cva.aglm(x, y, bins_list=bins_list, bins_names=bins_names) - -alpha.min <- cva.model@alpha.min -lambda.min <- cva.model@lambda.min -cat("alpha.min: ", alpha.min, "\n") -cat("lambda.min: ", lambda.min, "\n") - -## Predict y for newx -best_model <- aglm(x, y, lambda=lambda.min, alpha=alpha.min,bins_list=bins_list, bins_names=bins_names) -y_pred <- predict(best_model, newx=newx) -cat("RMSE: ", sqrt(mean((y_true - y_pred)^2)), "\n") -plot(y_true, y_pred) diff --git a/examples/cv-aglm-1.R b/examples/cv-aglm-1.R new file mode 100644 index 0000000..b154d1c --- /dev/null +++ b/examples/cv-aglm-1.R @@ -0,0 +1,29 @@ + +#################### Cross-validation for lambda #################### + +library(aglm) +library(faraway) + +## Read data +xy <- nes96 + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/5)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +y <- train$vote +newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] + +## Fit the model +model <- cv.aglm(x, y, family="binomial") + +## Make the confusion matrix +lambda <- model@lambda.min +y_true <- test$vote +y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))] + +cat(sprintf("Confusion matrix for lambda=%.5f:\n", lambda)) +print(table(y_true, y_pred)) diff --git a/examples/cva-aglm-1.R b/examples/cva-aglm-1.R new file mode 100644 index 0000000..9e6158e --- /dev/null +++ b/examples/cva-aglm-1.R @@ -0,0 +1,34 @@ + +#################### Cross-validation for alpha and lambda #################### + +library(aglm) +library(faraway) + +## Read data +xy <- nes96 + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/5)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +y <- train$vote +newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] + +## Fit the model +cva_result <- cva.aglm(x, y, family="binomial") + +alpha <- cva_result@alpha.min +lambda <- cva_result@lambda.min + +mod_idx <- cva_result@alpha.min.index +model <- cva_result@models_list[[mod_idx]] + +## Make the confusion matrix +y_true <- test$vote +y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))] + +cat(sprintf("Confusion matrix for alpha=%.5f and lambda=%.5f:\n", alpha, lambda)) +print(table(y_true, y_pred)) diff --git a/examples/freMTPL2freq.rmd b/examples/freMTPL2freq.rmd deleted file mode 100644 index 52a48f2..0000000 --- a/examples/freMTPL2freq.rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: 'Exploring freMTPL2freq (French Motor Third-Party Liability Frequency datasete)' -author: "Kenji Kondo" -date: "`r Sys.Date()`" -output: - rmdformats::readthedown: - highlight: kate ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library(CASdatasets) -library(purrr) -library(tidyr) -library(dplyr) -library(ggplot2) -library(assertthat) -library(aglm) -par(ps=8) -set.seed(2020) # Set seed for reproducibility. -``` - - -# What is the data? - -## R help - -> In the two datasets freMTPL2freq, freMTPL2sev, risk features are collected for 677,991 motor third-part liability policies (observed mostly on one year). In addition, we have claim numbers > by policy as well as the corresponding claim amounts. freMTPL2freq contains the risk features and the claim number while freMTPL2sev contains the claim amount and the corresponding policy ID. - -## Format -* IDpol: The policy ID (used to link with the claims dataset). -* ClaimNb: Number of claims during the exposure period. -* Exposure: The period of exposure for a policy, in years. -* VehPower: The power of the car (ordered values). -* VehAge: The vehicle age, in years. -* DrivAge: The driver age, in years (in France, people can drive a car at 18). -* BonusMalus: Bonus/malus, between 50 and 350: <100 means bonus, >100 means malus in France. -* VehBrand: The car brand (unknown categories). -* VehGas: The car gas, Diesel or regular. -* Area: The density value of the city community where the car driver lives in: from "A" for rural area to "F" for urban centre. -* Density: The density of inhabitants (number of inhabitants per square-kilometer) of the city where the car driver lives in. -* Region: The policy region in France (based on the 1970-2015 classification). - -We will use `ClaimNb` as a response variable, `log(Exposure)` as offset, and others as explanatory variables in this analysis. - - -# Read the data - -## Original data - -```{r} -data("freMTPL2freq") -xy <- freMTPL2freq # rename as `freq` -head(xy, 5) -``` - - -## Preprocessing - -```{r} -xy <- xy[-1] # Discard policy ID's we don't use. -xy$ClaimNb <- as.integer(xy$ClaimNb) # Because it has `table` type by default. -xy$VehGas <- factor(xy$VehGas) # Convert strings into factor values. -xy$Area <- ordered(xy$Area) # Convert them into ordered factor values, because this variable has order as abovementioned. -xy$VehBrand <- factor(substr(xy$VehBrand, 1, 2)) # Cut brand names to the first 2 letters, to plot them neatly -xy$Region <- factor(substr(xy$Region, 1, 2)) # same as above -head(xy) -``` - -```{r} -x <- xy[-c(1:2)] -y <- xy$ClaimNb -off <- log(xy$Exposure) # use log(Exposure) as offset -``` - - -# Look over the data - -## Size of the data - -```{r} -dim(xy) -``` - - -## Missing values - -```{r} -colSums(is.na(xy)) -``` - -Now it is ensured that there are no missing values. - - -## Distribution of the response variable - -```{r} -table(y) -``` - -The response values seem to contain too large values as claim numbers in less than one year. -Because these values are possibly disturbing, so we truncate them to 4 as below (FYI, our treatment is same as that in https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3164764 ). - -```{r} -y[y > 4] <- 4 -table(y) -``` - - -## Distributions of offset - -```{r} -plot(density(off)) -``` - -Note that some values of offset is greater than 0, which means `Exposure > 1 yr`. -It is ambiguous whther these values ara by some errors or not, but we decided to remain them unchanged because such excess exposures seem to have some correlation with y values as we can see in boxplots below. - -```{r} -excess_exp <- exp(off[off > 0]) - 1 # excess exposures over than one year -yy <- y[off > 0] # corresponding y values - -ggplot(data.frame(yy, excess_exp), aes(x=as.factor(yy), y=excess_exp)) + - geom_boxplot() + - scale_y_log10() + - theme_bw() -``` - - -## Distributions of explanatory variables - -### Quantitative variables - -```{r} -x %>% # drop target variable - keep(is.numeric) %>% - gather() %>% - ggplot(aes(value)) + - facet_wrap(~ key, scales="free") + - geom_density() -``` - - -### Qualitative variables - -```{r} -# The lines below results a warning of "attributes are not identical across measure variables; they will be dropped.", but it's because we gather multiple factor columns with different set of levels and no problem. -xy %>% - discard(is.numeric) %>% - gather() %>% - ggplot(aes(value)) + - facet_wrap(~ key, scales="free") + - geom_bar() -``` - - -# Split the data into a train and test pair -```{r} -n <- nrow(xy) # Sample size. -test.id <- sample(n, round(n/4)) # ID numbders for test data. - -# a dataset for training -x_train <- x[-test.id,] -y_train <- y[-test.id] -off_train <- off[-test.id] - -# a dataset for evaluation -x_test <- x[test.id,] -y_test <- y[test.id] -off_test <- off[test.id] -``` - - -# AGLM with Poisson family - -## Choose hyperparameters - -AGLM use the elastic-net type penalty, and it has two hyperparameters $\alpha$ and $\lambda$. -In short, $\alpha$ is the elastic-net mixing parameter, with $0 \le \alpha \le 1$, and \lambda is the strongness of penalty. -If specific value of alpha and lambda is given, the penalty for a coefficient β is wirten as $\lambda \{(1-\alpha)/2||\beta||_2^2+\alpha||\beta||_1\}$. -Note that `alpha = 1` is the lasso penalty, and `alpha = 0` the ridge penalty. - -We can choose these hyperparameters by cross-validation using `cva.aglm()`. -An R code for this purpose is as below, but commented out it because this process is slightly time-consuming. - -```{r} -# cva.model <- cva.aglm(x_train, -# y_train, -# offset=off_train, -# family="poisson", -# nbin.max=40, # Reduce number of bins for computational reason (default is 100). -# trace.it=TRUE) -# alpha.min <- cva.model@alpha.min -# lambda.min <- cva.model@lambda.min -# -# cat("Chosen hyperparameters", "\n", -# "alpha.min: ", alpha.min, "\n", -# "lamda.min: ", lambda.min, "\n") -``` - -Instead, use precalculated values here. - -```{r} -alpha.min <- 1 -lambda.min <- 0.00005440846715217542766025726752587843293440528213977813720703125 -``` - - -## Fit a model to the train data - -```{r} -model_path <- "./freMTPL2freq_aglm.rda" -if (file.exists(model_path)) { # to save time at the 2nd run - load(file=model_path) -} else { - best_model <- aglm(x_train, - y_train, - offset=off_train, - family="poisson", - nbin.max=40, - lambda=lambda.min, - alpha=alpha.min, - trace.it=TRUE) - save(best_model, file=model_path) -} -``` - - -## Predict for the test data - -```{r} -y_pred <- predict(best_model, newx=x_test, newoffset=off_test, type="response") -``` - - -## Evaluate the prediction - -### Compare to y_true - -To see how nice the prediction is, we use boxplots of the predicted means(y_pred) against true values(y_true) as below. - -```{r} -ggplot(data.frame(y_test, y_pred), aes(x=as.factor(y_test), y=y_pred)) + - geom_boxplot() + - scale_y_log10() + # Use log scale because y_pred has positively-skewed distribution. - theme_bw() -``` - -Notice that medians of y_pred and y_true seem positively correlated (the case where `y_true=4` is an exception), which indicates some natures of occuring are actually captured in this model. - - -### Calculate a numerical measure - -Because it is difficult to understand how accurate our prediction only from boxplots (especially in poisson cases), we calculate test deviance as below. - -```{r} -dev_test <- 2 * mean(ifelse(y_test == 0, y_pred, y_test * log(y_test / y_pred) - y_test + y_pred)) -cat("Test deviance: ", dev_test, "\n") -``` - -The calculated deviance value can be used when comparing this model to other models. - - -### Understand the model - -We can also understand the model visually using the `plot()` function. -It draws component curves (say, link values against specific values of explanatory variables) and residuals for the train data. -In this case, we use deviance residuals (defferent from the default working residuals), and make it draw only smoothed lines of residuals, in order to get clearer plots. - -```{r} -plot(best_model, verbose=FALSE, ask=FALSE, - resid="deviance", smooth_resid="smooth_only", - add_rug=TRUE, - layout=c(2, 3)) -``` - - - - - - diff --git a/examples/extrapolation.R b/examples/lvar-and-extrapolation.R similarity index 73% rename from examples/extrapolation.R rename to examples/lvar-and-extrapolation.R index 8a730b6..c884795 100644 --- a/examples/extrapolation.R +++ b/examples/lvar-and-extrapolation.R @@ -1,3 +1,6 @@ + +#################### use_LVar and extrapolation #################### + library(MASS) # For Boston library(aglm) @@ -12,11 +15,12 @@ x_test <- seq(0.75, 3.25, length.out=101) y_test <- f(x_test) + rnorm(101, sd=sd) xy_test <- data.frame(x=x_test, y=y_test) -## Sample plot for extrapolation="default" -models <- c(cv.aglm(x, y, extrapolation="default"), - cv.aglm(x, y, extrapolation="flat"), - cv.aglm(x, y, use_LVar=TRUE, extrapolation="default"), - cv.aglm(x, y, use_LVar=TRUE, extrapolation="flat")) +## Plot +nbin.max <- 10 +models <- c(cv.aglm(x, y, use_LVar=FALSE, extrapolation="default", nbin.max=nbin.max), + cv.aglm(x, y, use_LVar=FALSE, extrapolation="flat", nbin.max=nbin.max), + cv.aglm(x, y, use_LVar=TRUE, extrapolation="default", nbin.max=nbin.max), + cv.aglm(x, y, use_LVar=TRUE, extrapolation="flat", nbin.max=nbin.max)) titles <- c("O-Dummies with extrapolation=\"default\"", "O-Dummies with extrapolation=\"flat\"", diff --git a/examples/nes96.R b/examples/nes96.R deleted file mode 100644 index c1fa9d8..0000000 --- a/examples/nes96.R +++ /dev/null @@ -1,39 +0,0 @@ -library(aglm) -library(faraway) - -## Read data -xy <- nes96 - -## Split data into train and test -n <- nrow(xy) # Sample size. -set.seed(2018) # For reproducibility. -test.id <- sample(n, round(n/5)) # ID numbders for test data. -test <- xy[test.id,] # test is the data.frame for testing. -train <- xy[-test.id,] # train is the data.frame for training. -x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] -y <- train$vote -x.new <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] -y.true <- test$vote - -## Fitting, Prediction with Cross Validation of lambda -cat("Prediction with cross validation of lambda:\n") -model.cv <- cv.aglm(x, y, family="binomial", add_interaction_columns=FALSE) -lambda.min <- model.cv@lambda.min -cat("lambda.min: ", lambda.min, "\n") -plot(model.cv, s=model.cv@lambda.min, verbose=FALSE) -y.pred.cv <- predict(model.cv, x.new, type="class") -print(table(y.true, y.pred.cv[, length(y.pred.cv[1, ])])) - -## Fitting, Prediction with Cross Validation of both lambda and alpha -cat("\n\nPrediction with cross validation of both lambda and alpha:\n") -model.cva <- cva.aglm(x, y, add_interaction_columns=FALSE) -lambda.min <- model.cva@lambda.min -cat("lambda.min: ", lambda.min, "\n") -alpha.min <- model.cva@alpha.min -cat("alpha.min: ", alpha.min, "\n") -plot(model.cva@models_list[[model.cva@alpha.min.index]], s=model.cva@lambda.min, verbose=FALSE) - -model.best <- aglm(x, y, family="binomial", lambda=lambda.min, alpha=alpha.min, - add_interaction_columns=FALSE) -y.pred.best <- predict(model.best, x.new, type="class") -print(table(y.true, y.pred.best[, length(y.pred.best[1, ])])) diff --git a/examples/plot-coefs.R b/examples/plot-coefs.R deleted file mode 100644 index 21d1987..0000000 --- a/examples/plot-coefs.R +++ /dev/null @@ -1,31 +0,0 @@ -library(MASS) # For Boston -library(aglm) - -## Read data -xy <- Boston # xy is a data.frame to be processed. -colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. - -## Split data into train and test -n <- nrow(xy) # Sample size. -x <- xy[-ncol(xy)] -y <- xy$y - -## Set chas and rad variables as factors -x$chas <- as.factor(x$chas) -x$rad <- as.ordered(x$rad) - -## Plots coefs of a model trained with single lambda. -# model <- aglm(x, y, lambda=0.5) -# plot(model) - -## Plots coefs of a model trained with multiple lambda, specifying a single lambda to be plotted. -# model <- aglm(x, y) -# plot(model, s=0.5) - -## Plots coefs of cross-validated model -cv.model <- cv.aglm(x, y) -plot(cv.model, s=cv.model@lambda.min, resid=TRUE, add_rug=TRUE) - -## Plots coefs of specified variables only -# plot(cv.model, s=cv.model@lambda.min, vars=c("rm", "nox")) # use name -# plot(cv.model, s=cv.model@lambda.min, vars=c(1, 2)) # use indices diff --git a/examples/boston.R b/examples/predict-and-plot-1.R similarity index 52% rename from examples/boston.R rename to examples/predict-and-plot-1.R index 6d3f8f1..59ac0a4 100644 --- a/examples/boston.R +++ b/examples/predict-and-plot-1.R @@ -1,3 +1,6 @@ + +#################### using plot() and predict() #################### + library(MASS) # For Boston library(aglm) @@ -16,12 +19,24 @@ y <- train$y newx <- test[-ncol(xy)] y_true <- test$y -## Select the best lambda -lambda.min <- cv.aglm(x, y)@lambda.min -cat("lambda.min: ", lambda.min, "\n") +## With the result of aglm() +model <- aglm(x, y) +lambda <- 0.1 + +plot(model, s=lambda, resid=TRUE, add_rug=TRUE, + verbose=FALSE, layout=c(3, 3)) -## Predict y for newx -model <- aglm(x, y, lambda=lambda.min) -y_pred <- predict(model, newx=newx) -cat("RMSE: ", sqrt(mean((y_true - y_pred)^2)), "\n") +y_pred <- predict(model, newx=newx, s=lambda) plot(y_true, y_pred) + +## With the result of cv.aglm() +model <- cv.aglm(x, y) +lambda <- model@lambda.min + +plot(model, s=lambda, resid=TRUE, add_rug=TRUE, + verbose=FALSE, layout=c(3, 3)) + +y_pred <- predict(model, newx=newx, s=lambda) +plot(y_true, y_pred) + + diff --git a/man/aglm.Rd b/man/aglm.Rd index 23430b6..8cfa487 100644 --- a/man/aglm.Rd +++ b/man/aglm.Rd @@ -58,7 +58,7 @@ By default, \code{aglm} uses O-dummies as the representation of a quantitative v To avoid this, set \code{use_LVar=TRUE} to use a different type of auxiliary variable (named L-Variables), such that the linear combination form a piecewise linear functions and is continuous.} \item{extrapolation}{Used to control values of linear combination for quantitative variables, outside where the data exists. -Currently, this parameter is useful only when \code{use_LVar=TRUE}, where values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. +By default, values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. You can set \code{extrapolation="flat"} to get constant values outside the data instead.} \item{add_linear_columns}{By default, for quantitative variables, \code{aglm} expands them by adding dummies and the original columns, i.e. the linear effects, are remained in the resulting model. @@ -90,6 +90,119 @@ See \link{AccurateGLM-class} for more details. A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. } +\examples{ + +#################### Gaussian case #################### + +library(MASS) # For Boston +library(aglm) + +## Read data +xy <- Boston # xy is a data.frame to be processed. +colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/4)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[-ncol(xy)] +y <- train$y +newx <- test[-ncol(xy)] +y_true <- test$y + +## Fit the model +model <- aglm(x, y) # alpha=1 (the default value) + +## Predict for various alpha and lambda +lambda <- 0.1 +y_pred <- predict(model, newx=newx, s=lambda) +rmse <- sqrt(mean((y_true - y_pred)^2)) +cat(sprintf("RMSE for lambda=\%.2f: \%.5f \n\n", lambda, rmse)) + +lambda <- 1.0 +y_pred <- predict(model, newx=newx, s=lambda) +rmse <- sqrt(mean((y_true - y_pred)^2)) +cat(sprintf("RMSE for lambda=\%.2f: \%.5f \n\n", lambda, rmse)) + +alpha <- 0 +model <- aglm(x, y, alpha=alpha) + +lambda <- 0.1 +y_pred <- predict(model, newx=newx, s=lambda) +rmse <- sqrt(mean((y_true - y_pred)^2)) +cat(sprintf("RMSE for alpha=\%.2f and lambda=\%.2f: \%.5f \n\n", alpha, lambda, rmse)) + +#################### Binomial case #################### + +library(aglm) +library(faraway) + +## Read data +xy <- nes96 + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/5)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +y <- train$vote +newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] + +## Fit the model +model <- aglm(x, y, family="binomial") + +## Make the confusion matrix +lambda <- 0.1 +y_true <- test$vote +y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))] + +print(table(y_true, y_pred)) + +#################### use_LVar and extrapolation #################### + +library(MASS) # For Boston +library(aglm) + +## Randomly created train and test data +set.seed(2021) +sd <- 0.2 +x <- 2 * runif(1000) + 1 +f <- function(x){x^3 - 6 * x^2 + 13 * x} +y <- f(x) + rnorm(1000, sd = sd) +xy <- data.frame(x=x, y=y) +x_test <- seq(0.75, 3.25, length.out=101) +y_test <- f(x_test) + rnorm(101, sd=sd) +xy_test <- data.frame(x=x_test, y=y_test) + +## Plot +nbin.max <- 10 +models <- c(cv.aglm(x, y, use_LVar=FALSE, extrapolation="default", nbin.max=nbin.max), + cv.aglm(x, y, use_LVar=FALSE, extrapolation="flat", nbin.max=nbin.max), + cv.aglm(x, y, use_LVar=TRUE, extrapolation="default", nbin.max=nbin.max), + cv.aglm(x, y, use_LVar=TRUE, extrapolation="flat", nbin.max=nbin.max)) + +titles <- c("O-Dummies with extrapolation=\"default\"", + "O-Dummies with extrapolation=\"flat\"", + "L-Variables with extrapolation=\"default\"", + "L-Variables with extrapolation=\"flat\"") + +par.old <- par(mfrow=c(2, 2)) +for (i in 1:4) { + model <- models[[i]] + title <- titles[[i]] + + pred <- predict(model, newx=x_test, s=model@lambda.min, type="response") + + plot(x_test, y_test, pch=20, col="grey", main=title) + lines(x_test, f(x_test), lty="dashed", lwd=2) # the theoretical line + lines(x_test, pred, col="blue", lwd=3) # the smoothed line by the model +} +par(par.old) +} \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr diff --git a/man/cv.aglm.Rd b/man/cv.aglm.Rd index 3215d83..4361a91 100644 --- a/man/cv.aglm.Rd +++ b/man/cv.aglm.Rd @@ -72,6 +72,37 @@ See \link{AccurateGLM-class} for more details. A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. } +\examples{ + +#################### Cross-validation for lambda #################### + +library(aglm) +library(faraway) + +## Read data +xy <- nes96 + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/5)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +y <- train$vote +newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] + +## Fit the model +model <- cv.aglm(x, y, family="binomial") + +## Make the confusion matrix +lambda <- model@lambda.min +y_true <- test$vote +y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))] + +cat(sprintf("Confusion matrix for lambda=\%.5f:\n", lambda)) +print(table(y_true, y_pred)) +} \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr diff --git a/man/cva.aglm.Rd b/man/cva.aglm.Rd index 0c26df9..1cc861e 100644 --- a/man/cva.aglm.Rd +++ b/man/cva.aglm.Rd @@ -39,6 +39,42 @@ See \link{CVA_AccurateGLM-class} for more details. A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}. See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}. } +\examples{ + +#################### Cross-validation for alpha and lambda #################### + +library(aglm) +library(faraway) + +## Read data +xy <- nes96 + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/5)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +y <- train$vote +newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] + +## Fit the model +cva_result <- cva.aglm(x, y, family="binomial") + +alpha <- cva_result@alpha.min +lambda <- cva_result@lambda.min + +mod_idx <- cva_result@alpha.min.index +model <- cva_result@models_list[[mod_idx]] + +## Make the confusion matrix +y_true <- test$vote +y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))] + +cat(sprintf("Confusion matrix for alpha=\%.5f and lambda=\%.5f:\n", alpha, lambda)) +print(table(y_true, y_pred)) +} \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr diff --git a/man/plot.AccurateGLM.Rd b/man/plot.AccurateGLM.Rd index ea78f1b..d4a79aa 100644 --- a/man/plot.AccurateGLM.Rd +++ b/man/plot.AccurateGLM.Rd @@ -74,6 +74,50 @@ To achieve this, set it to a pair of integer, which indicating number of rows an } \description{ Plot contribution of each variable and residuals +} +\examples{ + +#################### using plot() and predict() #################### + +library(MASS) # For Boston +library(aglm) + +## Read data +xy <- Boston # xy is a data.frame to be processed. +colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/4)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[-ncol(xy)] +y <- train$y +newx <- test[-ncol(xy)] +y_true <- test$y + +## With the result of aglm() +model <- aglm(x, y) +lambda <- 0.1 + +plot(model, s=lambda, resid=TRUE, add_rug=TRUE, + verbose=FALSE, layout=c(3, 3)) + +y_pred <- predict(model, newx=newx, s=lambda) +plot(y_true, y_pred) + +## With the result of cv.aglm() +model <- cv.aglm(x, y) +lambda <- model@lambda.min + +plot(model, s=lambda, resid=TRUE, add_rug=TRUE, + verbose=FALSE, layout=c(3, 3)) + +y_pred <- predict(model, newx=newx, s=lambda) +plot(y_true, y_pred) + + } \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/man/predict.AccurateGLM.Rd b/man/predict.AccurateGLM.Rd index 121e5f6..9bfa523 100644 --- a/man/predict.AccurateGLM.Rd +++ b/man/predict.AccurateGLM.Rd @@ -36,6 +36,50 @@ See \link{predict.glmnet} for more details. } \description{ Make predictions for new data +} +\examples{ + +#################### using plot() and predict() #################### + +library(MASS) # For Boston +library(aglm) + +## Read data +xy <- Boston # xy is a data.frame to be processed. +colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y. + +## Split data into train and test +n <- nrow(xy) # Sample size. +set.seed(2018) # For reproducibility. +test.id <- sample(n, round(n/4)) # ID numbders for test data. +test <- xy[test.id,] # test is the data.frame for testing. +train <- xy[-test.id,] # train is the data.frame for training. +x <- train[-ncol(xy)] +y <- train$y +newx <- test[-ncol(xy)] +y_true <- test$y + +## With the result of aglm() +model <- aglm(x, y) +lambda <- 0.1 + +plot(model, s=lambda, resid=TRUE, add_rug=TRUE, + verbose=FALSE, layout=c(3, 3)) + +y_pred <- predict(model, newx=newx, s=lambda) +plot(y_true, y_pred) + +## With the result of cv.aglm() +model <- cv.aglm(x, y) +lambda <- model@lambda.min + +plot(model, s=lambda, resid=TRUE, add_rug=TRUE, + verbose=FALSE, layout=c(3, 3)) + +y_pred <- predict(model, newx=newx, s=lambda) +plot(y_true, y_pred) + + } \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) From 17c3f1689320050ac675715f9448228bb635ffe8 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Mon, 31 May 2021 00:06:09 +0900 Subject: [PATCH 13/26] Fixed several points where `devtools::check()` pointed out. --- DESCRIPTION | 1 + R/residuals-aglm.R | 3 +-- man/residuals.AccurateGLM.Rd | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7c716ab..49f32ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,7 @@ Suggests: knitr, rmarkdown, MASS, + faraway, mathjaxr RdMacros: mathjaxr diff --git a/R/residuals-aglm.R b/R/residuals-aglm.R index 72de3fc..df760e8 100644 --- a/R/residuals-aglm.R +++ b/R/residuals-aglm.R @@ -1,7 +1,5 @@ #' Get residuals of various types #' -#' \loadmathjax -#' #' @param object #' A model object obtained from `aglm()` or `cv.aglm()`. #' @@ -22,6 +20,7 @@ #' If not given, `weights` for fitting is used. #' #' @param type +#' \loadmathjax #' A string representing type of deviance: #' * `"working"` get working residual #' \mjsdeqn{r^W_i = (y_i - \mu_i) \left(\frac{\partial \eta}{\partial \mu}\right)_{\mu=\mu_i},} diff --git a/man/residuals.AccurateGLM.Rd b/man/residuals.AccurateGLM.Rd index bdcc33f..eee71cd 100644 --- a/man/residuals.AccurateGLM.Rd +++ b/man/residuals.AccurateGLM.Rd @@ -30,7 +30,8 @@ If not given, \code{offset} for fitting is used.} \item{weights}{Sample weights. If not given, \code{weights} for fitting is used.} -\item{type}{A string representing type of deviance: +\item{type}{\loadmathjax +A string representing type of deviance: \itemize{ \item \code{"working"} get working residual \mjsdeqn{r^W_i = (y_i - \mu_i) \left(\frac{\partial \eta}{\partial \mu}\right)_{\mu=\mu_i},} @@ -51,7 +52,7 @@ where \eqn{d_i} is the contribution to deviance. A numeric vector representing calculated residuals. } \description{ -\loadmathjax +Get residuals of various types } \author{ Kenji Kondo From 2c2ed84f041d574da54752c5ad29fe65e5a9aae9 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Mon, 31 May 2021 02:21:16 +0900 Subject: [PATCH 14/26] Small fixes. --- R/aglm.R | 10 ++++++---- man/aglm.Rd | 12 +++++++----- tests/testthat/test_aglm.R | 4 ++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/R/aglm.R b/R/aglm.R index 47b05b0..6bd9223 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -7,10 +7,12 @@ #' A design matrix. #' Usually a `data.frame` object is expected, but a `matrix` object is fine if all columns are of a same class. #' Each column may have one of the following classes, and `aglm` will automatically determine how to handle it: -#' * `numeric`: interpreted as a quantitative variable. `aglm` performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies). +#' * `numeric`: interpreted as a quantitative variable. `aglm` performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies/L-variables). #' * `factor` (unordered) or `logical` : interpreted as a qualitative variable without order. `aglm` creates dummy variables suitable for unordered values (named U-dummies). #' * `ordered`: interpreted as a qualitative variable with order. `aglm` creates both O-dummies and U-dummies. +#' #' These dummy variables are added to `x` and form a larger matrix, which is used internally as an actual design matrix. +#' See \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper} for more details on O-dummies, U-dummies, and L-variables. #' #' If you need to change the default behavior, use the following options: `qualitative_vars_UD_only`, `qualitative_vars_both`, `qualitative_vars_OD_only`, and `quantitative_vars`. #' @@ -34,9 +36,9 @@ #' Same as `qualitative_vars_UD_only`, except that specified variables are considered as quantitative variables. #' #' @param use_LVar -#' Used to get predictions without jumps by `aglm`. -#' By default, `aglm` uses O-dummies as the representation of a quantitative variable, but the resulting linear combination form a step function and has a jump at each break of binning. -#' To avoid this, set `use_LVar=TRUE` to use a different type of auxiliary variable (named L-Variables), such that the linear combination form a piecewise linear functions and is continuous. +#' Set to use L-variables. +#' By default, `aglm` uses O-dummies as the representation of a quantitative variable. +#' If `use_LVar=TRUE`, L-variables are used instead. #' #' @param extrapolation #' Used to control values of linear combination for quantitative variables, outside where the data exists. diff --git a/man/aglm.Rd b/man/aglm.Rd index 8cfa487..06fa097 100644 --- a/man/aglm.Rd +++ b/man/aglm.Rd @@ -29,12 +29,14 @@ aglm( Usually a \code{data.frame} object is expected, but a \code{matrix} object is fine if all columns are of a same class. Each column may have one of the following classes, and \code{aglm} will automatically determine how to handle it: \itemize{ -\item \code{numeric}: interpreted as a quantitative variable. \code{aglm} performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies). +\item \code{numeric}: interpreted as a quantitative variable. \code{aglm} performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies/L-variables). \item \code{factor} (unordered) or \code{logical} : interpreted as a qualitative variable without order. \code{aglm} creates dummy variables suitable for unordered values (named U-dummies). \item \code{ordered}: interpreted as a qualitative variable with order. \code{aglm} creates both O-dummies and U-dummies. -These dummy variables are added to \code{x} and form a larger matrix, which is used internally as an actual design matrix. } +These dummy variables are added to \code{x} and form a larger matrix, which is used internally as an actual design matrix. +See \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper} for more details on O-dummies, U-dummies, and L-variables. + If you need to change the default behavior, use the following options: \code{qualitative_vars_UD_only}, \code{qualitative_vars_both}, \code{qualitative_vars_OD_only}, and \code{quantitative_vars}.} \item{y}{A response variable.} @@ -53,9 +55,9 @@ This parameter may have one of the following classes: \item{quantitative_vars}{Same as \code{qualitative_vars_UD_only}, except that specified variables are considered as quantitative variables.} -\item{use_LVar}{Used to get predictions without jumps by \code{aglm}. -By default, \code{aglm} uses O-dummies as the representation of a quantitative variable, but the resulting linear combination form a step function and has a jump at each break of binning. -To avoid this, set \code{use_LVar=TRUE} to use a different type of auxiliary variable (named L-Variables), such that the linear combination form a piecewise linear functions and is continuous.} +\item{use_LVar}{Set to use L-variables. +By default, \code{aglm} uses O-dummies as the representation of a quantitative variable. +If \code{use_LVar=TRUE}, L-variables are used instead.} \item{extrapolation}{Used to control values of linear combination for quantitative variables, outside where the data exists. By default, values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists. diff --git a/tests/testthat/test_aglm.R b/tests/testthat/test_aglm.R index cbacae8..772bccb 100644 --- a/tests/testthat/test_aglm.R +++ b/tests/testthat/test_aglm.R @@ -105,7 +105,7 @@ test_that("Check for logical features", { # Generates non-linear reponse y <- xor(x[, 1], x[, 2]) - res <- cv.aglm(x, y, family=gaussian(), keep=TRUE) + res <- cv.aglm(x, y, family="gaussian", keep=TRUE) expect_true("AccurateGLM" %in% class(res)) expect_true("glmnet" %in% class(res@backend_models[[1]])) @@ -131,7 +131,7 @@ test_that("Check for binomial family", { nobs <- 1000 x1 <- rnorm(nobs); x2 <- rnorm(nobs); x <- cbind(x1, x2) y <- 1 * ((atan(0.25 * x1 - 0.5 * x2) / pi + 0.5) > 0.5) - model <- aglm(x, y, family = binomial(), alpha = 1, lambda = 0.003) + model <- aglm(x, y, family="binomial", alpha=1, lambda=0.003) newx1 <- rnorm(100); newx2 <- rnorm(100); newx <- cbind(newx1, newx2) aglm.pred <- predict(model, newx) From 9279db2355216c14de07679443ab329040053c84 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Mon, 31 May 2021 02:31:14 +0900 Subject: [PATCH 15/26] rewrite readme and added news. --- NEWS.md | 5 +++++ README.md | 22 ++++++++++------------ 2 files changed, 15 insertions(+), 12 deletions(-) create mode 100644 NEWS.md diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..ad73d35 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,5 @@ +# aglm 0.4.0 +- Rewrites helps and refactor all the examples. + +# aglm 0.3.2 +- Fixed to use `R` 4.0 and `glmnet` 4.0. diff --git a/README.md b/README.md index e325075..3b42ddc 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,18 @@ -### What is it? -A handy tool for actuarial modeling, which is designed to achieve both accuracy and accountability. +# What is it? +Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables. +More details can be found in [our paper]{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}. -### Installation +# Installation To install the latest version from `github` : ```r install.packages("devtools") devtools::install_github("kkondo1981/aglm") ``` -*Note:* -Now `aglm` requires the newest version of `R` and `glmnet`, so please update versions of your local packages before installation. +# Usage +See the help as below after installing `aglm`. -### Usage -See https://github.com/kkondo1981/aglm/tree/master/examples - -### Release note - -#### version 0.3.2 -- Fixed to use `R` 4.0 and `glmnet` 4.0. +```r +library(aglm) +?"aglm-package" +``` From 0acddd15764c95a3e80d9b8042d1d9e178de98a0 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Mon, 31 May 2021 02:32:38 +0900 Subject: [PATCH 16/26] bug fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b42ddc..c430663 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # What is it? Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables. -More details can be found in [our paper]{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}. +More details can be found in [our paper](https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1). # Installation To install the latest version from `github` : From 09ab64a8a175fd3598d1c58b0480449dbe16590a Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 14:24:25 +0900 Subject: [PATCH 17/26] Small fix --- NEWS.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index ad73d35..1f063f3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ # aglm 0.4.0 -- Rewrites helps and refactor all the examples. +- Rewirtes all documents and examples. # aglm 0.3.2 - Fixed to use `R` 4.0 and `glmnet` 4.0. diff --git a/README.md b/README.md index c430663..2da3014 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# What is it? +# What is this? Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables. More details can be found in [our paper](https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1). From 7ddde84bf08cc815bd3c181395ee8bb09fda6ac0 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 16:01:03 +0900 Subject: [PATCH 18/26] Modified Imports and Suggests field to pass `R CMD check` --- DESCRIPTION | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 49f32ea..ac34d97 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,13 +15,13 @@ Depends: Imports: glmnet (>= 4.0.2), assertthat, - methods + methods, + mathjaxr Suggests: testthat, knitr, rmarkdown, MASS, - faraway, - mathjaxr + faraway RdMacros: mathjaxr From 383391e9bdda0109c684de525d98108668493eb3 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 16:10:40 +0900 Subject: [PATCH 19/26] Passed spell-check --- NEWS.md | 2 +- R/aglm-package.R | 2 +- inst/WORDLIST | 17 +++++++++++++++++ man/aglm-package.Rd | 2 +- 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 inst/WORDLIST diff --git a/NEWS.md b/NEWS.md index 1f063f3..0533312 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ # aglm 0.4.0 -- Rewirtes all documents and examples. +- Updated all documents and examples. # aglm 0.3.2 - Fixed to use `R` 4.0 and `glmnet` 4.0. diff --git a/R/aglm-package.R b/R/aglm-package.R index f676c9c..aed6732 100644 --- a/R/aglm-package.R +++ b/R/aglm-package.R @@ -26,7 +26,7 @@ #' and \eqn{\lambda} determines the strength of the regularization. #' #' Searching hyper-parameters \eqn{\alpha} and \eqn{\lambda} is often useful to get better results, but usually time-consuming. -#' That's why the `aglm` package provides three fitting functions with different strategies for spcifying hyper-parameters as follows: +#' That's why the `aglm` package provides three fitting functions with different strategies for specifying hyper-parameters as follows: #' * \link{aglm}: A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). #' * \link{cv.aglm}: A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. #' * \link{cva.aglm}: A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}. diff --git a/inst/WORDLIST b/inst/WORDLIST new file mode 100644 index 0000000..6d49cad --- /dev/null +++ b/inst/WORDLIST @@ -0,0 +1,17 @@ +AccurateGLM +Banno +coef +cv +cva +CVA +Fujita +glmnet +Hirokazu +Iwasawa +Kondo +multicollinearity +Suguru +Takahashi +Tanaka +th +Toyoto diff --git a/man/aglm-package.Rd b/man/aglm-package.Rd index c24617a..ddf0570 100644 --- a/man/aglm-package.Rd +++ b/man/aglm-package.Rd @@ -33,7 +33,7 @@ where \eqn{\beta_jk} is the k-th coefficient of auxiliary variables for the j-th and \eqn{\lambda} determines the strength of the regularization. Searching hyper-parameters \eqn{\alpha} and \eqn{\lambda} is often useful to get better results, but usually time-consuming. -That's why the \code{aglm} package provides three fitting functions with different strategies for spcifying hyper-parameters as follows: +That's why the \code{aglm} package provides three fitting functions with different strategies for specifying hyper-parameters as follows: \itemize{ \item \link{aglm}: A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s). \item \link{cv.aglm}: A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}. From 8fb673c1de6bcd6e8205a1d391919ca0718fd5e8 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 16:19:30 +0900 Subject: [PATCH 20/26] Added comments on Hachemeister prize. --- README.md | 3 +++ inst/WORDLIST | 1 + 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 2da3014..57ce6dc 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables. More details can be found in [our paper](https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1). +2021/6/6: +Now our paper won [Charles A. Hachemeister Prize](https://www.casact.org/about/awards-prizes-scholarships/charles-hachemeister-prize#:~:text=This%20prize%20was%20established%20in,between%20the%20CAS%20and%20ASTIN.). + # Installation To install the latest version from `github` : ```r diff --git a/inst/WORDLIST b/inst/WORDLIST index 6d49cad..87ff8f6 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -15,3 +15,4 @@ Takahashi Tanaka th Toyoto +Hachemeister From 904087460e917465e75079e6dc72a7207270649c Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 16:57:44 +0900 Subject: [PATCH 21/26] Added \dontrun tags to pass R CMD check. --- examples/cv-aglm-1.R | 5 +++++ examples/cva-aglm-1.R | 5 +++++ man/cv.aglm.Rd | 5 +++++ man/cva.aglm.Rd | 5 +++++ 4 files changed, 20 insertions(+) diff --git a/examples/cv-aglm-1.R b/examples/cv-aglm-1.R index b154d1c..6e78faf 100644 --- a/examples/cv-aglm-1.R +++ b/examples/cv-aglm-1.R @@ -17,6 +17,9 @@ x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "e y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +# NOTE: Codes bellow will take considerable time, so run it when you have time. +\dontrun{ + ## Fit the model model <- cv.aglm(x, y, family="binomial") @@ -27,3 +30,5 @@ y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class") cat(sprintf("Confusion matrix for lambda=%.5f:\n", lambda)) print(table(y_true, y_pred)) + +} diff --git a/examples/cva-aglm-1.R b/examples/cva-aglm-1.R index 9e6158e..a498d39 100644 --- a/examples/cva-aglm-1.R +++ b/examples/cva-aglm-1.R @@ -17,6 +17,9 @@ x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "e y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +# NOTE: Codes bellow will take considerable time, so run it when you have time. +\dontrun{ + ## Fit the model cva_result <- cva.aglm(x, y, family="binomial") @@ -32,3 +35,5 @@ y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class") cat(sprintf("Confusion matrix for alpha=%.5f and lambda=%.5f:\n", alpha, lambda)) print(table(y_true, y_pred)) + +} diff --git a/man/cv.aglm.Rd b/man/cv.aglm.Rd index 4361a91..ef6dd83 100644 --- a/man/cv.aglm.Rd +++ b/man/cv.aglm.Rd @@ -92,6 +92,9 @@ x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "e y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +# NOTE: Codes bellow will take considerable time, so run it when you have time. +\dontrun{ + ## Fit the model model <- cv.aglm(x, y, family="binomial") @@ -102,6 +105,8 @@ y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class") cat(sprintf("Confusion matrix for lambda=\%.5f:\n", lambda)) print(table(y_true, y_pred)) + +} } \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/man/cva.aglm.Rd b/man/cva.aglm.Rd index 1cc861e..710bb21 100644 --- a/man/cva.aglm.Rd +++ b/man/cva.aglm.Rd @@ -59,6 +59,9 @@ x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "e y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] +# NOTE: Codes bellow will take considerable time, so run it when you have time. +\dontrun{ + ## Fit the model cva_result <- cva.aglm(x, y, family="binomial") @@ -74,6 +77,8 @@ y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class") cat(sprintf("Confusion matrix for alpha=\%.5f and lambda=\%.5f:\n", alpha, lambda)) print(table(y_true, y_pred)) + +} } \references{ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) From 9b67f3e592a0696b5c711d02b5be7458d660cc40 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 17:19:34 +0900 Subject: [PATCH 22/26] Changed not to use devtools to create a pdf manual when build. --- aglm.Rproj | 1 - 1 file changed, 1 deletion(-) diff --git a/aglm.Rproj b/aglm.Rproj index 526ad2d..e9a7f5e 100644 --- a/aglm.Rproj +++ b/aglm.Rproj @@ -16,6 +16,5 @@ AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package -PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace,vignette From ff0d9d1c73e3c97aad42296194c5e0802d6108d1 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 18:27:02 +0900 Subject: [PATCH 23/26] For package check. --- .Rbuildignore | 1 + DESCRIPTION | 2 +- aglm.Rproj | 1 + cran-comments.md | 31 +++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 cran-comments.md diff --git a/.Rbuildignore b/.Rbuildignore index d05fc86..791460b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,5 +4,6 @@ ^aglm.*\.tar\.gz$ ^aglm.*\.tgz$ ^LICENSE\.md$ +^cran-comments\.md ^\.github$ ^examples/* diff --git a/DESCRIPTION b/DESCRIPTION index ac34d97..2244ad1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: aglm Type: Package Title: Accurate Generalized Linear Model -Version: 0.3.2 +Version: 0.4.0 Author: Kenji Kondo, Kazuhisa Takahashi, others Maintainer: Kenji Kondo Description: A handy tool for actuarial modeling based on GLM. diff --git a/aglm.Rproj b/aglm.Rproj index e9a7f5e..4277d90 100644 --- a/aglm.Rproj +++ b/aglm.Rproj @@ -17,4 +17,5 @@ StripTrailingWhitespace: Yes BuildType: Package PackageInstallArgs: --no-multiarch --with-keep.source +PackageCheckArgs: --as-cran PackageRoxygenize: rd,collate,namespace,vignette diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..1ee272e --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,31 @@ +# Test environments +- local + - x86_64-w64-mingw32 +- r-hub + - ubuntu-gcc-release + - fedora-clang-devel +- win-builder (for R-release) + - x86_64-w64-mingw32 + + +# Results of `R CMD check --as-cran` + +Results are identical in all the environments. + +``` +Status: 1 NOTE +``` + +## Note #1 +``` +* checking CRAN incoming feasibility ... NOTE +Maintainer: 'Kenji Kondo ' + +New submission +``` + +This note just says this is the first submission of me, and there is actually no problem. + + +# `revdepcheck` results +There are currently no downstream dependencies for this package. From e80000a03d29dcfdcefb90790c6e7de35a5e09fc Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Sun, 6 Jun 2021 18:37:14 +0900 Subject: [PATCH 24/26] Sumitted to CRAN. --- .Rbuildignore | 1 + CRAN-RELEASE | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 CRAN-RELEASE diff --git a/.Rbuildignore b/.Rbuildignore index 791460b..87dba17 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^cran-comments\.md ^\.github$ ^examples/* +^CRAN-RELEASE$ diff --git a/CRAN-RELEASE b/CRAN-RELEASE new file mode 100644 index 0000000..2a18016 --- /dev/null +++ b/CRAN-RELEASE @@ -0,0 +1,2 @@ +This package was submitted to CRAN on 2021-06-06. +Once it is accepted, delete this file and tag the release (commit ff0d9d1). From 97c24a7f04fcec1d879d80fceb1440d91c70dc45 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Wed, 9 Jun 2021 10:49:37 +0900 Subject: [PATCH 25/26] To resubmit to CRAN. --- DESCRIPTION | 11 ++- R/aglm-package.R | 12 ++-- R/aglm.R | 2 +- R/coef-aglm.R | 11 +++ R/cv-aglm.R | 2 +- R/cva-aglm.R | 2 +- R/deviance-aglm.R | 3 + R/get-dummies.R | 4 +- R/plot-aglm.R | 15 ++-- R/predict-aglm.R | 2 +- R/print-aglm.R | 3 + cran-comments.md | 135 +++++++++++++++++++++++++++++++++++ examples/cv-aglm-1.R | 2 +- examples/cva-aglm-1.R | 2 +- man/aglm-package.Rd | 12 ++-- man/aglm.Rd | 2 +- man/coef.AccurateGLM.Rd | 13 ++++ man/cv.aglm.Rd | 4 +- man/cva.aglm.Rd | 4 +- man/deviance.AccurateGLM.Rd | 3 + man/getLVarMatForOneVec.Rd | 2 +- man/getODummyMatForOneVec.Rd | 2 +- man/plot.AccurateGLM.Rd | 5 +- man/predict.AccurateGLM.Rd | 2 +- man/print.AccurateGLM.Rd | 3 + 25 files changed, 220 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2244ad1..f85bc43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,9 +2,14 @@ Package: aglm Type: Package Title: Accurate Generalized Linear Model Version: 0.4.0 -Author: Kenji Kondo, Kazuhisa Takahashi, others -Maintainer: Kenji Kondo -Description: A handy tool for actuarial modeling based on GLM. +Authors@R: c( + person("Kenji", "Kondo", role=c("aut", "cre", "cph"), email="kkondo.odnokk@gmail.com"), + person("Kazuhisa", "Takahashi", role=c("ctb")), + person("Hikari", "Banno", role=c("ctb")) + ) +Description: Provides functions to fit Accurate Generalized Linear Model (AGLM) models, visualize them, and predict for new data. AGLM is defined as a regularized GLM which applies a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables. For more information on AGLM, see Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa (2020) . +URL: https://github.com/kkondo1981/aglm +BugReports: https://github.com/kkondo1981/aglm/issues License: GPL-2 Encoding: UTF-8 Language: en-US diff --git a/R/aglm-package.R b/R/aglm-package.R index aed6732..e693b88 100644 --- a/R/aglm-package.R +++ b/R/aglm-package.R @@ -1,9 +1,11 @@ #' aglm: Accurate Generalized Linear Model #' -#' Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying -#' a sort of feature transformations using a discretization of numerical features and specific -#' coding methodologies of dummy variables. More details can be found in -#' \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper}. +#' Provides functions to fit Accurate Generalized Linear Model (AGLM) models, +#' visualize them, and predict for new data. AGLM is defined as a regularized GLM +#' which applies a sort of feature transformations using a discretization of numerical +#' features and specific coding methodologies of dummy variables. +#' For more information on AGLM, see +#' \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa (2020)}. #' #' The collection of functions provided by the `aglm` package has almost the same structure as the famous `glmnet` package, #' so users familiar with the `glmnet` package will be able to handle it easily. @@ -66,7 +68,7 @@ #' #' @author #' * Kenji Kondo, -#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) #' #' #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/R/aglm.R b/R/aglm.R index 6bd9223..004c53c 100644 --- a/R/aglm.R +++ b/R/aglm.R @@ -87,7 +87,7 @@ #' #' @author #' * Kenji Kondo, -#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) #' #' #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/R/coef-aglm.R b/R/coef-aglm.R index f705e44..7f99319 100644 --- a/R/coef-aglm.R +++ b/R/coef-aglm.R @@ -19,6 +19,17 @@ #' @param ... #' Other arguments are passed directly to `coef.glmnet()`. #' +#' @return +#' If `index` or `name` is given, the function returns a list with the one or combination +#' of the following fields, consisting of coefficients related to the specified variable. +#' * `coef.linear`: A coefficient of the linear term. (If any) +#' * `coef.OD`: Coefficients of O-dummies. (If any) +#' * `coef.UD`: Coefficients of U-dummies. (If any) +#' * `coef.LV`: Coefficients of L-variables. (If any) +#' +#' If both `index` and `name` are not given, the function returns entire coefficients +#' corresponding to the internal designed matrix. +#' #' #' @author #' Kenji Kondo diff --git a/R/cv-aglm.R b/R/cv-aglm.R index c51b5ab..626acbc 100644 --- a/R/cv-aglm.R +++ b/R/cv-aglm.R @@ -69,7 +69,7 @@ #' #' @author #' * Kenji Kondo, -#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) #' #' #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/R/cva-aglm.R b/R/cva-aglm.R index 10148df..5a0759a 100644 --- a/R/cva-aglm.R +++ b/R/cva-aglm.R @@ -36,7 +36,7 @@ #' #' @author #' * Kenji Kondo, -#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) #' #' #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/R/deviance-aglm.R b/R/deviance-aglm.R index 307d0ff..9b3ab92 100644 --- a/R/deviance-aglm.R +++ b/R/deviance-aglm.R @@ -6,6 +6,9 @@ #' @param ... #' Other arguments are passed directly to `deviance.glmnet()`. #' +#' @return +#' The value of deviance extracted from the object `object`. +#' #' #' @author #' Kenji Kondo diff --git a/R/get-dummies.R b/R/get-dummies.R index 8d0f0d9..93a5cf4 100644 --- a/R/get-dummies.R +++ b/R/get-dummies.R @@ -47,7 +47,7 @@ getUDummyMatForOneVec <- function(x_vec, levels=NULL, drop_last=TRUE, only_info= #' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned. #' @param dummy_type Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated). #' -#' @return a list with the following fields: +#' @return A list with the following fields: #' * `breaks`: Same as input #' * `dummy_mat`: The created O-dummy matrix (only if `only_info=FALSE`). #' @@ -96,7 +96,7 @@ getODummyMatForOneVec <- function(x_vec, breaks=NULL, nbin.max=100, only_info=FA #' @param nbin.max The maximum number of bins (used only if `breaks=NULL`). #' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned. #' -#' @return a list with the following fields: +#' @return A list with the following fields: #' * `breaks`: Same as input #' * `dummy_mat`: The created L-variable matrix (only if `only_info=FALSE`). #' diff --git a/R/plot-aglm.R b/R/plot-aglm.R index 0dd3214..8f4674f 100644 --- a/R/plot-aglm.R +++ b/R/plot-aglm.R @@ -56,13 +56,16 @@ #' @param ... #' Other arguments are currently not used and just discarded. #' +#' @return +#' No return value, called for side effects. +#' #' #' @example examples/predict-and-plot-1.R #' #' #' @author #' * Kenji Kondo, -#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) #' #' #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) @@ -157,13 +160,15 @@ plot.AccurateGLM <- function(x, ## set par if (!only_plot) { - old.par <- par() + old.par <- par(no.readonly=TRUE) + on.exit(par(old.par), add=TRUE) par(oma=c(0, 0, 2, 0)) if (length(inds) == 1) layout <- c(1,1) par(mfrow=layout) } ask.old <- devAskNewPage() + on.exit(devAskNewPage(ask.old), add=TRUE) devAskNewPage(FALSE) ## Plotting @@ -346,10 +351,4 @@ plot.AccurateGLM <- function(x, first <- FALSE } } - devAskNewPage(ask.old) - - if (!only_plot) { - if (!is.null(old.par$oma)) par(oma=old.par$oma) - if (!is.null(old.par$mfrow)) par(mfrow=old.par$mfrow) - } } diff --git a/R/predict-aglm.R b/R/predict-aglm.R index f7ebe29..fb69200 100644 --- a/R/predict-aglm.R +++ b/R/predict-aglm.R @@ -32,7 +32,7 @@ #' #' @author #' * Kenji Kondo, -#' * Kazuhisa Takahashi and Banno (worked on L-Variable related features) +#' * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) #' #' #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) diff --git a/R/print-aglm.R b/R/print-aglm.R index 52d0347..e532264 100644 --- a/R/print-aglm.R +++ b/R/print-aglm.R @@ -9,6 +9,9 @@ #' @param ... #' Other arguments are passed directly to `print.glmnet()`. #' +#' @return +#' No return value, called for side effects. +#' #' #' @author #' Kenji Kondo diff --git a/cran-comments.md b/cran-comments.md index 1ee272e..7aef09c 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -29,3 +29,138 @@ This note just says this is the first submission of me, and there is actually no # `revdepcheck` results There are currently no downstream dependencies for this package. + + +# Resubmission + +I got some instructions from CRAN and fixed them as below. + +## Instruction #1 + +``` +The Description field is intended to be a (one paragraph) description +of what the package does and why it may be useful. +Please add more details about the package functionality and implemented +methods in your Description text. + +If there are references describing the methods in your package, please +add these in the description field of your DESCRIPTION file in the form +authors (year) +authors (year) +authors (year, ISBN:...) +or if those are not available: +with no space after 'doi:', 'arXiv:', 'https:' and angle brackets for +auto-linking. +(If you want to add a title as well please put it in quotes: "Title") +``` + +I fixed the description field of my DESCRIPTION file. +For consistency, I also fixed the beginning of 'aglm-package.Rd'. + +I got a new note like below as a result, and just ignore it because these words are names of us and our method. +``` +Possibly mis-spelled words in DESCRIPTION: + AGLM (10:75, 10:131, 10:339) + Fujita (10:356) + Hirokazu (10:395) + Iwasawa (10:404) + Kenji (10:379) + Kondo (10:385) + Suguru (10:349) + Tanaka (10:371) + Toyoto (10:364) +``` + +Additionally, I noticed 'URL' and 'BugReports' fields are useful and added them as below: +``` +URL: https://github.com/kkondo1981/aglm +BugReports: https://github.com/kkondo1981/aglm/issues +``` + + +## Instruction #2 +``` +Please rather use the Authors@R field and declare Maintainer, Authors +and Contributors with their appropriate roles with person() calls. +e.g. something like: +Authors@R: c(person("Alice", "Developer", role = c("aut", "cre","cph"), + email = "alice.developer@some.domain.net"), + person("Bob", "Dev", role = "aut") ) + +Please always add all authors, contributors and copyright holders in the +Authors@R field with the appropriate roles, instead of writing "others". +``` + +I removed the 'Authors' and 'Maintainer' fields, added an 'Authors@R' field, and wrote down all the contributors instead of writing 'others' as below: +``` +Authors@R: c( + person("Kenji", "Kondo", role=c("aut", "cre", "cph"), email="kkondo.odnokk@gmail.com"), + person("Kazuhisa", "Takahashi", role=c("ctb")), + person("Hikari", "Banno", role=c("ctb")) + ) +``` + +I also fixed \\author tags in the following files for consistency: +- aglm-package.Rd +- aglm-Rd +- cv.aglm.Rd +- cva.aglm.Rd +- plot.AccurateGLM.Rd +- predict.AccurateGLM.Rd + + +## Instruction #3 +``` +Please add \value to .Rd files regarding exported methods and explain +the functions results in the documentation. Please write about the +structure of the output (class) and also what the output means. (If a +function does not return a value, please document that too, e.g. +\value{No return value, called for side effects} or similar) +Missing Rd-tags: + coef.AccurateGLM.Rd: \value + deviance.AccurateGLM.Rd: \value + plot.AccurateGLM.Rd: \value + print.AccurateGLM.Rd: \value +``` + +I added \\value tags to the abovementioned Rd files. +I also checked all the explanations in \\value tags, and believe that they are enough. + + +## Instruction #4 +``` +\dontrun{} should only be used if the example really cannot be executed +(e.g. because of missing additional software, missing API keys, ...) by +the user. That's why wrapping examples in \dontrun{} adds the comment +("# Not run:") as a warning for the user. +Does not seem necessary. +Please unwrap the examples if they are executable in < 5 sec, or replace +\dontrun{} with \donttest{}. +``` + +I replaced \\dontrun{} in examples of the following functions to \\donttest{} because they take time > 5 sec: +- `cv.aglm()` +- `cva.aglm()` + + +## Instruction #5 +``` +Please make sure that you do not change the user's options, par or +working directory. If you really have to do so within functions, please +ensure with an *immediate* call of on.exit() that the settings are reset +when the function is exited. e.g.: +... +oldpar <- par(no.readonly = TRUE) # code line i +on.exit(par(oldpar)) # code line i + 1 +... +par(mfrow=c(2,2)) # somewhere after +... +e.g.: plot-aglm.R +If you're not familiar with the function, please check ?on.exit. This +function makes it possible to restore options before exiting a function +even if the function breaks. Therefore it needs to be called immediately +after the option change within a function. +``` + +I added two immidiate calls of `on.exit()` in 'plot-aglm.R' to restore `par` and `devAskNewPage`. +Because there are possibly two calls `on.exit()` in one function, I set `add=TRUE` when call `on.exit()`. diff --git a/examples/cv-aglm-1.R b/examples/cv-aglm-1.R index 6e78faf..8d17e58 100644 --- a/examples/cv-aglm-1.R +++ b/examples/cv-aglm-1.R @@ -18,7 +18,7 @@ y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] # NOTE: Codes bellow will take considerable time, so run it when you have time. -\dontrun{ +\donttest{ ## Fit the model model <- cv.aglm(x, y, family="binomial") diff --git a/examples/cva-aglm-1.R b/examples/cva-aglm-1.R index a498d39..90dd46c 100644 --- a/examples/cva-aglm-1.R +++ b/examples/cva-aglm-1.R @@ -18,7 +18,7 @@ y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] # NOTE: Codes bellow will take considerable time, so run it when you have time. -\dontrun{ +\donttest{ ## Fit the model cva_result <- cva.aglm(x, y, family="binomial") diff --git a/man/aglm-package.Rd b/man/aglm-package.Rd index ddf0570..27491f2 100644 --- a/man/aglm-package.Rd +++ b/man/aglm-package.Rd @@ -5,10 +5,12 @@ \alias{aglm-package} \title{aglm: Accurate Generalized Linear Model} \description{ -Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying -a sort of feature transformations using a discretization of numerical features and specific -coding methodologies of dummy variables. More details can be found in -\href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper}. +Provides functions to fit Accurate Generalized Linear Model (AGLM) models, +visualize them, and predict for new data. AGLM is defined as a regularized GLM +which applies a sort of feature transformations using a discretization of numerical +features and specific coding methodologies of dummy variables. +For more information on AGLM, see +\href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa (2020)}. } \details{ The collection of functions provided by the \code{aglm} package has almost the same structure as the famous \code{glmnet} package, @@ -95,6 +97,6 @@ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \author{ \itemize{ \item Kenji Kondo, -\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +\item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) } } diff --git a/man/aglm.Rd b/man/aglm.Rd index 06fa097..0bcca62 100644 --- a/man/aglm.Rd +++ b/man/aglm.Rd @@ -214,6 +214,6 @@ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \author{ \itemize{ \item Kenji Kondo, -\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +\item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) } } diff --git a/man/coef.AccurateGLM.Rd b/man/coef.AccurateGLM.Rd index d006ac9..49f8371 100644 --- a/man/coef.AccurateGLM.Rd +++ b/man/coef.AccurateGLM.Rd @@ -20,6 +20,19 @@ Note that if both \code{index} and \code{name} are set, \code{index} is discarde \item{...}{Other arguments are passed directly to \code{coef.glmnet()}.} } +\value{ +If \code{index} or \code{name} is given, the function returns a list with the one or combination +of the following fields, consisting of coefficients related to the specified variable. +\itemize{ +\item \code{coef.linear}: A coefficient of the linear term. (If any) +\item \code{coef.OD}: Coefficients of O-dummies. (If any) +\item \code{coef.UD}: Coefficients of U-dummies. (If any) +\item \code{coef.LV}: Coefficients of L-variables. (If any) +} + +If both \code{index} and \code{name} are not given, the function returns entire coefficients +corresponding to the internal designed matrix. +} \description{ Get coefficients } diff --git a/man/cv.aglm.Rd b/man/cv.aglm.Rd index ef6dd83..deaf5d3 100644 --- a/man/cv.aglm.Rd +++ b/man/cv.aglm.Rd @@ -93,7 +93,7 @@ y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] # NOTE: Codes bellow will take considerable time, so run it when you have time. -\dontrun{ +\donttest{ ## Fit the model model <- cv.aglm(x, y, family="binomial") @@ -117,6 +117,6 @@ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \author{ \itemize{ \item Kenji Kondo, -\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +\item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) } } diff --git a/man/cva.aglm.Rd b/man/cva.aglm.Rd index 710bb21..8063506 100644 --- a/man/cva.aglm.Rd +++ b/man/cva.aglm.Rd @@ -60,7 +60,7 @@ y <- train$vote newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")] # NOTE: Codes bellow will take considerable time, so run it when you have time. -\dontrun{ +\donttest{ ## Fit the model cva_result <- cva.aglm(x, y, family="binomial") @@ -89,6 +89,6 @@ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \author{ \itemize{ \item Kenji Kondo, -\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +\item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) } } diff --git a/man/deviance.AccurateGLM.Rd b/man/deviance.AccurateGLM.Rd index 7bada40..4ee50c2 100644 --- a/man/deviance.AccurateGLM.Rd +++ b/man/deviance.AccurateGLM.Rd @@ -11,6 +11,9 @@ \item{...}{Other arguments are passed directly to \code{deviance.glmnet()}.} } +\value{ +The value of deviance extracted from the object \code{object}. +} \description{ Get deviance } diff --git a/man/getLVarMatForOneVec.Rd b/man/getLVarMatForOneVec.Rd index 63f3dd2..4fdbc4b 100644 --- a/man/getLVarMatForOneVec.Rd +++ b/man/getLVarMatForOneVec.Rd @@ -16,7 +16,7 @@ getLVarMatForOneVec(x_vec, breaks = NULL, nbin.max = 100, only_info = FALSE) \item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.} } \value{ -a list with the following fields: +A list with the following fields: \itemize{ \item \code{breaks}: Same as input \item \code{dummy_mat}: The created L-variable matrix (only if \code{only_info=FALSE}). diff --git a/man/getODummyMatForOneVec.Rd b/man/getODummyMatForOneVec.Rd index c37df67..483dda0 100644 --- a/man/getODummyMatForOneVec.Rd +++ b/man/getODummyMatForOneVec.Rd @@ -24,7 +24,7 @@ getODummyMatForOneVec( \item{dummy_type}{Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).} } \value{ -a list with the following fields: +A list with the following fields: \itemize{ \item \code{breaks}: Same as input \item \code{dummy_mat}: The created O-dummy matrix (only if \code{only_info=FALSE}). diff --git a/man/plot.AccurateGLM.Rd b/man/plot.AccurateGLM.Rd index d4a79aa..6e357b6 100644 --- a/man/plot.AccurateGLM.Rd +++ b/man/plot.AccurateGLM.Rd @@ -72,6 +72,9 @@ To achieve this, set it to a pair of integer, which indicating number of rows an \item{...}{Other arguments are currently not used and just discarded.} } +\value{ +No return value, called for side effects. +} \description{ Plot contribution of each variable and residuals } @@ -128,6 +131,6 @@ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \author{ \itemize{ \item Kenji Kondo, -\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +\item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) } } diff --git a/man/predict.AccurateGLM.Rd b/man/predict.AccurateGLM.Rd index 9bfa523..1267afa 100644 --- a/man/predict.AccurateGLM.Rd +++ b/man/predict.AccurateGLM.Rd @@ -90,6 +90,6 @@ Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020) \author{ \itemize{ \item Kenji Kondo, -\item Kazuhisa Takahashi and Banno (worked on L-Variable related features) +\item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features) } } diff --git a/man/print.AccurateGLM.Rd b/man/print.AccurateGLM.Rd index fd7d096..20da039 100644 --- a/man/print.AccurateGLM.Rd +++ b/man/print.AccurateGLM.Rd @@ -13,6 +13,9 @@ \item{...}{Other arguments are passed directly to \code{print.glmnet()}.} } +\value{ +No return value, called for side effects. +} \description{ Display textual information of the model } From 55e0be21a394cdb95475c566d13beb25a98efe29 Mon Sep 17 00:00:00 2001 From: Kenji Kondo Date: Wed, 9 Jun 2021 15:59:57 +0900 Subject: [PATCH 26/26] CRAN-RELEASE is automatically updated. --- CRAN-RELEASE | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CRAN-RELEASE b/CRAN-RELEASE index 2a18016..2adb330 100644 --- a/CRAN-RELEASE +++ b/CRAN-RELEASE @@ -1,2 +1,2 @@ -This package was submitted to CRAN on 2021-06-06. -Once it is accepted, delete this file and tag the release (commit ff0d9d1). +This package was submitted to CRAN on 2021-06-09. +Once it is accepted, delete this file and tag the release (commit 97c24a7).