Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

frollmax1: frollmax, frollmax adaptive, left adaptive support #5889

Merged
merged 21 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ S3method(cube, data.table)
S3method(rollup, data.table)
export(frollmean)
export(frollsum)
export(frollmax)
export(frollapply)
export(nafill)
export(setnafill)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# 2:
```

2. New window function `frollmax` computes the rolling maximum. Request came from @gpierard who needs left-aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). Adaptive rolling functions did not have support for `align="left"`, therefore we added this feature as well for all adaptive rolling functions. We measure adaptive `frollmax` to be up to 50 times faster than the next fastest solution using `max` and grouping `by=.EACHI`.

## NOTES

1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1.
Expand Down
26 changes: 24 additions & 2 deletions R/froll.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,25 @@ froll = function(fun, x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "
stopifnot(!missing(fun), is.character(fun), length(fun)==1L, !is.na(fun))
algo = match.arg(algo)
align = match.arg(align)
leftadaptive = isTRUE(adaptive) && align=="left" ## support for left added in #5441
if (leftadaptive) {
rev2 = function(x) if (is.list(x)) sapply(x, rev, simplify=FALSE) else rev(x)
verbose = getOption("datatable.verbose")
if (verbose)
catf("froll: adaptive=TRUE && align='left' pre-processing for align='right'\n")
## TODO test atomic x but list of lists n (multiple windows)!
x = rev2(x)
n = rev2(n)
align = "right"
}
ans = .Call(CfrollfunR, fun, x, n, fill, algo, align, na.rm, hasNA, adaptive)
ans
if (!leftadaptive)
ans
else {
if (verbose)
catf("froll: adaptive=TRUE && align='left' post-processing from align='right'\n")
rev2(ans)
}
}

frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) {
Expand All @@ -12,9 +29,14 @@ frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "l
frollsum = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) {
froll(fun="sum", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive)
}
frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center")) {
frollmax = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) {
froll(fun="max", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive)
}
frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center"), adaptive) {
FUN = match.fun(FUN)
align = match.arg(align)
if (!missing(adaptive))
stopf("frollapply does not support 'adaptive' argument")
rho = new.env()
ans = .Call(CfrollapplyR, FUN, x, n, fill, align, rho)
ans
Expand Down
4 changes: 2 additions & 2 deletions inst/tests/froll.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,8 @@ if (FALSE) {

#### adaptive limitations
test(6000.145, frollmean(1:2, 1:2, adaptive=TRUE, align="right"), c(1, 1.5))
test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align argument different than 'right' is not implemented")
test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented")
test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align 'center' is not implemented")
##test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented") ## support added in #5441, TODO add tests
test(6000.148, frollmean(list(1:2, 1:3), list(1:2), adaptive=TRUE), error="adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame. If you want to call rolling function on list having variable length of elements call it for each field separately")

#### adaptive exact
Expand Down
107 changes: 73 additions & 34 deletions man/froll.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
\alias{rollmean}
\alias{frollmean}
\alias{rollsum}
\alias{rollmax}
\alias{frollsum}
\alias{frollmax}
\alias{rollapply}
\alias{frollapply}
\title{Rolling functions}
Expand All @@ -19,7 +21,9 @@ frollmean(x, n, fill=NA, algo=c("fast", "exact"),
align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE)
frollsum(x, n, fill=NA, algo=c("fast","exact"),
align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE)
frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"))
frollmax(x, n, fill=NA, algo=c("fast","exact"),
align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE)
frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"), adaptive)
}
\arguments{
\item{x}{ Vector, \code{data.frame} or \code{data.table} of integer, numeric or logical columns over which to calculate the windowed aggregations. May also be a list, in which case the rolling function is applied to each of its elements. }
Expand All @@ -45,36 +49,69 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"))
conveniently within \code{data.table} syntax.

Argument \code{n} allows multiple values to apply rolling functions on
multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list.
Each list element must be integer vector of window sizes corresponding
to every single observation in each column; see Examples.
multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list,
see \emph{Adaptive rolling functions} section below for details.

When \code{algo="fast"} an \emph{"on-line"} algorithm is used, and
all of \code{NaN, +Inf, -Inf} are treated as \code{NA}.
Setting \code{algo="exact"} will make rolling functions to use a more
computationally-intensive algorithm that suffers less from floating point
rounding error (the same consideration applies to \code{\link[base]{mean}}).
\code{algo="exact"} also handles \code{NaN, +Inf, -Inf} consistently to
base R. In case of some functions (like \emph{mean}), it will additionally
When multiple columns or multiple window widths are provided, then they
are run in parallel. The exception is for \code{algo="exact"}, which runs in
parallel already. See \code{\link{setDTthreads}} for defaults and further details on parallelism in data.table.
}
\section{\code{hasNA} argument}{
\code{hasNA} can be used to speed up processing in cases when it is known
whether \code{x} contains infinite values \code{NA, NaN, +Inf, -Inf}.
\itemize{
\item{ Default \code{hasNA=NA} will use faster \code{NA} agnostic implementation,
but when \code{NA}s are detected it will re-run \code{NA} aware implementation. }
\item{ \code{hasNA=TRUE} will use \code{NA} aware implementation straightaway. }
\item{ \code{hasNA=FALSE} will use faster \code{NA} agnostic implementation.
Then depending on the rolling function it will either
\itemize{
\item{ (\emph{mean, sum}) detect \code{NA}s, raise warning, re-run \code{NA} aware. }
\item{ (\emph{max}) not detect \code{NA}s and may silently produce an incorrect
answer. }}
Therefore \code{hasNA=FALSE} should be used with care.
}
}
}
\section{Implementation}{
\itemize{
\item{ \code{algo="fast"} uses \emph{"on-line"} algorithm, and
all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. Not all functions
have \emph{fast} implementation available. As of now
\emph{max} and \code{adaptive=TRUE} do not have it, therefore it will
automatically fall back to \emph{exact} implementation. \code{datatable.verbose}
option can be used to check that. }
\item{ \code{algo="exact"} will make rolling functions use a more
computationally-intensive algorithm. For each observation from input vector
it will compute a function on a window from scratch (complexity \eqn{O(n^2)}).
Depending on the function, this algorithm may suffers less from
floating point rounding error (the same consideration applies to base \code{\link[base]{mean}}).
Algorithm also handles \code{NaN, +Inf, -Inf} consistently to
base R, unless - for some functions (e.g. \emph{max}) - \code{hasNA} is \code{FALSE}
but NAs are present. In case of some functions (like \emph{mean}), it will additionally
make extra pass to perform floating point error correction. Error
corrections might not be truly exact on some platforms (like Windows)
when using multiple threads.

when using multiple threads. }
}
}
\section{Adaptive rolling functions}{
Adaptive rolling functions are a special case where each
observation has its own corresponding rolling window width. Due to the logic
of adaptive rolling functions, the following restrictions apply:
observation has its own corresponding rolling window width. \code{n}
argument must be a list, then each list element must be an integer vector
of window sizes corresponding to every single observation in each column;
see Examples. Due to the logic or implementation of adaptive rolling
functions, the following restrictions apply:
\itemize{
\item \code{align} only \code{"right"}.
\item \code{align} does not support \code{"center"}.
\item if list of vectors is passed to \code{x}, then all
vectors within it must have equal length.
\item functionality is not suported in \code{frollapply}.
}

When multiple columns or multiple windows width are provided, then they
are run in parallel. The exception is for \code{algo="exact"}, which runs in
parallel already.

}
\section{\code{frollapply}}{
\code{frollapply} computes rolling aggregate on arbitrary R functions.
The input \code{x} (first argument) to the function \code{FUN}
\code{adaptive} argument is not supported. The input
\code{x} (first argument) to the function \code{FUN}
is coerced to \emph{numeric} beforehand and \code{FUN}
has to return a scalar \emph{numeric} value. Checks for that are made only
during the first iteration when \code{FUN} is evaluated. Edge cases can be
Expand All @@ -84,32 +121,34 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"))
because there is no thread-safe API to R's C \code{eval}. Nevertheless we've
seen the computation speed up vis-a-vis versions implemented in base R.
}
\value{
A list except when the input is a \code{vector} and
\code{length(n)==1} in which case a \code{vector} is returned.
}
\note{
\section{\code{zoo} package users notice}{
Users coming from most popular package for rolling functions
\code{zoo} might expect following differences in \code{data.table}
implementation.
\itemize{
\item rolling function will always return result of the same length as input.
\item \code{fill} defaults to \code{NA}.
\item rolling functions will always return result of the same length
as input.
\item \code{fill} defaults to \code{NA}.
\item \code{fill} accepts only constant values. It does not support
for \emph{na.locf} or other functions.
\item \code{align} defaults to \code{"right"}.
\item \code{align} defaults to \code{"right"}.
\item \code{na.rm} is respected, and other functions are not needed
when input contains \code{NA}.
\item integers and logical are always coerced to double.
\item integers and logical are always coerced to double.
\item when \code{adaptive=FALSE} (default), then \code{n} must be a
numeric vector. List is not accepted.
\item when \code{adaptive=TRUE}, then \code{n} must be vector of
length equal to \code{nrow(x)}, or list of such vectors.
\item \code{partial} window feature is not supported, although it can
be accomplished by using \code{adaptive=TRUE}, see
examples. \code{NA} is always returned for incomplete windows.
be accomplished by using \code{adaptive=TRUE}, see examples.
\code{NA} is always returned for incomplete windows.
}

}
\value{
A list except when the input is a \code{vector} and
\code{length(n)==1} in which case a \code{vector} is returned.
}
\note{
Be aware that rolling functions operates on the physical order of input.
If the intent is to roll values in a vector by a logical window, for
example an hour, or a day, one has to ensure that there are no gaps in
Expand Down
6 changes: 6 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,9 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool
void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose);
void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose);
void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose);
void frollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose);
void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose);
void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose);
void frollapply(double *x, int64_t nx, double *w, int k, ans_t *ans, int align, double fill, SEXP call, SEXP rho, bool verbose);

// frolladaptive.c
Expand All @@ -215,6 +218,9 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f
void fadaptiverollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
void fadaptiverollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);
//void fadaptiverollmaxFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // does not exists as of now
void fadaptiverollmaxExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose);

// frollR.c
SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasNA, SEXP adaptive);
Expand Down
Loading