Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dcast only computes default fill if necessary #5549

Merged
merged 32 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2886c4f
delete old commented code
tdhock Nov 30, 2022
90f0647
new test for no warning fails
tdhock Nov 30, 2022
26745f4
only compute default fill if missing cells present
tdhock Nov 30, 2022
03dc91d
any_NA_int helper
tdhock Nov 30, 2022
258befb
bugfix #5512
tdhock Dec 1, 2022
360ba9d
Update src/fcast.c
tdhock Dec 4, 2022
75102bf
Update src/fcast.c
tdhock Dec 4, 2022
6225799
mention warning text
tdhock Dec 5, 2022
5055306
const int args
tdhock Dec 5, 2022
6a93cb1
add back ithiscol
tdhock Dec 5, 2022
a40d969
get pointer before for loop
tdhock Dec 7, 2022
2019a5c
Merge branch 'master' into fix5512
tdhock Feb 15, 2024
c46cfaa
Merge branch 'master' into fix5512
tdhock Mar 4, 2024
1a8ba9c
add test case from Michael
tdhock Mar 4, 2024
47d735e
Merge branch 'fix5512' of https://github.com/Rdatatable/data.table in…
tdhock Mar 4, 2024
7198d08
merge
tdhock Mar 8, 2024
02f2c3a
test min(dbl) and no warning when fill specified
tdhock Mar 8, 2024
fc542ec
Revert "delete old commented code"
tdhock Mar 10, 2024
6ae4c76
use suggestions from Michael
tdhock Mar 10, 2024
eb95ab8
rm inline any_NA_int since that causes install to fail
tdhock Mar 10, 2024
6d8f614
clarify comment
tdhock Mar 10, 2024
3c7fb24
link 5390
tdhock Mar 11, 2024
dcb51ed
mymin test fails
tdhock Mar 13, 2024
83b0cf5
compute some_fill using anyNA in R then pass to C
tdhock Mar 13, 2024
6f4b711
Update R/fcast.R
tdhock Mar 13, 2024
ee93c5f
Update R/fcast.R
tdhock Mar 13, 2024
747c76c
dat_for_default_fill is zero-row dt
tdhock Mar 13, 2024
07c6838
merge
tdhock Mar 13, 2024
4d6c0e1
!length instead of length==0
tdhock Mar 13, 2024
359c3c3
new dcast tests with fill=character
tdhock Mar 14, 2024
4b96d35
dat_for_default_fill is dat again, not 0-row, because that causes som…
tdhock Mar 14, 2024
4ca3736
Merge branch 'master' into fix5512
MichaelChirico Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -3720,6 +3720,12 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2,
DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2],20,TRUE), d1=runif(20), d2=1L)
test(1102.38, names(dcast(DT, x ~ y + z, fun.aggregate=length, value.var = "d2", sep=".")),
c("x", "1.a", "1.b", "2.a", "2.b"))

# test for #5512, only compute default fill if needed.
DT = data.table(chr=c("a","b","b"), int=1:3)
test(1102.39, dcast(DT, int ~ chr, min, value.var="int"), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int"), warning="NAs introduced by coercion to integer range")
test(1102.40, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))

}

# test for freading commands
Expand Down
2 changes: 1 addition & 1 deletion man/dcast.data.table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
\item{\dots}{Any other arguments that may be passed to the aggregating function.}
\item{margins}{Not implemented yet. Should take variable names to compute margins on. A value of \code{TRUE} would compute all margins.}
\item{subset}{Specified if casting should be done on a subset of the data. Ex: \code{subset = .(col1 <= 5)} or \code{subset = .(variable != "January")}.}
\item{fill}{Value with which to fill missing cells. If \code{fun.aggregate} is present, takes the value by applying the function on a 0-length vector.}
\item{fill}{Value with which to fill missing cells. If \code{fill=NULL} and missing cells are present, then \code{fun.aggregate} is used on a 0-length vector to obtain a fill value.}
\item{drop}{\code{FALSE} will cast by including all missing combinations.

\code{c(FALSE, TRUE)} will only include all missing combinations of formula \code{LHS}; \code{c(TRUE, FALSE)} will only include all missing combinations of formula RHS. See Examples.}
Expand Down
125 changes: 20 additions & 105 deletions src/fcast.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
// #include <signal.h> // the debugging machinery + breakpoint aidee
// raise(SIGINT);

bool any_NA_int(int N_data, int *idx){
for (int data_i=0; data_i<N_data; ++data_i) {
if(idx[data_i] == NA_INTEGER)return true;
}
return false;
}

// TO DO: margins
SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg) {
int nrows=INTEGER(nrowArg)[0], ncols=INTEGER(ncolArg)[0];
Expand All @@ -15,57 +22,54 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil
SET_VECTOR_ELT(ans, i, VECTOR_ELT(lhs, i));
}
// get val cols
bool some_fill = any_NA_int(nrows*ncols, idx);
for (int i=0; i<nval; ++i) {
SEXP thiscol = VECTOR_ELT(val, i);
SEXP thisfill = fill;
SEXPTYPE thistype = TYPEOF(thiscol);
int nprotect = 0;
if (isNull(fill)) {
if (LOGICAL(is_agg)[0]) {
thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++;
} else thisfill = VECTOR_ELT(fill_d, i);
}
if (TYPEOF(thisfill) != thistype) {
thisfill = PROTECT(coerceVector(thisfill, thistype)); nprotect++;
if(some_fill){
if (isNull(fill)) {
if (LOGICAL(is_agg)[0]) {
thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++;
} else thisfill = VECTOR_ELT(fill_d, i);
}
if (TYPEOF(thisfill) != thistype) {
thisfill = PROTECT(coerceVector(thisfill, thistype)); nprotect++;
}
}
switch (thistype) {
case INTSXP:
case LGLSXP: {
const int *ithiscol = INTEGER(thiscol);
const int *ithisfill = INTEGER(thisfill);
for (int j=0; j<ncols; ++j) {
SET_VECTOR_ELT(ans, nlhs+j+i*ncols, target=allocVector(thistype, nrows) );
int *itarget = INTEGER(target);
copyMostAttrib(thiscol, target);
for (int k=0; k<nrows; ++k) {
int thisidx = idx[k*ncols + j];
itarget[k] = (thisidx == NA_INTEGER) ? ithisfill[0] : ithiscol[thisidx-1];
itarget[k] = (thisidx == NA_INTEGER) ? INTEGER(thisfill)[0] : INTEGER(thiscol)[thisidx-1];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't we just use the same code for INTSXP, LGLSXP, REALSXP and CPLXSXP with coerceAs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is my first time hacking on this code, so I'm not sure, but I also had the feeling that it would be desirable to avoid the repeated logic in these switch cases. About usage of coerceAs, would that introduce unwanted overhead / performance penalty? I was thinking of solving that via a C macro. Anyway I would suggest saving that for another PR, though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Value added from coerceAs is handling attributes and therefore classes like int64, not sure if relevant here

}
}
} break;
case REALSXP: {
const double *dthiscol = REAL(thiscol);
const double *dthisfill = REAL(thisfill);
for (int j=0; j<ncols; ++j) {
SET_VECTOR_ELT(ans, nlhs+j+i*ncols, target=allocVector(thistype, nrows) );
double *dtarget = REAL(target);
copyMostAttrib(thiscol, target);
for (int k=0; k<nrows; ++k) {
int thisidx = idx[k*ncols + j];
dtarget[k] = (thisidx == NA_INTEGER) ? dthisfill[0] : dthiscol[thisidx-1];
dtarget[k] = (thisidx == NA_INTEGER) ? REAL(thisfill)[0] : REAL(thiscol)[thisidx-1];
}
}
} break;
case CPLXSXP: {
const Rcomplex *zthiscol = COMPLEX(thiscol);
const Rcomplex *zthisfill = COMPLEX(thisfill);
for (int j=0; j<ncols; ++j) {
SET_VECTOR_ELT(ans, nlhs+j+i*ncols, target=allocVector(thistype, nrows) );
Rcomplex *ztarget = COMPLEX(target);
copyMostAttrib(thiscol, target);
for (int k=0; k<nrows; ++k) {
int thisidx = idx[k*ncols + j];
ztarget[k] = (thisidx == NA_INTEGER) ? zthisfill[0] : zthiscol[thisidx-1];
ztarget[k] = (thisidx == NA_INTEGER) ? COMPLEX(thisfill)[0] : COMPLEX(thiscol)[thisidx-1];
}
}
} break;
Expand Down Expand Up @@ -96,92 +100,3 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil
UNPROTECT(1);
return(ans);
}

// commenting all unused functions, but not deleting it, just in case

// // internal functions that are not used anymore..

// // # nocov start
// // Note: all these functions below are internal functions and are designed specific to fcast.
// SEXP zero_init(R_len_t n) {
// SEXP ans;
// if (n < 0) error(_("Input argument 'n' to 'zero_init' must be >= 0"));
// ans = PROTECT(allocVector(INTSXP, n));
// for (int i=0; i<n; ++i) INTEGER(ans)[i] = 0;
// UNPROTECT(1);
// return(ans);
// }

// SEXP cast_order(SEXP v, SEXP env) {
// R_len_t len;
// SEXP call, ans;
// if (TYPEOF(env) != ENVSXP) error(_("Argument 'env' to (data.table internals) 'cast_order' must be an environment"));
// if (TYPEOF(v) == VECSXP) len = length(VECTOR_ELT(v, 0));
// else len = length(v);
// PROTECT(call = lang2(install("forder"), v)); // TODO: save the 'eval' by calling directly the C-function.
// ans = PROTECT(eval(call, env));
// if (length(ans) == 0) { // forder returns integer(0) if already sorted
// UNPROTECT(1); // ans
// ans = PROTECT(seq_int(len, 1));
// }
// UNPROTECT(2);
// return(ans);
// }

// SEXP cross_join(SEXP s, SEXP env) {
// // Calling CJ is faster and don't have to worry about sorting or setting key.
// SEXP call, r;
// if (!isNewList(s) || isNull(s)) error(_("Argument 's' to 'cross_join' must be a list of length > 0"));
// PROTECT(call = lang3(install("do.call"), install("CJ"), s));
// r = eval(call, env);
// UNPROTECT(1);
// return(r);
// }

// SEXP diff_int(SEXP x, R_len_t n) {
// SEXP ans;
// if (TYPEOF(x) != INTSXP) error(_("Argument 'x' to 'diff_int' must be an integer vector"));
// ans = PROTECT(allocVector(INTSXP, length(x)));
// for (int i=1; i<length(x); ++i)
// INTEGER(ans)[i-1] = INTEGER(x)[i] - INTEGER(x)[i-1];
// INTEGER(ans)[length(x)-1] = n - INTEGER(x)[length(x)-1] + 1;
// UNPROTECT(1);
// return(ans);
// }

// SEXP intrep(SEXP x, SEXP len) {
// R_len_t l=0, k=0;
// SEXP ans;
// if (TYPEOF(x) != INTSXP || TYPEOF(len) != INTSXP) error(_("Arguments 'x' and 'len' to 'intrep' should both be integer vectors"));
// if (length(x) != length(len)) error(_("'x' and 'len' must be of same length"));
// // assuming both are of length >= 1
// for (int i=0; i<length(len); ++i)
// l += INTEGER(len)[i]; // assuming positive values for len. internal use - can't bother to check.
// ans = PROTECT(allocVector(INTSXP, l));
// for (int i=0; i<length(len); ++i) {
// for (int j=0; j<INTEGER(len)[i]; ++j) {
// INTEGER(ans)[k++] = INTEGER(x)[i];
// }
// }
// UNPROTECT(1); // ans
// return(ans);
// }

// // taken match_transform() from base:::unique.c and modified
// SEXP coerce_to_char(SEXP s, SEXP env)
// {
// if (OBJECT(s)) {
// if (inherits(s, "factor")) return asCharacterFactor(s);
// else if(getAttrib(s, R_ClassSymbol) != R_NilValue) {
// SEXP call, r;
// PROTECT(call = lang2(install("as.character"), s));
// r = eval(call, env);
// UNPROTECT(1);
// return r;
// }
// }
// /* else */
// return coerceVector(s, STRSXP);
// }

// // # nocov end