-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathrbindlist.c
562 lines (550 loc) · 32.8 KB
/
rbindlist.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
#include "data.table.h"
#include <Rdefines.h>
#include <ctype.h> // for isdigit
SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignoreattrArg)
{
if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL)
error(_("%s should be TRUE or FALSE"), "fill");
if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1)
error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA
if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1 || LOGICAL(ignoreattrArg)[0] == NA_LOGICAL)
error(_("%s should be TRUE or FALSE"), "ignore.attr");
if (!length(l)) return(l);
if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists."));
int usenames = LOGICAL(usenamesArg)[0];
const bool fill = LOGICAL(fillArg)[0];
const bool ignoreattr = LOGICAL(ignoreattrArg)[0];
if (fill && usenames==NA_LOGICAL) {
usenames=TRUE;
}
const bool idcol = !isNull(idcolArg);
if (idcol && (!isString(idcolArg) || LENGTH(idcolArg)!=1)) internal_error(__func__, "idcol is not a single string"); // # nocov
int ncol=0, first=0;
int64_t nrow=0, upperBoundUniqueNames=1;
bool anyNames=false;
int numZero=0, firstZeroCol=0, firstZeroItem=0;
int *eachMax = (int *)R_alloc(LENGTH(l), sizeof(int));
// pre-check for any errors here to save having to get cleanup right below when usenames
for (int i=0; i<LENGTH(l); i++) { // length(l)>0 checked above
eachMax[i] = 0;
SEXP li = VECTOR_ELT(l, i);
if (isNull(li)) continue;
if (TYPEOF(li) != VECSXP) error(_("Item %d of input is not a data.frame, data.table or list"), i+1);
const int thisncol = length(li);
if (!thisncol) continue;
// delete as now more flexible ... if (fill && isNull(getAttrib(li, R_NamesSymbol))) error(_("When fill=TRUE every item of the input must have column names. Item %d does not."), i+1);
if (fill) {
if (thisncol>ncol) ncol=thisncol; // this section initializes ncol with max ncol. ncol may be increased when usenames is accounted for further down
} else {
if (ncol==0) { ncol=thisncol; first=i; }
else if (thisncol!=ncol) error(_("Item %d has %d columns, inconsistent with item %d which has %d columns. To fill missing columns use fill=TRUE."), i+1, thisncol, first+1, ncol);
}
int nNames = length(getAttrib(li, R_NamesSymbol));
if (nNames>0 && nNames!=thisncol) error(_("Item %d has %d columns but %d column names. Invalid object."), i+1, thisncol, nNames);
if (nNames>0) anyNames=true;
upperBoundUniqueNames += nNames;
int maxLen=0, whichMax=0;
for (int j=0; j<thisncol; ++j) { int tt=length(VECTOR_ELT(li,j)); if (tt>maxLen) { maxLen=tt; whichMax=j; } }
for (int j=0; j<thisncol; ++j) {
int tt = length(VECTOR_ELT(li, j));
if (tt>1 && tt!=maxLen) error(_("Column %d of item %d is length %d inconsistent with column %d which is length %d. Only length-1 columns are recycled."), j+1, i+1, tt, whichMax+1, maxLen);
if (tt==0 && maxLen>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; }
}
eachMax[i] = maxLen;
nrow += maxLen;
}
if (numZero) { // #1871
SEXP names = getAttrib(VECTOR_ELT(l, firstZeroItem), R_NamesSymbol);
const char *ch = names==R_NilValue ? "" : CHAR(STRING_ELT(names, firstZeroCol));
warning(Pl_(numZero-1,
"Column %d ['%s'] of item %d is length 0. This (and %d other like it) has been filled with NA (NULL for list columns) to make each item uniform.",
"Column %d ['%s'] of item %d is length 0. This (and %d others like it) has been filled with NA (NULL for list columns) to make each item uniform."),
firstZeroCol+1, ch, firstZeroItem+1, numZero-1);
}
if (nrow==0 && ncol==0) return(R_NilValue);
if (nrow>INT32_MAX) error(_("Total rows in the list is %"PRId64" which is larger than the maximum number of rows, currently %d"), (int64_t)nrow, INT32_MAX);
if (usenames==TRUE && !anyNames) error(_("use.names=TRUE but no item of input list has any names"));
int *colMap=NULL; // maps each column in final result to the column of each list item
if (usenames==TRUE || usenames==NA_LOGICAL) {
// here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch
// when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item
// first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names))))
SEXP *uniq = (SEXP *)malloc(upperBoundUniqueNames * sizeof(SEXP)); // upperBoundUniqueNames was initialized with 1 to ensure this is defined (otherwise 0 when no item has names)
if (!uniq)
error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov
savetl_init();
int nuniq=0;
// first pass - gather unique column names
for (int i=0; i<LENGTH(l); i++) {
SEXP li = VECTOR_ELT(l, i);
int thisncol=LENGTH(li);
if (isNull(li) || !LENGTH(li)) continue;
const SEXP cn = getAttrib(li, R_NamesSymbol);
if (!length(cn)) continue;
const SEXP *cnp = STRING_PTR_RO(cn);
for (int j=0; j<thisncol; j++) {
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
if (TRUELENGTH(s)<0) continue; // seen this name before
if (TRUELENGTH(s)>0) savetl(s);
uniq[nuniq++] = s;
SET_TRUELENGTH(s,-nuniq);
}
}
if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare
// now count the dups (if any) and how they're distributed across the items
int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames
int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
if (!counts || !maxdup) {
// # nocov start
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
free(uniq); free(counts); free(maxdup);
savetl_end();
error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq);
// # nocov end
}
// second pass - count duplicates
for (int i=0; i<LENGTH(l); i++) {
SEXP li = VECTOR_ELT(l, i);
int thisncol=length(li);
if (thisncol==0) continue;
const SEXP cn = getAttrib(li, R_NamesSymbol);
if (!length(cn)) continue;
const SEXP *cnp = STRING_PTR_RO(cn);
memset(counts, 0, nuniq*sizeof(int));
for (int j=0; j<thisncol; j++) {
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
counts[ -TRUELENGTH(s)-1 ]++;
}
for (int u=0; u<nuniq; u++) {
if (counts[u] > maxdup[u]) maxdup[u] = counts[u];
}
}
int ttncol = 0;
for (int u=0; u<nuniq; ++u) ttncol+=maxdup[u];
if (ttncol>ncol) ncol=ttncol;
free(maxdup); maxdup=NULL; // not needed again
// ncol is now the final number of columns accounting for unique and dups across all colnames
// allocate a matrix: nrows==length(list) each entry contains which column to fetch for that final column
int *colMapRaw = (int *)malloc(LENGTH(l)*ncol * sizeof(int)); // the result of this scope used later
int *uniqMap = (int *)malloc(ncol * sizeof(int)); // maps the ith unique string to the first time it occurs in the final result
int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc
if (!colMapRaw || !uniqMap || !dupLink) {
// # nocov start
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
free(uniq); free(counts); free(colMapRaw); free(uniqMap); free(dupLink);
savetl_end();
error(_("Failed to allocate ncol=%d items working memory in rbindlist.c"), ncol);
// # nocov end
}
for (int i=0; i<LENGTH(l)*ncol; ++i) colMapRaw[i]=-1; // 0-based so use -1
for (int i=0; i<ncol; ++i) {uniqMap[i] = dupLink[i] = -1;}
int nextCol=0, lastDup=ncol-1;
// third pass - create final column mapping colMapRaw
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
int thisncol=length(li);
if (thisncol==0) continue;
const SEXP cn = getAttrib(li, R_NamesSymbol);
if (!length(cn)) {
for (int j=0; j<thisncol; j++) colMapRaw[i*ncol + j] = j;
} else {
const SEXP *cnp = STRING_PTR_RO(cn);
memset(counts, 0, nuniq*sizeof(int));
for (int j=0; j<thisncol; j++) {
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
int w = -TRUELENGTH(s)-1;
int wi = counts[w]++; // how many dups have we seen before of this name within this item
if (uniqMap[w]==-1) {
// first time seen this name across all items
uniqMap[w] = nextCol++;
} else {
while (wi && dupLink[w]>0) { w=dupLink[w]; --wi; } // hop through the dups
if (wi && dupLink[w]==-1) {
// first time we've seen this number of dups of this name
w = dupLink[w] = lastDup--;
uniqMap[w] = nextCol++;
}
}
colMapRaw[i*ncol + uniqMap[w]] = j;
}
}
}
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0); // zero out our usage of tl
free(uniq); free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
savetl_end(); // restore R's usage
// colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
// move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.
colMap = (int *)R_alloc(LENGTH(l)*ncol, sizeof(int));
// This R_alloc could fail with out-of-memory but given it is very small it's very unlikely. If it does fail, colMapRaw will leak.
// But colMapRaw leaking now in this very rare situation is better than colMapRaw leaking in the more likely but still rare conditions later.
// And it's better than having to trap all exit point from here to the end of rbindlist, which may not be possible; e.g. writeNA() could error inside it with unsupported type.
// This very unlikely leak could be fixed by using an on.exit() at R level rbindlist(); R-exts$6.1.2 refers to pwilcox for example. However, that would not
// solve the (mere) leak if we ever call rbindlist internally from other C functions.
memcpy(colMap, colMapRaw, LENGTH(l)*ncol*sizeof(int));
free(colMapRaw); // local scope in this branch to ensure can't be used below
// to view map when debugging ...
// for (int i=0; i<LENGTH(l); ++i) { for (int j=0; j<ncol; ++j) Rprintf(_("%2d "),colMap[i*ncol + j]); Rprintf(_("\n")); }
}
if (fill && usenames==NA_LOGICAL) internal_error(__func__, "usenames==NA but fill=TRUE. usenames should have been set to TRUE earlier with warning"); // # nocov
if (!fill && (usenames==TRUE || usenames==NA_LOGICAL)) {
// Ensure no missings in both cases, and (when usenames==NA) all columns in same order too
// We proceeded earlier as if fill was true, so varying ncol items will have missing here
char buff[1001] = "";
const char *extra = usenames==TRUE?"":_(" use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for "\
" backwards compatibility. See news item 5 in v1.12.2 for options to control this message.");
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
if (!length(li) || !length(getAttrib(li, R_NamesSymbol))) continue;
for (int j=0; j<ncol; ++j) {
const int w = colMap[i*ncol + j];
if (w==-1) {
int missi = i;
while (colMap[i*ncol + j]==-1 && i<LENGTH(l)) i++;
if (i==LENGTH(l)) internal_error(__func__, "could not find the first column name not present in earlier item"); // # nocov
SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
int w2 = colMap[i*ncol + j];
const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : "";
snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
w2+1, str, i+1, missi+1, extra );
if (usenames==TRUE) error("%s", buff); // # notranslate
i = LENGTH(l); // break from outer i loop
break; // break from inner j loop
}
if (w!=j && usenames==NA_LOGICAL) {
SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
if (!isString(s) || i==0) internal_error(__func__, "usenames==NA but an out-of-order name has been found in an item with no names or the first item. [%d]", i); // # nocov
snprintf(buff, 1000, _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
w+1, CHAR(STRING_ELT(s,w)), i+1, j+1, i, extra);
i = LENGTH(l);
break;
}
}
if (buff[0]) {
SEXP opt = GetOption(install("datatable.rbindlist.check"), R_NilValue);
if (!isNull(opt) && !(isString(opt) && length(opt)==1)) {
warning(_("options()$datatable.rbindlist.check is set but is not a single string. See news item 5 in v1.12.2."));
opt = R_NilValue;
}
const char *o = isNull(opt) ? "message" : CHAR(STRING_ELT(opt,0));
if (strcmp(o,"message")==0) { eval(PROTECT(lang2(install("message"),PROTECT(ScalarString(mkChar(buff))))), R_GlobalEnv); UNPROTECT(2); }
else if (strcmp(o,"warning")==0) warning("%s", buff); // # notranslate
else if (strcmp(o,"error")==0) error("%s", buff); // # notranslate
else if (strcmp(o,"none")!=0) warning(_("options()$datatable.rbindlist.check=='%s' which is not 'message'|'warning'|'error'|'none'. See news item 5 in v1.12.2."), o);
}
}
}
if (usenames==NA_LOGICAL) {
usenames=FALSE; // for backwards compatibility, see warning above which says this will change to TRUE in future
ncol = length(VECTOR_ELT(l, first)); // ncol was increased as if fill=true, so reduce it back given fill=false (fill==false checked above)
}
int nprotect = 2;
SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol));
SEXP ansNames = PROTECT(allocVector(STRSXP, idcol + ncol));
setAttrib(ans, R_NamesSymbol, ansNames);
if (idcol) {
SET_STRING_ELT(ansNames, 0, STRING_ELT(idcolArg, 0));
SEXP idval, listNames=getAttrib(l, R_NamesSymbol);
if (length(listNames)) {
SET_VECTOR_ELT(ans, 0, idval=allocVector(STRSXP, nrow));
for (int i=0,ansloc=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
if (!length(li)) continue;
const int thisnrow = eachMax[i];
SEXP thisname = STRING_ELT(listNames, i);
for (int k=0; k<thisnrow; ++k) SET_STRING_ELT(idval, ansloc++, thisname);
}
} else {
SET_VECTOR_ELT(ans, 0, idval=allocVector(INTSXP, nrow));
int *idvald = INTEGER(idval);
for (int i=0,ansloc=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
if (!length(li)) continue;
const int thisnrow = eachMax[i];
for (int k=0; k<thisnrow; ++k) idvald[ansloc++] = i+1;
}
}
}
SEXP coercedForFactor = NULL;
for(int j=0; j<ncol; ++j) {
int maxType=LGLSXP; // initialize with LGLSXP for test 2002.3 which has col x NULL in both lists to be filled with NA for #1871
bool factor=false, orderedFactor=false; // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true.
int longestLen=-1, longestW=-1, longestI=-1; // just for ordered factor; longestLen must be initialized as -1 so that rbind zero-length ordered factor could work #4795
SEXP longestLevels=R_NilValue; // just for ordered factor
bool int64=false, date=false, posixct=false, itime=false, asis=false;
const char *foundName=NULL;
bool anyNotStringOrFactor=false;
SEXP firstCol=R_NilValue;
int firsti=-1, firstw=-1;
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
if (!length(li)) continue;
int w = usenames ? colMap[i*ncol + j] : (j<length(li) ? j : -1); // colMap tells us which item to fetch for each of the final result columns, so we can stack column-by-column // check if j exceeds length for fill=TRUE and usenames=FALSE #5444
if (w==-1) continue; // column j of final result has no input from this item (fill must be true)
if (!foundName) {
SEXP cn=PROTECT(getAttrib(li, R_NamesSymbol));
if (length(cn)) { SEXP tt; SET_STRING_ELT(ansNames, idcol+j, tt=STRING_ELT(cn, w)); foundName=CHAR(tt); }
UNPROTECT(1);
}
SEXP thisCol = VECTOR_ELT(li, w);
int thisType = TYPEOF(thisCol);
// Use >= for #546 -- TYPEORDER=0 for both LGLSXP and EXPRSXP (but also NULL)
if (TYPEORDER(thisType)>=TYPEORDER(maxType) && !isNull(thisCol)) maxType=thisType;
if (isFactor(thisCol)) {
if (isNull(getAttrib(thisCol,R_LevelsSymbol))) error(_("Column %d of item %d has type 'factor' but has no levels; i.e. malformed."), w+1, i+1);
factor = true;
if (isOrdered(thisCol)) {
orderedFactor = true;
int thisLen = length(getAttrib(thisCol, R_LevelsSymbol));
if (thisLen>longestLen) { longestLen=thisLen; longestLevels=getAttrib(thisCol, R_LevelsSymbol); /*for warnings later ...*/longestW=w; longestI=i; }
}
} else if (!isString(thisCol)) anyNotStringOrFactor=true; // even for length 0 columns for consistency; test 2113.3
if (INHERITS(thisCol, char_integer64)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below
int64=true;
} else if (INHERITS(thisCol, char_Date)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; }
date=true;
} else if (INHERITS(thisCol, char_POSIXct)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; }
posixct=true;
} else if (INHERITS(thisCol, char_ITime)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; }
itime=true;
} else if (!asis && INHERITS(thisCol, char_AsIs)) {
asis=true;
}
if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; }
else {
if (!factor && !int64 && date == posixct && !itime && !asis) { // prohibit binding of date and posixct
if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)),
PROTECT(getAttrib(firstCol, R_ClassSymbol)),
0) && !ignoreattr) {
error(_("Class attribute on column %d of item %d does not match with column %d of item %d. You can deactivate this safety-check by using ignore.attr=TRUE"), w+1, i+1, firstw+1, firsti+1);
}
UNPROTECT(2);
}
}
}
if (!foundName) { static char buff[12]; snprintf(buff,12,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); foundName=buff; } // # notranslate
if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option)
if (int64 && !(maxType==REALSXP || maxType==STRSXP || maxType==VECSXP || maxType==CPLXSXP))
internal_error(__func__, "column %d of result is determined to be integer64 but maxType=='%s' != REALSXP", j+1, type2char(maxType)); // # nocov
if (date && INHERITS(firstCol, char_IDate)) maxType=INTSXP; // first encountered Date determines class and type #5309
SEXP target;
SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow)); // does not initialize logical & numerics, but does initialize character and list
// #5504 do not copy class for mixing int64 and higher maxTypes CPLXSXP/STRSXP/VECSXP
if (!factor && !(int64 && (maxType==STRSXP || maxType==VECSXP || maxType==CPLXSXP))) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.
if (factor && anyNotStringOrFactor) {
// in future warn, or use list column instead ... warning(_("Column %d contains a factor but not all items for the column are character or factor"), idcol+j+1);
// some coercing from (likely) integer/numeric to character will be needed. But this coerce can feasibly fail with out-of-memory, so we have to do it up-front
// before the savetl_init() because we have no hook to clean up tl if coerceVector fails.
if (coercedForFactor==NULL) { coercedForFactor=PROTECT(allocVector(VECSXP, LENGTH(l))); nprotect++; }
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
int w = usenames ? colMap[i*ncol + j] : (j<length(li) ? j : -1); // check if j exceeds length for fill=TRUE and usenames=FALSE #5444
if (w==-1) continue;
SEXP thisCol = VECTOR_ELT(li, w);
if (!isFactor(thisCol) && !isString(thisCol)) {
SET_VECTOR_ELT(coercedForFactor, i, coerceVector(thisCol, STRSXP));
}
}
}
int ansloc=0;
if (factor) {
char warnStr[1000] = "";
savetl_init(); // no error from now (or warning given options(warn=2)) until savetl_end
int nLevel=0, allocLevel=0;
SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc.
if (orderedFactor) {
// If all sets of ordered levels are compatible (no ambiguities or conflicts) then an ordered factor is created, otherwise regular factor.
// Currently the longest set of ordered levels is taken and all other ordered levels must be a compatible subset of that.
// e.g. c( a<c<b, z<a<c<b, a<b ) => z<a<c<b [ the longest is the middle one, and the other two are ordered subsets of it ]
// c( a<c<b, z<c<a<b, a<b ) => regular factor because it contains an ambiguity: is a<c or c<a?
// c( a<c<b, c<b, 'c,b' ) => a<c<b because the regular factor/character items c and b exist in the ordered levels
// c( a<c<b, c<b, 'c,d' ) => a<c<b<d 'd' from non-ordered item added on the end of longest ordered levels
// c( a<c<b, c<b<d<e ) => regular factor because this case isn't yet implemented. a<c<b<d<e would be possible in future (extending longest at the beginning or end)
const SEXP *sd = STRING_PTR_RO(longestLevels);
nLevel = allocLevel = longestLen;
levelsRaw = (SEXP *)malloc(nLevel * sizeof(SEXP));
if (!levelsRaw) {
savetl_end(); // # nocov
error(_("Failed to allocate working memory for %d ordered factor levels of result column %d"), nLevel, idcol+j+1); // # nocov
}
for (int k=0; k<longestLen; ++k) {
SEXP s = sd[k];
if (TRUELENGTH(s)>0) savetl(s);
levelsRaw[k] = s;
SET_TRUELENGTH(s,-k-1);
}
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
int w = usenames ? colMap[i*ncol + j] : (j<length(li) ? j : -1); // check if j exceeds length for fill=TRUE and usenames=FALSE #5444
if (w==-1) continue;
SEXP thisCol = VECTOR_ELT(li, w);
if (isOrdered(thisCol)) {
SEXP levels = getAttrib(thisCol, R_LevelsSymbol);
const SEXP *levelsD = STRING_PTR_RO(levels);
const int n = length(levels);
for (int k=0, last=0; k<n; ++k) {
SEXP s = levelsD[k];
const int tl = TRUELENGTH(s);
if (tl>=last) { // if tl>=0 then also tl>=last because last<=0
if (tl>=0) {
snprintf(warnStr, 1000, // not direct warning as we're inside tl region
_("Column %d of item %d is an ordered factor but level %d ['%s'] is missing from the ordered levels from column %d of item %d. " \
"Each set of ordered factor levels should be an ordered subset of the first longest. A regular factor will be created for this column."),
w+1, i+1, k+1, CHAR(s), longestW+1, longestI+1);
} else {
snprintf(warnStr, 1000,
_("Column %d of item %d is an ordered factor with '%s'<'%s' in its levels. But '%s'<'%s' in the ordered levels from column %d of item %d. " \
"A regular factor will be created for this column due to this ambiguity."),
w+1, i+1, CHAR(levelsD[k-1]), CHAR(s), CHAR(s), CHAR(levelsD[k-1]), longestW+1, longestI+1);
// k>=1 (so k-1 is ok) because when k==0 last==0 and this branch wouldn't happen
}
orderedFactor=false;
i=LENGTH(l); // break outer i loop
break; // break inner k loop
// we leave the tl set for the longest levels; the regular factor will be created with the longest ordered levels first in case that useful for user
}
last = tl; // negative ordinal; last should monotonically grow more negative if the levels are an ordered subset of the longest
}
}
}
}
for (int i=0; i<LENGTH(l); ++i) {
const int thisnrow = eachMax[i];
SEXP li = VECTOR_ELT(l, i);
if (!length(li)) continue; // NULL items in the list() of DT/DF; not if thisnrow==0 because we need to retain (unused) factor levels (#3508)
int w = usenames ? colMap[i*ncol + j] : (j<length(li) ? j : -1); // check if j exceeds length for fill=TRUE and usenames=FALSE #5444
if (w==-1) {
writeNA(target, ansloc, thisnrow, false);
} else {
SEXP thisCol = VECTOR_ELT(li, w);
SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : (isString(thisCol) ? thisCol : VECTOR_ELT(coercedForFactor, i));
const int n = length(thisColStr);
const SEXP *thisColStrD = STRING_PTR_RO(thisColStr); // D for data
for (int k=0; k<n; ++k) {
SEXP s = thisColStrD[k];
if (s==NA_STRING || // remove NA from levels; test 1979 found by package emil when revdep testing 1.12.2 (#3473)
TRUELENGTH(s)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns
if (TRUELENGTH(s)>0) savetl(s);
if (allocLevel==nLevel) { // including initial time when allocLevel==nLevel==0
SEXP *tt = NULL;
if (allocLevel<INT_MAX) {
int64_t new = (int64_t)allocLevel+n-k+1024; // if all remaining levels in this item haven't been seen before, plus 1024 margin in case of many very short levels
allocLevel = (new>(int64_t)INT_MAX) ? INT_MAX : (int)new;
tt = (SEXP *)realloc(levelsRaw, allocLevel*sizeof(SEXP)); // first time levelsRaw==NULL and realloc==malloc in that case
}
if (tt==NULL) {
// # nocov start
// C spec states that if realloc() fails (above) the original block (levelsRaw) is left untouched: it is not freed or moved. We ...
for (int k=0; k<nLevel; k++) SET_TRUELENGTH(levelsRaw[k], 0); // ... rely on that in this loop which uses levelsRaw.
free(levelsRaw);
savetl_end();
error(_("Failed to allocate working memory for %d factor levels of result column %d when reading item %d of item %d"), allocLevel, idcol+j+1, w+1, i+1);
// # nocov end
}
levelsRaw = tt;
}
SET_TRUELENGTH(s,-(++nLevel));
levelsRaw[nLevel-1] = s;
}
int *targetd = INTEGER(target);
if (isFactor(thisCol)) {
const int *id = INTEGER(thisCol);
if (length(thisCol)<=1) {
// recycle length-1, or NA-fill length-0
SEXP lev;
const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRUELENGTH(lev) : NA_INTEGER;
// ^^ #3915 and tests 2015.2-5
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
} else {
// length(thisCol)==thisnrow already checked before this truelength-clobber region
// If all i==truelength(i) then just do a memcpy since hop is identity. Otherwise hop via the integer map.
bool hop = false;
if (orderedFactor) {
// retain the position of NA level (if any) and the integer mappings to it
for (int k=0; k<n; ++k) {
SEXP s = thisColStrD[k];
if (s!=NA_STRING && -TRUELENGTH(s)!=k+1) { hop=true; break; }
}
} else {
for (int k=0; k<n; ++k) {
SEXP s = thisColStrD[k];
if (s==NA_STRING || -TRUELENGTH(s)!=k+1) { hop=true; break; }
}
}
if (hop) {
if (orderedFactor) {
for (int r=0; r<thisnrow; ++r)
targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(thisColStrD[id[r]-1]);
} else {
for (int r=0; r<thisnrow; ++r) {
SEXP lev;
targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -TRUELENGTH(lev);
}
}
} else {
memcpy(targetd+ansloc, id, thisnrow*SIZEOF(thisCol));
}
}
} else {
const SEXP *sd = STRING_PTR_RO(thisColStr);
if (length(thisCol)<=1) {
const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -TRUELENGTH(sd[0]) : NA_INTEGER;
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
} else {
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -TRUELENGTH(sd[r]);
}
}
}
ansloc += thisnrow;
}
for (int k=0; k<nLevel; ++k) SET_TRUELENGTH(levelsRaw[k], 0);
savetl_end();
if (warnStr[0]) warning("%s", warnStr); // # notranslate. now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
SEXP levelsSxp;
setAttrib(target, R_LevelsSymbol, levelsSxp=allocVector(STRSXP, nLevel));
for (int k=0; k<nLevel; ++k) SET_STRING_ELT(levelsSxp, k, levelsRaw[k]);
free(levelsRaw);
if (orderedFactor) {
SEXP tt;
setAttrib(target, R_ClassSymbol, tt=allocVector(STRSXP, 2));
SET_STRING_ELT(tt, 0, char_ordered);
SET_STRING_ELT(tt, 1, char_factor);
} else {
setAttrib(target, R_ClassSymbol, ScalarString(char_factor));
}
} else { // factor==false
for (int i=0; i<LENGTH(l); ++i) {
const int thisnrow = eachMax[i];
if (thisnrow==0) continue;
SEXP li = VECTOR_ELT(l, i);
int w = usenames ? colMap[i*ncol + j] : (j<length(li) ? j : -1); // check if j exceeds length for fill=TRUE and usenames=FALSE #5444
SEXP thisCol;
if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871
writeNA(target, ansloc, thisnrow, false); // writeNA is integer64 aware and writes INT64_MIN
} else {
bool listprotect = (TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target);
// do an as.list() on the atomic column; #3528
if (listprotect) {
// coerceAs for int64 to copy attributes (coerceVector does not copy atts)
thisCol = PROTECT(INHERITS(thisCol, char_integer64) ? coerceAs(thisCol, target, ScalarLogical(TRUE)) : coerceVector(thisCol, TYPEOF(target)));
// else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909)
const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName);
UNPROTECT(1); // earlier unprotect rbindlist calls with lots of lists #4536
if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret);
} else {
const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName);
if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret);
}
// e.g. when precision is lost like assigning 3.4 to integer64; test 2007.2
// TODO: but maxType should handle that and this should never warn
}
ansloc += thisnrow;
}
}
}
UNPROTECT(nprotect); // ans, ansNames, coercedForFactor?
return(ans);
}