From d78a1b5d9063a3482684572cb8955136214018ea Mon Sep 17 00:00:00 2001 From: Arun Srinivasan Date: Mon, 13 Oct 2014 10:34:43 +0200 Subject: [PATCH] Closes #872. Clearer explanation of duplicated(). --- README.md | 1 + man/duplicated.Rd | 55 +++++++++++++++++++++++------------------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index d2ae8b6d27..504c92aa55 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ #### NOTES + 1. Clearer explanation of what `duplicated()` does (borrowed from base). Thanks to @matthieugomez for pointing out. Closes [#872](https://github.com/Rdatatable/data.table/issues/872). ### Changes in v1.9.4 (on CRAN 2 Oct 2014) diff --git a/man/duplicated.Rd b/man/duplicated.Rd index 81bc407dae..2a204af5f6 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -7,10 +7,10 @@ \alias{anyDuplicated.data.table} \title{ Determine Duplicate Rows } \description{ - \code{duplicated} returns a logical vector indicating which rows of a \code{data.table} - have duplicate rows (by key). + \code{duplicated} returns a logical vector indicating which rows of a \code{data.table} (by + key columns or when no key all columns) are duplicates of a row with smaller subscripts. - \code{unique} returns a data table with duplicated rows (by key) removed, or + \code{unique} returns a \code{data.table} with duplicated rows (by key) removed, or (when no key) duplicated rows by all columns removed. \code{anyDuplicated} returns the \emph{index} \code{i} of the first duplicated entry if there is one, and 0 otherwise. @@ -65,38 +65,37 @@ } \seealso{ \code{\link{data.table}}, \code{\link{duplicated}}, \code{\link{unique}}, \code{\link{all.equal}}} \examples{ - DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6), key = "A,B") - duplicated(DT) - unique(DT) +DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6), key = "A,B") +duplicated(DT) +unique(DT) - duplicated(DT, by="B") - unique(DT, by="B") +duplicated(DT, by="B") +unique(DT, by="B") - duplicated(DT, by=c("A", "C")) - unique(DT, by=c("A", "C")) +duplicated(DT, by=c("A", "C")) +unique(DT, by=c("A", "C")) - DT = data.table(a=c(2L,1L,2L), b=c(1L,2L,1L)) # no key - unique(DT) # rows 1 and 2 (row 3 is a duplicate of row 1) +DT = data.table(a=c(2L,1L,2L), b=c(1L,2L,1L)) # no key +unique(DT) # rows 1 and 2 (row 3 is a duplicate of row 1) - DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6)) - unique(DT) # rows 1,2 and 5 +DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6)) +unique(DT) # rows 1,2 and 5 - DT = data.table(a=tan(pi*(1/4 + 1:10)), b=rep(1,10)) # example from ?all.equal - length(unique(DT$a)) # 10 strictly unique floating point values - all.equal(DT$a,rep(1,10)) # TRUE, all within tolerance of 1.0 - DT[,which.min(a)] # row 10, the strictly smallest floating point value - identical(unique(DT),DT[1]) # TRUE, stable within tolerance - identical(unique(DT),DT[10]) # FALSE +DT = data.table(a=tan(pi*(1/4 + 1:10)), b=rep(1,10)) # example from ?all.equal +length(unique(DT$a)) # 10 strictly unique floating point values +all.equal(DT$a,rep(1,10)) # TRUE, all within tolerance of 1.0 +DT[,which.min(a)] # row 10, the strictly smallest floating point value +identical(unique(DT),DT[1]) # TRUE, stable within tolerance +identical(unique(DT),DT[10]) # FALSE - # fromLast=TRUE - DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6), key = "A,B") - duplicated(DT, by="B", fromLast=TRUE) - unique(DT, by="B", fromLast=TRUE) +# fromLast=TRUE +DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6), key = "A,B") +duplicated(DT, by="B", fromLast=TRUE) +unique(DT, by="B", fromLast=TRUE) - # anyDuplicated - anyDuplicated(DT, by=c("A", "B")) # 3L - any(duplicated(DT, by=c("A", "B"))) # TRUE +# anyDuplicated +anyDuplicated(DT, by=c("A", "B")) # 3L +any(duplicated(DT, by=c("A", "B"))) # TRUE } \keyword{ data } -