From cc27941eac63a6f147a36f4aeccf83d654e4c7be Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 10 Apr 2020 18:23:39 +0100 Subject: [PATCH 01/15] uniqueN logical unfold for codecov --- src/uniqlist.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index 447b3ea057..e2220a64fd 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -346,8 +346,10 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul SEXP uniqueNlogical(SEXP x, SEXP narmArg) { // single pass; short-circuit and return as soon as all 3 values are found - if (!isLogical(x)) error(_("x is not a logical vector")); - if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error(_("na.rm must be TRUE or FALSE")); + if (!isLogical(x)) + error(_("x is not a logical vector")); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("na.rm must be TRUE or FALSE")); bool narm = LOGICAL(narmArg)[0]==1; const R_xlen_t n = xlength(x); if (n==0) @@ -356,16 +358,27 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) { R_xlen_t i=0; const int *ix = LOGICAL(x); while (++i Date: Fri, 10 Apr 2020 18:53:13 +0100 Subject: [PATCH 02/15] uniqlengths reduce loop overhead, unfold for codecov --- src/uniqlist.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index e2220a64fd..89b1af81d0 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -144,15 +144,19 @@ SEXP uniqlist(SEXP l, SEXP order) } SEXP uniqlengths(SEXP x, SEXP n) { - // seems very similar to rbindlist.c:uniq_lengths. TODO: centralize into common function - if (TYPEOF(x) != INTSXP) error(_("Input argument 'x' to 'uniqlengths' must be an integer vector")); - if (TYPEOF(n) != INTSXP || length(n) != 1) error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1")); + if (!isInteger(x)) + error(_("Input argument 'x' to 'uniqlengths' must be an integer vector")); + if (!isInteger(x) || length(n)!=1 || INTEGER(n)[0]==NA_INTEGER) + error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA")); R_len_t len = length(x); SEXP ans = PROTECT(allocVector(INTSXP, len)); - for (R_len_t i=1; i0) INTEGER(ans)[len-1] = INTEGER(n)[0] - INTEGER(x)[len-1] + 1; + if (len>0) + ansp[len-1] = INTEGER(n)[0] - xp[len-1] + 1; UNPROTECT(1); return(ans); } From 96ec1a2390bd775e6d1c707961488ab6b8a483f6 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 10 Apr 2020 19:45:54 +0100 Subject: [PATCH 03/15] refactor code for easy folding better codecov more local variable declaration more meaningful variable names better int64 class inheritance check --- src/uniqlist.c | 120 ++++++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 57 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index 89b1af81d0..7c582e989e 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -1,5 +1,29 @@ #include "data.table.h" +/* uniqlist macros */ +#define COMPARE1 \ +prev = *vd; \ +for (int i=1; i=isize) { \ + isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); \ + iidx = Realloc(iidx, isize, int); \ + } \ + } \ + prev = elem; \ +} + // DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes // Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up) SEXP uniqlist(SEXP l, SEXP order) @@ -11,48 +35,26 @@ SEXP uniqlist(SEXP l, SEXP order) // (maximum length the number of rows) and the length returned in anslen. // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported. // DONE: ans is now grown - if (!isNewList(l)) error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov + if (!isNewList(l)) + error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov + R_len_t ncol = length(l); R_len_t nrow = length(VECTOR_ELT(l,0)); - if (!isInteger(order)) error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov - if (LENGTH(order)<1) error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov - if (LENGTH(order)>1 && LENGTH(order)!=nrow) error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov + if (!isInteger(order)) + error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov + if (LENGTH(order)<1) + error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov + if (LENGTH(order)>1 && LENGTH(order)!=nrow) + error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov bool via_order = INTEGER(order)[0] != -1; // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows unsigned long long *ulv; // for numeric check speed-up - SEXP v, ans; - R_len_t len, thisi, previ, isize=1000; + R_len_t isize=1000, len=1; int *iidx = Calloc(isize, int); // for 'idx' - len = 1; iidx[0] = 1; // first row is always the first of the first group - if (ncol==1) { - -#define COMPARE1 \ - prev = *vd; \ - for (int i=1; i=isize) { \ - isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); \ - iidx = Realloc(iidx, isize, int); \ - } \ - } \ - prev = elem; \ - } - SEXP v = VECTOR_ELT(l,0); - int *o = INTEGER(order); // only used when via_order is true + const int *o = INTEGER(order); // only used when via_order is true switch(TYPEOF(v)) { case INTSXP : case LGLSXP : { const int *vd=INTEGER(v); @@ -78,7 +80,7 @@ SEXP uniqlist(SEXP l, SEXP order) const uint64_t *vd=(const uint64_t *)REAL(v); uint64_t prev, elem; // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine - if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) { + if (getNumericRounding_C()==0 /*default*/ || Rinherits(v,char_integer64)) { if (via_order) { COMPARE1_VIA_ORDER COMPARE2 } else { @@ -92,51 +94,55 @@ SEXP uniqlist(SEXP l, SEXP order) } } } break; - default : - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + default : { + error(_("Type '%s' not supported"), type2char(TYPEOF(v))); } - } else { - // ncol>1 - thisi = via_order ? INTEGER(order)[0]-1 : 0; + } + } else { // ncol>1 + R_len_t previ, thisi = via_order ? INTEGER(order)[0]-1 : 0; bool *i64 = (bool *)R_alloc(ncol, sizeof(bool)); - for (int i=0; i=0 && b) { - v=VECTOR_ELT(l,j); + bool same = true; // flag to indicate if the values in a row are same as the previous row, if flag false then we can move on to next group + while (--j>=0 && same) { + SEXP v = VECTOR_ELT(l,j); switch (TYPEOF(v)) { - case INTSXP : case LGLSXP : // NA_INTEGER==NA_LOGICAL checked in init.c - b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break; - case STRSXP : + case INTSXP : case LGLSXP : { // NA_INTEGER==NA_LOGICAL checked in init.c + same = INTEGER(v)[thisi]==INTEGER(v)[previ]; + } break; + case STRSXP : { // fix for #469, when key is set, duplicated calls uniqlist, where encoding // needs to be taken care of. - b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings. - case REALSXP : + same = ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); + } break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings. + case REALSXP : { ulv = (unsigned long long *)REAL(v); - b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) - if (!b && !i64[j]) { - b = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ); + same = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) + if (!same && !i64[j]) { + same = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ); // could store LHS for use next time as RHS (to save calling dtwiddle twice). However: i) there could be multiple double columns so vector of RHS would need // to be stored, ii) many short-circuit early before the if (!b) anyway (negating benefit) and iii) we may not have needed LHS this time so logic would be complex. } - break; - default : - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + } break; + default : { + error(_("Type '%s' not supported"), type2char(TYPEOF(v))); + } } } - if (!b) { + if (!same) { iidx[len++] = i+1; if (len >= isize) { isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); iidx = Realloc(iidx, isize, int); } - } + } // almost like COMPARE2 but no last line: prev = elem } } - PROTECT(ans = allocVector(INTSXP, len)); + SEXP ans = PROTECT(allocVector(INTSXP, len)); memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues Free(iidx); UNPROTECT(1); From ac8e6b8c17dba7ba36b998a49d70d1ebe555c01f Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 00:02:24 +0100 Subject: [PATCH 04/15] exporting uniq function (uniqlist), #900 rework edge cases, that were not being used, to be more consistent --- NAMESPACE | 1 + R/uniqlist.R | 39 ++++++++++++++++++++------------- man/uniq.Rd | 35 +++++++++++++++++++++++++++++ src/data.table.h | 2 +- src/init.c | 2 +- src/uniqlist.c | 57 ++++++++++++++++++++++++------------------------ 6 files changed, 90 insertions(+), 46 deletions(-) create mode 100644 man/uniq.Rd diff --git a/NAMESPACE b/NAMESPACE index c2c095a1d8..fb8a09c294 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(uniq) S3method("[", data.table) S3method("[<-", data.table) diff --git a/R/uniqlist.R b/R/uniqlist.R index b0c1c9fdd0..1e0b0e35f0 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -1,18 +1,28 @@ +nrow2 = function(x) { + if (!length(x)) return(0L) + if (is.data.table(x)) nrow(x) else if (is.list(x)) length(x[[1L]]) else stop("nrow2 expects data.table or list") +} -uniqlist = function (l, order = -1L) -{ - # Assumes input list is ordered by each list item (or by 'order' if supplied), and that all list elements are the same length - # Finds the non-duplicate rows. Was called duplist but now grows vector - doesn't over-allocate result vector and - # is >2x times faster on numeric types - # TO DO: Possibly reinstate reverse argument : - # FALSE works in the usual duplicated() way, the first in a sequence of dups, will be FALSE - # TRUE has the last in a sequence of dups FALSE (so you can keep the last if that's required) - # l = list(...) - if (!is.list(l)) - stop("l not type list") - if (!length(l)) return(list(0L)) - ans = .Call(Cuniqlist, l, as.integer(order)) - ans +uniqlist = function (l, order=integer()) { + # used in "[.data.table" when doing groupby (!byjoin) to find the groups using byval + # (length(byval) && length(byval[[1L]])) && (bysameorder || byindex) + # and in duplicated.data.table when + # haskey(x) && length(by) <= length(key(x)) && all(head(key(x), length(by)) == by) + uniq(l, order, internal=TRUE) +} + +uniq = function(x, order=integer(), internal=FALSE) { + if (!isTRUE(internal) && !missing(order)) { + if (!is.integer(order)) + stop("'order' must be an integer") + if (length(order) != nrow2(x)) + stop("'order' must be same length as nrow of 'x'") + if (anyDuplicated(order)) + stop("'order' should not contain duplicates") + if (anyNA(order)) + stop("'order' should not contain NAs") + } + .Call(Cuniq, x, order) } # implemented for returning the lengths of groups obtained from uniqlist (for internal use only) @@ -21,4 +31,3 @@ uniqlengths = function(x, len) { ans = .Call(Cuniqlengths, as.integer(x), as.integer(len)) ans } - diff --git a/man/uniq.Rd b/man/uniq.Rd new file mode 100644 index 0000000000..c924b694c9 --- /dev/null +++ b/man/uniq.Rd @@ -0,0 +1,35 @@ +\name{uniq} +\alias{uniq} +\alias{uniqlist} +\title{ Consecutively unique rows } +\description{ + Finds the consecutively unique rows. +} +\usage{ +uniq(x, order, internal=FALSE) +} +\arguments{ + \item{x}{ data.table type object. } + \item{order}{ integer vector order of \code{x}. } + \item{internal}{ logical flag, escapes checks of \code{order} argument. } +} +\details{ + Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row. + Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, sorted, no duplicates, no NAs). +} +\value{ + Integer vector corresponding to rows which are consecutively unique. +} +\seealso{ \code{\link{data.table}}, \code{\link{rleid}} } +\examples{ +uniq(data.table()) +uniq(data.table(x=integer())) +uniq(data.table(x=integer(), y=integer())) +uniq(data.table(x=1L)) +uniq(data.table(x=1L, y=1L)) +uniq(data.table(x=1:2)) +uniq(data.table(x=1:2, y=1:2)) +uniq(data.table(x=1:2)[c(1L,1:2)]) +uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]) +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 90ff7fb6fc..646a32cc1a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val); SEXP vecseq(SEXP x, SEXP len, SEXP clamp); // uniqlist.c -SEXP uniqlist(SEXP l, SEXP order); +SEXP uniq(SEXP x, SEXP order); SEXP uniqlengths(SEXP x, SEXP n); // chmatch.c diff --git a/src/init.c b/src/init.c index aed2da3dbd..b5b5da2bf1 100644 --- a/src/init.c +++ b/src/init.c @@ -149,7 +149,7 @@ R_CallMethodDef callMethods[] = { {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1}, {"Cfmelt", (DL_FUNC) &fmelt, -1}, {"Cfcast", (DL_FUNC) &fcast, -1}, -{"Cuniqlist", (DL_FUNC) &uniqlist, -1}, +{"Cuniq", (DL_FUNC) &uniq, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, {"Cforder", (DL_FUNC) &forder, -1}, {"Cfsorted", (DL_FUNC) &fsorted, -1}, diff --git a/src/uniqlist.c b/src/uniqlist.c index 7c582e989e..4cf93a5232 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -1,6 +1,6 @@ #include "data.table.h" -/* uniqlist macros */ +/* uniq macros */ #define COMPARE1 \ prev = *vd; \ for (int i=1; i 2x speed-up) -SEXP uniqlist(SEXP l, SEXP order) -{ - // This works like UNIX uniq as referred to by ?base::unique; i.e., it - // drops immediately repeated rows but doesn't drop duplicates of any - // previous row. Unless, order is provided, then it also drops any previous - // row. l must be a list of same length vectors ans is allocated first - // (maximum length the number of rows) and the length returned in anslen. - // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported. - // DONE: ans is now grown - if (!isNewList(l)) - error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov - - R_len_t ncol = length(l); - R_len_t nrow = length(VECTOR_ELT(l,0)); +SEXP uniq(SEXP x, SEXP order) { + if (!isNewList(x)) + error(_("'x' must be a data.table type object")); if (!isInteger(order)) - error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov - if (LENGTH(order)<1) - error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov - if (LENGTH(order)>1 && LENGTH(order)!=nrow) - error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov - bool via_order = INTEGER(order)[0] != -1; // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows - - unsigned long long *ulv; // for numeric check speed-up + error(_("'order' must be an integer, you should be more careful when using internal=TRUE")); + const bool verbose = GetVerbose(); + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + R_len_t ncol = length(x); + R_len_t nrow = length(VECTOR_ELT(x,0)); + if (!ncol || !nrow) { + SEXP ans = PROTECT(allocVector(INTSXP, 0)); + if (verbose) + Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic); + UNPROTECT(1); + return(ans); + } + bool via_order = LENGTH(order) > 0; + if (via_order && LENGTH(order)!=nrow) + error(_("uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); + unsigned long long *ulv; // for numeric check speed-up (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up) R_len_t isize=1000, len=1; int *iidx = Calloc(isize, int); // for 'idx' iidx[0] = 1; // first row is always the first of the first group if (ncol==1) { - SEXP v = VECTOR_ELT(l,0); + SEXP v = VECTOR_ELT(x,0); const int *o = INTEGER(order); // only used when via_order is true switch(TYPEOF(v)) { case INTSXP : case LGLSXP : { @@ -102,14 +99,14 @@ SEXP uniqlist(SEXP l, SEXP order) R_len_t previ, thisi = via_order ? INTEGER(order)[0]-1 : 0; bool *i64 = (bool *)R_alloc(ncol, sizeof(bool)); for (int i=0; i=0 && same) { - SEXP v = VECTOR_ELT(l,j); + SEXP v = VECTOR_ELT(x,j); switch (TYPEOF(v)) { case INTSXP : case LGLSXP : { // NA_INTEGER==NA_LOGICAL checked in init.c same = INTEGER(v)[thisi]==INTEGER(v)[previ]; @@ -142,9 +139,11 @@ SEXP uniqlist(SEXP l, SEXP order) } // almost like COMPARE2 but no last line: prev = elem } } - SEXP ans = PROTECT(allocVector(INTSXP, len)); + SEXP ans = PROTECT(allocVector(INTSXP, len)); memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues Free(iidx); + if (verbose) + Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic); UNPROTECT(1); return(ans); } From 4a4fc1d5196100b040593816b3be962960ad7342 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 11:08:26 +0100 Subject: [PATCH 05/15] extend man for order arg example, fix old tests, breaking change? --- inst/tests/tests.Rraw | 5 ++--- man/uniq.Rd | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..b03fed4b07 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13266,9 +13266,8 @@ test(1962.0091, anyDuplicated(DT, by = NULL), 2L) test(1962.0092, uniqueN(DT, by = NULL), 3L) ## uniqlist.R -test(1962.010, uniqlist(1:5), - error = 'not type list') -test(1962.011, uniqlist(list()), list(0L)) +test(1962.010, uniqlist(1:5), error = 'must be a data.table type') +test(1962.011, uniqlist(list()), integer()) ## merge.R DT1 = data.table(a = 1:3, V = 'a') diff --git a/man/uniq.Rd b/man/uniq.Rd index c924b694c9..389a9c5aaa 100644 --- a/man/uniq.Rd +++ b/man/uniq.Rd @@ -31,5 +31,13 @@ uniq(data.table(x=1:2)) uniq(data.table(x=1:2, y=1:2)) uniq(data.table(x=1:2)[c(1L,1:2)]) uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]) + +# order argument +x = data.table(id = 1:8, v = rep(1:2, each=4)) +uniq(x[,"v"]) +x = x[c(1:2,7:8,3:4,5:6)] +uniq(x[,"v"]) +o = order(x$id) +uniq(x[,"v"], order=o) } \keyword{ data } From f5bcbf3c575ce4f5b042d13285931edf94cf48ef Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 11:18:05 +0100 Subject: [PATCH 06/15] internal uniqlist reverted to match previous tests --- R/uniqlist.R | 9 ++++++++- inst/tests/tests.Rraw | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/uniqlist.R b/R/uniqlist.R index 1e0b0e35f0..c5237169a6 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -3,11 +3,18 @@ nrow2 = function(x) { if (is.data.table(x)) nrow(x) else if (is.list(x)) length(x[[1L]]) else stop("nrow2 expects data.table or list") } -uniqlist = function (l, order=integer()) { +uniqlist = function (l, order = -1L) { # used in "[.data.table" when doing groupby (!byjoin) to find the groups using byval # (length(byval) && length(byval[[1L]])) && (bysameorder || byindex) # and in duplicated.data.table when # haskey(x) && length(by) <= length(key(x)) && all(head(key(x), length(by)) == by) + + # those are only for backward compatibility, probably not really used anywhere, will keep 1962.010 and 1962.011 happy + if (!is.list(l)) stop("l not type list") + if (!length(l)) return(list(0L)) + # this is for compatibility to new uniq.c code + if (identical(order, -1L)) order = integer() + uniq(l, order, internal=TRUE) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b03fed4b07..7cc6819e8f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13266,8 +13266,9 @@ test(1962.0091, anyDuplicated(DT, by = NULL), 2L) test(1962.0092, uniqueN(DT, by = NULL), 3L) ## uniqlist.R -test(1962.010, uniqlist(1:5), error = 'must be a data.table type') -test(1962.011, uniqlist(list()), integer()) +test(1962.010, uniqlist(1:5), + error = 'not type list') +test(1962.011, uniqlist(list()), list(0L)) ## merge.R DT1 = data.table(a = 1:3, V = 'a') From 02415a5057f87a1ce2fcccc7a57ffcd4c6295237 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 14:18:43 +0100 Subject: [PATCH 07/15] uniq unit tests, uniqlist as well, document segfault when internal=T --- R/uniqlist.R | 19 +++++++++------ inst/tests/tests.Rraw | 57 +++++++++++++++++++++++++++++++++++++++++++ src/uniqlist.c | 6 +++++ 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/R/uniqlist.R b/R/uniqlist.R index c5237169a6..a4869fdc92 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -21,13 +21,18 @@ uniqlist = function (l, order = -1L) { uniq = function(x, order=integer(), internal=FALSE) { if (!isTRUE(internal) && !missing(order)) { if (!is.integer(order)) - stop("'order' must be an integer") - if (length(order) != nrow2(x)) - stop("'order' must be same length as nrow of 'x'") - if (anyDuplicated(order)) - stop("'order' should not contain duplicates") - if (anyNA(order)) - stop("'order' should not contain NAs") + order = as.integer(order) + if (length(order)) { + len = nrow2(x) + if (length(order) != len) + stop("'order' must be same length as nrow of 'x'") + if (anyDuplicated(order)) + stop("'order' must not contain duplicates") + if (anyNA(order)) + stop("'order' must not contain NAs") + if (any(!between(order, 1L, len))) + stop("'order' must be in range of 1:nrow(x)") + } } .Call(Cuniq, x, order) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..fcdc8e0cd8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -54,6 +54,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist + nrow2 = data.table:::nrow2 which_ = data.table:::which_ which.first = data.table:::which.first which.last = data.table:::which.last @@ -3885,6 +3886,62 @@ if (.Machine$sizeof.longdouble == 16) { test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) +# test uniq (uniqlist) #900 ## test number 1150 looks to be unused so taking over +test(1150.01, uniq(data.table()), integer()) # examples +test(1150.02, uniq(data.table(x=integer())), integer()) +test(1150.03, uniq(data.table(x=integer(), y=integer())), integer()) +test(1150.04, uniq(data.table(x=1L)), 1L) +test(1150.05, uniq(data.table(x=1L, y=1L)), 1L) +test(1150.06, uniq(data.table(x=1:2)), 1:2) +test(1150.07, uniq(data.table(x=1:2, y=1:2)), 1:2) +test(1150.08, uniq(data.table(x=1:2)[c(1L,1:2)]), c(1L,3L)) +test(1150.09, uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]), c(1L,3L)) +x = data.table(id = 1:8, v = rep(1:2, each=4)) # 'order' argument example +test(1150.11, uniq(x[,"v"]), c(1L,5L)) +x = x[c(1:2,7:8,3:4,5:6)] +test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L)) +o = order(x$id) +test(1150.13, uniq(x[,"v"], order=o), c(1L,5L)) +test(1150.21, uniq(1:5), error="must be a data.table type") # error code cov +test(1150.22, uniq(list(b = as.raw(1:5))), error="not supported") +test(1150.23, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported") +test(1150.31, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L)) # 'order' exceptions internal=FALSE +test(1150.32, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow of 'x'") +test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), error="must not contain duplicates") +test(1150.34, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must not contain NAs") +test(1150.35, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range") +test(1150.36, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range") +test(1150.41, uniq(x[,"v"], order=as.numeric(o), internal=TRUE), error="must be an integer") # 'order' exceptions internal=TRUE +test(1150.42, uniq(x[,"v"], order=o[-1L], internal=TRUE), error="has been passed length.*nrow") +test(1150.43, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal +#test(1150.441, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), internal=TRUE), error="must not contain NAs") ## unhandled bad usage! thus internal # segfault, see uniqlist.c:COMPARE1_VIA_ORDER +test(1150.442, uniq(x[c(o[1:6],o[c(NA,7L)]), "v"], internal=TRUE), c(1L,5L,7L,8L)) ## this is ok as are not using 'order' arg +test(1150.45, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal +test(1150.46, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal +x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behaviour of 'order' special case +o = order(x$id) +test(1150.51, uniq(x[,"v"], order=o), c(1L,5L)) +test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L)) +test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow") +test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range") +test(1150.61, uniq(x[,"v"], order=o, internal=TRUE), c(1L,5L)) +test(1150.62, uniq(x[,"v"], order=integer(), internal=TRUE), c(1L,5L)) +test(1150.63, uniq(x[,"v"], order=-1L, internal=TRUE), error="has been passed length.*nrow") +test(1150.64, uniq(x[1L,"v"], order=-1L, internal=TRUE), 1L) ## unhandled bad usage! thus internal +test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L)) +test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L)) +test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L)) +test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L) +op = options(datatable.verbose=TRUE) +test(1150.91, uniq(data.table()), integer(), output="took") +test(1150.92, uniq(data.table(x=integer())), integer(), output="took") +test(1150.93, uniq(data.table(x=1:2)), 1:2, output="took") +options(op) +test(1150.96, nrow2(list()), 0L) # nrow2 helper tests +test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L) +test(1150.98, nrow2(data.table(x=1:10, b=1:5)), 10L) +test(1150.99, nrow2(1:5), error="nrow2 expects data.table or list") + # test uniqlengths set.seed(45) x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) diff --git a/src/uniqlist.c b/src/uniqlist.c index 4cf93a5232..c15ab2b5c9 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -7,6 +7,12 @@ for (int i=1; i nrow) error("internal error: *++o not in range of 1:nrow(x)\n"); elem = vd[oi -1]; + */ #define COMPARE1_VIA_ORDER \ prev = vd[*o -1]; \ for (int i=1; i Date: Sat, 11 Apr 2020 14:31:33 +0100 Subject: [PATCH 08/15] uniqueNlogical codecov, uniqlengths test num to int --- R/uniqlist.R | 2 +- inst/tests/tests.Rraw | 2 ++ src/uniqlist.c | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/uniqlist.R b/R/uniqlist.R index a4869fdc92..2f706e8a4d 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -12,7 +12,7 @@ uniqlist = function (l, order = -1L) { # those are only for backward compatibility, probably not really used anywhere, will keep 1962.010 and 1962.011 happy if (!is.list(l)) stop("l not type list") if (!length(l)) return(list(0L)) - # this is for compatibility to new uniq.c code + # this is for compatibility to new uniq C code if (identical(order, -1L)) order = integer() uniq(l, order, internal=TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fcdc8e0cd8..fc03f27181 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3950,6 +3950,7 @@ o1 <- uniqlist(list(x), ox) test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) o1 <- uniqlist(list(x)) test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +test(1151.3, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(as.numeric(o1), as.numeric(length(x)))) rm(list=c("x","ox","o1")) gc() @@ -6786,6 +6787,7 @@ test(1475.13, uniqueN(NA), 1L) test(1475.14, uniqueN(NA, na.rm=TRUE), 0L) test(1475.15, uniqueN(logical()), 0L) test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L) +test(1475.17, uniqueN(TRUE, na.rm=NA), error="must be TRUE or FALSE") # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) diff --git a/src/uniqlist.c b/src/uniqlist.c index c15ab2b5c9..88a8b1641e 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -156,9 +156,9 @@ SEXP uniq(SEXP x, SEXP order) { SEXP uniqlengths(SEXP x, SEXP n) { if (!isInteger(x)) - error(_("Input argument 'x' to 'uniqlengths' must be an integer vector")); + error(_("internal error: Input argument 'x' to 'uniqlengths' must be an integer vector")); // # nocov if (!isInteger(x) || length(n)!=1 || INTEGER(n)[0]==NA_INTEGER) - error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA")); + error(_("internal error: Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA")); // # nocov R_len_t len = length(x); SEXP ans = PROTECT(allocVector(INTSXP, len)); const int *xp = INTEGER(x); @@ -362,7 +362,7 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul SEXP uniqueNlogical(SEXP x, SEXP narmArg) { // single pass; short-circuit and return as soon as all 3 values are found if (!isLogical(x)) - error(_("x is not a logical vector")); + error(_("internal error: x is not a logical vector")); // # nocov if (!IS_TRUE_OR_FALSE(narmArg)) error(_("na.rm must be TRUE or FALSE")); bool narm = LOGICAL(narmArg)[0]==1; From fef3aece6207c3ee76255eeae8606131ab0fc002 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 14:48:46 +0100 Subject: [PATCH 09/15] news and more manual --- NEWS.md | 2 ++ man/uniq.Rd | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..db206de8ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,8 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. New function `uniq` has been exported (previously known as `uniqlist` when used internally). Function is useful to find consecutively unique rows, [#900](https://github.com/Rdatatable/data.table/issues/900). Thanks to @anhqle for feature request. For more details about usage see function manual [`?uniq`](https://rdatatable.gitlab.io/data.table/library/data.table/html/uniq.html). + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/man/uniq.Rd b/man/uniq.Rd index 389a9c5aaa..c8ecd2c44c 100644 --- a/man/uniq.Rd +++ b/man/uniq.Rd @@ -15,7 +15,7 @@ uniq(x, order, internal=FALSE) } \details{ Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row. - Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, sorted, no duplicates, no NAs). + Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, no duplicates, no NAs). Anyone setting \code{internal} to \code{TRUE} is obligated to check the \code{order} argument beforehand, otherwise \emph{segfault} exception is likely to happen and terminate R process abnormally. } \value{ Integer vector corresponding to rows which are consecutively unique. @@ -32,12 +32,16 @@ uniq(data.table(x=1:2, y=1:2)) uniq(data.table(x=1:2)[c(1L,1:2)]) uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]) -# order argument +# 'order' argument x = data.table(id = 1:8, v = rep(1:2, each=4)) uniq(x[,"v"]) x = x[c(1:2,7:8,3:4,5:6)] uniq(x[,"v"]) o = order(x$id) uniq(x[,"v"], order=o) + +if (!anyDuplicated(o) && !anyNA(o) && between(o, 1L, nrow(x))) { + uniq(x[,"v"], order=o, internal=TRUE) +} } \keyword{ data } From bcdc65a8d6cd017c4abffa04dc244e166bef3d93 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 14:50:59 +0100 Subject: [PATCH 10/15] if needs all, unlike stopifnot, fix --- man/uniq.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/uniq.Rd b/man/uniq.Rd index c8ecd2c44c..63dccf2953 100644 --- a/man/uniq.Rd +++ b/man/uniq.Rd @@ -40,7 +40,7 @@ uniq(x[,"v"]) o = order(x$id) uniq(x[,"v"], order=o) -if (!anyDuplicated(o) && !anyNA(o) && between(o, 1L, nrow(x))) { +if (!anyDuplicated(o) && !anyNA(o) && all(between(o, 1L, nrow(x)))) { uniq(x[,"v"], order=o, internal=TRUE) } } From 58beb639aa65b1cb529738e84e00fae05e8429e0 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 19 Apr 2020 20:43:50 +0100 Subject: [PATCH 11/15] segfault proof public api --- R/uniqlist.R | 33 +++++++++++---------- inst/tests/tests.Rraw | 52 ++++++++++++++++++-------------- man/uniq.Rd | 17 ++++++----- src/data.table.h | 2 +- src/uniqlist.c | 69 ++++++++++++++++++++++++++++++++----------- 5 files changed, 110 insertions(+), 63 deletions(-) diff --git a/R/uniqlist.R b/R/uniqlist.R index 2f706e8a4d..fedba89646 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -15,26 +15,27 @@ uniqlist = function (l, order = -1L) { # this is for compatibility to new uniq C code if (identical(order, -1L)) order = integer() - uniq(l, order, internal=TRUE) + funiq(l, order, safe=FALSE) } -uniq = function(x, order=integer(), internal=FALSE) { - if (!isTRUE(internal) && !missing(order)) { - if (!is.integer(order)) +uniq = function(x, order=integer()) { + if (!is.list(x)) + stop("'x' must be a data.table type object"); + if (!is.integer(order)) { + if (is.numeric(order)) order = as.integer(order) - if (length(order)) { - len = nrow2(x) - if (length(order) != len) - stop("'order' must be same length as nrow of 'x'") - if (anyDuplicated(order)) - stop("'order' must not contain duplicates") - if (anyNA(order)) - stop("'order' must not contain NAs") - if (any(!between(order, 1L, len))) - stop("'order' must be in range of 1:nrow(x)") - } + else + stop("'order' must be an integer") } - .Call(Cuniq, x, order) + if (length(order) && length(order)!=nrow2(x)) + stop("'order' must be same length as nrow of 'x'") + funiq(x, order, safe=TRUE) +} + +# use safe=F when you are sure that 'order' is in 1:nrow(x) +# otherwise it segfaults, thus internal +funiq = function(x, order=integer(), safe=FALSE) { + .Call(Cuniq, x, order, safe) } # implemented for returning the lengths of groups obtained from uniqlist (for internal use only) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fc03f27181..0a0699b53e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -54,6 +54,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist + funiq = data.table:::funiq nrow2 = data.table:::nrow2 which_ = data.table:::which_ which.first = data.table:::which.first @@ -3902,32 +3903,38 @@ x = x[c(1:2,7:8,3:4,5:6)] test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L)) o = order(x$id) test(1150.13, uniq(x[,"v"], order=o), c(1L,5L)) -test(1150.21, uniq(1:5), error="must be a data.table type") # error code cov -test(1150.22, uniq(list(b = as.raw(1:5))), error="not supported") -test(1150.23, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported") -test(1150.31, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L)) # 'order' exceptions internal=FALSE -test(1150.32, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow of 'x'") -test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), error="must not contain duplicates") -test(1150.34, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must not contain NAs") -test(1150.35, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range") -test(1150.36, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range") -test(1150.41, uniq(x[,"v"], order=as.numeric(o), internal=TRUE), error="must be an integer") # 'order' exceptions internal=TRUE -test(1150.42, uniq(x[,"v"], order=o[-1L], internal=TRUE), error="has been passed length.*nrow") -test(1150.43, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal -#test(1150.441, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), internal=TRUE), error="must not contain NAs") ## unhandled bad usage! thus internal # segfault, see uniqlist.c:COMPARE1_VIA_ORDER -test(1150.442, uniq(x[c(o[1:6],o[c(NA,7L)]), "v"], internal=TRUE), c(1L,5L,7L,8L)) ## this is ok as are not using 'order' arg -test(1150.45, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal -test(1150.46, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal -x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behaviour of 'order' special case +x = data.table(id = 1:8, v = rep(1:2, each=4), w=1L) +o = order(x$id) +test(1150.21, uniq(1:5), error="must be a data.table type object") +test(1150.22, funiq(1:5), error="internal.*must be a data.table type object") +test(1150.23, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L)) +test(1150.24, funiq(x[,"v"], order=as.numeric(o)), error="internal.*must be an integer") +test(1150.25, uniq(x[,"v"], order="a"), error="must be an integer") +test(1150.26, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow") +test(1150.27, funiq(x[,"v"], order=o[-1L]), error="internal.*has been passed length.*nrow") +test(1150.28, uniq(list(b = as.raw(1:5))), error="not supported") +test(1150.29, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported") +test(1150.30, funiq(x[,"v"], safe=NA), error="must be TRUE or FALSE") +test(1150.31, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must be in range") +test(1150.32, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range") +test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range") +test(1150.34, uniq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)])), error="must be in range") +test(1150.35, funiq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") ## trying safe=F would segfault! +test(1150.36, funiq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), safe=TRUE), error="must be in range") +test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error="must be in range") +test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") +test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"]) +test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) +x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case o = order(x$id) test(1150.51, uniq(x[,"v"], order=o), c(1L,5L)) test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L)) test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow") test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range") -test(1150.61, uniq(x[,"v"], order=o, internal=TRUE), c(1L,5L)) -test(1150.62, uniq(x[,"v"], order=integer(), internal=TRUE), c(1L,5L)) -test(1150.63, uniq(x[,"v"], order=-1L, internal=TRUE), error="has been passed length.*nrow") -test(1150.64, uniq(x[1L,"v"], order=-1L, internal=TRUE), 1L) ## unhandled bad usage! thus internal +test(1150.61, funiq(x[,"v"], order=o), c(1L,5L)) +test(1150.62, funiq(x[,"v"], order=integer()), c(1L,5L)) +test(1150.63, funiq(x[,"v"], order=-1L), error="has been passed length.*nrow") +test(1150.64, funiq(x[1L,"v"], order=-1L), error="must be in range") test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L)) test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L)) test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L)) @@ -3935,7 +3942,8 @@ test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L) op = options(datatable.verbose=TRUE) test(1150.91, uniq(data.table()), integer(), output="took") test(1150.92, uniq(data.table(x=integer())), integer(), output="took") -test(1150.93, uniq(data.table(x=1:2)), 1:2, output="took") +test(1150.93, uniq(data.table(x=1L)), 1L, output="took") +test(1150.94, uniq(data.table(x=1:2)), 1:2, output="took") options(op) test(1150.96, nrow2(list()), 0L) # nrow2 helper tests test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L) diff --git a/man/uniq.Rd b/man/uniq.Rd index 63dccf2953..37652b0b58 100644 --- a/man/uniq.Rd +++ b/man/uniq.Rd @@ -1,21 +1,23 @@ \name{uniq} \alias{uniq} \alias{uniqlist} +\alias{funiq} \title{ Consecutively unique rows } \description{ Finds the consecutively unique rows. } \usage{ -uniq(x, order, internal=FALSE) +uniq(x, order=integer()) } \arguments{ \item{x}{ data.table type object. } - \item{order}{ integer vector order of \code{x}. } - \item{internal}{ logical flag, escapes checks of \code{order} argument. } + \item{order}{ integer vector order of \code{x}, must not contain duplicates. } } \details{ Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row. - Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, no duplicates, no NAs). Anyone setting \code{internal} to \code{TRUE} is obligated to check the \code{order} argument beforehand, otherwise \emph{segfault} exception is likely to happen and terminate R process abnormally. +} +\note{ + It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception. } \value{ Integer vector corresponding to rows which are consecutively unique. @@ -37,11 +39,12 @@ x = data.table(id = 1:8, v = rep(1:2, each=4)) uniq(x[,"v"]) x = x[c(1:2,7:8,3:4,5:6)] uniq(x[,"v"]) + o = order(x$id) uniq(x[,"v"], order=o) - -if (!anyDuplicated(o) && !anyNA(o) && all(between(o, 1L, nrow(x)))) { - uniq(x[,"v"], order=o, internal=TRUE) +# or if we are not sure if 'o' has no duplicates +if (!anyDuplicated(o)) { + uniq(x[,"v"], order=o) } } \keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 646a32cc1a..51fe3598ff 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val); SEXP vecseq(SEXP x, SEXP len, SEXP clamp); // uniqlist.c -SEXP uniq(SEXP x, SEXP order); +SEXP uniq(SEXP x, SEXP order, SEXP safe); SEXP uniqlengths(SEXP x, SEXP n); // chmatch.c diff --git a/src/uniqlist.c b/src/uniqlist.c index 88a8b1641e..f0578f0d97 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -1,24 +1,28 @@ #include "data.table.h" /* uniq macros */ + #define COMPARE1 \ prev = *vd; \ for (int i=1; i nrow) error("internal error: *++o not in range of 1:nrow(x)\n"); elem = vd[oi -1]; - */ #define COMPARE1_VIA_ORDER \ prev = vd[*o -1]; \ for (int i=1; i nrow) \ + error("'order' must be in range 1:nrow(x)"); \ + elem = vd[oi -1]; \ + if (elem!=prev + #define COMPARE2 \ ) { \ iidx[len++] = i+1; \ @@ -30,11 +34,15 @@ for (int i=1; i 0; + bool via_order_safe = via_order && LOGICAL(safe)[0]; if (via_order && LENGTH(order)!=nrow) - error(_("uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); + error(_("internal error: uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); + const int *o = INTEGER(order); // only used when via_order[_safe] is true + if (nrow==1) { + if (via_order && o[0]!=1) + error("'order' must be in range 1:nrow(x)"); + SEXP ans = PROTECT(allocVector(INTSXP, 1)); + INTEGER(ans)[0] = 1; + if (verbose) + Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic); + UNPROTECT(1); + return(ans); + } + unsigned long long *ulv; // for numeric check speed-up (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up) R_len_t isize=1000, len=1; int *iidx = Calloc(isize, int); // for 'idx' iidx[0] = 1; // first row is always the first of the first group if (ncol==1) { SEXP v = VECTOR_ELT(x,0); - const int *o = INTEGER(order); // only used when via_order is true switch(TYPEOF(v)) { case INTSXP : case LGLSXP : { const int *vd=INTEGER(v); int prev, elem; - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE COMPARE2 + } else if (via_order) { // ad hoc by (order passed in) COMPARE1_VIA_ORDER COMPARE2 } else { @@ -73,7 +95,9 @@ SEXP uniq(SEXP x, SEXP order) { case STRSXP : { const SEXP *vd=STRING_PTR(v); SEXP prev, elem; - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 + } else if (via_order) { COMPARE1_VIA_ORDER && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 // but most of the time they are equal, so ENC2UTF8 doesn't need to be called } else { COMPARE1 && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 @@ -84,13 +108,17 @@ SEXP uniq(SEXP x, SEXP order) { uint64_t prev, elem; // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine if (getNumericRounding_C()==0 /*default*/ || Rinherits(v,char_integer64)) { - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE COMPARE2 + } else if (via_order) { COMPARE1_VIA_ORDER COMPARE2 } else { COMPARE1 COMPARE2 } } else { - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 + } else if (via_order) { COMPARE1_VIA_ORDER && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 } else { COMPARE1 && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 @@ -102,13 +130,20 @@ SEXP uniq(SEXP x, SEXP order) { } } } else { // ncol>1 - R_len_t previ, thisi = via_order ? INTEGER(order)[0]-1 : 0; + R_len_t previ, thisi = via_order ? o[0]-1 : 0; bool *i64 = (bool *)R_alloc(ncol, sizeof(bool)); for (int i=0; i nrow)) + error("'order' must be in range 1:nrow(x)"); + thisi = oi-1; + } int j = ncol; // the last column varies the most frequently so check that first and work backwards bool same = true; // flag to indicate if the values in a row are same as the previous row, if flag false then we can move on to next group while (--j>=0 && same) { From aed71cf9041ac5f5b6881b8e0cf75d5a1f410c71 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 19 Apr 2020 20:57:59 +0100 Subject: [PATCH 12/15] manual entry align --- man/uniq.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/uniq.Rd b/man/uniq.Rd index 37652b0b58..1a46603f38 100644 --- a/man/uniq.Rd +++ b/man/uniq.Rd @@ -11,7 +11,7 @@ uniq(x, order=integer()) } \arguments{ \item{x}{ data.table type object. } - \item{order}{ integer vector order of \code{x}, must not contain duplicates. } + \item{order}{ integer vector order of \code{x}, must not contain duplicates. } } \details{ Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row. From c93ed24c0471ca20620e8e2cf88522e5e0da17d8 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 19 Apr 2020 21:00:09 +0100 Subject: [PATCH 13/15] another manual entry fix --- man/uniq.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/uniq.Rd b/man/uniq.Rd index 1a46603f38..01d01a5b2a 100644 --- a/man/uniq.Rd +++ b/man/uniq.Rd @@ -17,7 +17,7 @@ uniq(x, order=integer()) Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row. } \note{ - It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception. + It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order}} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception. } \value{ Integer vector corresponding to rows which are consecutively unique. From b589fad0241b6c0e415a3ae925c4fdfed7341508 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 11:18:57 +0100 Subject: [PATCH 14/15] test coverage --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0a0699b53e..7f182d511b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3925,6 +3925,13 @@ test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error= test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"]) test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) +test(1150.41, uniq(data.table(x = c("a","a","b","b","c"))), c(1L,3L,5L)) ## test coverage +old = getNumericRounding() +setNumericRounding(0) +test(1150.42, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L)) +setNumericRounding(2) +test(1150.43, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L)) +setNumericRounding(old) x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case o = order(x$id) test(1150.51, uniq(x[,"v"], order=o), c(1L,5L)) From a09a54f5f382cb19b5825adb591c3c728fcb4da9 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 11:47:29 +0100 Subject: [PATCH 15/15] proper codecov fix --- inst/tests/tests.Rraw | 9 ++++++--- src/uniqlist.c | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7f182d511b..31f7e21092 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3925,12 +3925,15 @@ test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error= test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"]) test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) -test(1150.41, uniq(data.table(x = c("a","a","b","b","c"))), c(1L,3L,5L)) ## test coverage +test(1150.41, uniq(data.table(x = c("a","a","b","b","c")), order=1:5), c(1L,3L,5L)) ## test coverage old = getNumericRounding() setNumericRounding(0) -test(1150.42, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L)) +test(1150.42, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L)) setNumericRounding(2) -test(1150.43, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L)) +test(1150.43, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L)) +if (test_bit64) { + test(1150.44, uniq(data.table(x = as.integer64(c(1,1,2,2,3))), order=1:5), c(1L,3L,5L)) +} setNumericRounding(old) x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case o = order(x$id) diff --git a/src/uniqlist.c b/src/uniqlist.c index f0578f0d97..1a9e27489e 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -107,7 +107,8 @@ SEXP uniq(SEXP x, SEXP order, SEXP safe) { const uint64_t *vd=(const uint64_t *)REAL(v); uint64_t prev, elem; // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine - if (getNumericRounding_C()==0 /*default*/ || Rinherits(v,char_integer64)) { + if (getNumericRounding_C()==0 /*default*/ || + Rinherits(v,char_integer64)) { if (via_order_safe) { COMPARE1_VIA_ORDER_SAFE COMPARE2 } else if (via_order) {