diff --git a/NAMESPACE b/NAMESPACE index c2c095a1d8..fb8a09c294 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(uniq) S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..db206de8ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,8 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. New function `uniq` has been exported (previously known as `uniqlist` when used internally). Function is useful to find consecutively unique rows, [#900](https://github.com/Rdatatable/data.table/issues/900). Thanks to @anhqle for feature request. For more details about usage see function manual [`?uniq`](https://rdatatable.gitlab.io/data.table/library/data.table/html/uniq.html). + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/uniqlist.R b/R/uniqlist.R index b0c1c9fdd0..fedba89646 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -1,18 +1,41 @@ +nrow2 = function(x) { + if (!length(x)) return(0L) + if (is.data.table(x)) nrow(x) else if (is.list(x)) length(x[[1L]]) else stop("nrow2 expects data.table or list") +} -uniqlist = function (l, order = -1L) -{ - # Assumes input list is ordered by each list item (or by 'order' if supplied), and that all list elements are the same length - # Finds the non-duplicate rows. Was called duplist but now grows vector - doesn't over-allocate result vector and - # is >2x times faster on numeric types - # TO DO: Possibly reinstate reverse argument : - # FALSE works in the usual duplicated() way, the first in a sequence of dups, will be FALSE - # TRUE has the last in a sequence of dups FALSE (so you can keep the last if that's required) - # l = list(...) - if (!is.list(l)) - stop("l not type list") - if (!length(l)) return(list(0L)) - ans = .Call(Cuniqlist, l, as.integer(order)) - ans +uniqlist = function (l, order = -1L) { + # used in "[.data.table" when doing groupby (!byjoin) to find the groups using byval + # (length(byval) && length(byval[[1L]])) && (bysameorder || byindex) + # and in duplicated.data.table when + # haskey(x) && length(by) <= length(key(x)) && all(head(key(x), length(by)) == by) + + # those are only for backward compatibility, probably not really used anywhere, will keep 1962.010 and 1962.011 happy + if (!is.list(l)) stop("l not type list") + if (!length(l)) return(list(0L)) + # this is for compatibility to new uniq C code + if (identical(order, -1L)) order = integer() + + funiq(l, order, safe=FALSE) +} + +uniq = function(x, order=integer()) { + if (!is.list(x)) + stop("'x' must be a data.table type object"); + if (!is.integer(order)) { + if (is.numeric(order)) + order = as.integer(order) + else + stop("'order' must be an integer") + } + if (length(order) && length(order)!=nrow2(x)) + stop("'order' must be same length as nrow of 'x'") + funiq(x, order, safe=TRUE) +} + +# use safe=F when you are sure that 'order' is in 1:nrow(x) +# otherwise it segfaults, thus internal +funiq = function(x, order=integer(), safe=FALSE) { + .Call(Cuniq, x, order, safe) } # implemented for returning the lengths of groups obtained from uniqlist (for internal use only) @@ -21,4 +44,3 @@ uniqlengths = function(x, len) { ans = .Call(Cuniqlengths, as.integer(x), as.integer(len)) ans } - diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..31f7e21092 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -54,6 +54,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist + funiq = data.table:::funiq + nrow2 = data.table:::nrow2 which_ = data.table:::which_ which.first = data.table:::which.first which.last = data.table:::which.last @@ -3885,6 +3887,79 @@ if (.Machine$sizeof.longdouble == 16) { test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) +# test uniq (uniqlist) #900 ## test number 1150 looks to be unused so taking over +test(1150.01, uniq(data.table()), integer()) # examples +test(1150.02, uniq(data.table(x=integer())), integer()) +test(1150.03, uniq(data.table(x=integer(), y=integer())), integer()) +test(1150.04, uniq(data.table(x=1L)), 1L) +test(1150.05, uniq(data.table(x=1L, y=1L)), 1L) +test(1150.06, uniq(data.table(x=1:2)), 1:2) +test(1150.07, uniq(data.table(x=1:2, y=1:2)), 1:2) +test(1150.08, uniq(data.table(x=1:2)[c(1L,1:2)]), c(1L,3L)) +test(1150.09, uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]), c(1L,3L)) +x = data.table(id = 1:8, v = rep(1:2, each=4)) # 'order' argument example +test(1150.11, uniq(x[,"v"]), c(1L,5L)) +x = x[c(1:2,7:8,3:4,5:6)] +test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L)) +o = order(x$id) +test(1150.13, uniq(x[,"v"], order=o), c(1L,5L)) +x = data.table(id = 1:8, v = rep(1:2, each=4), w=1L) +o = order(x$id) +test(1150.21, uniq(1:5), error="must be a data.table type object") +test(1150.22, funiq(1:5), error="internal.*must be a data.table type object") +test(1150.23, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L)) +test(1150.24, funiq(x[,"v"], order=as.numeric(o)), error="internal.*must be an integer") +test(1150.25, uniq(x[,"v"], order="a"), error="must be an integer") +test(1150.26, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow") +test(1150.27, funiq(x[,"v"], order=o[-1L]), error="internal.*has been passed length.*nrow") +test(1150.28, uniq(list(b = as.raw(1:5))), error="not supported") +test(1150.29, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported") +test(1150.30, funiq(x[,"v"], safe=NA), error="must be TRUE or FALSE") +test(1150.31, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must be in range") +test(1150.32, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range") +test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range") +test(1150.34, uniq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)])), error="must be in range") +test(1150.35, funiq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") ## trying safe=F would segfault! +test(1150.36, funiq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), safe=TRUE), error="must be in range") +test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error="must be in range") +test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") +test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"]) +test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) +test(1150.41, uniq(data.table(x = c("a","a","b","b","c")), order=1:5), c(1L,3L,5L)) ## test coverage +old = getNumericRounding() +setNumericRounding(0) +test(1150.42, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L)) +setNumericRounding(2) +test(1150.43, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L)) +if (test_bit64) { + test(1150.44, uniq(data.table(x = as.integer64(c(1,1,2,2,3))), order=1:5), c(1L,3L,5L)) +} +setNumericRounding(old) +x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case +o = order(x$id) +test(1150.51, uniq(x[,"v"], order=o), c(1L,5L)) +test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L)) +test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow") +test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range") +test(1150.61, funiq(x[,"v"], order=o), c(1L,5L)) +test(1150.62, funiq(x[,"v"], order=integer()), c(1L,5L)) +test(1150.63, funiq(x[,"v"], order=-1L), error="has been passed length.*nrow") +test(1150.64, funiq(x[1L,"v"], order=-1L), error="must be in range") +test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L)) +test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L)) +test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L)) +test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L) +op = options(datatable.verbose=TRUE) +test(1150.91, uniq(data.table()), integer(), output="took") +test(1150.92, uniq(data.table(x=integer())), integer(), output="took") +test(1150.93, uniq(data.table(x=1L)), 1L, output="took") +test(1150.94, uniq(data.table(x=1:2)), 1:2, output="took") +options(op) +test(1150.96, nrow2(list()), 0L) # nrow2 helper tests +test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L) +test(1150.98, nrow2(data.table(x=1:10, b=1:5)), 10L) +test(1150.99, nrow2(1:5), error="nrow2 expects data.table or list") + # test uniqlengths set.seed(45) x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) @@ -3893,6 +3968,7 @@ o1 <- uniqlist(list(x), ox) test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) o1 <- uniqlist(list(x)) test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +test(1151.3, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(as.numeric(o1), as.numeric(length(x)))) rm(list=c("x","ox","o1")) gc() @@ -6729,6 +6805,7 @@ test(1475.13, uniqueN(NA), 1L) test(1475.14, uniqueN(NA, na.rm=TRUE), 0L) test(1475.15, uniqueN(logical()), 0L) test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L) +test(1475.17, uniqueN(TRUE, na.rm=NA), error="must be TRUE or FALSE") # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) diff --git a/man/uniq.Rd b/man/uniq.Rd new file mode 100644 index 0000000000..01d01a5b2a --- /dev/null +++ b/man/uniq.Rd @@ -0,0 +1,50 @@ +\name{uniq} +\alias{uniq} +\alias{uniqlist} +\alias{funiq} +\title{ Consecutively unique rows } +\description{ + Finds the consecutively unique rows. +} +\usage{ +uniq(x, order=integer()) +} +\arguments{ + \item{x}{ data.table type object. } + \item{order}{ integer vector order of \code{x}, must not contain duplicates. } +} +\details{ + Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row. +} +\note{ + It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order}} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception. +} +\value{ + Integer vector corresponding to rows which are consecutively unique. +} +\seealso{ \code{\link{data.table}}, \code{\link{rleid}} } +\examples{ +uniq(data.table()) +uniq(data.table(x=integer())) +uniq(data.table(x=integer(), y=integer())) +uniq(data.table(x=1L)) +uniq(data.table(x=1L, y=1L)) +uniq(data.table(x=1:2)) +uniq(data.table(x=1:2, y=1:2)) +uniq(data.table(x=1:2)[c(1L,1:2)]) +uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]) + +# 'order' argument +x = data.table(id = 1:8, v = rep(1:2, each=4)) +uniq(x[,"v"]) +x = x[c(1:2,7:8,3:4,5:6)] +uniq(x[,"v"]) + +o = order(x$id) +uniq(x[,"v"], order=o) +# or if we are not sure if 'o' has no duplicates +if (!anyDuplicated(o)) { + uniq(x[,"v"], order=o) +} +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 90ff7fb6fc..51fe3598ff 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val); SEXP vecseq(SEXP x, SEXP len, SEXP clamp); // uniqlist.c -SEXP uniqlist(SEXP l, SEXP order); +SEXP uniq(SEXP x, SEXP order, SEXP safe); SEXP uniqlengths(SEXP x, SEXP n); // chmatch.c diff --git a/src/init.c b/src/init.c index aed2da3dbd..b5b5da2bf1 100644 --- a/src/init.c +++ b/src/init.c @@ -149,7 +149,7 @@ R_CallMethodDef callMethods[] = { {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1}, {"Cfmelt", (DL_FUNC) &fmelt, -1}, {"Cfcast", (DL_FUNC) &fcast, -1}, -{"Cuniqlist", (DL_FUNC) &uniqlist, -1}, +{"Cuniq", (DL_FUNC) &uniq, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, {"Cforder", (DL_FUNC) &forder, -1}, {"Cfsorted", (DL_FUNC) &fsorted, -1}, diff --git a/src/uniqlist.c b/src/uniqlist.c index 447b3ea057..1a9e27489e 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -1,63 +1,90 @@ #include "data.table.h" -// DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes -// Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up) -SEXP uniqlist(SEXP l, SEXP order) -{ - // This works like UNIX uniq as referred to by ?base::unique; i.e., it - // drops immediately repeated rows but doesn't drop duplicates of any - // previous row. Unless, order is provided, then it also drops any previous - // row. l must be a list of same length vectors ans is allocated first - // (maximum length the number of rows) and the length returned in anslen. - // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported. - // DONE: ans is now grown - if (!isNewList(l)) error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov - R_len_t ncol = length(l); - R_len_t nrow = length(VECTOR_ELT(l,0)); - if (!isInteger(order)) error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov - if (LENGTH(order)<1) error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov - if (LENGTH(order)>1 && LENGTH(order)!=nrow) error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov - bool via_order = INTEGER(order)[0] != -1; // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows +/* uniq macros */ - unsigned long long *ulv; // for numeric check speed-up - SEXP v, ans; - R_len_t len, thisi, previ, isize=1000; - int *iidx = Calloc(isize, int); // for 'idx' - len = 1; - iidx[0] = 1; // first row is always the first of the first group - - if (ncol==1) { +#define COMPARE1 \ +prev = *vd; \ +for (int i=1; i nrow) \ + error("'order' must be in range 1:nrow(x)"); \ + elem = vd[oi -1]; \ + if (elem!=prev -#define COMPARE2 \ - ) { \ - iidx[len++] = i+1; \ - if (len>=isize) { \ - isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); \ - iidx = Realloc(iidx, isize, int); \ - } \ - } \ - prev = elem; \ - } +#define COMPARE2 \ + ) { \ + iidx[len++] = i+1; \ + if (len>=isize) { \ + isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); \ + iidx = Realloc(iidx, isize, int); \ + } \ + } \ + prev = elem; \ +} - SEXP v = VECTOR_ELT(l,0); - int *o = INTEGER(order); // only used when via_order is true +SEXP uniq(SEXP x, SEXP order, SEXP safe) { + // safe=true ensure no segfault using COMPARE1_VIA_ORDER_SAFE + // safe=false is not available in public API + if (!isNewList(x)) + error(_("internal error: 'x' must be a data.table type object")); + if (!isInteger(order)) + error(_("internal error: 'order' must be an integer")); + if (!IS_TRUE_OR_FALSE(safe)) + error(_("internal error: 'safe' must be TRUE or FALSE")); + const bool verbose = GetVerbose(); + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + R_len_t ncol = length(x); + R_len_t nrow = length(VECTOR_ELT(x,0)); + if (!ncol || !nrow) { + SEXP ans = PROTECT(allocVector(INTSXP, 0)); + if (verbose) + Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic); + UNPROTECT(1); + return(ans); + } + bool via_order = LENGTH(order) > 0; + bool via_order_safe = via_order && LOGICAL(safe)[0]; + if (via_order && LENGTH(order)!=nrow) + error(_("internal error: uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); + const int *o = INTEGER(order); // only used when via_order[_safe] is true + if (nrow==1) { + if (via_order && o[0]!=1) + error("'order' must be in range 1:nrow(x)"); + SEXP ans = PROTECT(allocVector(INTSXP, 1)); + INTEGER(ans)[0] = 1; + if (verbose) + Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic); + UNPROTECT(1); + return(ans); + } + + unsigned long long *ulv; // for numeric check speed-up (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up) + R_len_t isize=1000, len=1; + int *iidx = Calloc(isize, int); // for 'idx' + iidx[0] = 1; // first row is always the first of the first group + if (ncol==1) { + SEXP v = VECTOR_ELT(x,0); switch(TYPEOF(v)) { case INTSXP : case LGLSXP : { const int *vd=INTEGER(v); int prev, elem; - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE COMPARE2 + } else if (via_order) { // ad hoc by (order passed in) COMPARE1_VIA_ORDER COMPARE2 } else { @@ -68,7 +95,9 @@ SEXP uniqlist(SEXP l, SEXP order) case STRSXP : { const SEXP *vd=STRING_PTR(v); SEXP prev, elem; - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 + } else if (via_order) { COMPARE1_VIA_ORDER && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 // but most of the time they are equal, so ENC2UTF8 doesn't need to be called } else { COMPARE1 && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2 @@ -78,81 +107,103 @@ SEXP uniqlist(SEXP l, SEXP order) const uint64_t *vd=(const uint64_t *)REAL(v); uint64_t prev, elem; // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine - if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) { - if (via_order) { + if (getNumericRounding_C()==0 /*default*/ || + Rinherits(v,char_integer64)) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE COMPARE2 + } else if (via_order) { COMPARE1_VIA_ORDER COMPARE2 } else { COMPARE1 COMPARE2 } } else { - if (via_order) { + if (via_order_safe) { + COMPARE1_VIA_ORDER_SAFE && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 + } else if (via_order) { COMPARE1_VIA_ORDER && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 } else { COMPARE1 && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 } } } break; - default : - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + default : { + error(_("Type '%s' not supported"), type2char(TYPEOF(v))); } - } else { - // ncol>1 - thisi = via_order ? INTEGER(order)[0]-1 : 0; + } + } else { // ncol>1 + R_len_t previ, thisi = via_order ? o[0]-1 : 0; bool *i64 = (bool *)R_alloc(ncol, sizeof(bool)); - for (int i=0; i nrow)) + error("'order' must be in range 1:nrow(x)"); + thisi = oi-1; + } int j = ncol; // the last column varies the most frequently so check that first and work backwards - bool b = true; - while (--j>=0 && b) { - v=VECTOR_ELT(l,j); + bool same = true; // flag to indicate if the values in a row are same as the previous row, if flag false then we can move on to next group + while (--j>=0 && same) { + SEXP v = VECTOR_ELT(x,j); switch (TYPEOF(v)) { - case INTSXP : case LGLSXP : // NA_INTEGER==NA_LOGICAL checked in init.c - b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break; - case STRSXP : + case INTSXP : case LGLSXP : { // NA_INTEGER==NA_LOGICAL checked in init.c + same = INTEGER(v)[thisi]==INTEGER(v)[previ]; + } break; + case STRSXP : { // fix for #469, when key is set, duplicated calls uniqlist, where encoding // needs to be taken care of. - b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings. - case REALSXP : + same = ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); + } break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings. + case REALSXP : { ulv = (unsigned long long *)REAL(v); - b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) - if (!b && !i64[j]) { - b = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ); + same = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) + if (!same && !i64[j]) { + same = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ); // could store LHS for use next time as RHS (to save calling dtwiddle twice). However: i) there could be multiple double columns so vector of RHS would need // to be stored, ii) many short-circuit early before the if (!b) anyway (negating benefit) and iii) we may not have needed LHS this time so logic would be complex. } - break; - default : - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + } break; + default : { + error(_("Type '%s' not supported"), type2char(TYPEOF(v))); + } } } - if (!b) { + if (!same) { iidx[len++] = i+1; if (len >= isize) { isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i))); iidx = Realloc(iidx, isize, int); } - } + } // almost like COMPARE2 but no last line: prev = elem } } - PROTECT(ans = allocVector(INTSXP, len)); + SEXP ans = PROTECT(allocVector(INTSXP, len)); memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues Free(iidx); + if (verbose) + Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic); UNPROTECT(1); return(ans); } SEXP uniqlengths(SEXP x, SEXP n) { - // seems very similar to rbindlist.c:uniq_lengths. TODO: centralize into common function - if (TYPEOF(x) != INTSXP) error(_("Input argument 'x' to 'uniqlengths' must be an integer vector")); - if (TYPEOF(n) != INTSXP || length(n) != 1) error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1")); + if (!isInteger(x)) + error(_("internal error: Input argument 'x' to 'uniqlengths' must be an integer vector")); // # nocov + if (!isInteger(x) || length(n)!=1 || INTEGER(n)[0]==NA_INTEGER) + error(_("internal error: Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA")); // # nocov R_len_t len = length(x); SEXP ans = PROTECT(allocVector(INTSXP, len)); - for (R_len_t i=1; i0) INTEGER(ans)[len-1] = INTEGER(n)[0] - INTEGER(x)[len-1] + 1; + if (len>0) + ansp[len-1] = INTEGER(n)[0] - xp[len-1] + 1; UNPROTECT(1); return(ans); } @@ -346,8 +397,10 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul SEXP uniqueNlogical(SEXP x, SEXP narmArg) { // single pass; short-circuit and return as soon as all 3 values are found - if (!isLogical(x)) error(_("x is not a logical vector")); - if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error(_("na.rm must be TRUE or FALSE")); + if (!isLogical(x)) + error(_("internal error: x is not a logical vector")); // # nocov + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("na.rm must be TRUE or FALSE")); bool narm = LOGICAL(narmArg)[0]==1; const R_xlen_t n = xlength(x); if (n==0) @@ -356,16 +409,27 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) { R_xlen_t i=0; const int *ix = LOGICAL(x); while (++i