From cc27941eac63a6f147a36f4aeccf83d654e4c7be Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 10 Apr 2020 18:23:39 +0100
Subject: [PATCH 01/15] uniqueN logical unfold for codecov

---
 src/uniqlist.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/uniqlist.c b/src/uniqlist.c
index 447b3ea057..e2220a64fd 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -346,8 +346,10 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul
 
 SEXP uniqueNlogical(SEXP x, SEXP narmArg) {
   // single pass; short-circuit and return as soon as all 3 values are found
-  if (!isLogical(x)) error(_("x is not a logical vector"));
-  if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error(_("na.rm must be TRUE or FALSE"));
+  if (!isLogical(x))
+    error(_("x is not a logical vector"));
+  if (!IS_TRUE_OR_FALSE(narmArg))
+    error(_("na.rm must be TRUE or FALSE"));
   bool narm = LOGICAL(narmArg)[0]==1;
   const R_xlen_t n = xlength(x);
   if (n==0)
@@ -356,16 +358,27 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) {
   R_xlen_t i=0;
   const int *ix = LOGICAL(x);
   while (++i<n && ix[i]==first);
-  if (i==n)
-    return ScalarInteger(first==NA_INTEGER && narm ? 0 : 1); // all one value
+  if (i==n) { // all values the same
+    if (first==NA_INTEGER && narm) {
+      return ScalarInteger(0);
+    } else {
+      return ScalarInteger(1);
+    }
+  }
   Rboolean second = ix[i];
   // we've found 2 different values (first and second). Which one didn't we find? Then just look for that.
   // NA_LOGICAL == INT_MIN checked in init.c
-  const int third = (first+second == 1) ? NA_LOGICAL : ( first+second == INT_MIN ? TRUE : FALSE );
+  int third;
+  if (first+second == 1) {
+    third = NA_LOGICAL;
+  } else {
+    third = first+second == INT_MIN;
+  }
   if (third==NA_LOGICAL && narm)
-    return ScalarInteger(2);  // TRUE and FALSE found before any NA, but na.rm=TRUE so we're done
-  while (++i<n) if (ix[i]==third)
-    return ScalarInteger(3-narm);
+    return ScalarInteger(2); // TRUE and FALSE found before any NA, but na.rm=TRUE so we're done
+  while (++i<n) {
+    if (ix[i]==third)
+      return ScalarInteger(3-narm);
+  }
   return ScalarInteger(2-(narm && third!=NA_LOGICAL));
 }
-

From 81e7391bc9f45258300b0ae473c63f3c00a4f384 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 10 Apr 2020 18:53:13 +0100
Subject: [PATCH 02/15] uniqlengths reduce loop overhead, unfold for codecov

---
 src/uniqlist.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/uniqlist.c b/src/uniqlist.c
index e2220a64fd..89b1af81d0 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -144,15 +144,19 @@ SEXP uniqlist(SEXP l, SEXP order)
 }
 
 SEXP uniqlengths(SEXP x, SEXP n) {
-  // seems very similar to rbindlist.c:uniq_lengths. TODO: centralize into common function
-  if (TYPEOF(x) != INTSXP) error(_("Input argument 'x' to 'uniqlengths' must be an integer vector"));
-  if (TYPEOF(n) != INTSXP || length(n) != 1) error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1"));
+  if (!isInteger(x))
+    error(_("Input argument 'x' to 'uniqlengths' must be an integer vector"));
+  if (!isInteger(x) || length(n)!=1 || INTEGER(n)[0]==NA_INTEGER)
+    error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA"));
   R_len_t len = length(x);
   SEXP ans = PROTECT(allocVector(INTSXP, len));
-  for (R_len_t i=1; i<len; i++) {
-    INTEGER(ans)[i-1] = INTEGER(x)[i] - INTEGER(x)[i-1];
+  const int *xp = INTEGER(x);
+  int *ansp = INTEGER(ans);
+  for (R_len_t i=1; i<len; ++i) {
+    ansp[i-1] = xp[i] - xp[i-1];
   }
-  if (len>0) INTEGER(ans)[len-1] = INTEGER(n)[0] - INTEGER(x)[len-1] + 1;
+  if (len>0)
+    ansp[len-1] = INTEGER(n)[0] - xp[len-1] + 1;
   UNPROTECT(1);
   return(ans);
 }

From 96ec1a2390bd775e6d1c707961488ab6b8a483f6 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 10 Apr 2020 19:45:54 +0100
Subject: [PATCH 03/15] refactor code for easy folding

better codecov

more local variable declaration

more meaningful variable names

better int64 class inheritance check
---
 src/uniqlist.c | 120 ++++++++++++++++++++++++++-----------------------
 1 file changed, 63 insertions(+), 57 deletions(-)

diff --git a/src/uniqlist.c b/src/uniqlist.c
index 89b1af81d0..7c582e989e 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -1,5 +1,29 @@
 #include "data.table.h"
 
+/* uniqlist macros */
+#define COMPARE1                                                              \
+prev = *vd;                                                                   \
+for (int i=1; i<nrow; i++) {                                                  \
+  elem = *++vd;                                                               \
+  if (elem!=prev
+
+#define COMPARE1_VIA_ORDER                                                    \
+prev = vd[*o -1];                                                             \
+for (int i=1; i<nrow; i++) {                                                  \
+  elem = vd[*++o -1];                                                         \
+  if (elem!=prev
+
+#define COMPARE2                                                              \
+  ) {                                                                         \
+    iidx[len++] = i+1;                                                        \
+    if (len>=isize) {                                                         \
+      isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));        \
+      iidx = Realloc(iidx, isize, int);                                       \
+    }                                                                         \
+  }                                                                           \
+  prev = elem;                                                                \
+}
+
 // DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes
 // Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
 SEXP uniqlist(SEXP l, SEXP order)
@@ -11,48 +35,26 @@ SEXP uniqlist(SEXP l, SEXP order)
   // (maximum length the number of rows) and the length returned in anslen.
   // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported.
   // DONE: ans is now grown
-  if (!isNewList(l)) error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov
+  if (!isNewList(l))
+    error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov
+
   R_len_t ncol = length(l);
   R_len_t nrow = length(VECTOR_ELT(l,0));
-  if (!isInteger(order)) error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov
-  if (LENGTH(order)<1) error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov
-  if (LENGTH(order)>1 && LENGTH(order)!=nrow) error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov
+  if (!isInteger(order))
+    error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov
+  if (LENGTH(order)<1)
+    error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov
+  if (LENGTH(order)>1 && LENGTH(order)!=nrow)
+    error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov
   bool via_order = INTEGER(order)[0] != -1;  // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows
 
   unsigned long long *ulv; // for numeric check speed-up
-  SEXP v, ans;
-  R_len_t len, thisi, previ, isize=1000;
+  R_len_t isize=1000, len=1;
   int *iidx = Calloc(isize, int); // for 'idx'
-  len = 1;
   iidx[0] = 1; // first row is always the first of the first group
-
   if (ncol==1) {
-
-#define COMPARE1                                                                 \
-      prev = *vd;                                                                \
-      for (int i=1; i<nrow; i++) {                                               \
-        elem = *++vd;                                                            \
-        if (elem!=prev
-
-#define COMPARE1_VIA_ORDER                                                       \
-      prev = vd[*o -1];                                                          \
-      for (int i=1; i<nrow; i++) {                                               \
-        elem = vd[*++o -1];                                                      \
-        if (elem!=prev
-
-#define COMPARE2                                                                 \
-                        ) {                                                      \
-          iidx[len++] = i+1;                                                     \
-          if (len>=isize) {                                                      \
-            isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));     \
-            iidx = Realloc(iidx, isize, int);                                    \
-          }                                                                      \
-        }                                                                        \
-        prev = elem;                                                             \
-      }
-
     SEXP v = VECTOR_ELT(l,0);
-    int *o = INTEGER(order);  // only used when via_order is true
+    const int *o = INTEGER(order);  // only used when via_order is true
     switch(TYPEOF(v)) {
     case INTSXP : case LGLSXP : {
       const int *vd=INTEGER(v);
@@ -78,7 +80,7 @@ SEXP uniqlist(SEXP l, SEXP order)
       const uint64_t *vd=(const uint64_t *)REAL(v);
       uint64_t prev, elem;
       // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine
-      if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) {
+      if (getNumericRounding_C()==0 /*default*/ || Rinherits(v,char_integer64)) {
         if (via_order) {
           COMPARE1_VIA_ORDER COMPARE2
         } else {
@@ -92,51 +94,55 @@ SEXP uniqlist(SEXP l, SEXP order)
         }
       }
     } break;
-    default :
-      error(_("Type '%s' not supported"), type2char(TYPEOF(v)));  // # nocov
+    default : {
+      error(_("Type '%s' not supported"), type2char(TYPEOF(v)));
     }
-  } else {
-    // ncol>1
-    thisi = via_order ? INTEGER(order)[0]-1 : 0;
+    }
+  } else { // ncol>1
+    R_len_t previ, thisi = via_order ? INTEGER(order)[0]-1 : 0;
     bool *i64 = (bool *)R_alloc(ncol, sizeof(bool));
-    for (int i=0; i<ncol; i++) i64[i] = INHERITS(VECTOR_ELT(l,i), char_integer64);
+    for (int i=0; i<ncol; i++)
+      i64[i] = Rinherits(VECTOR_ELT(l,i),char_integer64);
     for (int i=1; i<nrow; i++) {
       previ = thisi;
       thisi = via_order ? INTEGER(order)[i]-1 : i;
       int j = ncol;  // the last column varies the most frequently so check that first and work backwards
-      bool b = true;
-      while (--j>=0 && b) {
-        v=VECTOR_ELT(l,j);
+      bool same = true; // flag to indicate if the values in a row are same as the previous row, if flag false then we can move on to next group
+      while (--j>=0 && same) {
+        SEXP v = VECTOR_ELT(l,j);
         switch (TYPEOF(v)) {
-        case INTSXP : case LGLSXP :  // NA_INTEGER==NA_LOGICAL checked in init.c
-          b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
-        case STRSXP :
+        case INTSXP : case LGLSXP : { // NA_INTEGER==NA_LOGICAL checked in init.c
+          same = INTEGER(v)[thisi]==INTEGER(v)[previ];
+        } break;
+        case STRSXP : {
           // fix for #469, when key is set, duplicated calls uniqlist, where encoding
           // needs to be taken care of.
-          b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break;  // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
-        case REALSXP :
+          same = ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ));
+        } break;  // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
+        case REALSXP : {
           ulv = (unsigned long long *)REAL(v);
-          b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
-          if (!b && !i64[j]) {
-            b = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ);
+          same = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
+          if (!same && !i64[j]) {
+            same = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ);
             // could store LHS for use next time as RHS (to save calling dtwiddle twice). However: i) there could be multiple double columns so vector of RHS would need
             // to be stored, ii) many short-circuit early before the if (!b) anyway (negating benefit) and iii) we may not have needed LHS this time so logic would be complex.
           }
-          break;
-        default :
-          error(_("Type '%s' not supported"), type2char(TYPEOF(v)));  // # nocov
+        } break;
+        default : {
+          error(_("Type '%s' not supported"), type2char(TYPEOF(v)));
+        }
         }
       }
-      if (!b) {
+      if (!same) {
         iidx[len++] = i+1;
         if (len >= isize) {
           isize = MIN(nrow, (size_t)(1.1*(double)isize*((double)nrow/i)));
           iidx = Realloc(iidx, isize, int);
         }
-      }
+      } // almost like COMPARE2 but no last line: prev = elem
     }
   }
-  PROTECT(ans = allocVector(INTSXP, len));
+  SEXP ans =  PROTECT(allocVector(INTSXP, len));
   memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues
   Free(iidx);
   UNPROTECT(1);

From ac8e6b8c17dba7ba36b998a49d70d1ebe555c01f Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 00:02:24 +0100
Subject: [PATCH 04/15] exporting uniq function (uniqlist), #900

rework edge cases, that were not being used, to be more consistent
---
 NAMESPACE        |  1 +
 R/uniqlist.R     | 39 ++++++++++++++++++++-------------
 man/uniq.Rd      | 35 +++++++++++++++++++++++++++++
 src/data.table.h |  2 +-
 src/init.c       |  2 +-
 src/uniqlist.c   | 57 ++++++++++++++++++++++++------------------------
 6 files changed, 90 insertions(+), 46 deletions(-)
 create mode 100644 man/uniq.Rd

diff --git a/NAMESPACE b/NAMESPACE
index c2c095a1d8..fb8a09c294 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -56,6 +56,7 @@ export(nafill)
 export(setnafill)
 export(.Last.updated)
 export(fcoalesce)
+export(uniq)
 
 S3method("[", data.table)
 S3method("[<-", data.table)
diff --git a/R/uniqlist.R b/R/uniqlist.R
index b0c1c9fdd0..1e0b0e35f0 100644
--- a/R/uniqlist.R
+++ b/R/uniqlist.R
@@ -1,18 +1,28 @@
+nrow2 = function(x) {
+  if (!length(x)) return(0L)
+  if (is.data.table(x)) nrow(x) else if (is.list(x)) length(x[[1L]]) else stop("nrow2 expects data.table or list")
+}
 
-uniqlist = function (l, order = -1L)
-{
-  # Assumes input list is ordered by each list item (or by 'order' if supplied), and that all list elements are the same length
-  # Finds the non-duplicate rows. Was called duplist but now grows vector - doesn't over-allocate result vector and
-  # is >2x times faster on numeric types
-  # TO DO: Possibly reinstate reverse argument :
-  #    FALSE works in the usual duplicated() way,  the first in a sequence of dups, will be FALSE
-  #    TRUE has the last in a sequence of dups FALSE (so you can keep the last if that's required)
-  # l = list(...)
-  if (!is.list(l))
-    stop("l not type list")
-  if (!length(l))  return(list(0L))
-  ans = .Call(Cuniqlist, l, as.integer(order))
-  ans
+uniqlist = function (l, order=integer()) {
+  # used in "[.data.table" when doing groupby (!byjoin) to find the groups using byval
+  # (length(byval) && length(byval[[1L]])) && (bysameorder || byindex)
+  # and in duplicated.data.table when
+  # haskey(x) && length(by) <= length(key(x)) && all(head(key(x), length(by)) == by)
+  uniq(l, order, internal=TRUE)
+}
+
+uniq = function(x, order=integer(), internal=FALSE) {
+  if (!isTRUE(internal) && !missing(order)) {
+    if (!is.integer(order))
+      stop("'order' must be an integer")
+    if (length(order) != nrow2(x))
+      stop("'order' must be same length as nrow of 'x'")
+    if (anyDuplicated(order))
+      stop("'order' should not contain duplicates")
+    if (anyNA(order))
+      stop("'order' should not contain NAs")
+  }
+  .Call(Cuniq, x, order)
 }
 
 # implemented for returning the lengths of groups obtained from uniqlist (for internal use only)
@@ -21,4 +31,3 @@ uniqlengths = function(x, len) {
   ans = .Call(Cuniqlengths, as.integer(x), as.integer(len))
   ans
 }
-
diff --git a/man/uniq.Rd b/man/uniq.Rd
new file mode 100644
index 0000000000..c924b694c9
--- /dev/null
+++ b/man/uniq.Rd
@@ -0,0 +1,35 @@
+\name{uniq}
+\alias{uniq}
+\alias{uniqlist}
+\title{ Consecutively unique rows }
+\description{
+  Finds the consecutively unique rows.
+}
+\usage{
+uniq(x, order, internal=FALSE)
+}
+\arguments{
+  \item{x}{ data.table type object. }
+  \item{order}{ integer vector order of \code{x}. }
+  \item{internal}{ logical flag, escapes checks of \code{order} argument. }
+}
+\details{
+  Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.
+  Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, sorted, no duplicates, no NAs).
+}
+\value{
+  Integer vector corresponding to rows which are consecutively unique.
+}
+\seealso{ \code{\link{data.table}}, \code{\link{rleid}} }
+\examples{
+uniq(data.table())
+uniq(data.table(x=integer()))
+uniq(data.table(x=integer(), y=integer()))
+uniq(data.table(x=1L))
+uniq(data.table(x=1L, y=1L))
+uniq(data.table(x=1:2))
+uniq(data.table(x=1:2, y=1:2))
+uniq(data.table(x=1:2)[c(1L,1:2)])
+uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)])
+}
+\keyword{ data }
diff --git a/src/data.table.h b/src/data.table.h
index 90ff7fb6fc..646a32cc1a 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val);
 SEXP vecseq(SEXP x, SEXP len, SEXP clamp);
 
 // uniqlist.c
-SEXP uniqlist(SEXP l, SEXP order);
+SEXP uniq(SEXP x, SEXP order);
 SEXP uniqlengths(SEXP x, SEXP n);
 
 // chmatch.c
diff --git a/src/init.c b/src/init.c
index aed2da3dbd..b5b5da2bf1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -149,7 +149,7 @@ R_CallMethodDef callMethods[] = {
 {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1},
 {"Cfmelt", (DL_FUNC) &fmelt, -1},
 {"Cfcast", (DL_FUNC) &fcast, -1},
-{"Cuniqlist", (DL_FUNC) &uniqlist, -1},
+{"Cuniq", (DL_FUNC) &uniq, -1},
 {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1},
 {"Cforder", (DL_FUNC) &forder, -1},
 {"Cfsorted", (DL_FUNC) &fsorted, -1},
diff --git a/src/uniqlist.c b/src/uniqlist.c
index 7c582e989e..4cf93a5232 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -1,6 +1,6 @@
 #include "data.table.h"
 
-/* uniqlist macros */
+/* uniq macros */
 #define COMPARE1                                                              \
 prev = *vd;                                                                   \
 for (int i=1; i<nrow; i++) {                                                  \
@@ -24,36 +24,33 @@ for (int i=1; i<nrow; i++) {                                                  \
   prev = elem;                                                                \
 }
 
-// DONE: return 'uniqlist' as a vector (same as duplist) and write a separate function to get group sizes
-// Also improvements for numeric type with a hack of checking unsigned int (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
-SEXP uniqlist(SEXP l, SEXP order)
-{
-  // This works like UNIX uniq as referred to by ?base::unique; i.e., it
-  // drops immediately repeated rows but doesn't drop duplicates of any
-  // previous row. Unless, order is provided, then it also drops any previous
-  // row. l must be a list of same length vectors ans is allocated first
-  // (maximum length the number of rows) and the length returned in anslen.
-  // No NA in order which is guaranteed since internal-only. Used at R level internally (Cuniqlist) but is not and should not be exported.
-  // DONE: ans is now grown
-  if (!isNewList(l))
-    error(_("Internal error: uniqlist has not been passed a list of columns")); // # nocov
-
-  R_len_t ncol = length(l);
-  R_len_t nrow = length(VECTOR_ELT(l,0));
+SEXP uniq(SEXP x, SEXP order) {
+  if (!isNewList(x))
+    error(_("'x' must be a data.table type object"));
   if (!isInteger(order))
-    error(_("Internal error: uniqlist has been passed a non-integer order")); // # nocov
-  if (LENGTH(order)<1)
-    error(_("Internal error: uniqlist has been passed a length-0 order")); // # nocov
-  if (LENGTH(order)>1 && LENGTH(order)!=nrow)
-    error(_("Internal error: uniqlist has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow); // # nocov
-  bool via_order = INTEGER(order)[0] != -1;  // has an ordering vector been passed in that we have to hop via? Don't use MISSING() here as it appears unstable on Windows
-
-  unsigned long long *ulv; // for numeric check speed-up
+    error(_("'order' must be an integer, you should be more careful when using internal=TRUE"));
+  const bool verbose = GetVerbose();
+  double tic = 0;
+  if (verbose)
+    tic = omp_get_wtime();
+  R_len_t ncol = length(x);
+  R_len_t nrow = length(VECTOR_ELT(x,0));
+  if (!ncol || !nrow) {
+    SEXP ans = PROTECT(allocVector(INTSXP, 0));
+    if (verbose)
+      Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic);
+    UNPROTECT(1);
+    return(ans);
+  }
+  bool via_order = LENGTH(order) > 0;
+  if (via_order && LENGTH(order)!=nrow)
+    error(_("uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow);
+  unsigned long long *ulv; // for numeric check speed-up (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
   R_len_t isize=1000, len=1;
   int *iidx = Calloc(isize, int); // for 'idx'
   iidx[0] = 1; // first row is always the first of the first group
   if (ncol==1) {
-    SEXP v = VECTOR_ELT(l,0);
+    SEXP v = VECTOR_ELT(x,0);
     const int *o = INTEGER(order);  // only used when via_order is true
     switch(TYPEOF(v)) {
     case INTSXP : case LGLSXP : {
@@ -102,14 +99,14 @@ SEXP uniqlist(SEXP l, SEXP order)
     R_len_t previ, thisi = via_order ? INTEGER(order)[0]-1 : 0;
     bool *i64 = (bool *)R_alloc(ncol, sizeof(bool));
     for (int i=0; i<ncol; i++)
-      i64[i] = Rinherits(VECTOR_ELT(l,i),char_integer64);
+      i64[i] = Rinherits(VECTOR_ELT(x,i),char_integer64);
     for (int i=1; i<nrow; i++) {
       previ = thisi;
       thisi = via_order ? INTEGER(order)[i]-1 : i;
       int j = ncol;  // the last column varies the most frequently so check that first and work backwards
       bool same = true; // flag to indicate if the values in a row are same as the previous row, if flag false then we can move on to next group
       while (--j>=0 && same) {
-        SEXP v = VECTOR_ELT(l,j);
+        SEXP v = VECTOR_ELT(x,j);
         switch (TYPEOF(v)) {
         case INTSXP : case LGLSXP : { // NA_INTEGER==NA_LOGICAL checked in init.c
           same = INTEGER(v)[thisi]==INTEGER(v)[previ];
@@ -142,9 +139,11 @@ SEXP uniqlist(SEXP l, SEXP order)
       } // almost like COMPARE2 but no last line: prev = elem
     }
   }
-  SEXP ans =  PROTECT(allocVector(INTSXP, len));
+  SEXP ans = PROTECT(allocVector(INTSXP, len));
   memcpy(INTEGER(ans), iidx, sizeof(int)*len); // sizeof is of type size_t - no integer overflow issues
   Free(iidx);
+  if (verbose)
+    Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic);
   UNPROTECT(1);
   return(ans);
 }

From 4a4fc1d5196100b040593816b3be962960ad7342 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 11:08:26 +0100
Subject: [PATCH 05/15] extend man for order arg example, fix old tests,
 breaking change?

---
 inst/tests/tests.Rraw | 5 ++---
 man/uniq.Rd           | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 7cc6819e8f..b03fed4b07 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -13266,9 +13266,8 @@ test(1962.0091, anyDuplicated(DT, by = NULL), 2L)
 test(1962.0092, uniqueN(DT, by = NULL), 3L)
 
 ## uniqlist.R
-test(1962.010, uniqlist(1:5),
-     error = 'not type list')
-test(1962.011, uniqlist(list()), list(0L))
+test(1962.010, uniqlist(1:5), error = 'must be a data.table type')
+test(1962.011, uniqlist(list()), integer())
 
 ## merge.R
 DT1 = data.table(a = 1:3, V = 'a')
diff --git a/man/uniq.Rd b/man/uniq.Rd
index c924b694c9..389a9c5aaa 100644
--- a/man/uniq.Rd
+++ b/man/uniq.Rd
@@ -31,5 +31,13 @@ uniq(data.table(x=1:2))
 uniq(data.table(x=1:2, y=1:2))
 uniq(data.table(x=1:2)[c(1L,1:2)])
 uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)])
+
+# order argument
+x = data.table(id = 1:8, v = rep(1:2, each=4))
+uniq(x[,"v"])
+x = x[c(1:2,7:8,3:4,5:6)]
+uniq(x[,"v"])
+o = order(x$id)
+uniq(x[,"v"], order=o)
 }
 \keyword{ data }

From f5bcbf3c575ce4f5b042d13285931edf94cf48ef Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 11:18:05 +0100
Subject: [PATCH 06/15] internal uniqlist reverted to match previous tests

---
 R/uniqlist.R          | 9 ++++++++-
 inst/tests/tests.Rraw | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/R/uniqlist.R b/R/uniqlist.R
index 1e0b0e35f0..c5237169a6 100644
--- a/R/uniqlist.R
+++ b/R/uniqlist.R
@@ -3,11 +3,18 @@ nrow2 = function(x) {
   if (is.data.table(x)) nrow(x) else if (is.list(x)) length(x[[1L]]) else stop("nrow2 expects data.table or list")
 }
 
-uniqlist = function (l, order=integer()) {
+uniqlist = function (l, order = -1L) {
   # used in "[.data.table" when doing groupby (!byjoin) to find the groups using byval
   # (length(byval) && length(byval[[1L]])) && (bysameorder || byindex)
   # and in duplicated.data.table when
   # haskey(x) && length(by) <= length(key(x)) && all(head(key(x), length(by)) == by)
+
+  # those are only for backward compatibility, probably not really used anywhere, will keep 1962.010 and 1962.011 happy
+  if (!is.list(l)) stop("l not type list")
+  if (!length(l)) return(list(0L))
+  # this is for compatibility to new uniq.c code
+  if (identical(order, -1L)) order = integer()
+
   uniq(l, order, internal=TRUE)
 }
 
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index b03fed4b07..7cc6819e8f 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -13266,8 +13266,9 @@ test(1962.0091, anyDuplicated(DT, by = NULL), 2L)
 test(1962.0092, uniqueN(DT, by = NULL), 3L)
 
 ## uniqlist.R
-test(1962.010, uniqlist(1:5), error = 'must be a data.table type')
-test(1962.011, uniqlist(list()), integer())
+test(1962.010, uniqlist(1:5),
+     error = 'not type list')
+test(1962.011, uniqlist(list()), list(0L))
 
 ## merge.R
 DT1 = data.table(a = 1:3, V = 'a')

From 02415a5057f87a1ce2fcccc7a57ffcd4c6295237 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 14:18:43 +0100
Subject: [PATCH 07/15] uniq unit tests, uniqlist as well, document segfault
 when internal=T

---
 R/uniqlist.R          | 19 +++++++++------
 inst/tests/tests.Rraw | 57 +++++++++++++++++++++++++++++++++++++++++++
 src/uniqlist.c        |  6 +++++
 3 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/R/uniqlist.R b/R/uniqlist.R
index c5237169a6..a4869fdc92 100644
--- a/R/uniqlist.R
+++ b/R/uniqlist.R
@@ -21,13 +21,18 @@ uniqlist = function (l, order = -1L) {
 uniq = function(x, order=integer(), internal=FALSE) {
   if (!isTRUE(internal) && !missing(order)) {
     if (!is.integer(order))
-      stop("'order' must be an integer")
-    if (length(order) != nrow2(x))
-      stop("'order' must be same length as nrow of 'x'")
-    if (anyDuplicated(order))
-      stop("'order' should not contain duplicates")
-    if (anyNA(order))
-      stop("'order' should not contain NAs")
+      order = as.integer(order)
+    if (length(order)) {
+      len = nrow2(x)
+      if (length(order) != len)
+        stop("'order' must be same length as nrow of 'x'")
+      if (anyDuplicated(order))
+        stop("'order' must not contain duplicates")
+      if (anyNA(order))
+        stop("'order' must not contain NAs")
+      if (any(!between(order, 1L, len)))
+        stop("'order' must be in range of 1:nrow(x)")
+    }
   }
   .Call(Cuniq, x, order)
 }
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 7cc6819e8f..fcdc8e0cd8 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -54,6 +54,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   test = data.table:::test
   uniqlengths = data.table:::uniqlengths
   uniqlist = data.table:::uniqlist
+  nrow2 = data.table:::nrow2
   which_ = data.table:::which_
   which.first = data.table:::which.first
   which.last = data.table:::which.last
@@ -3885,6 +3886,62 @@ if (.Machine$sizeof.longdouble == 16) {
 test(1149.1, forderv(integer(0)), integer(0))
 test(1149.2, forderv(numeric(0)), integer(0))
 
+# test uniq (uniqlist) #900         ## test number 1150 looks to be unused so taking over
+test(1150.01, uniq(data.table()), integer()) # examples
+test(1150.02, uniq(data.table(x=integer())), integer())
+test(1150.03, uniq(data.table(x=integer(), y=integer())), integer())
+test(1150.04, uniq(data.table(x=1L)), 1L)
+test(1150.05, uniq(data.table(x=1L, y=1L)), 1L)
+test(1150.06, uniq(data.table(x=1:2)), 1:2)
+test(1150.07, uniq(data.table(x=1:2, y=1:2)), 1:2)
+test(1150.08, uniq(data.table(x=1:2)[c(1L,1:2)]), c(1L,3L))
+test(1150.09, uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]), c(1L,3L))
+x = data.table(id = 1:8, v = rep(1:2, each=4)) # 'order' argument example
+test(1150.11, uniq(x[,"v"]), c(1L,5L))
+x = x[c(1:2,7:8,3:4,5:6)]
+test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L))
+o = order(x$id)
+test(1150.13, uniq(x[,"v"], order=o), c(1L,5L))
+test(1150.21, uniq(1:5), error="must be a data.table type") # error code cov
+test(1150.22, uniq(list(b = as.raw(1:5))), error="not supported")
+test(1150.23, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported")
+test(1150.31, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L)) # 'order' exceptions internal=FALSE
+test(1150.32, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow of 'x'")
+test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), error="must not contain duplicates")
+test(1150.34, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must not contain NAs")
+test(1150.35, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range")
+test(1150.36, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range")
+test(1150.41, uniq(x[,"v"], order=as.numeric(o), internal=TRUE), error="must be an integer") # 'order' exceptions internal=TRUE
+test(1150.42, uniq(x[,"v"], order=o[-1L], internal=TRUE), error="has been passed length.*nrow")
+test(1150.43, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal
+#test(1150.441, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), internal=TRUE), error="must not contain NAs") ## unhandled bad usage! thus internal # segfault, see uniqlist.c:COMPARE1_VIA_ORDER
+test(1150.442, uniq(x[c(o[1:6],o[c(NA,7L)]), "v"], internal=TRUE), c(1L,5L,7L,8L)) ## this is ok as are not using 'order' arg
+test(1150.45, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal
+test(1150.46, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal
+x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behaviour of 'order' special case
+o = order(x$id)
+test(1150.51, uniq(x[,"v"], order=o), c(1L,5L))
+test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L))
+test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow")
+test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range")
+test(1150.61, uniq(x[,"v"], order=o, internal=TRUE), c(1L,5L))
+test(1150.62, uniq(x[,"v"], order=integer(), internal=TRUE), c(1L,5L))
+test(1150.63, uniq(x[,"v"], order=-1L, internal=TRUE), error="has been passed length.*nrow")
+test(1150.64, uniq(x[1L,"v"], order=-1L, internal=TRUE), 1L) ## unhandled bad usage! thus internal
+test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L))
+test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L))
+test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L))
+test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L)
+op = options(datatable.verbose=TRUE)
+test(1150.91, uniq(data.table()), integer(), output="took")
+test(1150.92, uniq(data.table(x=integer())), integer(), output="took")
+test(1150.93, uniq(data.table(x=1:2)), 1:2, output="took")
+options(op)
+test(1150.96, nrow2(list()), 0L) # nrow2 helper tests
+test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L)
+test(1150.98, nrow2(data.table(x=1:10, b=1:5)), 10L)
+test(1150.99, nrow2(1:5), error="nrow2 expects data.table or list")
+
 # test uniqlengths
 set.seed(45)
 x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
diff --git a/src/uniqlist.c b/src/uniqlist.c
index 4cf93a5232..c15ab2b5c9 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -7,6 +7,12 @@ for (int i=1; i<nrow; i++) {                                                  \
   elem = *++vd;                                                               \
   if (elem!=prev
 
+/* segfault in COMPARE1_VIA_ORDER when using internal=TRUE (disabled test 1150.441)
+ * uniq(data.table(a=1:3), order=c(1L,NA,3L), internal=TRUE) # where NA is any int not in 1:nrow(x)
+ * could be avoided by extra check, line change
+ * from:  elem = vd[*++o -1];
+ * into:  int oi = *++o; if (oi < 1 || oi > nrow) error("internal error: *++o not in range of 1:nrow(x)\n"); elem = vd[oi -1];
+ */
 #define COMPARE1_VIA_ORDER                                                    \
 prev = vd[*o -1];                                                             \
 for (int i=1; i<nrow; i++) {                                                  \

From 11e1020c5b3a0fb2058cff86615839738a91c7aa Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 14:31:33 +0100
Subject: [PATCH 08/15] uniqueNlogical codecov, uniqlengths test num to int

---
 R/uniqlist.R          | 2 +-
 inst/tests/tests.Rraw | 2 ++
 src/uniqlist.c        | 6 +++---
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/uniqlist.R b/R/uniqlist.R
index a4869fdc92..2f706e8a4d 100644
--- a/R/uniqlist.R
+++ b/R/uniqlist.R
@@ -12,7 +12,7 @@ uniqlist = function (l, order = -1L) {
   # those are only for backward compatibility, probably not really used anywhere, will keep 1962.010 and 1962.011 happy
   if (!is.list(l)) stop("l not type list")
   if (!length(l)) return(list(0L))
-  # this is for compatibility to new uniq.c code
+  # this is for compatibility to new uniq C code
   if (identical(order, -1L)) order = integer()
 
   uniq(l, order, internal=TRUE)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index fcdc8e0cd8..fc03f27181 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -3950,6 +3950,7 @@ o1 <- uniqlist(list(x), ox)
 test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
 o1 <- uniqlist(list(x))
 test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
+test(1151.3, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(as.numeric(o1), as.numeric(length(x))))
 rm(list=c("x","ox","o1"))
 gc()
 
@@ -6786,6 +6787,7 @@ test(1475.13, uniqueN(NA),                               1L)
 test(1475.14, uniqueN(NA, na.rm=TRUE),                   0L)
 test(1475.15, uniqueN(logical()),                        0L)
 test(1475.16, uniqueN(logical(), na.rm=TRUE),            0L)
+test(1475.17, uniqueN(TRUE, na.rm=NA), error="must be TRUE or FALSE")
 
 # preserve class attribute in GForce mean (and sum)
 DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day")))
diff --git a/src/uniqlist.c b/src/uniqlist.c
index c15ab2b5c9..88a8b1641e 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -156,9 +156,9 @@ SEXP uniq(SEXP x, SEXP order) {
 
 SEXP uniqlengths(SEXP x, SEXP n) {
   if (!isInteger(x))
-    error(_("Input argument 'x' to 'uniqlengths' must be an integer vector"));
+    error(_("internal error: Input argument 'x' to 'uniqlengths' must be an integer vector")); // # nocov
   if (!isInteger(x) || length(n)!=1 || INTEGER(n)[0]==NA_INTEGER)
-    error(_("Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA"));
+    error(_("internal error: Input argument 'n' to 'uniqlengths' must be an integer vector of length 1 non NA")); // # nocov
   R_len_t len = length(x);
   SEXP ans = PROTECT(allocVector(INTSXP, len));
   const int *xp = INTEGER(x);
@@ -362,7 +362,7 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul
 SEXP uniqueNlogical(SEXP x, SEXP narmArg) {
   // single pass; short-circuit and return as soon as all 3 values are found
   if (!isLogical(x))
-    error(_("x is not a logical vector"));
+    error(_("internal error: x is not a logical vector")); // # nocov
   if (!IS_TRUE_OR_FALSE(narmArg))
     error(_("na.rm must be TRUE or FALSE"));
   bool narm = LOGICAL(narmArg)[0]==1;

From fef3aece6207c3ee76255eeae8606131ab0fc002 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 14:48:46 +0100
Subject: [PATCH 09/15] news and more manual

---
 NEWS.md     | 2 ++
 man/uniq.Rd | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 71fd76aa65..db206de8ab 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -81,6 +81,8 @@ unit = "s")
 
 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.
 
+15. New function `uniq` has been exported (previously known as `uniqlist` when used internally). Function is useful to find consecutively unique rows, [#900](https://github.com/Rdatatable/data.table/issues/900). Thanks to @anhqle for feature request. For more details about usage see function manual [`?uniq`](https://rdatatable.gitlab.io/data.table/library/data.table/html/uniq.html).
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
diff --git a/man/uniq.Rd b/man/uniq.Rd
index 389a9c5aaa..c8ecd2c44c 100644
--- a/man/uniq.Rd
+++ b/man/uniq.Rd
@@ -15,7 +15,7 @@ uniq(x, order, internal=FALSE)
 }
 \details{
   Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.
-  Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, sorted, no duplicates, no NAs).
+  Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, no duplicates, no NAs). Anyone setting \code{internal} to \code{TRUE} is obligated to check the \code{order} argument beforehand, otherwise \emph{segfault} exception is likely to happen and terminate R process abnormally.
 }
 \value{
   Integer vector corresponding to rows which are consecutively unique.
@@ -32,12 +32,16 @@ uniq(data.table(x=1:2, y=1:2))
 uniq(data.table(x=1:2)[c(1L,1:2)])
 uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)])
 
-# order argument
+# 'order' argument
 x = data.table(id = 1:8, v = rep(1:2, each=4))
 uniq(x[,"v"])
 x = x[c(1:2,7:8,3:4,5:6)]
 uniq(x[,"v"])
 o = order(x$id)
 uniq(x[,"v"], order=o)
+
+if (!anyDuplicated(o) && !anyNA(o) && between(o, 1L, nrow(x))) {
+  uniq(x[,"v"], order=o, internal=TRUE)
+}
 }
 \keyword{ data }

From bcdc65a8d6cd017c4abffa04dc244e166bef3d93 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 11 Apr 2020 14:50:59 +0100
Subject: [PATCH 10/15] if needs all, unlike stopifnot, fix

---
 man/uniq.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/uniq.Rd b/man/uniq.Rd
index c8ecd2c44c..63dccf2953 100644
--- a/man/uniq.Rd
+++ b/man/uniq.Rd
@@ -40,7 +40,7 @@ uniq(x[,"v"])
 o = order(x$id)
 uniq(x[,"v"], order=o)
 
-if (!anyDuplicated(o) && !anyNA(o) && between(o, 1L, nrow(x))) {
+if (!anyDuplicated(o) && !anyNA(o) && all(between(o, 1L, nrow(x)))) {
   uniq(x[,"v"], order=o, internal=TRUE)
 }
 }

From 58beb639aa65b1cb529738e84e00fae05e8429e0 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sun, 19 Apr 2020 20:43:50 +0100
Subject: [PATCH 11/15] segfault proof public api

---
 R/uniqlist.R          | 33 +++++++++++----------
 inst/tests/tests.Rraw | 52 ++++++++++++++++++--------------
 man/uniq.Rd           | 17 ++++++-----
 src/data.table.h      |  2 +-
 src/uniqlist.c        | 69 ++++++++++++++++++++++++++++++++-----------
 5 files changed, 110 insertions(+), 63 deletions(-)

diff --git a/R/uniqlist.R b/R/uniqlist.R
index 2f706e8a4d..fedba89646 100644
--- a/R/uniqlist.R
+++ b/R/uniqlist.R
@@ -15,26 +15,27 @@ uniqlist = function (l, order = -1L) {
   # this is for compatibility to new uniq C code
   if (identical(order, -1L)) order = integer()
 
-  uniq(l, order, internal=TRUE)
+  funiq(l, order, safe=FALSE)
 }
 
-uniq = function(x, order=integer(), internal=FALSE) {
-  if (!isTRUE(internal) && !missing(order)) {
-    if (!is.integer(order))
+uniq = function(x, order=integer()) {
+  if (!is.list(x))
+    stop("'x' must be a data.table type object");
+  if (!is.integer(order)) {
+    if (is.numeric(order))
       order = as.integer(order)
-    if (length(order)) {
-      len = nrow2(x)
-      if (length(order) != len)
-        stop("'order' must be same length as nrow of 'x'")
-      if (anyDuplicated(order))
-        stop("'order' must not contain duplicates")
-      if (anyNA(order))
-        stop("'order' must not contain NAs")
-      if (any(!between(order, 1L, len)))
-        stop("'order' must be in range of 1:nrow(x)")
-    }
+    else
+      stop("'order' must be an integer")
   }
-  .Call(Cuniq, x, order)
+  if (length(order) && length(order)!=nrow2(x))
+    stop("'order' must be same length as nrow of 'x'")
+  funiq(x, order, safe=TRUE)
+}
+
+# use safe=F when you are sure that 'order' is in 1:nrow(x)
+# otherwise it segfaults, thus internal
+funiq = function(x, order=integer(), safe=FALSE) {
+  .Call(Cuniq, x, order, safe)
 }
 
 # implemented for returning the lengths of groups obtained from uniqlist (for internal use only)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index fc03f27181..0a0699b53e 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -54,6 +54,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   test = data.table:::test
   uniqlengths = data.table:::uniqlengths
   uniqlist = data.table:::uniqlist
+  funiq = data.table:::funiq
   nrow2 = data.table:::nrow2
   which_ = data.table:::which_
   which.first = data.table:::which.first
@@ -3902,32 +3903,38 @@ x = x[c(1:2,7:8,3:4,5:6)]
 test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L))
 o = order(x$id)
 test(1150.13, uniq(x[,"v"], order=o), c(1L,5L))
-test(1150.21, uniq(1:5), error="must be a data.table type") # error code cov
-test(1150.22, uniq(list(b = as.raw(1:5))), error="not supported")
-test(1150.23, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported")
-test(1150.31, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L)) # 'order' exceptions internal=FALSE
-test(1150.32, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow of 'x'")
-test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), error="must not contain duplicates")
-test(1150.34, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must not contain NAs")
-test(1150.35, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range")
-test(1150.36, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range")
-test(1150.41, uniq(x[,"v"], order=as.numeric(o), internal=TRUE), error="must be an integer") # 'order' exceptions internal=TRUE
-test(1150.42, uniq(x[,"v"], order=o[-1L], internal=TRUE), error="has been passed length.*nrow")
-test(1150.43, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal
-#test(1150.441, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), internal=TRUE), error="must not contain NAs") ## unhandled bad usage! thus internal # segfault, see uniqlist.c:COMPARE1_VIA_ORDER
-test(1150.442, uniq(x[c(o[1:6],o[c(NA,7L)]), "v"], internal=TRUE), c(1L,5L,7L,8L)) ## this is ok as are not using 'order' arg
-test(1150.45, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal
-test(1150.46, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), internal=TRUE), c(1L,5L,7L,8L)) ## unhandled bad usage! thus internal
-x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behaviour of 'order' special case
+x = data.table(id = 1:8, v = rep(1:2, each=4), w=1L)
+o = order(x$id)
+test(1150.21, uniq(1:5), error="must be a data.table type object")
+test(1150.22, funiq(1:5), error="internal.*must be a data.table type object")
+test(1150.23, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L))
+test(1150.24, funiq(x[,"v"], order=as.numeric(o)), error="internal.*must be an integer")
+test(1150.25, uniq(x[,"v"], order="a"), error="must be an integer")
+test(1150.26, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow")
+test(1150.27, funiq(x[,"v"], order=o[-1L]), error="internal.*has been passed length.*nrow")
+test(1150.28, uniq(list(b = as.raw(1:5))), error="not supported")
+test(1150.29, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported")
+test(1150.30, funiq(x[,"v"], safe=NA), error="must be TRUE or FALSE")
+test(1150.31, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must be in range")
+test(1150.32, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range")
+test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range")
+test(1150.34, uniq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)])), error="must be in range")
+test(1150.35, funiq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") ## trying safe=F would segfault!
+test(1150.36, funiq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), safe=TRUE), error="must be in range")
+test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error="must be in range")
+test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range")
+test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"])
+test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L))
+x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case
 o = order(x$id)
 test(1150.51, uniq(x[,"v"], order=o), c(1L,5L))
 test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L))
 test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow")
 test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range")
-test(1150.61, uniq(x[,"v"], order=o, internal=TRUE), c(1L,5L))
-test(1150.62, uniq(x[,"v"], order=integer(), internal=TRUE), c(1L,5L))
-test(1150.63, uniq(x[,"v"], order=-1L, internal=TRUE), error="has been passed length.*nrow")
-test(1150.64, uniq(x[1L,"v"], order=-1L, internal=TRUE), 1L) ## unhandled bad usage! thus internal
+test(1150.61, funiq(x[,"v"], order=o), c(1L,5L))
+test(1150.62, funiq(x[,"v"], order=integer()), c(1L,5L))
+test(1150.63, funiq(x[,"v"], order=-1L), error="has been passed length.*nrow")
+test(1150.64, funiq(x[1L,"v"], order=-1L), error="must be in range")
 test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L))
 test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L))
 test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L))
@@ -3935,7 +3942,8 @@ test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L)
 op = options(datatable.verbose=TRUE)
 test(1150.91, uniq(data.table()), integer(), output="took")
 test(1150.92, uniq(data.table(x=integer())), integer(), output="took")
-test(1150.93, uniq(data.table(x=1:2)), 1:2, output="took")
+test(1150.93, uniq(data.table(x=1L)), 1L, output="took")
+test(1150.94, uniq(data.table(x=1:2)), 1:2, output="took")
 options(op)
 test(1150.96, nrow2(list()), 0L) # nrow2 helper tests
 test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L)
diff --git a/man/uniq.Rd b/man/uniq.Rd
index 63dccf2953..37652b0b58 100644
--- a/man/uniq.Rd
+++ b/man/uniq.Rd
@@ -1,21 +1,23 @@
 \name{uniq}
 \alias{uniq}
 \alias{uniqlist}
+\alias{funiq}
 \title{ Consecutively unique rows }
 \description{
   Finds the consecutively unique rows.
 }
 \usage{
-uniq(x, order, internal=FALSE)
+uniq(x, order=integer())
 }
 \arguments{
   \item{x}{ data.table type object. }
-  \item{order}{ integer vector order of \code{x}. }
-  \item{internal}{ logical flag, escapes checks of \code{order} argument. }
+    \item{order}{ integer vector order of \code{x}, must not contain duplicates. }
 }
 \details{
   Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.
-  Function should be considered experimental. Argument \code{internal} should not be set to \code{FALSE} unless \code{order} is a \emph{shuffle} (strict permutation of \code{1:nrow(x)}, no duplicates, no NAs). Anyone setting \code{internal} to \code{TRUE} is obligated to check the \code{order} argument beforehand, otherwise \emph{segfault} exception is likely to happen and terminate R process abnormally.
+}
+\note{
+  It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception.
 }
 \value{
   Integer vector corresponding to rows which are consecutively unique.
@@ -37,11 +39,12 @@ x = data.table(id = 1:8, v = rep(1:2, each=4))
 uniq(x[,"v"])
 x = x[c(1:2,7:8,3:4,5:6)]
 uniq(x[,"v"])
+
 o = order(x$id)
 uniq(x[,"v"], order=o)
-
-if (!anyDuplicated(o) && !anyNA(o) && all(between(o, 1L, nrow(x)))) {
-  uniq(x[,"v"], order=o, internal=TRUE)
+# or if we are not sure if 'o' has no duplicates
+if (!anyDuplicated(o)) {
+  uniq(x[,"v"], order=o)
 }
 }
 \keyword{ data }
diff --git a/src/data.table.h b/src/data.table.h
index 646a32cc1a..51fe3598ff 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val);
 SEXP vecseq(SEXP x, SEXP len, SEXP clamp);
 
 // uniqlist.c
-SEXP uniq(SEXP x, SEXP order);
+SEXP uniq(SEXP x, SEXP order, SEXP safe);
 SEXP uniqlengths(SEXP x, SEXP n);
 
 // chmatch.c
diff --git a/src/uniqlist.c b/src/uniqlist.c
index 88a8b1641e..f0578f0d97 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -1,24 +1,28 @@
 #include "data.table.h"
 
 /* uniq macros */
+
 #define COMPARE1                                                              \
 prev = *vd;                                                                   \
 for (int i=1; i<nrow; i++) {                                                  \
   elem = *++vd;                                                               \
   if (elem!=prev
 
-/* segfault in COMPARE1_VIA_ORDER when using internal=TRUE (disabled test 1150.441)
- * uniq(data.table(a=1:3), order=c(1L,NA,3L), internal=TRUE) # where NA is any int not in 1:nrow(x)
- * could be avoided by extra check, line change
- * from:  elem = vd[*++o -1];
- * into:  int oi = *++o; if (oi < 1 || oi > nrow) error("internal error: *++o not in range of 1:nrow(x)\n"); elem = vd[oi -1];
- */
 #define COMPARE1_VIA_ORDER                                                    \
 prev = vd[*o -1];                                                             \
 for (int i=1; i<nrow; i++) {                                                  \
   elem = vd[*++o -1];                                                         \
   if (elem!=prev
 
+#define COMPARE1_VIA_ORDER_SAFE                                               \
+prev = vd[*o -1];                                                             \
+for (int i=1; i<nrow; i++) {                                                  \
+  int oi = *++o;                                                              \
+  if (oi < 1 || oi > nrow)                                                    \
+    error("'order' must be in range 1:nrow(x)");                              \
+  elem = vd[oi -1];                                                           \
+  if (elem!=prev
+
 #define COMPARE2                                                              \
   ) {                                                                         \
     iidx[len++] = i+1;                                                        \
@@ -30,11 +34,15 @@ for (int i=1; i<nrow; i++) {                                                  \
   prev = elem;                                                                \
 }
 
-SEXP uniq(SEXP x, SEXP order) {
+SEXP uniq(SEXP x, SEXP order, SEXP safe) {
+  // safe=true ensure no segfault using COMPARE1_VIA_ORDER_SAFE
+  // safe=false is not available in public API
   if (!isNewList(x))
-    error(_("'x' must be a data.table type object"));
+    error(_("internal error: 'x' must be a data.table type object"));
   if (!isInteger(order))
-    error(_("'order' must be an integer, you should be more careful when using internal=TRUE"));
+    error(_("internal error: 'order' must be an integer"));
+  if (!IS_TRUE_OR_FALSE(safe))
+    error(_("internal error: 'safe' must be TRUE or FALSE"));
   const bool verbose = GetVerbose();
   double tic = 0;
   if (verbose)
@@ -49,20 +57,34 @@ SEXP uniq(SEXP x, SEXP order) {
     return(ans);
   }
   bool via_order = LENGTH(order) > 0;
+  bool via_order_safe = via_order && LOGICAL(safe)[0];
   if (via_order && LENGTH(order)!=nrow)
-    error(_("uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow);
+    error(_("internal error: uniq has been passed length(order)==%d but nrow==%d"), LENGTH(order), nrow);
+  const int *o = INTEGER(order);  // only used when via_order[_safe] is true
+  if (nrow==1) {
+    if (via_order && o[0]!=1)
+      error("'order' must be in range 1:nrow(x)");
+    SEXP ans = PROTECT(allocVector(INTSXP, 1));
+    INTEGER(ans)[0] = 1;
+    if (verbose)
+      Rprintf(_("uniq: took %.3fs\n"), omp_get_wtime()-tic);
+    UNPROTECT(1);
+    return(ans);
+  }
+  
   unsigned long long *ulv; // for numeric check speed-up (to overcome NA/NaN/Inf/-Inf comparisons) (> 2x speed-up)
   R_len_t isize=1000, len=1;
   int *iidx = Calloc(isize, int); // for 'idx'
   iidx[0] = 1; // first row is always the first of the first group
   if (ncol==1) {
     SEXP v = VECTOR_ELT(x,0);
-    const int *o = INTEGER(order);  // only used when via_order is true
     switch(TYPEOF(v)) {
     case INTSXP : case LGLSXP : {
       const int *vd=INTEGER(v);
       int prev, elem;
-      if (via_order) {
+      if (via_order_safe) {
+        COMPARE1_VIA_ORDER_SAFE COMPARE2
+      } else if (via_order) {
         // ad hoc by (order passed in)
         COMPARE1_VIA_ORDER COMPARE2
       } else {
@@ -73,7 +95,9 @@ SEXP uniq(SEXP x, SEXP order) {
     case STRSXP : {
       const SEXP *vd=STRING_PTR(v);
       SEXP prev, elem;
-      if (via_order) {
+      if (via_order_safe) {
+        COMPARE1_VIA_ORDER_SAFE && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2
+      } else if (via_order) {
         COMPARE1_VIA_ORDER && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2   // but most of the time they are equal, so ENC2UTF8 doesn't need to be called
       } else {
         COMPARE1           && ENC2UTF8(elem)!=ENC2UTF8(prev) COMPARE2
@@ -84,13 +108,17 @@ SEXP uniq(SEXP x, SEXP order) {
       uint64_t prev, elem;
       // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine
       if (getNumericRounding_C()==0 /*default*/ || Rinherits(v,char_integer64)) {
-        if (via_order) {
+        if (via_order_safe) {
+          COMPARE1_VIA_ORDER_SAFE COMPARE2
+        } else if (via_order) {
           COMPARE1_VIA_ORDER COMPARE2
         } else {
           COMPARE1           COMPARE2
         }
       } else {
-        if (via_order) {
+        if (via_order_safe) {
+          COMPARE1_VIA_ORDER_SAFE && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2
+        } else if (via_order) {
           COMPARE1_VIA_ORDER && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2
         } else {
           COMPARE1           && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2
@@ -102,13 +130,20 @@ SEXP uniq(SEXP x, SEXP order) {
     }
     }
   } else { // ncol>1
-    R_len_t previ, thisi = via_order ? INTEGER(order)[0]-1 : 0;
+    R_len_t previ, thisi = via_order ? o[0]-1 : 0;
     bool *i64 = (bool *)R_alloc(ncol, sizeof(bool));
     for (int i=0; i<ncol; i++)
       i64[i] = Rinherits(VECTOR_ELT(x,i),char_integer64);
     for (int i=1; i<nrow; i++) {
       previ = thisi;
-      thisi = via_order ? INTEGER(order)[i]-1 : i;
+      if (!via_order) {
+        thisi = i;
+      } else {
+        int oi = o[i];
+        if (via_order_safe && (oi < 1 || oi > nrow))
+          error("'order' must be in range 1:nrow(x)");
+        thisi = oi-1;
+      }
       int j = ncol;  // the last column varies the most frequently so check that first and work backwards
       bool same = true; // flag to indicate if the values in a row are same as the previous row, if flag false then we can move on to next group
       while (--j>=0 && same) {

From aed71cf9041ac5f5b6881b8e0cf75d5a1f410c71 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sun, 19 Apr 2020 20:57:59 +0100
Subject: [PATCH 12/15] manual entry align

---
 man/uniq.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/uniq.Rd b/man/uniq.Rd
index 37652b0b58..1a46603f38 100644
--- a/man/uniq.Rd
+++ b/man/uniq.Rd
@@ -11,7 +11,7 @@ uniq(x, order=integer())
 }
 \arguments{
   \item{x}{ data.table type object. }
-    \item{order}{ integer vector order of \code{x}, must not contain duplicates. }
+  \item{order}{ integer vector order of \code{x}, must not contain duplicates. }
 }
 \details{
   Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.

From c93ed24c0471ca20620e8e2cf88522e5e0da17d8 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sun, 19 Apr 2020 21:00:09 +0100
Subject: [PATCH 13/15] another manual entry fix

---
 man/uniq.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/uniq.Rd b/man/uniq.Rd
index 1a46603f38..01d01a5b2a 100644
--- a/man/uniq.Rd
+++ b/man/uniq.Rd
@@ -17,7 +17,7 @@ uniq(x, order=integer())
   Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.
 }
 \note{
-  It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception.
+  It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order}} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception.
 }
 \value{
   Integer vector corresponding to rows which are consecutively unique.

From b589fad0241b6c0e415a3ae925c4fdfed7341508 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 20 Apr 2020 11:18:57 +0100
Subject: [PATCH 14/15] test coverage

---
 inst/tests/tests.Rraw | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 0a0699b53e..7f182d511b 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -3925,6 +3925,13 @@ test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error=
 test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range")
 test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"])
 test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L))
+test(1150.41, uniq(data.table(x = c("a","a","b","b","c"))), c(1L,3L,5L)) ## test coverage
+old = getNumericRounding()
+setNumericRounding(0)
+test(1150.42, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L))
+setNumericRounding(2)
+test(1150.43, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L))
+setNumericRounding(old)
 x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case
 o = order(x$id)
 test(1150.51, uniq(x[,"v"], order=o), c(1L,5L))

From a09a54f5f382cb19b5825adb591c3c728fcb4da9 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 20 Apr 2020 11:47:29 +0100
Subject: [PATCH 15/15] proper codecov fix

---
 inst/tests/tests.Rraw | 9 ++++++---
 src/uniqlist.c        | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 7f182d511b..31f7e21092 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -3925,12 +3925,15 @@ test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error=
 test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range")
 test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"])
 test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L))
-test(1150.41, uniq(data.table(x = c("a","a","b","b","c"))), c(1L,3L,5L)) ## test coverage
+test(1150.41, uniq(data.table(x = c("a","a","b","b","c")), order=1:5), c(1L,3L,5L)) ## test coverage
 old = getNumericRounding()
 setNumericRounding(0)
-test(1150.42, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L))
+test(1150.42, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L))
 setNumericRounding(2)
-test(1150.43, uniq(data.table(x = c(1,1,2,2,3))), c(1L,3L,5L))
+test(1150.43, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L))
+if (test_bit64) {
+  test(1150.44, uniq(data.table(x = as.integer64(c(1,1,2,2,3))), order=1:5), c(1L,3L,5L))
+}
 setNumericRounding(old)
 x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case
 o = order(x$id)
diff --git a/src/uniqlist.c b/src/uniqlist.c
index f0578f0d97..1a9e27489e 100644
--- a/src/uniqlist.c
+++ b/src/uniqlist.c
@@ -107,7 +107,8 @@ SEXP uniq(SEXP x, SEXP order, SEXP safe) {
       const uint64_t *vd=(const uint64_t *)REAL(v);
       uint64_t prev, elem;
       // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine
-      if (getNumericRounding_C()==0 /*default*/ || Rinherits(v,char_integer64)) {
+      if (getNumericRounding_C()==0 /*default*/ ||
+          Rinherits(v,char_integer64)) {
         if (via_order_safe) {
           COMPARE1_VIA_ORDER_SAFE COMPARE2
         } else if (via_order) {