export uniq function (uniqlist) #4372

MichaelChirico · 2024-09-09T16:32:27Z

Suggested change

if (!is.integer(order)) {

if (is.numeric(order))

order = as.integer(order)

else

stop("'order' must be an integer")

}

if (!is.numeric(order))

stopf("'%s' must be an integer", "order")

order = as.integer(order)

-Original file line number
+Diff line change
@@ Expand Up / @@ -56,6 +56,7 @@ export(nafill) @@
     export(setnafill)
     export(.Last.updated)
     export(fcoalesce)
+    export(uniq)
     S3method("[", data.table)
     S3method("[<-", data.table)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -81,6 +81,8 @@ unit = "s") @@
 . Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.
+. New function `uniq` has been exported (previously known as `uniqlist` when used internally). Function is useful to find consecutively unique rows, [#900](https://github.com/Rdatatable/data.table/issues/900). Thanks to @anhqle for feature request. For more details about usage see function manual [`?uniq`](https://rdatatable.gitlab.io/data.table/library/data.table/html/uniq.html).
     ## BUG FIXES
 . A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -54,6 +54,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { @@
       test = data.table:::test
       uniqlengths = data.table:::uniqlengths
       uniqlist = data.table:::uniqlist
+      funiq = data.table:::funiq
+      nrow2 = data.table:::nrow2
       which_ = data.table:::which_
       which.first = data.table:::which.first
       which.last = data.table:::which.last
@@ Expand Down Expand Up / @@ -3885,6 +3887,79 @@ if (.Machine$sizeof.longdouble == 16) { @@
     test(1149.1, forderv(integer(0)), integer(0))
     test(1149.2, forderv(numeric(0)), integer(0))
+    # test uniq (uniqlist) #900         ## test number 1150 looks to be unused so taking over
+    test(1150.01, uniq(data.table()), integer()) # examples
+    test(1150.02, uniq(data.table(x=integer())), integer())
+    test(1150.03, uniq(data.table(x=integer(), y=integer())), integer())
+    test(1150.04, uniq(data.table(x=1L)), 1L)
+    test(1150.05, uniq(data.table(x=1L, y=1L)), 1L)
+    test(1150.06, uniq(data.table(x=1:2)), 1:2)
+    test(1150.07, uniq(data.table(x=1:2, y=1:2)), 1:2)
+    test(1150.08, uniq(data.table(x=1:2)[c(1L,1:2)]), c(1L,3L))
+    test(1150.09, uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]), c(1L,3L))
+    x = data.table(id = 1:8, v = rep(1:2, each=4)) # 'order' argument example
+    test(1150.11, uniq(x[,"v"]), c(1L,5L))
+    x = x[c(1:2,7:8,3:4,5:6)]
+    test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L))
+    o = order(x$id)
+    test(1150.13, uniq(x[,"v"], order=o), c(1L,5L))
+    x = data.table(id = 1:8, v = rep(1:2, each=4), w=1L)
+    o = order(x$id)
+    test(1150.21, uniq(1:5), error="must be a data.table type object")
+    test(1150.22, funiq(1:5), error="internal.*must be a data.table type object")
+    test(1150.23, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L))
+    test(1150.24, funiq(x[,"v"], order=as.numeric(o)), error="internal.*must be an integer")
+    test(1150.25, uniq(x[,"v"], order="a"), error="must be an integer")
+    test(1150.26, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow")
+    test(1150.27, funiq(x[,"v"], order=o[-1L]), error="internal.*has been passed length.*nrow")
+    test(1150.28, uniq(list(b = as.raw(1:5))), error="not supported")
+    test(1150.29, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported")
+    test(1150.30, funiq(x[,"v"], safe=NA), error="must be TRUE or FALSE")
+    test(1150.31, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must be in range")
+    test(1150.32, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range")
+    test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range")
+    test(1150.34, uniq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)])), error="must be in range")
+    test(1150.35, funiq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") ## trying safe=F would segfault!
+    test(1150.36, funiq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), safe=TRUE), error="must be in range")
+    test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error="must be in range")
+    test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range")
+    test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"])
+    test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L))
+    test(1150.41, uniq(data.table(x = c("a","a","b","b","c")), order=1:5), c(1L,3L,5L)) ## test coverage
+    old = getNumericRounding()
+    setNumericRounding(0)
+    test(1150.42, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L))
+    setNumericRounding(2)
+    test(1150.43, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L))
+    if (test_bit64) {
+      test(1150.44, uniq(data.table(x = as.integer64(c(1,1,2,2,3))), order=1:5), c(1L,3L,5L))
+    }
+    setNumericRounding(old)
+    x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case
+    o = order(x$id)
+    test(1150.51, uniq(x[,"v"], order=o), c(1L,5L))
+    test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L))
+    test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow")
+    test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range")
+    test(1150.61, funiq(x[,"v"], order=o), c(1L,5L))
+    test(1150.62, funiq(x[,"v"], order=integer()), c(1L,5L))
+    test(1150.63, funiq(x[,"v"], order=-1L), error="has been passed length.*nrow")
+    test(1150.64, funiq(x[1L,"v"], order=-1L), error="must be in range")
+    test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L))
+    test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L))
+    test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L))
+    test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L)
+    op = options(datatable.verbose=TRUE)
+    test(1150.91, uniq(data.table()), integer(), output="took")
+    test(1150.92, uniq(data.table(x=integer())), integer(), output="took")
+    test(1150.93, uniq(data.table(x=1L)), 1L, output="took")
+    test(1150.94, uniq(data.table(x=1:2)), 1:2, output="took")
+    options(op)
+    test(1150.96, nrow2(list()), 0L) # nrow2 helper tests
+    test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L)
+    test(1150.98, nrow2(data.table(x=1:10, b=1:5)), 10L)
+    test(1150.99, nrow2(1:5), error="nrow2 expects data.table or list")
     # test uniqlengths
     set.seed(45)
     x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
@@ Expand All / @@ -3893,6 +3968,7 @@ o1 <- uniqlist(list(x), ox) @@
     test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
     o1 <- uniqlist(list(x))
     test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
+    test(1151.3, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(as.numeric(o1), as.numeric(length(x))))
     rm(list=c("x","ox","o1"))
     gc()
@@ Expand Down Expand Up / @@ -6729,6 +6805,7 @@ test(1475.13, uniqueN(NA), 1L) @@
     test(1475.14, uniqueN(NA, na.rm=TRUE),                   0L)
     test(1475.15, uniqueN(logical()),                        0L)
     test(1475.16, uniqueN(logical(), na.rm=TRUE),            0L)
+    test(1475.17, uniqueN(TRUE, na.rm=NA), error="must be TRUE or FALSE")
     # preserve class attribute in GForce mean (and sum)
     DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day")))
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,50 @@
+    \name{uniq}
+    \alias{uniq}
+    \alias{uniqlist}
+    \alias{funiq}
+    \title{ Consecutively unique rows }
+    \description{
+      Finds the consecutively unique rows.
+    }
+    \usage{
+    uniq(x, order=integer())
+    }
+    \arguments{
+      \item{x}{ data.table type object. }
+      \item{order}{ integer vector order of \code{x}, must not contain duplicates. }
+    }
+    \details{
+      Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.
+    }
+    \note{
+      It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order}} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception.
+    }
+    \value{
+      Integer vector corresponding to rows which are consecutively unique.
+    }
+    \seealso{ \code{\link{data.table}}, \code{\link{rleid}} }
+    \examples{
+    uniq(data.table())
+    uniq(data.table(x=integer()))
+    uniq(data.table(x=integer(), y=integer()))
+    uniq(data.table(x=1L))
+    uniq(data.table(x=1L, y=1L))
+    uniq(data.table(x=1:2))
+    uniq(data.table(x=1:2, y=1:2))
+    uniq(data.table(x=1:2)[c(1L,1:2)])
+    uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)])
+    # 'order' argument
+    x = data.table(id = 1:8, v = rep(1:2, each=4))
+    uniq(x[,"v"])
+    x = x[c(1:2,7:8,3:4,5:6)]
+    uniq(x[,"v"])
+    o = order(x$id)
+    uniq(x[,"v"], order=o)
+    # or if we are not sure if 'o' has no duplicates
+    if (!anyDuplicated(o)) {
+      uniq(x[,"v"], order=o)
+    }
+    }
+    \keyword{ data }

-Original file line number
+Diff line change
@@ Expand Up / @@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val); @@
     SEXP vecseq(SEXP x, SEXP len, SEXP clamp);
     // uniqlist.c
-    SEXP uniqlist(SEXP l, SEXP order);
+    SEXP uniq(SEXP x, SEXP order, SEXP safe);
     SEXP uniqlengths(SEXP x, SEXP n);
     // chmatch.c
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

export uniq function (uniqlist) #4372

Uh oh!

Diff view

Diff view

There are no files selected for viewing

MichaelChirico Sep 9, 2024

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -149,7 +149,7 @@ R_CallMethodDef callMethods[] = { @@
     {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1},
     {"Cfmelt", (DL_FUNC) &fmelt, -1},
     {"Cfcast", (DL_FUNC) &fcast, -1},
-    {"Cuniqlist", (DL_FUNC) &uniqlist, -1},
+    {"Cuniq", (DL_FUNC) &uniq, -1},
     {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1},
     {"Cforder", (DL_FUNC) &forder, -1},
     {"Cfsorted", (DL_FUNC) &fsorted, -1},
@@ Expand Down @@

export uniq function (uniqlist) #4372

Are you sure you want to change the base?

Uh oh!

export uniq function (uniqlist) #4372

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

MichaelChirico Sep 9, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!