Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export(nafill)
export(setnafill)
export(.Last.updated)
export(fcoalesce)
export(uniq)

S3method("[", data.table)
S3method("[<-", data.table)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ unit = "s")

14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.

15. New function `uniq` has been exported (previously known as `uniqlist` when used internally). Function is useful to find consecutively unique rows, [#900](https://github.com/Rdatatable/data.table/issues/900). Thanks to @anhqle for feature request. For more details about usage see function manual [`?uniq`](https://rdatatable.gitlab.io/data.table/library/data.table/html/uniq.html).

## BUG FIXES

1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
Expand Down
52 changes: 37 additions & 15 deletions R/uniqlist.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,41 @@
nrow2 = function(x) {
if (!length(x)) return(0L)
if (is.data.table(x)) nrow(x) else if (is.list(x)) length(x[[1L]]) else stop("nrow2 expects data.table or list")
}

uniqlist = function (l, order = -1L)
{
# Assumes input list is ordered by each list item (or by 'order' if supplied), and that all list elements are the same length
# Finds the non-duplicate rows. Was called duplist but now grows vector - doesn't over-allocate result vector and
# is >2x times faster on numeric types
# TO DO: Possibly reinstate reverse argument :
# FALSE works in the usual duplicated() way, the first in a sequence of dups, will be FALSE
# TRUE has the last in a sequence of dups FALSE (so you can keep the last if that's required)
# l = list(...)
if (!is.list(l))
stop("l not type list")
if (!length(l)) return(list(0L))
ans = .Call(Cuniqlist, l, as.integer(order))
ans
uniqlist = function (l, order = -1L) {
# used in "[.data.table" when doing groupby (!byjoin) to find the groups using byval
# (length(byval) && length(byval[[1L]])) && (bysameorder || byindex)
# and in duplicated.data.table when
# haskey(x) && length(by) <= length(key(x)) && all(head(key(x), length(by)) == by)

# those are only for backward compatibility, probably not really used anywhere, will keep 1962.010 and 1962.011 happy
if (!is.list(l)) stop("l not type list")
if (!length(l)) return(list(0L))
# this is for compatibility to new uniq C code
if (identical(order, -1L)) order = integer()

funiq(l, order, safe=FALSE)
}

uniq = function(x, order=integer()) {
if (!is.list(x))
stop("'x' must be a data.table type object");
if (!is.integer(order)) {
if (is.numeric(order))
order = as.integer(order)
else
stop("'order' must be an integer")
}
Comment on lines +24 to +29
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!is.integer(order)) {
if (is.numeric(order))
order = as.integer(order)
else
stop("'order' must be an integer")
}
if (!is.numeric(order))
stopf("'%s' must be an integer", "order")
order = as.integer(order)

if (length(order) && length(order)!=nrow2(x))
stop("'order' must be same length as nrow of 'x'")
funiq(x, order, safe=TRUE)
}

# use safe=F when you are sure that 'order' is in 1:nrow(x)
# otherwise it segfaults, thus internal
funiq = function(x, order=integer(), safe=FALSE) {
.Call(Cuniq, x, order, safe)
}

# implemented for returning the lengths of groups obtained from uniqlist (for internal use only)
Expand All @@ -21,4 +44,3 @@ uniqlengths = function(x, len) {
ans = .Call(Cuniqlengths, as.integer(x), as.integer(len))
ans
}

77 changes: 77 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
test = data.table:::test
uniqlengths = data.table:::uniqlengths
uniqlist = data.table:::uniqlist
funiq = data.table:::funiq
nrow2 = data.table:::nrow2
which_ = data.table:::which_
which.first = data.table:::which.first
which.last = data.table:::which.last
Expand Down Expand Up @@ -3885,6 +3887,79 @@ if (.Machine$sizeof.longdouble == 16) {
test(1149.1, forderv(integer(0)), integer(0))
test(1149.2, forderv(numeric(0)), integer(0))

# test uniq (uniqlist) #900 ## test number 1150 looks to be unused so taking over
test(1150.01, uniq(data.table()), integer()) # examples
test(1150.02, uniq(data.table(x=integer())), integer())
test(1150.03, uniq(data.table(x=integer(), y=integer())), integer())
test(1150.04, uniq(data.table(x=1L)), 1L)
test(1150.05, uniq(data.table(x=1L, y=1L)), 1L)
test(1150.06, uniq(data.table(x=1:2)), 1:2)
test(1150.07, uniq(data.table(x=1:2, y=1:2)), 1:2)
test(1150.08, uniq(data.table(x=1:2)[c(1L,1:2)]), c(1L,3L))
test(1150.09, uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)]), c(1L,3L))
x = data.table(id = 1:8, v = rep(1:2, each=4)) # 'order' argument example
test(1150.11, uniq(x[,"v"]), c(1L,5L))
x = x[c(1:2,7:8,3:4,5:6)]
test(1150.12, uniq(x[,"v"]), c(1L,3L,5L,7L))
o = order(x$id)
test(1150.13, uniq(x[,"v"], order=o), c(1L,5L))
x = data.table(id = 1:8, v = rep(1:2, each=4), w=1L)
o = order(x$id)
test(1150.21, uniq(1:5), error="must be a data.table type object")
test(1150.22, funiq(1:5), error="internal.*must be a data.table type object")
test(1150.23, uniq(x[,"v"], order=as.numeric(o)), c(1L,5L))
test(1150.24, funiq(x[,"v"], order=as.numeric(o)), error="internal.*must be an integer")
test(1150.25, uniq(x[,"v"], order="a"), error="must be an integer")
test(1150.26, uniq(x[,"v"], order=o[-1L]), error="must be same length as nrow")
test(1150.27, funiq(x[,"v"], order=o[-1L]), error="internal.*has been passed length.*nrow")
test(1150.28, uniq(list(b = as.raw(1:5))), error="not supported")
test(1150.29, uniq(list(a = 1:2, b = as.raw(1:5))), error="not supported")
test(1150.30, funiq(x[,"v"], safe=NA), error="must be TRUE or FALSE")
test(1150.31, uniq(x[,"v"], order=c(o[1:6],o[c(NA,7L)])), error="must be in range")
test(1150.32, uniq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L])), error="must be in range")
test(1150.33, uniq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L])), error="must be in range")
test(1150.34, uniq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)])), error="must be in range")
test(1150.35, funiq(x[,"v"], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range") ## trying safe=F would segfault!
test(1150.36, funiq(x[,"v"], order=c(o[1:6],o[7L]+10L,o[8L]), safe=TRUE), error="must be in range")
test(1150.37, funiq(x[,"v"], order=c(o[1:6],o[7L]-10L,o[8L]), safe=TRUE), error="must be in range")
test(1150.38, funiq(x[,c("v","w")], order=c(o[1:6],o[c(NA,7L)]), safe=TRUE), error="must be in range")
test(1150.39, uniq(x[,"v"], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L)) ## duplicates in 'order' undefined behavior, see note in ?uniq, seems to behave like: uniq(x[c(o[1:6],o[c(2L,7L)]),"v"])
test(1150.40, uniq(x[,c("v","w")], order=c(o[1:6],o[c(2L,7L)])), c(1L,5L,7L,8L))
test(1150.41, uniq(data.table(x = c("a","a","b","b","c")), order=1:5), c(1L,3L,5L)) ## test coverage
old = getNumericRounding()
setNumericRounding(0)
test(1150.42, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L))
setNumericRounding(2)
test(1150.43, uniq(data.table(x = c(1,1,2,2,3)), order=1:5), c(1L,3L,5L))
if (test_bit64) {
test(1150.44, uniq(data.table(x = as.integer64(c(1,1,2,2,3))), order=1:5), c(1L,3L,5L))
}
setNumericRounding(old)
x = data.table(id = 1:8, v = rep(1:2, each=4)) # changed behavior of 'order' special case
o = order(x$id)
test(1150.51, uniq(x[,"v"], order=o), c(1L,5L))
test(1150.52, uniq(x[,"v"], order=integer()), c(1L,5L))
test(1150.53, uniq(x[,"v"], order=-1L), error="must be same length as nrow")
test(1150.54, uniq(x[1L,"v"], order=-1L), error="must be in range")
test(1150.61, funiq(x[,"v"], order=o), c(1L,5L))
test(1150.62, funiq(x[,"v"], order=integer()), c(1L,5L))
test(1150.63, funiq(x[,"v"], order=-1L), error="has been passed length.*nrow")
test(1150.64, funiq(x[1L,"v"], order=-1L), error="must be in range")
test(1150.71, uniqlist(x[,"v"], order=o), c(1L,5L))
test(1150.72, uniqlist(x[,"v"], order=integer()), c(1L,5L))
test(1150.73, uniqlist(x[,"v"], order=-1L), c(1L,5L))
test(1150.74, uniqlist(x[1L,"v"], order=-1L), 1L)
op = options(datatable.verbose=TRUE)
test(1150.91, uniq(data.table()), integer(), output="took")
test(1150.92, uniq(data.table(x=integer())), integer(), output="took")
test(1150.93, uniq(data.table(x=1L)), 1L, output="took")
test(1150.94, uniq(data.table(x=1:2)), 1:2, output="took")
options(op)
test(1150.96, nrow2(list()), 0L) # nrow2 helper tests
test(1150.97, nrow2(list(x=1:10, b=1:5)), 10L)
test(1150.98, nrow2(data.table(x=1:10, b=1:5)), 10L)
test(1150.99, nrow2(1:5), error="nrow2 expects data.table or list")

# test uniqlengths
set.seed(45)
x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
Expand All @@ -3893,6 +3968,7 @@ o1 <- uniqlist(list(x), ox)
test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
o1 <- uniqlist(list(x))
test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
test(1151.3, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(as.numeric(o1), as.numeric(length(x))))
rm(list=c("x","ox","o1"))
gc()

Expand Down Expand Up @@ -6729,6 +6805,7 @@ test(1475.13, uniqueN(NA), 1L)
test(1475.14, uniqueN(NA, na.rm=TRUE), 0L)
test(1475.15, uniqueN(logical()), 0L)
test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L)
test(1475.17, uniqueN(TRUE, na.rm=NA), error="must be TRUE or FALSE")

# preserve class attribute in GForce mean (and sum)
DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day")))
Expand Down
50 changes: 50 additions & 0 deletions man/uniq.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
\name{uniq}
\alias{uniq}
\alias{uniqlist}
\alias{funiq}
\title{ Consecutively unique rows }
\description{
Finds the consecutively unique rows.
}
\usage{
uniq(x, order=integer())
}
\arguments{
\item{x}{ data.table type object. }
\item{order}{ integer vector order of \code{x}, must not contain duplicates. }
}
\details{
Works like UNIX \emph{uniq} as referred to by \code{\link[base]{unique}}; i.e., it drops immediately repeated rows but doesn't drop duplicates of any previous row. Unless \code{order} is provided, then it also drops any previous row.
}
\note{
It is an undefined behavior when \code{order} argument contains duplicates. It was designed to take what the \code{\link[base]{order}} function returns. We do not check for duplicates, although we still check for values to be in range \code{1:nrow(x)} and non-NA, to avoid \emph{segfault} exception.
}
\value{
Integer vector corresponding to rows which are consecutively unique.
}
\seealso{ \code{\link{data.table}}, \code{\link{rleid}} }
\examples{
uniq(data.table())
uniq(data.table(x=integer()))
uniq(data.table(x=integer(), y=integer()))
uniq(data.table(x=1L))
uniq(data.table(x=1L, y=1L))
uniq(data.table(x=1:2))
uniq(data.table(x=1:2, y=1:2))
uniq(data.table(x=1:2)[c(1L,1:2)])
uniq(data.table(x=1:2, y=1:2)[c(1L,1:2)])

# 'order' argument
x = data.table(id = 1:8, v = rep(1:2, each=4))
uniq(x[,"v"])
x = x[c(1:2,7:8,3:4,5:6)]
uniq(x[,"v"])

o = order(x$id)
uniq(x[,"v"], order=o)
# or if we are not sure if 'o' has no duplicates
if (!anyDuplicated(o)) {
uniq(x[,"v"], order=o)
}
}
\keyword{ data }
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ SEXP int_vec_init(R_len_t n, int val);
SEXP vecseq(SEXP x, SEXP len, SEXP clamp);

// uniqlist.c
SEXP uniqlist(SEXP l, SEXP order);
SEXP uniq(SEXP x, SEXP order, SEXP safe);
SEXP uniqlengths(SEXP x, SEXP n);

// chmatch.c
Expand Down
2 changes: 1 addition & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ R_CallMethodDef callMethods[] = {
{"CexpandAltRep", (DL_FUNC) &expandAltRep, -1},
{"Cfmelt", (DL_FUNC) &fmelt, -1},
{"Cfcast", (DL_FUNC) &fcast, -1},
{"Cuniqlist", (DL_FUNC) &uniqlist, -1},
{"Cuniq", (DL_FUNC) &uniq, -1},
{"Cuniqlengths", (DL_FUNC) &uniqlengths, -1},
{"Cforder", (DL_FUNC) &forder, -1},
{"Cfsorted", (DL_FUNC) &fsorted, -1},
Expand Down
Loading