Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

## NEW FEATURES

1. Group-wise queries like `x[.N]` or `.SD[.N-1L]` (i.e. an offset of `k>=0` from the end of the group at index `.N`) are now optimized, [#4809](https://github.com/Rdatatable/data.table/issues/4809) and part of [#735](https://github.com/Rdatatable/data.table/issues/4809). Previously only fixed positive offsets like `x[1L]` or `.SD[2L]` were optimized. Thanks to @matthewgson for the report.

## BUG FIXES

1. `as.matrix(<empty DT>)` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting.
Expand Down
39 changes: 32 additions & 7 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1667,10 +1667,25 @@ replace_dot_alias = function(e) {
if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875
if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE)
# ... head-tail uses default value n=6 which as of now should not go gforce ^^
# otherwise there must be three arguments, and only in two cases:
# 1) head/tail(x, 1) or 2) x[n], n>0
length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) &&
( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) )
# otherwise there must be three arguments, and only in these cases:
# 1) head/tail(x, 1); 2) x[n], n>0; 3) x[.N-k], k>=0; 4) x[[n]], n>0, is.atomic(x)
if (length(q) != 3L) return(FALSE)
q3 = q[[3L]]
switch(
as.character(q[[1L]]),
head = , tail = {
length(q3) == 1L && is.numeric(q3) && q3 == 1L
},
"[" = {
if (length(q3) == 1L) (is.numeric(q3) && q3 > 0L) || (is.name(q3) && q3 == ".N")
else if (length(q3) == 3L) q3 %iscall% "-" && is.name(q3[[2L]]) && q3[[2L]] == ".N" && is.numeric(q3[[3L]]) && q3[[3L]] >= 0L
else FALSE
},
"[[" = {
length(q3) == 1L && is.numeric(q3) && q3 > 0L && eval(call('is.atomic', q[[2L]]), envir=x)
},
FALSE
)
}
if (jsub[[1L]]=="list") {
GForce = TRUE
Expand All @@ -1681,11 +1696,21 @@ replace_dot_alias = function(e) {
if (GForce) {
if (jsub[[1L]]=="list")
for (ii in seq_along(jsub)[-1L]) {
if (dotN(jsub[[ii]])) next; # For #334
jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]]))
ji <- jsub[[ii]]
if (dotN(ji)) next; # For #334
jsub[[ii]][[1L]] = as.name(paste0("g", ji[[1L]]))
if (ji[[1L]] == "[" && (ji[[3L]] == ".N" || (length(ji[[3L]]) == 3L && ji[[3L]][[2L]] == ".N"))) {
# `[`(x, .N-k) becomes `g[`(x, k, fromLast=TRUE)
jsub[[ii]][[3L]] = if (ji[[3L]] == ".N") 0L else ji[[3L]][[3L]]
jsub[[ii]][[4L]] = TRUE
}
if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4
}
else {
if (jsub[[1L]] == "[" && (jsub[[3L]] == ".N" || (length(jsub[[3L]]) == 3L && jsub[[3L]][[2L]] == ".N"))) {
jsub[[3L]] = if (jsub[[3L]] == ".N") 0L else jsub[[3L]][[3L]]
jsub[[4L]] = TRUE
}
jsub[[1L]] = as.name(paste0("g", jsub[[1L]]))
if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5
}
Expand Down Expand Up @@ -2850,7 +2875,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) {
# (3) define the gfun = function() R wrapper
gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod",
"median", "min", "max", "var", "sd", ".N") # added .N for #334
`g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here.
`g[` = `g[[` = function(x, n, fromLast=FALSE) .Call(Cgnthvalue, x, as.integer(n), as.logical(fromLast)) # n is of length=1 here.
ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment
gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment
gfirst = function(x) .Call(Cgfirst, x)
Expand Down
27 changes: 27 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8114,6 +8114,33 @@ fun = function (DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"]
fun(DT)
test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')])

# #4809 g[ with fromLast=TRUE (i.e. on queries like x[.N-k])
DT = data.table(id = c(1, 1, 2), value = c(FALSE, TRUE, FALSE))
options(datatable.optimize=1L)
test(1581.20, ans1 <- DT[ , value[.N], by=id, verbose=TRUE], output="GForce FALSE")
test(1581.21, ans2 <- DT[ , value[.N-1L], by=id, verbose=TRUE], output="GForce FALSE")
options(datatable.optimize=Inf)
test(1581.22, ans3 <- DT[ , value[.N], by=id, verbose=TRUE], output="GForce TRUE")
test(1581.23, ans4 <- DT[ , value[.N-1L], by=id, verbose=TRUE], output="GForce TRUE")
test(1581.24, ans1, ans3)
#test(1581.25, ans2, ans4)

idx = c(2L, 3L)
DT[ , value := 0:2]
test(1581.26, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)])

DT[ , value := value + .1]
test(1581.27, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)])

DT[ , value := value + 1i]
test(1581.28, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)])

DT[ , value := letters[1:3]]
test(1581.29, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)])

DT[ , value := .(list(1, 1:2, 1:3))]
test(1581.30, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)])

# handle NULL value correctly #1429
test(1582, uniqueN(NULL), 0L)

Expand Down
145 changes: 105 additions & 40 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1167,83 +1167,148 @@ SEXP ghead(SEXP x, SEXP valArg) {
return (gfirst(x));
}

SEXP gnthvalue(SEXP x, SEXP valArg) {
SEXP gnthvalue(SEXP x, SEXP valArg, SEXP fromLastArg) {

if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<=0) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov
R_len_t i,k, val=INTEGER(valArg)[0];
if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<0 || (!LOGICAL(fromLastArg)[0] && INTEGER(valArg)[0] == 0)) {
error(_("Internal error, `g[` (gnthvalue) is only implemented single-value subsets with positive index, e.g., .SD[2], or single-value tail subsets with non-negative index, e.g. .SD[.N-2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov
}
R_len_t i,k;
const R_len_t val = INTEGER(valArg)[0];
const Rboolean fromLast = LOGICAL(fromLastArg)[0];
int n = (irowslen == -1) ? length(x) : irowslen;
SEXP ans;
if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "ghead");
if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gnthvalue");
switch(TYPEOF(x)) {
case LGLSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
ians[i] = ix[k];
if (fromLast) {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]-1) { LOGICAL(ans)[i] = NA_LOGICAL; continue; }
k = ff[i]+grpsize[i]-val-2; // = (ff[i]-1) + (grpsize[i]-1) - val
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
ians[i] = ix[k];
}
} else {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; }
Rprintf("i=%d, ff[i]=%d\n", i, ff[i]);
k = ff[i]+val-2; // = (ff[i]-1) + (val-1)
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
ians[i] = ix[k];
}
}
}
break;
case INTSXP: {
const int *ix = INTEGER(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = INTEGER(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
ians[i] = ix[k];
if (fromLast) {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]-1) { INTEGER(ans)[i] = NA_INTEGER; continue; }
k = ff[i]+grpsize[i]-val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
ians[i] = ix[k];
}
} else {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
ians[i] = ix[k];
}
}
}
break;
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
dans[i] = dx[k];
if (fromLast) {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]-1) { REAL(ans)[i] = NA_REAL; continue; }
k = ff[i]+grpsize[i]-val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
dans[i] = dx[k];
}
} else {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
dans[i] = dx[k];
}
}
}
break;
case CPLXSXP: {
const Rcomplex *dx = COMPLEX(x);
ans = PROTECT(allocVector(CPLXSXP, ngrp));
Rcomplex *dans = COMPLEX(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
dans[i] = dx[k];
if (fromLast) {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]-1) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; }
k = ff[i]+grpsize[i]-val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
dans[i] = dx[k];
}
} else {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
dans[i] = dx[k];
}
}
} break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_STRING_ELT(ans, i, STRING_ELT(x, k));
if (fromLast) {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]-1) { SET_STRING_ELT(ans, i, NA_STRING); continue; }
k = ff[i]+grpsize[i]-val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_STRING_ELT(ans, i, STRING_ELT(x, k));
}
} else {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_STRING_ELT(ans, i, STRING_ELT(x, k));
}
}
break;
case VECSXP:
ans = PROTECT(allocVector(VECSXP, ngrp));
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k));
if (fromLast) {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]-1) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; }
k = ff[i]+grpsize[i]-val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k));
}
} else {
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k));
}
}
break;
default:
Expand Down