From d548a724a2b2407e611258de035e0ff23a62a8cd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 17 Nov 2020 00:53:48 -0500 Subject: [PATCH] extend g[ to work for tail queries like x[.N] --- NEWS.md | 2 + R/data.table.R | 39 ++++++++++-- inst/tests/tests.Rraw | 27 ++++++++ src/gsumm.c | 145 ++++++++++++++++++++++++++++++------------ 4 files changed, 166 insertions(+), 47 deletions(-) diff --git a/NEWS.md b/NEWS.md index 94e1fee81c..06c2e0bbc3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## NEW FEATURES +1. Group-wise queries like `x[.N]` or `.SD[.N-1L]` (i.e. an offset of `k>=0` from the end of the group at index `.N`) are now optimized, [#4809](https://github.com/Rdatatable/data.table/issues/4809) and part of [#735](https://github.com/Rdatatable/data.table/issues/4809). Previously only fixed positive offsets like `x[1L]` or `.SD[2L]` were optimized. Thanks to @matthewgson for the report. + ## BUG FIXES 1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. diff --git a/R/data.table.R b/R/data.table.R index d513891b93..5448fabea3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1667,10 +1667,25 @@ replace_dot_alias = function(e) { if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE) # ... head-tail uses default value n=6 which as of now should not go gforce ^^ - # otherwise there must be three arguments, and only in two cases: - # 1) head/tail(x, 1) or 2) x[n], n>0 - length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) + # otherwise there must be three arguments, and only in these cases: + # 1) head/tail(x, 1); 2) x[n], n>0; 3) x[.N-k], k>=0; 4) x[[n]], n>0, is.atomic(x) + if (length(q) != 3L) return(FALSE) + q3 = q[[3L]] + switch( + as.character(q[[1L]]), + head = , tail = { + length(q3) == 1L && is.numeric(q3) && q3 == 1L + }, + "[" = { + if (length(q3) == 1L) (is.numeric(q3) && q3 > 0L) || (is.name(q3) && q3 == ".N") + else if (length(q3) == 3L) q3 %iscall% "-" && is.name(q3[[2L]]) && q3[[2L]] == ".N" && is.numeric(q3[[3L]]) && q3[[3L]] >= 0L + else FALSE + }, + "[[" = { + length(q3) == 1L && is.numeric(q3) && q3 > 0L && eval(call('is.atomic', q[[2L]]), envir=x) + }, + FALSE + ) } if (jsub[[1L]]=="list") { GForce = TRUE @@ -1681,11 +1696,21 @@ replace_dot_alias = function(e) { if (GForce) { if (jsub[[1L]]=="list") for (ii in seq_along(jsub)[-1L]) { - if (dotN(jsub[[ii]])) next; # For #334 - jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) + ji <- jsub[[ii]] + if (dotN(ji)) next; # For #334 + jsub[[ii]][[1L]] = as.name(paste0("g", ji[[1L]])) + if (ji[[1L]] == "[" && (ji[[3L]] == ".N" || (length(ji[[3L]]) == 3L && ji[[3L]][[2L]] == ".N"))) { + # `[`(x, .N-k) becomes `g[`(x, k, fromLast=TRUE) + jsub[[ii]][[3L]] = if (ji[[3L]] == ".N") 0L else ji[[3L]][[3L]] + jsub[[ii]][[4L]] = TRUE + } if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { + if (jsub[[1L]] == "[" && (jsub[[3L]] == ".N" || (length(jsub[[3L]]) == 3L && jsub[[3L]][[2L]] == ".N"))) { + jsub[[3L]] = if (jsub[[3L]] == ".N") 0L else jsub[[3L]][[3L]] + jsub[[4L]] = TRUE + } jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } @@ -2850,7 +2875,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { # (3) define the gfun = function() R wrapper gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", "median", "min", "max", "var", "sd", ".N") # added .N for #334 -`g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. +`g[` = `g[[` = function(x, n, fromLast=FALSE) .Call(Cgnthvalue, x, as.integer(n), as.logical(fromLast)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment gfirst = function(x) .Call(Cgfirst, x) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4dd2809f7d..aa447d70d2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8114,6 +8114,33 @@ fun = function (DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] fun(DT) test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) +# #4809 g[ with fromLast=TRUE (i.e. on queries like x[.N-k]) +DT = data.table(id = c(1, 1, 2), value = c(FALSE, TRUE, FALSE)) +options(datatable.optimize=1L) +test(1581.20, ans1 <- DT[ , value[.N], by=id, verbose=TRUE], output="GForce FALSE") +test(1581.21, ans2 <- DT[ , value[.N-1L], by=id, verbose=TRUE], output="GForce FALSE") +options(datatable.optimize=Inf) +test(1581.22, ans3 <- DT[ , value[.N], by=id, verbose=TRUE], output="GForce TRUE") +test(1581.23, ans4 <- DT[ , value[.N-1L], by=id, verbose=TRUE], output="GForce TRUE") +test(1581.24, ans1, ans3) +#test(1581.25, ans2, ans4) + +idx = c(2L, 3L) +DT[ , value := 0:2] +test(1581.26, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)]) + +DT[ , value := value + .1] +test(1581.27, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)]) + +DT[ , value := value + 1i] +test(1581.28, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)]) + +DT[ , value := letters[1:3]] +test(1581.29, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)]) + +DT[ , value := .(list(1, 1:2, 1:3))] +test(1581.30, DT[ , value[.N], by=id], DT[idx, .(id, V1 = value)]) + # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) diff --git a/src/gsumm.c b/src/gsumm.c index 372ae59440..adc66de649 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1167,24 +1167,39 @@ SEXP ghead(SEXP x, SEXP valArg) { return (gfirst(x)); } -SEXP gnthvalue(SEXP x, SEXP valArg) { +SEXP gnthvalue(SEXP x, SEXP valArg, SEXP fromLastArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<=0) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov - R_len_t i,k, val=INTEGER(valArg)[0]; + if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<0 || (!LOGICAL(fromLastArg)[0] && INTEGER(valArg)[0] == 0)) { + error(_("Internal error, `g[` (gnthvalue) is only implemented single-value subsets with positive index, e.g., .SD[2], or single-value tail subsets with non-negative index, e.g. .SD[.N-2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov + } + R_len_t i,k; + const R_len_t val = INTEGER(valArg)[0]; + const Rboolean fromLast = LOGICAL(fromLastArg)[0]; int n = (irowslen == -1) ? length(x) : irowslen; SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "ghead"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gnthvalue"); switch(TYPEOF(x)) { case LGLSXP: { const int *ix = LOGICAL(x); ans = PROTECT(allocVector(LGLSXP, ngrp)); int *ians = LOGICAL(ans); - for (i=0; i grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; } - k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - ians[i] = ix[k]; + if (fromLast) { + for (i=0; i grpsize[i]-1) { LOGICAL(ans)[i] = NA_LOGICAL; continue; } + k = ff[i]+grpsize[i]-val-2; // = (ff[i]-1) + (grpsize[i]-1) - val + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[i] = ix[k]; + } + } else { + for (i=0; i grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; } + Rprintf("i=%d, ff[i]=%d\n", i, ff[i]); + k = ff[i]+val-2; // = (ff[i]-1) + (val-1) + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[i] = ix[k]; + } } } break; @@ -1192,12 +1207,22 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { const int *ix = INTEGER(x); ans = PROTECT(allocVector(INTSXP, ngrp)); int *ians = INTEGER(ans); - for (i=0; i grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; } - k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - ians[i] = ix[k]; + if (fromLast) { + for (i=0; i grpsize[i]-1) { INTEGER(ans)[i] = NA_INTEGER; continue; } + k = ff[i]+grpsize[i]-val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[i] = ix[k]; + } + } else { + for (i=0; i grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; } + k = ff[i]+val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[i] = ix[k]; + } } } break; @@ -1205,12 +1230,22 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { const double *dx = REAL(x); ans = PROTECT(allocVector(REALSXP, ngrp)); double *dans = REAL(ans); - for (i=0; i grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; } - k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - dans[i] = dx[k]; + if (fromLast) { + for (i=0; i grpsize[i]-1) { REAL(ans)[i] = NA_REAL; continue; } + k = ff[i]+grpsize[i]-val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[i] = dx[k]; + } + } else { + for (i=0; i grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; } + k = ff[i]+val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[i] = dx[k]; + } } } break; @@ -1218,32 +1253,62 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { const Rcomplex *dx = COMPLEX(x); ans = PROTECT(allocVector(CPLXSXP, ngrp)); Rcomplex *dans = COMPLEX(ans); - for (i=0; i grpsize[i]) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; } - k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - dans[i] = dx[k]; + if (fromLast) { + for (i=0; i grpsize[i]-1) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; } + k = ff[i]+grpsize[i]-val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[i] = dx[k]; + } + } else { + for (i=0; i grpsize[i]) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; } + k = ff[i]+val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[i] = dx[k]; + } } } break; case STRSXP: ans = PROTECT(allocVector(STRSXP, ngrp)); - for (i=0; i grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } - k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - SET_STRING_ELT(ans, i, STRING_ELT(x, k)); + if (fromLast) { + for (i=0; i grpsize[i]-1) { SET_STRING_ELT(ans, i, NA_STRING); continue; } + k = ff[i]+grpsize[i]-val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_STRING_ELT(ans, i, STRING_ELT(x, k)); + } + } else { + for (i=0; i grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } + k = ff[i]+val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_STRING_ELT(ans, i, STRING_ELT(x, k)); + } } break; case VECSXP: ans = PROTECT(allocVector(VECSXP, ngrp)); - for (i=0; i grpsize[i]) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; } - k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k)); + if (fromLast) { + for (i=0; i grpsize[i]-1) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; } + k = ff[i]+grpsize[i]-val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k)); + } + } else { + for (i=0; i grpsize[i]) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; } + k = ff[i]+val-2; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k)); + } } break; default: