From 6c1548152b0aa89eee9cfec3651fe7943813e9a1 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 5 Apr 2025 13:45:34 +0300 Subject: [PATCH 1/2] groupingsets(): introduce and make use of 'enclos' When forwarding NSE arguments via jj=, also forward the environment containing the symbols they may be referencing. Now groupingsets() substitutes all the local arguments into the call and then evaluates x[, jj, ...] in the specified environment, making it possible for jj to refer to local symbols without confusing them to variables belonging to groupingsets() or its caller. To avoid cedta() problems, set .datatable.aware = TRUE in the environment where the call is evaluated. (A column named .datatable.aware would be shadowed. So it goes.) --- R/groupingsets.R | 17 ++++++++++++----- inst/tests/tests.Rraw | 24 ++++++++++++++++++++++++ man/groupingsets.Rd | 3 ++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 63e94d1b62..8a2eddcde9 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -13,7 +13,7 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { sets = lapply(length(by):0L, function(i) by[0L:i]) # redirect to workhorse function jj = substitute(j) - groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label) + groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame()) } cube = function(x, ...) { @@ -35,13 +35,13 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { sets = lapply((2L^n):1L, function(jj) by[keepBool[jj, ]]) # redirect to workhorse function jj = substitute(j) - groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label) + groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame()) } groupingsets = function(x, ...) { UseMethod("groupingsets") } -groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, ...) { +groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), ...) { # input data type basic validation if (!is.data.table(x)) stopf("Argument 'x' must be a data.table object") @@ -112,7 +112,10 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL if (length(names(by))) by = unname(by) # 0 rows template data.table to keep colorder and type - empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] + # inline all arguments that might clash with enclosing environment + pcall = substitute(x[0L, jj, by], list(x = x, jj = jj, by = by)) + if (length(.SDcols)) pcall$.SDcols = .SDcols + empty = eval(pcall, list(.datatable.aware = TRUE), enclos) if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field stopf("When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'.") if (anyDuplicated(names(empty)) > 0L) @@ -150,8 +153,12 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe stopf("Using integer64 class columns require to have 'bit64' package installed.") # nocov int64.by.cols = intersect(int64.cols, by) # aggregate function called for each grouping set + # inline all arguments that might clash with enclosing environment + pcall = substitute(x[, jj], list(x = x, jj = jj)) + if (length(.SDcols)) pcall$.SDcols = .SDcols aggregate.set = function(by.set) { - r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set] + pcall$by = by.set + r = eval(pcall, list(.datatable.aware = TRUE), enclos) if (id) { # integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE # 3267: strtoi("", base = 2L) output apparently unstable across platforms diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ea964f2047..ed0a71b3ff 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21129,3 +21129,27 @@ test(2311.2, nlevels(DT$V1), 2L) # used to be 3 # avoid translateChar*() in OpenMP threads, #6883 DF = list(rep(iconv("\uf8", from = "UTF-8", to = "latin1"), 2e6)) test(2312, fwrite(DF, nullfile(), encoding = "UTF-8", nThread = 2L), NULL) + +# 'sets' is a local variable in groupingsets(), cube(), rollup() and shouldn't leak into the 'j' expression +n = 24L +set.seed(25) +DT = data.table( + color = sample(c("green","yellow","red"), n, TRUE), + year = as.Date(sample(paste0(2011:2015,"-01-01"), n, TRUE)), + status = as.factor(sample(c("removed","active","inactive","archived"), n, TRUE)), + amount = sample(1:5, n, TRUE), + value = sample(c(3, 3.5, 2.5, 2), n, TRUE) +) +sets = 0 +test(2313.0, + groupingsets(DT, j = c(list(count=.N + ..sets)), by = c("color","year","status"), sets = list("color", c("year","status"), character()), id=TRUE), + groupingsets(DT, j = c(list(count=.N + 0)), by = c("color","year","status"), sets = list("color", c("year","status"), character()), id=TRUE) +) +test(2313.1, + cube(DT, j = sum(value) + ..sets, by = c("color","year","status"), id=TRUE), + cube(DT, j = sum(value), by = c("color","year","status"), id=TRUE) +) +test(2313.2, + rollup(DT, j = sum(value) + ..sets, by=c("color","year","status"), label="total"), + rollup(DT, j = sum(value), by=c("color","year","status"), label="total") +) diff --git a/man/groupingsets.Rd b/man/groupingsets.Rd index 5d8a309548..7d87175dec 100644 --- a/man/groupingsets.Rd +++ b/man/groupingsets.Rd @@ -15,7 +15,7 @@ rollup(x, \dots) cube(x, \dots) \method{cube}{data.table}(x, j, by, .SDcols, id = FALSE, label = NULL, \dots) groupingsets(x, \dots) -\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, \dots) +\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), \dots) } \arguments{ \item{x}{\code{data.table}.} @@ -27,6 +27,7 @@ groupingsets(x, \dots) \item{id}{logical default \code{FALSE}. If \code{TRUE} it will add leading column with bit mask of grouping sets.} \item{jj}{quoted version of \code{j} argument, for convenience. When provided function will ignore \code{j} argument.} \item{label}{label(s) to be used in the 'total' rows in the grouping variable columns of the output, that is, in rows where the grouping variable has been aggregated. Can be a named list of scalars, or a scalar, or \code{NULL}. Defaults to \code{NULL}, which results in the grouping variables having \code{NA} in their 'total' rows. See Details.} + \item{enclos}{the environment containing the symbols referenced by \code{jj}. When writing functions that accept a \code{j} environment for non-standard evaluation by \pkg{data.table}, \code{\link[base]{substitute}()} it and forward it to \code{groupingsets} using the \code{jj} argument, set this to the \code{\link[base]{parent.frame}()} of the function that captures \code{j}.} } \details{ All three functions \code{rollup, cube, groupingsets} are generic methods, \code{data.table} methods are provided. From dc0d366cdc5993bbd913b9a0863ad449cd02f286 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Tue, 1 Jul 2025 13:59:20 +0300 Subject: [PATCH 2/2] NEWS item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5a6c73d482..901dfc6c7a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ 2. `melt()` now supports using `patterns()` with `id.vars`, [#6867](https://github.com/Rdatatable/data.table/issues/6867). Thanks to Toby Dylan Hocking for the suggestion and PR. +3. `groupingsets()` gets a new argument `enclos` for use together with the `jj` argument in functions wrapping `groupingsets()`, including the existing wrappers `rollup()` and `cube()`. When forwarding a `j`-expression as `groupingsets(jj = substitute(j))`, make sure to pass `enclos = parent.frame()` as well, so that the `j`-expression will be evaluated in the right context. This makes it possible for `j` to refer to variables outside the `data.table`. + ## BUG FIXES 1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.