Rdatatable · mattdowle · May 9, 2021 · Sep 26, 2020 · Sep 26, 2020 · Sep 26, 2020
@@ -1,3 +1,4 @@
+.dir-locals.el
 ^\.Rprofile$
 ^data\.table_.*\.tar\.gz$
 ^vignettes/plots/figures$

@@ -56,6 +56,8 @@
 
 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing.
 
+9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing.
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.

@@ -948,7 +948,7 @@ replace_dot_alias = function(e) {
           } else {
             if (colsub %iscall% 'patterns') {
               # each pattern gives a new filter condition, intersect the end result
-              .SDcols = Reduce(intersect, do_patterns(colsub, names_x))
+              .SDcols = Reduce(intersect, eval_with_cols(colsub, names_x))
             } else {
               .SDcols = eval(colsub, parent.frame(), parent.frame())
               # allow filtering via function in .SDcols, #3950

@@ -3,7 +3,7 @@
 #   reshape2 package is deprecated since December 2017, so we'll deprecate our
 #   redirection as well
 
-melt <- function(data, ..., na.rm = FALSE, value.name = "value") {
+melt = function(data, ..., na.rm = FALSE, value.name = "value") {
   if (is.data.table(data)) {
     UseMethod("melt", data)
     # if data is not data.table and reshape2 is installed, this won't dispatch to reshape2's method;
@@ -22,10 +22,144 @@ melt <- function(data, ..., na.rm = FALSE, value.name = "value") {
 patterns = function(..., cols=character(0L)) {
   # if ... has no names, names(list(...)) will be "";
   #   this assures they'll be NULL instead
-  p = unlist(list(...), use.names = any(nzchar(names(...))))
+  L = list(...)
+  p = unlist(L, use.names = any(nzchar(names(L))))
   if (!is.character(p))
     stop("Input patterns must be of type character.")
-  lapply(p, grep, cols)
+  matched = lapply(p, grep, cols)
+  # replace with lengths when R 3.2.0 dependency arrives
+  if (length(idx <- which(sapply(matched, length) == 0L)))
+    stop('Pattern', if (length(idx) > 1L) 's', ' not found: [',
+         paste(p[idx], collapse = ', '), ']')
+  matched
+}
+
+measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") {
+  # 1. basic error checking.
+  if (!missing(sep) && !missing(pattern)) {
+    stop("both sep and pattern arguments used in measure; must use either sep or pattern (not both)")
+  }
+  if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) {
+    stop("multiple.keyword must be a character string with nchar>0")
+  }
+  if (!is.character(cols)) {
+    stop("cols must be a character vector of column names")
+  }
+  # 2. compute conversion function list with group names.
+  mcall = match.call()
+  L = as.list(mcall)[-1]
+  formal.names <- names(formals())
+  fun.list = L[-which(names(L) %in% formal.names)]
+  user.named = names(fun.list) != ""
+  is.symb = sapply(fun.list, is.symbol)
+  bad.i = which((!user.named) & (!is.symb))
+  if (length(bad.i)) {
+    stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=","))
+  }
+  names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste)
+  # group names error checking.
+  group.is.formal <- names(fun.list) %in% formal.names
+  if (any(group.is.formal)) {
+    bad.names <- names(fun.list)[group.is.formal]
+    stop("group names specified in ... conflict with measure argument names; please fix by changing group names: ", paste(bad.names, collapse=","))
+  }
+  err.names.unique <- function(err.what, name.vec) {
+    name.tab = table(name.vec)
+    bad.counts = name.tab[1 < name.tab]
+    if (length(bad.counts)) {
+      stop(err.what, " names should be unique, problems: ", paste(names(bad.counts), collapse=","))
+    }
+  }
+  err.args.groups <- function(type, N){
+    if (N != length(fun.list)) {
+      stop("number of ... arguments to measure =", length(fun.list), " must be same as ", type, " =", N)
+    }
+  }
+  err.names.unique("measure group", names(fun.list))
+  # 3. compute initial group data table, used as variable_table attribute.
+  group.mat = if (!missing(pattern)) {
+    if (!is.character(pattern)) {
+      stop("pattern must be character string")
+    }
+    match.vec = regexpr(pattern, cols, perl=TRUE)
+    measure.vec = which(0 < match.vec)
+    if (length(measure.vec) == 0L) {
+      stop("pattern did not match any cols, so nothing would be melted; fix by changing pattern")
+    }
+    start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE]
+    if (is.null(start)) {
+      stop("pattern must contain at least one capture group (parenthesized sub-pattern)")
+    }
+    err.args.groups("number of capture groups in pattern", ncol(start))
+    end = attr(match.vec, "capture.length")[measure.vec,]+start-1L
+    names.mat = matrix(cols[measure.vec], nrow(start), ncol(start))
+    substr(names.mat, start, end)
+  } else { #pattern not specified, so split using sep.
+    if (!is.character(sep)) {
+      stop("sep must be character string")
+    }
+    list.of.vectors = strsplit(cols, sep, fixed=TRUE)
+    vector.lengths = sapply(list.of.vectors, length)
+    n.groups = max(vector.lengths)
+    if (n.groups == 1) {
+      stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
+    }
+    err.args.groups("max number of items after splitting column names", n.groups)
+    measure.vec = which(vector.lengths==n.groups)
+    do.call(rbind, list.of.vectors[measure.vec])
+  }
+  err.names.unique("measured column", cols[measure.vec])
+  uniq.mat <- unique(group.mat)
+  if (nrow(uniq.mat) < nrow(group.mat)) {
+    stop("number of unique column IDs =", nrow(uniq.mat), " is less than number of melted columns =", nrow(group.mat), "; fix by changing pattern/sep")
+  }
+  colnames(group.mat) = names(fun.list)
+  group.dt = data.table(group.mat)
+  # 4. apply conversion functions to group data table.
+  for (group.i in which(user.named)) {
+    group.name = names(fun.list)[[group.i]]
+    fun = eval(fun.list[[group.name]], parent.frame(1L))
+    if (!is.function(fun) || length(formals(args(fun)))==0) {
+      stop("each ... argument to measure must be a function with at least one argument, problem: ", group.name)
+    }
+    group.val = fun(group.dt[[group.name]])
+    if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) {
+      stop("each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name)
+    }
+    if (all(is.na(group.val))) {
+      stop(group.name, " conversion function returned vector of all NA")
+    }
+    set(group.dt, j=group.name, value=group.val)
+  }
+  group.uniq <- unique(group.dt)
+  if (nrow(group.uniq) < nrow(group.dt)) {
+    stop("number of unique groups after applying type conversion functions less than number of groups, change type conversion")
+  }
+  # 5. compute measure.vars list or vector.
+  if (multiple.keyword %in% names(fun.list)) {# multiple output columns.
+    if (!is.character(group.dt[[multiple.keyword]])) {
+      stop(multiple.keyword, " column class=", class(group.dt[[multiple.keyword]])[[1L]], " after applying conversion function, but must be character")
+    }
+    is.other = names(group.dt) != multiple.keyword
+    if (!any(is.other)) {
+      stop(multiple.keyword, " is the only group; fix by creating at least one more group")
+    }
+    other.values = lapply(group.dt[, is.other, with=FALSE], unique)
+    other.values$stringsAsFactors = FALSE
+    other.dt = data.table(do.call(expand.grid, other.values))
+    measure.list = structure(list(), variable_table=other.dt)
+    column.values = unique(group.dt[[multiple.keyword]])
+    for(column.val in column.values){
+      select.dt = data.table(other.dt)
+      set(select.dt, j=multiple.keyword, value=column.val)
+      measure.list[[column.val]] = data.table(
+        measure.vec, group.dt
+      )[select.dt, measure.vec, on=names(select.dt)]
+    }
+    measure.list
+  } else {# single output column.
+    structure(measure.vec, variable_table=group.dt)
+  }
 }
 
 melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable",
@@ -35,8 +169,11 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl
   if (missing(id.vars)) id.vars=NULL
   if (missing(measure.vars)) measure.vars = NULL
   measure.sub = substitute(measure.vars)
-  if (measure.sub %iscall% "patterns") {
-    measure.vars = do_patterns(measure.sub, names(data))
+  if (is.call(measure.sub)) {
+    eval.result = eval_with_cols(measure.sub, names(data))
+    if (!is.null(eval.result)) {
+      measure.vars = eval.result
+    }
   }
   if (is.list(measure.vars) && length(measure.vars) > 1L) {
     meas.nm = names(measure.vars)

@@ -105,22 +105,28 @@ brackify = function(x, quote=FALSE) {
 }
 
 # patterns done via NSE in melt.data.table and .SDcols in `[.data.table`
-do_patterns = function(pat_sub, all_cols) {
-  # received as substitute(patterns(...))
-  pat_sub = as.list(pat_sub)[-1L]
-  # identify cols = argument if present
-  idx = which(names(pat_sub) == "cols")
-  if (length(idx)) {
-    cols = eval(pat_sub[["cols"]], parent.frame(2L))
-    pat_sub = pat_sub[-idx]
-  } else cols = all_cols
-  pats = lapply(pat_sub, eval, parent.frame(2L))
-  matched = patterns(pats, cols=cols)
-  # replace with lengths when R 3.2.0 dependency arrives
-  if (length(idx <- which(sapply(matched, length) == 0L)))
-    stop('Pattern', if (length(idx) > 1L) 's', ' not found: ', brackify(pats[idx]))
-
-  return(matched)
+# was called do_patterns() before PR#4731
+eval_with_cols = function(orig_call, all_cols) {
+  parent = parent.frame(2L)
+  fun_uneval = orig_call[[1L]]
+  # take fun from either calling env (parent) or from data.table
+  fun = tryCatch({
+    maybe_fun = eval(fun_uneval, parent)
+    # parent env could have a non-function with this name, which we
+    # should ignore.
+    stopifnot(is.function(maybe_fun))
+    maybe_fun
+  }, error=function(e) {
+    eval(fun_uneval)#take function from data.table namespace.
+  })
+  if (!is.primitive(fun)) {
+    named_call = match.call(fun, orig_call)
+    if ("cols" %in% names(formals(fun)) && !"cols" %in% names(named_call)) {
+      named_call[["cols"]] = all_cols
+    }
+    named_call[[1L]] = fun
+    eval(named_call, parent)
+  }
 }
 
 # check UTC status