From cadfd099a547075b9ffb6503550e8ac120076b51 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Sep 2020 22:48:25 -0700 Subject: [PATCH 01/57] support missing values in measure.vars arg to melt --- inst/tests/tests.Rraw | 5 +++ src/fmelt.c | 93 ++++++++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 058c7db3b2..e36e3752f2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17138,3 +17138,8 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't chan test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) +# fix for #4027 via PR#TODO. +DT.wide = data.table(a2=2, b1=1, b2=2) +expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) +test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) +test(2154.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) diff --git a/src/fmelt.c b/src/fmelt.c index 22a4ac1fc5..6794a461cb 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -97,6 +97,21 @@ static const char *concat(SEXP vec, SEXP idx) { return ans; } +// input: character vector of column names (maybe missing), output: +// integer vector of column indices with NA_INTEGER in the positions +// with missing inputs. +SEXP chmatch_na(SEXP x, SEXP table){ + SEXP ans; + PROTECT(ans = chmatch(x, table, 0)); + for(int i=0; i ncol) + if (INTEGER(tmp)[i] != NA_INTEGER && (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)){ error(_("One or more values in 'measure.vars' is invalid.")); + } else if (!LOGICAL(booltmp)[i]) targetcols++; else continue; } @@ -260,18 +276,27 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } ans = PROTECT(allocVector(VECSXP, 2)); protecti++; SET_VECTOR_ELT(ans, 0, idcols); - SET_VECTOR_ELT(ans, 1, valuecols); + SET_VECTOR_ELT(ans, 1, valuecols);//List of integer vectors. UNPROTECT(protecti); return(ans); } struct processData { SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 - SEXP idcols, valuecols, naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively - int lids, lvalues, lmax, lmin, totlen, nrow; - int *isfactor, *leach, *isidentical; + SEXP idcols, + valuecols, // list with one element per output/value column, each + // element is an integer vector. + naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively + int *isfactor, + *leach, // length of each element of the valuecols(measure.vars) list. + *isidentical; // are all inputs for this value column the same type? + int lids, // number of id columns. + lvalues, // number of value columns. + lmax, //max length of valuecols elements / number of times to repeat ids. + totlen, // of output/long DT result of melt operation. + nrow; // of input/wide DT to be melted. SEXPTYPE *maxtype; - Rboolean narm; + Rboolean narm; // remove missing values? }; static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) { @@ -279,7 +304,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SEXP vars,tmp,thiscol; SEXPTYPE type; int i,j; - data->lmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); + data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); @@ -296,29 +321,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); - for (i=0; ilvalues; i++) { + // first find max type of each output column. + for (i=0; ilvalues; i++) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; - data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i]; - for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if (isFactor(thiscol)) { - data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; - data->maxtype[i] = STRSXP; - } else { - type = TYPEOF(thiscol); - if (type > data->maxtype[i]) data->maxtype[i] = type; + for (j=0; jleach[i]; j++) { // for each input column. + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if (isFactor(thiscol)) { + data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; + data->maxtype[i] = STRSXP; + } else { + type = TYPEOF(thiscol); + if (type > data->maxtype[i]) data->maxtype[i] = type; + } } } for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { - data->isidentical[i] = 0; - break; + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || + (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { + data->isidentical[i] = 0; + break; + } } } } @@ -427,18 +459,25 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax)); Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean)); SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues)); - for (int i=0; ilvalues; ++i) { + for (int i=0; ilvalues; ++i) {//for each output/value column. bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor; SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy SET_VECTOR_ELT(ansvals, i, target); UNPROTECT(1); // still protected by virtue of being member of protected ansval. - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids. int counter = 0; bool copyattr = false; - for (int j=0; jlmax; ++j) { + for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1) - : allocNAVector(data->maxtype[i], data->nrow); + // TODO use this line of code if NA specified. + SEXP thiscol; + int input_column_num = INTEGER(thisvaluecols)[j]; + if (j >= data->leach[i] || // fewer indices than the max were specified. + input_column_num == NA_INTEGER) { // NA was specified. + thiscol = allocNAVector(data->maxtype[i], data->nrow); + }else{ + thiscol = VECTOR_ELT(DT, input_column_num-1); + } if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = true; From 522cc789b6c26e22006b2baf36a4a84bd9db7788 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Sep 2020 23:12:40 -0700 Subject: [PATCH 02/57] PR4720 --- NEWS.md | 7 +++++++ inst/tests/tests.Rraw | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 308d427250..3e6668e135 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,13 @@ ## NEW FEATURES +1. `melt.data.table()` now supports `NA` entries when specifying a + list of `measure.vars`, which translate into runs of missing values + in the output. Fixes + [#4027](https://github.com/Rdatatable/data.table/issues/4027) via + [PR#4720](https://github.com/Rdatatable/data.table/pull/4720) from + @tdhock. + ## BUG FIXES 1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e36e3752f2..4d059fc772 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17138,7 +17138,7 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't chan test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) -# fix for #4027 via PR#TODO. +# fix for #4027 via PR#4720. DT.wide = data.table(a2=2, b1=1, b2=2) expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) From 61a3c8ce5b557f6165de004cc2bdc48ed6c5c256 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 26 Sep 2020 07:10:08 -0700 Subject: [PATCH 03/57] bump variable when all data are missing --- inst/tests/tests.Rraw | 5 ++--- src/fmelt.c | 8 +++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 058c7db3b2..024e1e2610 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3172,9 +3172,9 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") x[, c("y1","z1"):=NA] test(1037.405, dim(melt(x, measure.vars=patterns("^y", "^z"))), INT(4,5)) test(1037.406, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE)), INT(2,5)) - test(1037.407, ans$variable, factor(c("1","1"))) + test(1037.407, ans$variable, factor(c("2","2"), c("1", "2"))) test(1037.408, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE, variable.factor=FALSE)), INT(2,5)) - test(1037.409, ans$variable, c("1","1")) + test(1037.409, ans$variable, c("2","2")) test(1037.410, melt(data.table(NULL), verbose=TRUE), data.table(NULL), output="ncol(data) is 0. Nothing to melt") @@ -17137,4 +17137,3 @@ test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L))) test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) - diff --git a/src/fmelt.c b/src/fmelt.c index 22a4ac1fc5..3a84afd635 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -538,7 +538,6 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str } else { for (int j=0, ansloc=0, level=1; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - if (thislen==0) continue; // so as not to bump level char buff[20]; snprintf(buff, 20, "%d", level++); SEXP str = PROTECT(mkChar(buff)); @@ -546,11 +545,11 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str UNPROTECT(1); } } - } else { + } else {// varfactor==TRUE SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (data->lvalues == 1) { + if (data->lvalues == 1) {//single output column. SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; @@ -573,12 +572,11 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; for (int k=0; klmax)); protecti++; for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - if (thislen==0) continue; // so as not to bump level char buff[20]; snprintf(buff, 20, "%d", nlevel+1); SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels From 264169e73d71eda55eefd8790065fa741a7da4c9 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 26 Sep 2020 07:49:19 -0700 Subject: [PATCH 04/57] test/fix with id.vars --- inst/tests/tests.Rraw | 4 ++++ src/fmelt.c | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d059fc772..73da3f0726 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17143,3 +17143,7 @@ DT.wide = data.table(a2=2, b1=1, b2=2) expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) test(2154.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) +DTid <- data.table(DT.wide, id=1) +exid <- data.table(id=1, expected) +test(2154.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) +test(2154.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) diff --git a/src/fmelt.c b/src/fmelt.c index 6794a461cb..bb5c5f3554 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -153,6 +153,10 @@ static SEXP unlist_(SEXP xint) { return(ans); } +bool invalid_measure(int i, int ncol) { + return i != NA_INTEGER && (i <= 0 || i > ncol); +} + SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans; @@ -218,9 +222,8 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++; for (i=0; i ncol)){ + if (invalid_measure(INTEGER(tmp)[i], ncol)) error(_("One or more values in 'measure.vars' is invalid.")); - } else if (!LOGICAL(booltmp)[i]) targetcols++; else continue; } @@ -265,7 +268,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { tmp = PROTECT(unlist_(tmp2)); protecti++; } for (i=0; i ncol) + if (invalid_measure(INTEGER(tmp)[i], ncol)) error(_("One or more values in 'measure.vars' is invalid.")); } if (isNewList(measure)) valuecols = tmp2; From 9a58c4ce9927b94755f7af3712213ac87c52be48 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 13:25:16 -0700 Subject: [PATCH 05/57] only link issue not PR in NEWS --- NEWS.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3e6668e135..995565d172 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,10 +8,11 @@ 1. `melt.data.table()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values - in the output. Fixes - [#4027](https://github.com/Rdatatable/data.table/issues/4027) via - [PR#4720](https://github.com/Rdatatable/data.table/pull/4720) from - @tdhock. + in the output. Useful for melting wide data tables with some + missing columns, + [#4027](https://github.com/Rdatatable/data.table/issues/4027). + Thanks to @vspinu for reporting, and @tdhock for implementing the + changes to fmelt. ## BUG FIXES From 0b69386aeadf0ab0cc02063dfdef7f4f8245b36f Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 13:26:04 -0700 Subject: [PATCH 06/57] doc/exmple for missing entries of measure.vars list --- man/melt.data.table.Rd | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index a9d69b5f66..a9403a4281 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -64,7 +64,11 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt -into multiple columns in a single function call efficiently. The function +into multiple columns in a single function call efficiently. +If a vector in the list containts missing values, or is shorter than the +max length of the list elements, then the output will include runs of +missing values at the specified position, or at the end. +The function \code{\link{patterns}} can be used to provide regular expression patterns. When used along with \code{melt}, if \code{cols} argument is not provided, the patterns will be matched against \code{names(data)}, for convenience. @@ -134,6 +138,10 @@ melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) +# measure list with missing/short entries results in output with runs of NA +DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] +melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) + } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} From 2a242e01a13b58612fa672eb9da940337b1e343d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 22:14:02 -0700 Subject: [PATCH 07/57] no newlines in news items --- NEWS.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 995565d172..6c189a2888 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,13 +6,7 @@ ## NEW FEATURES -1. `melt.data.table()` now supports `NA` entries when specifying a - list of `measure.vars`, which translate into runs of missing values - in the output. Useful for melting wide data tables with some - missing columns, - [#4027](https://github.com/Rdatatable/data.table/issues/4027). - Thanks to @vspinu for reporting, and @tdhock for implementing the - changes to fmelt. +1. `melt.data.table()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data tables with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing the changes to fmelt. ## BUG FIXES From 50b7b2eca82833bff447fce9df7a2977823b6dc5 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 22:16:57 -0700 Subject: [PATCH 08/57] fix typo --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index a9403a4281..7f3c742db2 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -65,7 +65,7 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt into multiple columns in a single function call efficiently. -If a vector in the list containts missing values, or is shorter than the +If a vector in the list contains missing values, or is shorter than the max length of the list elements, then the output will include runs of missing values at the specified position, or at the end. The function From e535423475793ac2a0b13ff2e94c7bf21eb4c8f6 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 22:30:33 -0700 Subject: [PATCH 09/57] bugfix for melt with na.rm=T and list for measure.vars --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 308d427250..42859117b0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ 3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. +4. When using melt with na.rm=TRUE and a list for measure.vars, variable was not incremented for groups with all missing values, which caused inconsistent results between na.rm=TRUE and FALSE, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. + ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` suggests `bit64` but does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be improved to include suggests in reverse dependency testing. From 6c8bb5158d2786913e9338670f6fbac08c3a86c3 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 28 Sep 2020 11:46:07 -0700 Subject: [PATCH 10/57] remove tabs --- src/fmelt.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index bb5c5f3554..403c1989b3 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -288,7 +288,7 @@ struct processData { SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 SEXP idcols, valuecols, // list with one element per output/value column, each - // element is an integer vector. + // element is an integer vector. naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively int *isfactor, *leach, // length of each element of the valuecols(measure.vars) list. @@ -335,25 +335,25 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna for (j=0; jleach[i]; j++) { // for each input column. int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ - thiscol = VECTOR_ELT(DT, this_col_num-1); - if (isFactor(thiscol)) { - data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; - data->maxtype[i] = STRSXP; - } else { - type = TYPEOF(thiscol); - if (type > data->maxtype[i]) data->maxtype[i] = type; - } + thiscol = VECTOR_ELT(DT, this_col_num-1); + if (isFactor(thiscol)) { + data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; + data->maxtype[i] = STRSXP; + } else { + type = TYPEOF(thiscol); + if (type > data->maxtype[i]) data->maxtype[i] = type; + } } } for (j=0; jleach[i]; j++) { int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ - thiscol = VECTOR_ELT(DT, this_col_num-1); - if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || - (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { - data->isidentical[i] = 0; - break; - } + thiscol = VECTOR_ELT(DT, this_col_num-1); + if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || + (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { + data->isidentical[i] = 0; + break; + } } } } @@ -472,14 +472,13 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s bool copyattr = false; for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - // TODO use this line of code if NA specified. SEXP thiscol; int input_column_num = INTEGER(thisvaluecols)[j]; if (j >= data->leach[i] || // fewer indices than the max were specified. - input_column_num == NA_INTEGER) { // NA was specified. - thiscol = allocNAVector(data->maxtype[i], data->nrow); + input_column_num == NA_INTEGER) { // NA was specified. + thiscol = allocNAVector(data->maxtype[i], data->nrow); }else{ - thiscol = VECTOR_ELT(DT, input_column_num-1); + thiscol = VECTOR_ELT(DT, input_column_num-1); } if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); From 999c91b99a482212bc69258f617b60cfcd0a2714 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 28 Sep 2020 21:44:01 -0700 Subject: [PATCH 11/57] start with melt --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 8fd5f9fd31..5ebd01a6ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,7 +16,7 @@ 4. `fread("1.2\n", colClasses='integer')` would segfault when creating the warning message due to no column names in the output, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. -4. When using melt with na.rm=TRUE and a list for measure.vars, variable was not incremented for groups with all missing values, which caused inconsistent results between na.rm=TRUE and FALSE, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. +4. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. ## NOTES From c1fb10f590e590e746cd35d6826e98e77151ab5e Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 30 Sep 2020 22:36:45 -0700 Subject: [PATCH 12/57] document values in variable column --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index a9d69b5f66..b73f4643e5 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -31,7 +31,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } -\item{variable.name}{name for the measured variable names column. The default name is \code{'variable'}.} +\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors.} \item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten data.} From 9b68a1bbdd4090bd18f5de93d8ddf83d5ac61368 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Oct 2020 16:09:17 -0700 Subject: [PATCH 13/57] new variable.name attribute for measure.vars --- R/fmelt.R | 42 ++++++- inst/tests/tests.Rraw | 9 ++ src/fmelt.c | 254 ++++++++++++++++++++++++++---------------- 3 files changed, 208 insertions(+), 97 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 12dd9fa5ac..46483792b7 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -3,7 +3,7 @@ # reshape2 package is deprecated since December 2017, so we'll deprecate our # redirection as well -melt <- function(data, ..., na.rm = FALSE, value.name = "value") { +melt = function(data, ..., na.rm = FALSE, value.name = "value") { if (is.data.table(data)) { UseMethod("melt", data) # if data is not data.table and reshape2 is installed, this won't dispatch to reshape2's method; @@ -28,6 +28,46 @@ patterns = function(..., cols=character(0L)) { lapply(p, grep, cols) } +pattern_list = function(pat, fun.list=list(), cols=character(0L)){ + match.vec = regexpr(pat, cols, perl=TRUE) + capture.names = attr(match.vec, "capture.names") + if(! "column" %in% capture.names){ + stop("need capture group named column") + } + if(any("" == capture.names)){ + stop("each capture group needs a name (?pattern)") + } + match.i = which(0 < match.vec) + start = attr(match.vec, "capture.start")[match.i,] + end = attr(match.vec, "capture.length")[match.i,]+start-1L + matched.names = cols[match.i] + names.mat = matrix( + matched.names, + nrow(start), ncol(start), + dimnames=list( + column=matched.names, + group=capture.names)) + group.mat = substr(names.mat, start, end) + group.dt = data.table(group.mat) + for(group.name in names(fun.list)){ + fun = fun.list[[group.name]] + set(group.dt, j=group.name, value=fun(group.dt[[group.name]])) + } + is.other = names(group.dt)!="column" + other.values = lapply(group.dt[, ..is.other], unique) + other.values$stringsAsFactors = FALSE + other.dt = data.table(do.call(expand.grid, other.values)) + measure.vars = structure(list(), variable.name=other.dt) + column.values = unique(group.dt[["column"]]) + for(column.val in column.values){ + select.dt = data.table(column=column.val, other.dt) + measure.vars[[column.val]] = data.table( + match.i, group.dt + )[select.dt, match.i, on=names(select.dt)] + } + measure.vars +} + melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", value.name = "value", ..., na.rm = FALSE, variable.factor = TRUE, value.factor = FALSE, verbose = getOption("datatable.verbose")) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1a902beb00..9cb4d5aaf6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17159,3 +17159,12 @@ DTid <- data.table(DT.wide, id=1) exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) + +# new variable.name attribute for measure.vars +measure <- data.table:::pattern_list("(?[ab])(?[12])", cols=names(DTid)) +test(2155.1, melt(DTid, measure.vars=measure)[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2155.2, melt(DTid, measure.vars=structure(measure, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") +test(2155.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") +test(2155.4, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") +test(2155.5, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") +test(2155.6, melt(DTid, measure.vars=structure(measure, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") diff --git a/src/fmelt.c b/src/fmelt.c index 03e8dcb5e4..b68265d818 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -3,6 +3,8 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); +static SEXP VarNameSymbol = NULL; + // generate from 1 to n (a simple fun for melt, vecseq is convenient from R due to SEXP inputs) SEXP seq_int(int n, int start) { if (n <= 0) return(R_NilValue); @@ -119,17 +121,17 @@ SEXP measurelist(SEXP measure, SEXP dtnames) { for (int i=0; i ncol) @@ -257,11 +259,11 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } idcols = PROTECT(tmp); protecti++; switch(TYPEOF(measure)) { - case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0)); protecti++; break; - case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; - case INTSXP : tmp2 = measure; break; - case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; - default : error(_("Unknown 'measure.vars' type %s, must be character or integer vector"), type2char(TYPEOF(measure))); + case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0)); protecti++; break; + case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; + case INTSXP : tmp2 = measure; break; + case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; + default : error(_("Unknown 'measure.vars' type %s, must be character or integer vector"), type2char(TYPEOF(measure))); } tmp = tmp2; if (isNewList(measure)) { @@ -286,14 +288,16 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { struct processData { SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 - SEXP idcols, + SEXP idcols, + variable_table, // NULL or data for variable column(s). valuecols, // list with one element per output/value column, each - // element is an integer vector. + // element is an integer vector. naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively int *isfactor, *leach, // length of each element of the valuecols(measure.vars) list. *isidentical; // are all inputs for this value column the same type? int lids, // number of id columns. + lvars, // number of variable columns. lvalues, // number of value columns. lmax, //max length of valuecols elements / number of times to repeat ids. totlen, // of output/long DT result of melt operation. @@ -324,7 +328,6 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); - // first find max type of each output column. for (i=0; ilvalues; i++) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); @@ -332,6 +335,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; + // first find max type of this output column. for (j=0; jleach[i]; j++) { // for each input column. int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ @@ -345,6 +349,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna } } } + // then compute isidentical for this output column. for (j=0; jleach[i]; j++) { int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ @@ -360,6 +365,25 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna if (data->narm) { SET_VECTOR_ELT(data->RCHK, 1, data->naidx = allocVector(VECSXP, data->lmax)); } + // TDH 1 Oct 2020 variable table. + if (VarNameSymbol == NULL) VarNameSymbol = install("variable.name"); + data->variable_table = getAttrib(measure, VarNameSymbol); + if (isNull(data->variable_table)) { + // We need to include this check first because isNewList(NULL) == + // TRUE + data->lvars = 1; + } else if (isNewList(data->variable_table)) { + data->lvars = length(data->variable_table); + if (data->lvars == 0) { + error(_("variable.name attribute of measure.vars should be a data table with at least one column")); + } + int nrow = length(VECTOR_ELT(data->variable_table, 0)); + if (data->lmax != nrow) { + error(_("variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); + } + } else {//neither NULL nor DT. + error(_("variable.name attribute of measure.vars should be either NULL or a data table")); + } } static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType, Rboolean * isRowOrdered) @@ -563,76 +587,107 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str // reworked in PR#3455 to create character/factor directly for efficiency, and handle duplicates (#1754) // data->nrow * data->lmax == data->totlen int protecti=0; - SEXP ansvars=PROTECT(allocVector(VECSXP, 1)); protecti++; - SEXP target; + SEXP ansvars=PROTECT(allocVector(VECSXP, data->lvars)); protecti++; + SEXP target; if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov - if (!varfactor) { - SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (data->lvalues == 1) { - const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); - for (int j=0, ansloc=0; jlmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - SEXP str = STRING_ELT(dtnames, thisvaluecols[j]-1); - for (int k=0; kvariable_table)) { + if (!varfactor) { + SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); + if (data->lvalues == 1) {//one value column to output. + const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); + for (int j=0, ansloc=0; jlmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + SEXP str = STRING_ELT(dtnames, thisvaluecols[j]-1); + for (int k=0; klmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + char buff[20]; + snprintf(buff, 20, "%d", level++); + SEXP str = PROTECT(mkChar(buff)); + for (int k=0; klmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - char buff[20]; - snprintf(buff, 20, "%d", level++); - SEXP str = PROTECT(mkChar(buff)); - for (int k=0; ktotlen)); + SEXP levels; + int *td = INTEGER(target); + if (data->lvalues == 1) {//one value column to output. + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); + int len = length(thisvaluecols); + levels = PROTECT(allocVector(STRSXP, len)); protecti++; + const int *vd = INTEGER(thisvaluecols); + for (int j=0; jnarm && length(VECTOR_ELT(data->naidx, j))==0)) { numRemove++; md[j]=0; } + } + if (numRemove) { + SEXP newlevels = PROTECT(allocVector(STRSXP, len-numRemove)); protecti++; + for (int i=0, loc=0; ilmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + for (int k=0; klmax)); protecti++; + for (int j=0, ansloc=0; jlmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + char buff[20]; + snprintf(buff, 20, "%d", nlevel+1); + SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels + for (int k=0; klmax) { + // data->narm is true and there are some all-NA items causing at least one 'if (thislen==0) continue' above + // shrink the levels + SEXP newlevels = PROTECT(allocVector(STRSXP, nlevel)); protecti++; + for (int i=0; itotlen)); - SEXP levels; - int *td = INTEGER(target); - if (data->lvalues == 1) {//single output column. - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); - int len = length(thisvaluecols); - levels = PROTECT(allocVector(STRSXP, len)); protecti++; - const int *vd = INTEGER(thisvaluecols); - for (int j=0; jnarm && length(VECTOR_ELT(data->naidx, j))==0)) { numRemove++; md[j]=0; } - } - if (numRemove) { - SEXP newlevels = PROTECT(allocVector(STRSXP, len-numRemove)); protecti++; - for (int i=0, loc=0; ilmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - for (int k=0; klmax)); protecti++; + } else { //variable.name table specified + for (int out_col_i=0; out_col_ilvars; out_col_i++) { + SEXP out_col = VECTOR_ELT(data->variable_table, out_col_i); + SET_VECTOR_ELT(ansvars, out_col_i, target=allocVector(TYPEOF(out_col), data->totlen)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - char buff[20]; - snprintf(buff, 20, "%d", nlevel+1); - SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels - for (int k=0; klmax) { - // data->narm is true and there are some all-NA items causing at least one 'if (thislen==0) continue' above - // shrink the levels - SEXP newlevels = PROTECT(allocVector(STRSXP, nlevel)); protecti++; - for (int i=0; ivariable_table, R_NamesSymbol), out_col_i))); + } } } - setAttrib(target, R_LevelsSymbol, levels); - setAttrib(target, R_ClassSymbol, ScalarString(char_factor)); } UNPROTECT(protecti); return(ansvars); @@ -763,13 +818,20 @@ SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP SET_VECTOR_ELT(ans, data.lids+1+i, VECTOR_ELT(ansvals, i)); } // fill in 'ansnames' - ansnames = PROTECT(allocVector(STRSXP, data.lids+1+data.lvalues)); protecti++; + ansnames = PROTECT(allocVector(STRSXP, data.lids+data.lvars+data.lvalues)); protecti++; for (int i=0; i Date: Thu, 1 Oct 2020 16:38:25 -0700 Subject: [PATCH 14/57] use lvars in output column number allocation --- inst/tests/tests.Rraw | 9 +++++++++ src/fmelt.c | 14 ++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9cb4d5aaf6..20f9b34289 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17168,3 +17168,12 @@ test(2155.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error test(2155.4, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") test(2155.5, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") test(2155.6, melt(DTid, measure.vars=structure(measure, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") + +input <- data.table( + school = c("A","B"), + read_1 = c(20,22), read_1_sp = c(T,F), + read_2 = c(45,47), read_2_sp = c(F,F), + math_1 = c(20,22), math_1_sp = c(T,F), + math_2 = c(NA,35), math_2_sp = c(NA,F)) +measure <- data.table:::pattern_list("(?[^_]+)_(?[12])(?.*)", list(column=function(x)ifelse(x=="", "score", "sp")), cols=names(input)) +melt(input, measure=measure, na.rm=TRUE) diff --git a/src/fmelt.c b/src/fmelt.c index b68265d818..bbe4315ae4 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -809,25 +809,27 @@ SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP ansids = PROTECT(getidcols(DT, dtnames, verbose, &data)); protecti++; // populate 'ans' - ans = PROTECT(allocVector(VECSXP, data.lids+1+data.lvalues)); protecti++; // 1 is for variable column + int ncol_ans = data.lids+data.lvars+data.lvalues; + ans = PROTECT(allocVector(VECSXP, ncol_ans)); protecti++; // 1 is for variable column for (int i=0; i Date: Thu, 1 Oct 2020 17:08:26 -0700 Subject: [PATCH 15/57] fix segfault with na.rm=T --- inst/tests/tests.Rraw | 1 + src/fmelt.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db14c662e2..9b9cdc3e91 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17159,3 +17159,4 @@ DTid <- data.table(DT.wide, id=1) exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) +test(2155.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 diff --git a/src/fmelt.c b/src/fmelt.c index 403c1989b3..d018a86d4e 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -442,11 +442,13 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int i=0; ilmax; ++i) { SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (int j=0; jlvalues; ++j) { - if (i < data->leach[j]) { - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1)); - } else { + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); + int input_column_num = INTEGER(thisvaluecols)[i]; + if (j >= data->leach[j] || //fewer indices than the max were specified. + input_column_num == NA_INTEGER) { //NA was specified. SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); + } else { + SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, input_column_num-1)); } } tmp = PROTECT(dt_na(tmp, seqcols)); From b6407ab16ad524595c07b9e90cb0fb59e8e7861f Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Oct 2020 21:48:56 -0700 Subject: [PATCH 16/57] sep_ funs --- R/fmelt.R | 74 +++++++++++++++++++++++++++++++++++-------- inst/tests/tests.Rraw | 30 ++++++++++++------ 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 46483792b7..b78d020589 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -28,19 +28,16 @@ patterns = function(..., cols=character(0L)) { lapply(p, grep, cols) } -pattern_list = function(pat, fun.list=list(), cols=character(0L)){ +pattern_match_info = function(pat, fun.list, cols){ match.vec = regexpr(pat, cols, perl=TRUE) capture.names = attr(match.vec, "capture.names") - if(! "column" %in% capture.names){ - stop("need capture group named column") - } if(any("" == capture.names)){ stop("each capture group needs a name (?pattern)") } - match.i = which(0 < match.vec) - start = attr(match.vec, "capture.start")[match.i,] - end = attr(match.vec, "capture.length")[match.i,]+start-1L - matched.names = cols[match.i] + measure.vars = which(0 < match.vec) + start = attr(match.vec, "capture.start")[measure.vars,] + end = attr(match.vec, "capture.length")[measure.vars,]+start-1L + matched.names = cols[measure.vars] names.mat = matrix( matched.names, nrow(start), ncol(start), @@ -53,19 +50,70 @@ pattern_list = function(pat, fun.list=list(), cols=character(0L)){ fun = fun.list[[group.name]] set(group.dt, j=group.name, value=fun(group.dt[[group.name]])) } + list(measure.vars=measure.vars, group.dt=group.dt) +} + +info_to_list = function(measure.vars, group.dt){ + if(! "column" %in% names(group.dt)){ + stop("need capture group named column") + } is.other = names(group.dt)!="column" other.values = lapply(group.dt[, ..is.other], unique) other.values$stringsAsFactors = FALSE other.dt = data.table(do.call(expand.grid, other.values)) - measure.vars = structure(list(), variable.name=other.dt) + measure.list = structure(list(), variable.name=other.dt) column.values = unique(group.dt[["column"]]) for(column.val in column.values){ select.dt = data.table(column=column.val, other.dt) - measure.vars[[column.val]] = data.table( - match.i, group.dt - )[select.dt, match.i, on=names(select.dt)] + measure.list[[column.val]] = data.table( + measure.vars, group.dt + )[select.dt, measure.vars, on=names(select.dt)] } - measure.vars + measure.list +} + +pattern_list = function(pat, fun.list=list(), cols=character(0L)){ + m.list = pattern_match_info(pat, fun.list, cols) + do.call(info_to_list, m.list) +} + +info_to_vec = function(measure.vars, group.dt){ + structure(measure.vars, variable.name=group.dt) +} + +pattern_vec = function(pat, fun.list=list(), cols=character(0L)){ + m.list <- pattern_match_info(pat, fun.list, cols) + do.call(info_to_vec, m.list) +} + +sep_call_info = function(sep, cols){ + parent = sys.parent() + mcall = match.call(definition=sys.function(parent), call=sys.call(parent)) + L = as.list(mcall)[-1] + fun.list = L[-which(names(L)%in%names(formals()))] + no.fun = names(fun.list)=="" + names(fun.list)[no.fun] = sapply(fun.list[no.fun], paste) + list.of.vectors = strsplit(cols, sep, fixed=TRUE) + vector.lengths = sapply(list.of.vectors, length) + measure.vars = which(vector.lengths==max(vector.lengths)) + mat = do.call(rbind, list.of.vectors[measure.vars]) + colnames(mat) = names(fun.list) + group.dt = data.table(mat) + for(group.i in which(!no.fun)){ + fun = eval(fun.list[[group.i]]) + set(group.dt, j=group.i, value=fun(group.dt[[group.i]])) + } + list(measure.vars=measure.vars, group.dt=group.dt) +} + +sep_list = function(..., sep="_", cols){ + sep.list = sep_call_info(sep, cols) + do.call(info_to_list, sep.list) +} + +sep_vec = function(..., sep="_", cols){ + sep.list = sep_call_info(sep, cols) + do.call(info_to_vec, sep.list) } melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 51846837f2..c673439660 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15612,7 +15612,7 @@ DT <- data.table( f_1 = factor(c('a', 'c', 'b', NA, 'c', 'b', 'c', 'c', NA, 'c', NA, 'c', 'a', 'b', NA, NA, NA, 'a')), c_1 = c("a", "c", NA, NA, NA, "c", "b", NA, "a", "b", NA, "a", "c", "b", "c", "b", "a", "b") ) -test(2063.1, melt(DT, id=1:2, measure=3:4), melt(DT, id=c("i_1", "i_2"), measure=c("f_1", "c_1"))) +test(2063.1, melt(DT, id=1:2, measure.vars=3:4), melt(DT, id=c("i_1", "i_2"), measure.vars=c("f_1", "c_1"))) ## fun --> fun.aggregate DT = melt(as.data.table(ChickWeight), id.vars=2:4) setnames(DT, tolower(names(DT))) @@ -17159,7 +17159,6 @@ DTid <- data.table(DT.wide, id=1) exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) - # new variable.name attribute for measure.vars measure <- data.table:::pattern_list("(?[ab])(?[12])", cols=names(DTid)) test(2156.1, melt(DTid, measure.vars=measure)[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) @@ -17168,12 +17167,23 @@ test(2156.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error test(2156.4, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") test(2156.5, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") test(2156.6, melt(DTid, measure.vars=structure(measure, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") - -input <- data.table( +schools.wide <- data.table( school = c("A","B"), - read_1 = c(20,22), read_1_sp = c(T,F), - read_2 = c(45,47), read_2_sp = c(F,F), - math_1 = c(20,22), math_1_sp = c(T,F), - math_2 = c(NA,35), math_2_sp = c(NA,F)) -measure <- data.table:::pattern_list("(?[^_]+)_(?[12])(?.*)", list(column=function(x)ifelse(x=="", "score", "sp")), cols=names(input)) -melt(input, measure=measure, na.rm=TRUE) + read_1 = c(1.1,2.1), read_1_sp = c(T,T), + read_2 = c(1.2,2.2), + math_1 = c(10.1,20.1), math_1_sp = c(T,T), + math_2 = c(NA,20.2), math_2_sp = c(NA,F)) +measure <- data.table:::pattern_list("(?[^_]+)_(?[12])(?.*)", list(number=as.integer, column=function(x)ifelse(x=="", "score", "sp")), cols=names(schools.wide)) +schools.tall <- melt(schools.wide, measure.vars=measure, na.rm=TRUE) +schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) +test(2156.6, schools.tall, schools.expected) +who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) +measure <- data.table:::pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", list(ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y))), cols=names(who)) +test(2156.7, melt(who, measure.vars=measure), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) +# sep_ funs. +measure = data.table:::sep_list(column, subject, number=as.integer, cols=names(wide.again)) +test(2157.1, melt(wide.again, measure.vars=measure, na.rm=TRUE)[order(score)], schools.expected) +test(2157.2, names(melt(data.table(iris), measure.vars=data.table:::sep_list(column, dim, sep=".", cols=names(iris)))), c("Species", "dim", "Sepal", "Petal")) +test(2157.3, names(melt(data.table(iris), measure.vars=data.table:::sep_list(part, column, sep=".", cols=names(iris)))), c("Species", "part", "Length", "Width")) +test(2157.4, names(melt(data.table(iris), measure.vars=data.table:::sep_vec(part, dim, sep=".", cols=names(iris)))), c("Species", "part", "dim", "value")) From 190e0194540acc6e4fe58fd4001a9e64839f0d29 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Oct 2020 22:38:44 -0700 Subject: [PATCH 17/57] increment test num, print statements --- inst/tests/tests.Rraw | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c673439660..da47976c8b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17176,14 +17176,29 @@ schools.wide <- data.table( measure <- data.table:::pattern_list("(?[^_]+)_(?[12])(?.*)", list(number=as.integer, column=function(x)ifelse(x=="", "score", "sp")), cols=names(schools.wide)) schools.tall <- melt(schools.wide, measure.vars=measure, na.rm=TRUE) schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) -test(2156.6, schools.tall, schools.expected) +test(2156.7, schools.tall, schools.expected) who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) measure <- data.table:::pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", list(ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y))), cols=names(who)) -test(2156.7, melt(who, measure.vars=measure), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +test(2156.8, melt(who, measure.vars=measure), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # sep_ funs. measure = data.table:::sep_list(column, subject, number=as.integer, cols=names(wide.again)) -test(2157.1, melt(wide.again, measure.vars=measure, na.rm=TRUE)[order(score)], schools.expected) -test(2157.2, names(melt(data.table(iris), measure.vars=data.table:::sep_list(column, dim, sep=".", cols=names(iris)))), c("Species", "dim", "Sepal", "Petal")) -test(2157.3, names(melt(data.table(iris), measure.vars=data.table:::sep_list(part, column, sep=".", cols=names(iris)))), c("Species", "part", "Length", "Width")) -test(2157.4, names(melt(data.table(iris), measure.vars=data.table:::sep_vec(part, dim, sep=".", cols=names(iris)))), c("Species", "part", "dim", "value")) +print(measure) +molt = melt(wide.again, measure.vars=measure, na.rm=TRUE)[order(score)] +print(molt) +test(2157.1, molt, schools.expected) +measure = data.table:::sep_list(column, dim, sep=".", cols=names(iris)) +print(measure) +molt = melt(data.table(iris), measure.vars=measure) +print(molt) +test(2157.2, names(molt), c("Species", "dim", "Sepal", "Petal")) +measure = data.table:::sep_list(part, column, sep=".", cols=names(iris)) +print(measure) +molt = melt(data.table(iris), measure.vars=measure) +print(molt) +test(2157.3, names(molt), c("Species", "part", "Length", "Width")) +measure = data.table:::sep_vec(part, dim, sep=".", cols=names(iris)) +print(measure) +molt = melt(data.table(iris), measure.vars=measure) +print(molt) +test(2157.4, names(molt), c("Species", "part", "dim", "value")) From 22fe79ae4cee23d944e737a9e7c96084b40bd2ac Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 14:19:13 -0700 Subject: [PATCH 18/57] use datasets::iris to avoid iris copy in earlier tests --- inst/tests/tests.Rraw | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index da47976c8b..280ad28e43 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17183,22 +17183,15 @@ test(2156.8, melt(who, measure.vars=measure), data.table(id=1, diagnosis=c("sp", wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # sep_ funs. measure = data.table:::sep_list(column, subject, number=as.integer, cols=names(wide.again)) -print(measure) molt = melt(wide.again, measure.vars=measure, na.rm=TRUE)[order(score)] -print(molt) test(2157.1, molt, schools.expected) -measure = data.table:::sep_list(column, dim, sep=".", cols=names(iris)) -print(measure) -molt = melt(data.table(iris), measure.vars=measure) -print(molt) +iris.dt = data.table(datasets::iris) +measure = data.table:::sep_list(column, dim, sep=".", cols=names(iris.dt)) +molt = melt(iris.dt, measure.vars=measure) test(2157.2, names(molt), c("Species", "dim", "Sepal", "Petal")) -measure = data.table:::sep_list(part, column, sep=".", cols=names(iris)) -print(measure) -molt = melt(data.table(iris), measure.vars=measure) -print(molt) +measure = data.table:::sep_list(part, column, sep=".", cols=names(iris.dt)) +molt = melt(iris.dt, measure.vars=measure) test(2157.3, names(molt), c("Species", "part", "Length", "Width")) -measure = data.table:::sep_vec(part, dim, sep=".", cols=names(iris)) -print(measure) -molt = melt(data.table(iris), measure.vars=measure) -print(molt) +measure = data.table:::sep_vec(part, dim, sep=".", cols=names(iris.dt)) +molt = melt(iris.dt, measure.vars=measure) test(2157.4, names(molt), c("Species", "part", "dim", "value")) From 775f8730f1628feb5dabe08aea46c4b0bb63200e Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 16:00:31 -0700 Subject: [PATCH 19/57] eval_with_cols instead of do_patterns --- R/data.table.R | 2 +- R/fmelt.R | 17 +++++++++++++---- R/utils.R | 36 +++++++++++++++++++----------------- inst/tests/tests.Rraw | 33 +++++++++++++-------------------- 4 files changed, 46 insertions(+), 42 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 99afcfb271..2c4a965e7b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -941,7 +941,7 @@ replace_dot_alias = function(e) { } else { if (colsub %iscall% 'patterns') { # each pattern gives a new filter condition, intersect the end result - .SDcols = Reduce(intersect, do_patterns(colsub, names_x)) + .SDcols = Reduce(intersect, eval_with_cols(colsub, names_x)) } else { .SDcols = eval(colsub, parent.frame(), parent.frame()) # allow filtering via function in .SDcols, #3950 diff --git a/R/fmelt.R b/R/fmelt.R index b78d020589..e34e04c983 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -22,10 +22,16 @@ melt = function(data, ..., na.rm = FALSE, value.name = "value") { patterns = function(..., cols=character(0L)) { # if ... has no names, names(list(...)) will be ""; # this assures they'll be NULL instead - p = unlist(list(...), use.names = any(nzchar(names(...)))) + L <- list(...) + p = unlist(L, use.names = any(nzchar(names(L)))) if (!is.character(p)) stop("Input patterns must be of type character.") - lapply(p, grep, cols) + matched = lapply(p, grep, cols) + # replace with lengths when R 3.2.0 dependency arrives + if (length(idx <- which(sapply(matched, length) == 0L))) + stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', + paste(p[idx], collapse = ', '), ']') + matched } pattern_match_info = function(pat, fun.list, cols){ @@ -123,8 +129,11 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl if (missing(id.vars)) id.vars=NULL if (missing(measure.vars)) measure.vars = NULL measure.sub = substitute(measure.vars) - if (measure.sub %iscall% "patterns") { - measure.vars = do_patterns(measure.sub, names(data)) + if (is.call(measure.sub)) { + eval.result = eval_with_cols(measure.sub, names(data)) + if (!is.null(eval.result)) { + measure.vars = eval.result + } } if (is.list(measure.vars) && length(measure.vars) > 1L) { meas.nm = names(measure.vars) diff --git a/R/utils.R b/R/utils.R index 42e67ea8de..cde3a504c7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -105,23 +105,25 @@ brackify = function(x, quote=FALSE) { } # patterns done via NSE in melt.data.table and .SDcols in `[.data.table` -do_patterns = function(pat_sub, all_cols) { - # received as substitute(patterns(...)) - pat_sub = as.list(pat_sub)[-1L] - # identify cols = argument if present - idx = which(names(pat_sub) == "cols") - if (length(idx)) { - cols = eval(pat_sub[["cols"]], parent.frame(2L)) - pat_sub = pat_sub[-idx] - } else cols = all_cols - pats = lapply(pat_sub, eval, parent.frame(2L)) - matched = patterns(pats, cols=cols) - # replace with lengths when R 3.2.0 dependency arrives - if (length(idx <- which(sapply(matched, length) == 0L))) - stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', - paste(pats[idx], collapse = ', '), ']') - - return(matched) +eval_with_cols = function(orig_call, all_cols) { + parent = parent.frame(2L) + fun_uneval = orig_call[[1L]] + fun = tryCatch({ + maybe_fun = eval(fun_uneval, parent) + # parent env could have a non-function with this name, which we + # should ignore. + stopifnot(is.function(maybe.fun)) + }, error=function(e) { + eval(fun_uneval)#take function from data.table namespace. + }) + if (!is.primitive(fun)) { + named_call = match.call(fun, orig_call) + if ("cols" %in% names(formals(fun)) && !"cols" %in% names(named_call)) { + named_call[["cols"]] = all_cols + } + named_call[[1L]] = fun + eval(named_call, parent) + } } # check UTC status diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 280ad28e43..e5161d5c39 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17160,38 +17160,31 @@ exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) # new variable.name attribute for measure.vars -measure <- data.table:::pattern_list("(?[ab])(?[12])", cols=names(DTid)) -test(2156.1, melt(DTid, measure.vars=measure)[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) -test(2156.2, melt(DTid, measure.vars=structure(measure, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") +myfun <- function(cols)cols #user-defined function for computing measure.vars. +pattern_list = list("foo", "bar")#pattern_list below should not use this. +test(2156.0, melt(DT.wide, measure.vars=myfun()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +test(2156.1, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2156.2, melt(DTid, measure.vars=structure(list(1, 2), variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") test(2156.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") test(2156.4, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") test(2156.5, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") -test(2156.6, melt(DTid, measure.vars=structure(measure, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +test(2156.6, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") schools.wide <- data.table( school = c("A","B"), read_1 = c(1.1,2.1), read_1_sp = c(T,T), read_2 = c(1.2,2.2), math_1 = c(10.1,20.1), math_1_sp = c(T,T), math_2 = c(NA,20.2), math_2_sp = c(NA,F)) -measure <- data.table:::pattern_list("(?[^_]+)_(?[12])(?.*)", list(number=as.integer, column=function(x)ifelse(x=="", "score", "sp")), cols=names(schools.wide)) -schools.tall <- melt(schools.wide, measure.vars=measure, na.rm=TRUE) +schools.tall <- melt(schools.wide, measure.vars=pattern_list("(?[^_]+)_(?[12])(?.*)", list(number=as.integer, column=function(x)ifelse(x=="", "score", "sp"))), na.rm=TRUE) schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) test(2156.7, schools.tall, schools.expected) who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) -measure <- data.table:::pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", list(ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y))), cols=names(who)) -test(2156.8, melt(who, measure.vars=measure), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +test(2156.8, melt(who, measure.vars=pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", list(ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y))))), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # sep_ funs. -measure = data.table:::sep_list(column, subject, number=as.integer, cols=names(wide.again)) -molt = melt(wide.again, measure.vars=measure, na.rm=TRUE)[order(score)] -test(2157.1, molt, schools.expected) +test(2157.1, melt(wide.again, measure.vars=sep_list(column, subject, number=as.integer), na.rm=TRUE)[order(score)], schools.expected) iris.dt = data.table(datasets::iris) -measure = data.table:::sep_list(column, dim, sep=".", cols=names(iris.dt)) -molt = melt(iris.dt, measure.vars=measure) -test(2157.2, names(molt), c("Species", "dim", "Sepal", "Petal")) -measure = data.table:::sep_list(part, column, sep=".", cols=names(iris.dt)) -molt = melt(iris.dt, measure.vars=measure) -test(2157.3, names(molt), c("Species", "part", "Length", "Width")) -measure = data.table:::sep_vec(part, dim, sep=".", cols=names(iris.dt)) -molt = melt(iris.dt, measure.vars=measure) -test(2157.4, names(molt), c("Species", "part", "dim", "value")) +test(2157.2, names(melt(iris.dt, measure.vars=sep_list(column, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) +test(2157.3, names(melt(iris.dt, measure.vars=sep_list(part, column, sep="."))), c("Species", "part", "Length", "Width")) +test(2157.4, names(melt(iris.dt, measure.vars=sep_vec(part, dim, sep="."))), c("Species", "part", "dim", "value")) +test(2157.5, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) From ab9cbbb493859b6d599166ff52e5993da6a67f97 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 19:34:20 -0700 Subject: [PATCH 20/57] maybe_fun not maybe.fun --- R/utils.R | 4 +++- inst/tests/tests.Rraw | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index cde3a504c7..c42c11ace5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -108,11 +108,13 @@ brackify = function(x, quote=FALSE) { eval_with_cols = function(orig_call, all_cols) { parent = parent.frame(2L) fun_uneval = orig_call[[1L]] + # take fun from either calling env (parent) or from data.table fun = tryCatch({ maybe_fun = eval(fun_uneval, parent) # parent env could have a non-function with this name, which we # should ignore. - stopifnot(is.function(maybe.fun)) + stopifnot(is.function(maybe_fun)) + maybe_fun }, error=function(e) { eval(fun_uneval)#take function from data.table namespace. }) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e5161d5c39..d0ba0fc96e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17160,9 +17160,9 @@ exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) # new variable.name attribute for measure.vars -myfun <- function(cols)cols #user-defined function for computing measure.vars. -pattern_list = list("foo", "bar")#pattern_list below should not use this. -test(2156.0, melt(DT.wide, measure.vars=myfun()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +pattern_list = function(cols)cols#user-defined function for computing measure.vars, same name as data.table:::pattern_list but user-defined version should be used. +test(2156.0, melt(DT.wide, measure.vars=pattern_list()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +pattern_list = list("foo", "bar")#pattern_list below should not use this since it is not a function. test(2156.1, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2156.2, melt(DTid, measure.vars=structure(list(1, 2), variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") test(2156.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") From ac6e82a681b99680f49a6069ea1b2c643420f7fd Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 20:33:46 -0700 Subject: [PATCH 21/57] pattern_* takes ... instead of fun.list, error checking --- .Rbuildignore | 1 + R/fmelt.R | 41 ++++++++++++++++++++++++++++------------- inst/tests/tests.Rraw | 10 ++++++++-- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index a910621f52..b544468c67 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ +.dir-locals.el ^\.Rprofile$ ^data\.table_.*\.tar\.gz$ ^vignettes/plots/figures$ diff --git a/R/fmelt.R b/R/fmelt.R index e34e04c983..c4e32ae811 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -34,7 +34,17 @@ patterns = function(..., cols=character(0L)) { matched } -pattern_match_info = function(pat, fun.list, cols){ +pattern_list = function(pat, ..., cols=character(0L)){ + m.list = pattern_match_info(pat, ..., cols=cols) + do.call(info_to_list, m.list) +} + +pattern_vec = function(pat, ..., cols=character(0L)){ + m.list <- pattern_match_info(pat, ..., cols=cols) + do.call(info_to_vec, m.list) +} + +pattern_match_info = function(pat, cols=character(0L), ...){ match.vec = regexpr(pat, cols, perl=TRUE) capture.names = attr(match.vec, "capture.names") if(any("" == capture.names)){ @@ -52,9 +62,24 @@ pattern_match_info = function(pat, fun.list, cols){ group=capture.names)) group.mat = substr(names.mat, start, end) group.dt = data.table(group.mat) - for(group.name in names(fun.list)){ + fun.list = list(...) + for (group.i in seq_along(fun.list)) { + group.name = names(fun.list)[[group.i]] + if (is.null(group.name) || nchar(group.name)==0) { + stop("each argument to pattern_* in ... must be named") + } + if (! group.name %in% names(group.dt)) { + stop("each argument name to pattern_* in ... must be one of the capture group names, problem: ", group.name) + } fun = fun.list[[group.name]] - set(group.dt, j=group.name, value=fun(group.dt[[group.name]])) + if (!is.function(fun) || length(formals(fun))==0) { + stop("each argument to pattern_* in ... must be a function with at least one argument, problem: ", group.name) + } + group.val = fun(group.dt[[group.name]]) + if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { + stop("each argument to pattern_* in ... must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) + } + set(group.dt, j=group.name, value=group.val) } list(measure.vars=measure.vars, group.dt=group.dt) } @@ -78,20 +103,10 @@ info_to_list = function(measure.vars, group.dt){ measure.list } -pattern_list = function(pat, fun.list=list(), cols=character(0L)){ - m.list = pattern_match_info(pat, fun.list, cols) - do.call(info_to_list, m.list) -} - info_to_vec = function(measure.vars, group.dt){ structure(measure.vars, variable.name=group.dt) } -pattern_vec = function(pat, fun.list=list(), cols=character(0L)){ - m.list <- pattern_match_info(pat, fun.list, cols) - do.call(info_to_vec, m.list) -} - sep_call_info = function(sep, cols){ parent = sys.parent() mcall = match.call(definition=sys.function(parent), call=sys.call(parent)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d0ba0fc96e..33a2a66567 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17175,11 +17175,11 @@ schools.wide <- data.table( read_2 = c(1.2,2.2), math_1 = c(10.1,20.1), math_1_sp = c(T,T), math_2 = c(NA,20.2), math_2_sp = c(NA,F)) -schools.tall <- melt(schools.wide, measure.vars=pattern_list("(?[^_]+)_(?[12])(?.*)", list(number=as.integer, column=function(x)ifelse(x=="", "score", "sp"))), na.rm=TRUE) +schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=pattern_list("(?[^_]+)_(?[12])(?.*)", number=as.integer, column=function(x)ifelse(x=="", "score", "sp"))) schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) test(2156.7, schools.tall, schools.expected) who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) -test(2156.8, melt(who, measure.vars=pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", list(ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y))))), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +test(2156.8, melt(who, measure.vars=pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)))), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # sep_ funs. test(2157.1, melt(wide.again, measure.vars=sep_list(column, subject, number=as.integer), na.rm=TRUE)[order(score)], schools.expected) @@ -17188,3 +17188,9 @@ test(2157.2, names(melt(iris.dt, measure.vars=sep_list(column, dim, sep="."))), test(2157.3, names(melt(iris.dt, measure.vars=sep_list(part, column, sep="."))), c("Species", "part", "Length", "Width")) test(2157.4, names(melt(iris.dt, measure.vars=sep_vec(part, dim, sep="."))), c("Species", "part", "dim", "value")) test(2157.5, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) +# pattern_ errors +test(2158.01, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", identity)), error="each argument to pattern_* in ... must be named") +test(2158.02, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", foo=identity)), error="each argument name to pattern_* in ... must be one of the capture group names, problem: foo") +test(2158.03, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr="bar")), error="each argument to pattern_* in ... must be a function with at least one argument, problem: istr") +test(2158.04, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function()1)), error="each argument to pattern_* in ... must be a function with at least one argument, problem: istr") +test(2158.05, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function(x)1)), error="each argument to pattern_* in ... must be a function that returns an atomic vector with same length as its first argument, problem: istr") From 2cb0a5918199b454a337336452c52212996ee337 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 22:00:17 -0700 Subject: [PATCH 22/57] group_funs error checking --- R/fmelt.R | 33 ++++++++++++++++++++------------- inst/tests/tests.Rraw | 15 +++++++++------ 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index c4e32ae811..d84d8b5d96 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -63,25 +63,29 @@ pattern_match_info = function(pat, cols=character(0L), ...){ group.mat = substr(names.mat, start, end) group.dt = data.table(group.mat) fun.list = list(...) + list(measure.vars=measure.vars, group.dt=group_funs(group.dt, fun.list)) +} + +group_funs = function(group.dt, fun.list) { for (group.i in seq_along(fun.list)) { group.name = names(fun.list)[[group.i]] if (is.null(group.name) || nchar(group.name)==0) { - stop("each argument to pattern_* in ... must be named") + stop("each argument must be named") } if (! group.name %in% names(group.dt)) { - stop("each argument name to pattern_* in ... must be one of the capture group names, problem: ", group.name) + stop("each argument name must be one of the capture group names, problem: ", group.name) } - fun = fun.list[[group.name]] - if (!is.function(fun) || length(formals(fun))==0) { - stop("each argument to pattern_* in ... must be a function with at least one argument, problem: ", group.name) + fun = eval(fun.list[[group.name]]) + if (!is.function(fun) || (!is.primitive(fun) && length(formals(fun))==0)) { + stop("each argument must be a function with at least one argument, problem: ", group.name) } group.val = fun(group.dt[[group.name]]) if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { - stop("each argument to pattern_* in ... must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) + stop("each argument must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) } set(group.dt, j=group.name, value=group.val) } - list(measure.vars=measure.vars, group.dt=group.dt) + group.dt } info_to_list = function(measure.vars, group.dt){ @@ -116,15 +120,18 @@ sep_call_info = function(sep, cols){ names(fun.list)[no.fun] = sapply(fun.list[no.fun], paste) list.of.vectors = strsplit(cols, sep, fixed=TRUE) vector.lengths = sapply(list.of.vectors, length) - measure.vars = which(vector.lengths==max(vector.lengths)) + n.groups = max(vector.lengths) + if (n.groups != length(fun.list)) { + stop( + "number of arguments to sep_* =", length(fun.list), + " must be same as max number of items after splitting column names =", n.groups) + } + measure.vars = which(vector.lengths==n.groups) mat = do.call(rbind, list.of.vectors[measure.vars]) colnames(mat) = names(fun.list) group.dt = data.table(mat) - for(group.i in which(!no.fun)){ - fun = eval(fun.list[[group.i]]) - set(group.dt, j=group.i, value=fun(group.dt[[group.i]])) - } - list(measure.vars=measure.vars, group.dt=group.dt) + some.funs = fun.list[which(!no.fun)] + list(measure.vars=measure.vars, group.dt=group_funs(group.dt, some.funs)) } sep_list = function(..., sep="_", cols){ diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 33a2a66567..56c1bffbb4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17182,15 +17182,18 @@ who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) test(2156.8, melt(who, measure.vars=pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)))), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # sep_ funs. -test(2157.1, melt(wide.again, measure.vars=sep_list(column, subject, number=as.integer), na.rm=TRUE)[order(score)], schools.expected) +test(2157.1, melt(wide.again, na.rm=TRUE, measure.vars=sep_list(column, subject, number=as.integer))[order(score)], schools.expected) iris.dt = data.table(datasets::iris) test(2157.2, names(melt(iris.dt, measure.vars=sep_list(column, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) test(2157.3, names(melt(iris.dt, measure.vars=sep_list(part, column, sep="."))), c("Species", "part", "Length", "Width")) test(2157.4, names(melt(iris.dt, measure.vars=sep_vec(part, dim, sep="."))), c("Species", "part", "dim", "value")) test(2157.5, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) # pattern_ errors -test(2158.01, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", identity)), error="each argument to pattern_* in ... must be named") -test(2158.02, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", foo=identity)), error="each argument name to pattern_* in ... must be one of the capture group names, problem: foo") -test(2158.03, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr="bar")), error="each argument to pattern_* in ... must be a function with at least one argument, problem: istr") -test(2158.04, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function()1)), error="each argument to pattern_* in ... must be a function with at least one argument, problem: istr") -test(2158.05, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function(x)1)), error="each argument to pattern_* in ... must be a function that returns an atomic vector with same length as its first argument, problem: istr") +test(2158.01, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", identity)), error="each argument must be named") +test(2158.02, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", foo=identity)), error="each argument name must be one of the capture group names, problem: foo") +test(2158.03, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr="bar")), error="each argument must be a function with at least one argument, problem: istr") +test(2158.04, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function()1)), error="each argument must be a function with at least one argument, problem: istr") +test(2158.05, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function(x)1)), error="each argument must be a function that returns an atomic vector with same length as its first argument, problem: istr") +myfac = function(x)factor(x) +test(2158.06, melt(iris.dt, measure.vars=sep_list(column, dim="bar", sep=".")), error="each argument must be a function with at least one argument, problem: dim") +test(2158.07, melt(iris.dt, measure.vars=sep_list(column, dim, baz, sep=".")), error="number of arguments to sep_* =3 must be same as max number of items after splitting column names =2") From fefbb25abdb8d84347130f7b994569e6dc572a5b Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 22:17:30 -0700 Subject: [PATCH 23/57] test named capture errors --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 56c1bffbb4..09599b1a02 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17197,3 +17197,6 @@ test(2158.05, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12]) myfac = function(x)factor(x) test(2158.06, melt(iris.dt, measure.vars=sep_list(column, dim="bar", sep=".")), error="each argument must be a function with at least one argument, problem: dim") test(2158.07, melt(iris.dt, measure.vars=sep_list(column, dim, baz, sep=".")), error="number of arguments to sep_* =3 must be same as max number of items after splitting column names =2") +test(2158.08, melt(DTid, measure.vars=pattern_list("(?[ab])([12])", istr=as.integer)), error="each capture group needs a name (?pattern)") +test(2158.08, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=as.integer)), error="need capture group named column") +##TODO factors From 25d9dfe0f820dbb63e5568bee3ace0b65588f81e Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Oct 2020 22:21:10 -0700 Subject: [PATCH 24/57] remove factor renumbering code which is never used anymore --- src/fmelt.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 3a84afd635..39b93d2e81 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -582,13 +582,6 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels for (int k=0; klmax) { - // data->narm is true and there are some all-NA items causing at least one 'if (thislen==0) continue' above - // shrink the levels - SEXP newlevels = PROTECT(allocVector(STRSXP, nlevel)); protecti++; - for (int i=0; i Date: Sat, 3 Oct 2020 07:26:33 -0700 Subject: [PATCH 25/57] factor test/variable output col --- inst/tests/tests.Rraw | 5 +++-- src/fmelt.c | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 09599b1a02..66f9f0bd60 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17198,5 +17198,6 @@ myfac = function(x)factor(x) test(2158.06, melt(iris.dt, measure.vars=sep_list(column, dim="bar", sep=".")), error="each argument must be a function with at least one argument, problem: dim") test(2158.07, melt(iris.dt, measure.vars=sep_list(column, dim, baz, sep=".")), error="number of arguments to sep_* =3 must be same as max number of items after splitting column names =2") test(2158.08, melt(DTid, measure.vars=pattern_list("(?[ab])([12])", istr=as.integer)), error="each capture group needs a name (?pattern)") -test(2158.08, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=as.integer)), error="need capture group named column") -##TODO factors +test(2158.09, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=as.integer)), error="need capture group named column") +# factors +test(2159.00, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", letter=factor)), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) diff --git a/src/fmelt.c b/src/fmelt.c index 683e116639..a6aaa9e8de 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -684,6 +684,11 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str case LGLSXP : for (int k=0; kvariable_table, R_NamesSymbol), out_col_i))); From 8126a6c2beb95ef1b1c876355964634a9e6515c2 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 3 Oct 2020 08:15:37 -0700 Subject: [PATCH 26/57] test to increase code coverage --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 63d8c5b8bf..02d6b0014c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3014,6 +3014,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) error="Unknown 'id.vars' type raw") test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") + test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1)) + test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) + test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 5725b25b37416b104f573dc2dc0b50b267f15220 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 08:20:48 -0700 Subject: [PATCH 27/57] test with narm=TRUE that caused a segfault --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9b9cdc3e91..d3766eee7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3014,6 +3014,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) error="Unknown 'id.vars' type raw") test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") + test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1)) + test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) + test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 854f18973c3dbfd44f20ba05806f63ea7a6c5dc4 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 08:21:14 -0700 Subject: [PATCH 28/57] fix segfault via new fun input_col_or_na --- src/fmelt.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index d018a86d4e..e64ebba32d 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -427,6 +427,16 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType return ans; } +SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { + if (in_col < data->leach[out_col]) { + int input_column_num = INTEGER(thisvaluecols)[in_col]; + if (input_column_num != NA_INTEGER) { + return VECTOR_ELT(DT, input_column_num-1); + } + } + return allocNAVector(data->maxtype[out_col], data->nrow); +} + SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { for (int i=0; ilvalues; ++i) { SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); @@ -442,14 +452,8 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int i=0; ilmax; ++i) { SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (int j=0; jlvalues; ++j) { - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - int input_column_num = INTEGER(thisvaluecols)[i]; - if (j >= data->leach[j] || //fewer indices than the max were specified. - input_column_num == NA_INTEGER) { //NA was specified. - SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); - } else { - SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, input_column_num-1)); - } + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); + SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i)); } tmp = PROTECT(dt_na(tmp, seqcols)); SEXP w; @@ -474,14 +478,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s bool copyattr = false; for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol; - int input_column_num = INTEGER(thisvaluecols)[j]; - if (j >= data->leach[i] || // fewer indices than the max were specified. - input_column_num == NA_INTEGER) { // NA was specified. - thiscol = allocNAVector(data->maxtype[i], data->nrow); - }else{ - thiscol = VECTOR_ELT(DT, input_column_num-1); - } + SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j); if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = true; From 9039be5415de32676ae79cf5beca247dadf52ffa Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 14:01:23 -0700 Subject: [PATCH 29/57] measure function instead of sep_* pattern_* --- R/fmelt.R | 148 +++++++++++++++++------------------------- inst/tests/tests.Rraw | 46 +++++++------ 2 files changed, 81 insertions(+), 113 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index d84d8b5d96..f09080551b 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -34,40 +34,49 @@ patterns = function(..., cols=character(0L)) { matched } -pattern_list = function(pat, ..., cols=character(0L)){ - m.list = pattern_match_info(pat, ..., cols=cols) - do.call(info_to_list, m.list) -} - -pattern_vec = function(pat, ..., cols=character(0L)){ - m.list <- pattern_match_info(pat, ..., cols=cols) - do.call(info_to_vec, m.list) -} - -pattern_match_info = function(pat, cols=character(0L), ...){ - match.vec = regexpr(pat, cols, perl=TRUE) - capture.names = attr(match.vec, "capture.names") - if(any("" == capture.names)){ - stop("each capture group needs a name (?pattern)") +measure = function(..., sep, pattern, cols){ + # 1. error checking on sep/pattern args. + if (missing(sep) && missing(pattern)) { + stop( + "neither sep nor pattern arguments used in measure; ", + "must use either sep or pattern") + } + if (!missing(sep) && !missing(pattern)) { + stop( + "both sep and pattern arguments used in measure; ", + "must use either sep or pattern (not both)") + } + # 2. compute conversion function list with group names. + mcall = match.call() + L = as.list(mcall)[-1] + fun.list = L[-which(names(L)%in%names(formals()))] + no.fun = names(fun.list)=="" + names(fun.list)[no.fun] = sapply(fun.list[no.fun], paste) + # 3. compute initial group data table, used as variable.name attribute. + group.mat = if (!missing(pattern)) { + match.vec = regexpr(pattern, cols, perl=TRUE) + measure.vec = which(0 < match.vec) + start = attr(match.vec, "capture.start")[measure.vec,] + end = attr(match.vec, "capture.length")[measure.vec,]+start-1L + matched.names = cols[measure.vec] + names.mat = matrix(matched.names, nrow(start), ncol(start)) + substr(names.mat, start, end) + } else { + list.of.vectors = strsplit(cols, sep, fixed=TRUE) + vector.lengths = sapply(list.of.vectors, length) + n.groups = max(vector.lengths) + if (n.groups != length(fun.list)) { + stop( + "number of arguments to sep_* =", length(fun.list), + " must be same as max number of items after splitting column names =", n.groups) + } + measure.vec = which(vector.lengths==n.groups) + do.call(rbind, list.of.vectors[measure.vec]) } - measure.vars = which(0 < match.vec) - start = attr(match.vec, "capture.start")[measure.vars,] - end = attr(match.vec, "capture.length")[measure.vars,]+start-1L - matched.names = cols[measure.vars] - names.mat = matrix( - matched.names, - nrow(start), ncol(start), - dimnames=list( - column=matched.names, - group=capture.names)) - group.mat = substr(names.mat, start, end) + colnames(group.mat) = names(fun.list) group.dt = data.table(group.mat) - fun.list = list(...) - list(measure.vars=measure.vars, group.dt=group_funs(group.dt, fun.list)) -} - -group_funs = function(group.dt, fun.list) { - for (group.i in seq_along(fun.list)) { + # 4. apply conversion functions to group data table. + for (group.i in which(!no.fun)) { group.name = names(fun.list)[[group.i]] if (is.null(group.name) || nchar(group.name)==0) { stop("each argument must be named") @@ -75,7 +84,7 @@ group_funs = function(group.dt, fun.list) { if (! group.name %in% names(group.dt)) { stop("each argument name must be one of the capture group names, problem: ", group.name) } - fun = eval(fun.list[[group.name]]) + fun = eval(fun.list[[group.name]], parent.frame(1L)) if (!is.function(fun) || (!is.primitive(fun) && length(formals(fun))==0)) { stop("each argument must be a function with at least one argument, problem: ", group.name) } @@ -85,63 +94,24 @@ group_funs = function(group.dt, fun.list) { } set(group.dt, j=group.name, value=group.val) } - group.dt -} - -info_to_list = function(measure.vars, group.dt){ - if(! "column" %in% names(group.dt)){ - stop("need capture group named column") - } - is.other = names(group.dt)!="column" - other.values = lapply(group.dt[, ..is.other], unique) - other.values$stringsAsFactors = FALSE - other.dt = data.table(do.call(expand.grid, other.values)) - measure.list = structure(list(), variable.name=other.dt) - column.values = unique(group.dt[["column"]]) - for(column.val in column.values){ - select.dt = data.table(column=column.val, other.dt) - measure.list[[column.val]] = data.table( - measure.vars, group.dt - )[select.dt, measure.vars, on=names(select.dt)] - } - measure.list -} - -info_to_vec = function(measure.vars, group.dt){ - structure(measure.vars, variable.name=group.dt) -} - -sep_call_info = function(sep, cols){ - parent = sys.parent() - mcall = match.call(definition=sys.function(parent), call=sys.call(parent)) - L = as.list(mcall)[-1] - fun.list = L[-which(names(L)%in%names(formals()))] - no.fun = names(fun.list)=="" - names(fun.list)[no.fun] = sapply(fun.list[no.fun], paste) - list.of.vectors = strsplit(cols, sep, fixed=TRUE) - vector.lengths = sapply(list.of.vectors, length) - n.groups = max(vector.lengths) - if (n.groups != length(fun.list)) { - stop( - "number of arguments to sep_* =", length(fun.list), - " must be same as max number of items after splitting column names =", n.groups) + # 5. compute measure.vars list or vector. + if ("column" %in% names(fun.list)) {# multiple output columns. + is.other = names(group.dt) != "column" + other.values = lapply(group.dt[, ..is.other], unique) + other.values$stringsAsFactors = FALSE + other.dt = data.table(do.call(expand.grid, other.values)) + measure.list = structure(list(), variable.name=other.dt) + column.values = unique(group.dt[["column"]]) + for(column.val in column.values){ + select.dt = data.table(column=column.val, other.dt) + measure.list[[column.val]] = data.table( + measure.vec, group.dt + )[select.dt, measure.vec, on=names(select.dt)] + } + measure.list + } else {# single output column. + structure(measure.vec, variable.name=group.dt) } - measure.vars = which(vector.lengths==n.groups) - mat = do.call(rbind, list.of.vectors[measure.vars]) - colnames(mat) = names(fun.list) - group.dt = data.table(mat) - some.funs = fun.list[which(!no.fun)] - list(measure.vars=measure.vars, group.dt=group_funs(group.dt, some.funs)) -} - -sep_list = function(..., sep="_", cols){ - sep.list = sep_call_info(sep, cols) - do.call(info_to_list, sep.list) -} - -sep_vec = function(..., sep="_", cols){ - sep.list = sep_call_info(sep, cols) - do.call(info_to_vec, sep.list) } melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a7f807e9fc..d3d9940adf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17163,10 +17163,10 @@ exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) # new variable.name attribute for measure.vars -pattern_list = function(cols)cols#user-defined function for computing measure.vars, same name as data.table:::pattern_list but user-defined version should be used. -test(2156.0, melt(DT.wide, measure.vars=pattern_list()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) -pattern_list = list("foo", "bar")#pattern_list below should not use this since it is not a function. -test(2156.1, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +measure = function(cols)cols#user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. +test(2156.0, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +measure = list("foo", "bar")#measure below should not use this since it is not a function. +test(2156.1, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2156.2, melt(DTid, measure.vars=structure(list(1, 2), variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") test(2156.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") test(2156.4, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") @@ -17178,29 +17178,27 @@ schools.wide <- data.table( read_2 = c(1.2,2.2), math_1 = c(10.1,20.1), math_1_sp = c(T,T), math_2 = c(NA,20.2), math_2_sp = c(NA,F)) -schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=pattern_list("(?[^_]+)_(?[12])(?.*)", number=as.integer, column=function(x)ifelse(x=="", "score", "sp"))) +schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=measure(subject, number=as.integer, column=function(x)ifelse(x=="", "score", "sp"), pattern="([^_]+)_([12])(.*)")) schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) test(2156.7, schools.tall, schools.expected) who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) -test(2156.8, melt(who, measure.vars=pattern_vec("new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))", ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)))), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +test(2156.8, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))")), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) -# sep_ funs. -test(2157.1, melt(wide.again, na.rm=TRUE, measure.vars=sep_list(column, subject, number=as.integer))[order(score)], schools.expected) +# measure with sep= +test(2157.1, melt(wide.again, na.rm=TRUE, measure.vars=measure(column, subject, number=as.integer, sep="_"))[order(score)], schools.expected) iris.dt = data.table(datasets::iris) -test(2157.2, names(melt(iris.dt, measure.vars=sep_list(column, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) -test(2157.3, names(melt(iris.dt, measure.vars=sep_list(part, column, sep="."))), c("Species", "part", "Length", "Width")) -test(2157.4, names(melt(iris.dt, measure.vars=sep_vec(part, dim, sep="."))), c("Species", "part", "dim", "value")) +test(2157.2, names(melt(iris.dt, measure.vars=measure(column, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) +test(2157.3, names(melt(iris.dt, measure.vars=measure(part, column, sep="."))), c("Species", "part", "Length", "Width")) +test(2157.4, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) test(2157.5, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) -# pattern_ errors -test(2158.01, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", identity)), error="each argument must be named") -test(2158.02, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", foo=identity)), error="each argument name must be one of the capture group names, problem: foo") -test(2158.03, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr="bar")), error="each argument must be a function with at least one argument, problem: istr") -test(2158.04, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function()1)), error="each argument must be a function with at least one argument, problem: istr") -test(2158.05, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=function(x)1)), error="each argument must be a function that returns an atomic vector with same length as its first argument, problem: istr") -myfac = function(x)factor(x) -test(2158.06, melt(iris.dt, measure.vars=sep_list(column, dim="bar", sep=".")), error="each argument must be a function with at least one argument, problem: dim") -test(2158.07, melt(iris.dt, measure.vars=sep_list(column, dim, baz, sep=".")), error="number of arguments to sep_* =3 must be same as max number of items after splitting column names =2") -test(2158.08, melt(DTid, measure.vars=pattern_list("(?[ab])([12])", istr=as.integer)), error="each capture group needs a name (?pattern)") -test(2158.09, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", istr=as.integer)), error="need capture group named column") -# factors -test(2159.00, melt(DTid, measure.vars=pattern_list("(?[ab])(?[12])", letter=factor)), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) +# measure with pattern= +test(2158.03, melt(DTid, measure.vars=measure(column, istr="bar", pattern="([ab])([12])")), error="each argument must be a function with at least one argument, problem: istr") +test(2158.04, melt(DTid, measure.vars=measure(column, istr=function()1, pattern="([ab])([12])")), error="each argument must be a function with at least one argument, problem: istr") +test(2158.05, melt(DTid, measure.vars=measure(column, istr=function(x)1, pattern="([ab])([12])")), error="each argument must be a function that returns an atomic vector with same length as its first argument, problem: istr") +test(2158.06, melt(iris.dt, measure.vars=measure(column, dim="bar", sep=".")), error="each argument must be a function with at least one argument, problem: dim") +test(2158.07, melt(iris.dt, measure.vars=measure(column, dim, baz, sep=".")), error="number of arguments to sep_* =3 must be same as max number of items after splitting column names =2") +test(2158.08, melt(iris.dt, measure.vars=measure(column, dim)), error="neither sep nor pattern arguments used in measure; must use either sep or pattern") +test(2158.09, melt(iris.dt, measure.vars=measure(column, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") +# measure with factor conversion. +myfac = function(x)factor(x)#user-defined conversion function. +test(2159.00, melt(DTid, measure.vars=measure(letter=myfac, column, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) From 20f88a8053fc5c867b4901efd3a37f3ce737d314 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 20:00:26 -0700 Subject: [PATCH 30/57] multiple.keyword="value.name" by default --- R/fmelt.R | 30 ++++++++++++++------------ inst/tests/tests.Rraw | 50 +++++++++++++++++++++++-------------------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index f09080551b..afbc9e1657 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -34,7 +34,7 @@ patterns = function(..., cols=character(0L)) { matched } -measure = function(..., sep, pattern, cols){ +measure = function(..., sep, pattern, cols, multiple.keyword="value.name") { # 1. error checking on sep/pattern args. if (missing(sep) && missing(pattern)) { stop( @@ -57,9 +57,13 @@ measure = function(..., sep, pattern, cols){ match.vec = regexpr(pattern, cols, perl=TRUE) measure.vec = which(0 < match.vec) start = attr(match.vec, "capture.start")[measure.vec,] + if (ncol(start) != length(fun.list)) { + stop( + "number of ... arguments to measure =", length(fun.list), + " must be same as number of capture groups in pattern =", ncol(start)) + } end = attr(match.vec, "capture.length")[measure.vec,]+start-1L - matched.names = cols[measure.vec] - names.mat = matrix(matched.names, nrow(start), ncol(start)) + names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) substr(names.mat, start, end) } else { list.of.vectors = strsplit(cols, sep, fixed=TRUE) @@ -67,7 +71,7 @@ measure = function(..., sep, pattern, cols){ n.groups = max(vector.lengths) if (n.groups != length(fun.list)) { stop( - "number of arguments to sep_* =", length(fun.list), + "number of ... arguments to measure =", length(fun.list), " must be same as max number of items after splitting column names =", n.groups) } measure.vec = which(vector.lengths==n.groups) @@ -79,31 +83,29 @@ measure = function(..., sep, pattern, cols){ for (group.i in which(!no.fun)) { group.name = names(fun.list)[[group.i]] if (is.null(group.name) || nchar(group.name)==0) { - stop("each argument must be named") - } - if (! group.name %in% names(group.dt)) { - stop("each argument name must be one of the capture group names, problem: ", group.name) + stop("each ... argument to measure must be named") } fun = eval(fun.list[[group.name]], parent.frame(1L)) if (!is.function(fun) || (!is.primitive(fun) && length(formals(fun))==0)) { - stop("each argument must be a function with at least one argument, problem: ", group.name) + stop("each ... argument to measure must be a function with at least one argument, problem: ", group.name) } group.val = fun(group.dt[[group.name]]) if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { - stop("each argument must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) + stop("each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) } set(group.dt, j=group.name, value=group.val) } # 5. compute measure.vars list or vector. - if ("column" %in% names(fun.list)) {# multiple output columns. - is.other = names(group.dt) != "column" + if (multiple.keyword %in% names(fun.list)) {# multiple output columns. + is.other = names(group.dt) != multiple.keyword other.values = lapply(group.dt[, ..is.other], unique) other.values$stringsAsFactors = FALSE other.dt = data.table(do.call(expand.grid, other.values)) measure.list = structure(list(), variable.name=other.dt) - column.values = unique(group.dt[["column"]]) + column.values = unique(group.dt[[multiple.keyword]]) for(column.val in column.values){ - select.dt = data.table(column=column.val, other.dt) + select.dt = data.table(other.dt) + set(select.dt, j=multiple.keyword, value=column.val) measure.list[[column.val]] = data.table( measure.vec, group.dt )[select.dt, measure.vec, on=names(select.dt)] diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d3d9940adf..7dbd7692a3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17164,41 +17164,45 @@ test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) # new variable.name attribute for measure.vars measure = function(cols)cols#user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. -test(2156.0, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +test(2156.00, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) measure = list("foo", "bar")#measure below should not use this since it is not a function. -test(2156.1, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) -test(2156.2, melt(DTid, measure.vars=structure(list(1, 2), variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") -test(2156.3, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") -test(2156.4, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") -test(2156.5, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") -test(2156.6, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +test(2156.01, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2156.02, melt(DTid, measure.vars=structure(list(1, 2), variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") +test(2156.03, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") +test(2156.04, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") +test(2156.05, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") +test(2156.06, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +# general measure errors. +test(2156.07, melt(iris.dt, measure.vars=measure(value.name, dim)), error="neither sep nor pattern arguments used in measure; must use either sep or pattern") +test(2156.08, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") +# school example. schools.wide <- data.table( school = c("A","B"), read_1 = c(1.1,2.1), read_1_sp = c(T,T), read_2 = c(1.2,2.2), math_1 = c(10.1,20.1), math_1_sp = c(T,T), math_2 = c(NA,20.2), math_2_sp = c(NA,F)) -schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=measure(subject, number=as.integer, column=function(x)ifelse(x=="", "score", "sp"), pattern="([^_]+)_([12])(.*)")) +schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=measure(subject, number=as.integer, value.name=function(x)ifelse(x=="", "score", "sp"), pattern="([^_]+)_([12])(.*)")) schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) -test(2156.7, schools.tall, schools.expected) +test(2156.09, schools.tall, schools.expected) who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) -test(2156.8, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))")), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +test(2156.10, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))")), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # measure with sep= -test(2157.1, melt(wide.again, na.rm=TRUE, measure.vars=measure(column, subject, number=as.integer, sep="_"))[order(score)], schools.expected) +test(2156.11, melt(wide.again, na.rm=TRUE, measure.vars=measure(value.name, subject, number=as.integer, sep="_"))[order(score)], schools.expected) iris.dt = data.table(datasets::iris) -test(2157.2, names(melt(iris.dt, measure.vars=measure(column, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) -test(2157.3, names(melt(iris.dt, measure.vars=measure(part, column, sep="."))), c("Species", "part", "Length", "Width")) -test(2157.4, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) -test(2157.5, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) +test(2156.12, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) +test(2156.13, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width")) +test(2156.14, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) +test(2156.15, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim") +test(2156.16, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2") +# patterns with iris data. +test(2156.20, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) # measure with pattern= -test(2158.03, melt(DTid, measure.vars=measure(column, istr="bar", pattern="([ab])([12])")), error="each argument must be a function with at least one argument, problem: istr") -test(2158.04, melt(DTid, measure.vars=measure(column, istr=function()1, pattern="([ab])([12])")), error="each argument must be a function with at least one argument, problem: istr") -test(2158.05, melt(DTid, measure.vars=measure(column, istr=function(x)1, pattern="([ab])([12])")), error="each argument must be a function that returns an atomic vector with same length as its first argument, problem: istr") -test(2158.06, melt(iris.dt, measure.vars=measure(column, dim="bar", sep=".")), error="each argument must be a function with at least one argument, problem: dim") -test(2158.07, melt(iris.dt, measure.vars=measure(column, dim, baz, sep=".")), error="number of arguments to sep_* =3 must be same as max number of items after splitting column names =2") -test(2158.08, melt(iris.dt, measure.vars=measure(column, dim)), error="neither sep nor pattern arguments used in measure; must use either sep or pattern") -test(2158.09, melt(iris.dt, measure.vars=measure(column, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") +test(2156.30, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2156.31, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2156.32, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: istr") +test(2156.33, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. -test(2159.00, melt(DTid, measure.vars=measure(letter=myfac, column, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) +test(2156.40, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) From ba3174cfe429cd085363daa640d2f3214a16cc0b Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 20:01:57 -0700 Subject: [PATCH 31/57] move iris.dt def up --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7dbd7692a3..f8a7e4f5c1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17173,6 +17173,7 @@ test(2156.04, melt(DTid, measure.vars=structure(1:3, variable.name=data.table()) test(2156.05, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") test(2156.06, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") # general measure errors. +iris.dt = data.table(datasets::iris) test(2156.07, melt(iris.dt, measure.vars=measure(value.name, dim)), error="neither sep nor pattern arguments used in measure; must use either sep or pattern") test(2156.08, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") # school example. @@ -17190,7 +17191,6 @@ test(2156.10, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.nu wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # measure with sep= test(2156.11, melt(wide.again, na.rm=TRUE, measure.vars=measure(value.name, subject, number=as.integer, sep="_"))[order(score)], schools.expected) -iris.dt = data.table(datasets::iris) test(2156.12, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) test(2156.13, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width")) test(2156.14, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) From 6c8dc8161419fd66f815a9793d62c9b06713866d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 21:54:48 -0700 Subject: [PATCH 32/57] measure docs --- vignettes/datatable-reshape.Rmd | 74 ++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index c9fb43dabd..f3c297a08f 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -204,10 +204,80 @@ Usually in these problems, the columns we'd like to melt can be distinguished by ```{r} DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "gender")) -DT.m2 +print(DT.m2, class=TRUE) +``` + +#### - Using `measure()` + +If, as in the data above, the input columns to melt have regular +names, then we can use `measure`. The `sep` argument is used with +`strsplit` on each column name, and the measured columns are defined +by the maximum number of items after the split: + +```{r} +DT.m3 = melt(DT, measure = measure(value.name, child=as.integer, sep="_child")) +print(DT.m3, class=TRUE) +``` + +In the code above we used `sep="_child"` which results in melting only +the columns which contain that string (six column names split into two +groups). The other arguments are used to name each group; these names +define the output variable column names. The special sentinel +`value.name` is used to indicate that unique values of the first group +should be used to define multiple value columns (one for each unique +value). The `child=as.integer` argument means the second group will +result in an output column named `child` with values defined by +plugging the groups into the function `as.integer`. + +Another example is the iris data, + +```{r} +(two.iris = data.table(datasets::iris)[c(1,150)]) +``` + +To melt the iris data into a single value column, we simply avoid +using `value.name` in the group names below, + +```{r} +melt(two.iris, measure.vars = measure(part, dim, sep=".")) ``` -That's it! +If we want two value columns, one for each part, we can do + +```{r} +melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) +``` + +If we want two value columns, one for each dim, we can do + +```{r} +melt(two.iris, measure.vars = measure(part, value.name, sep=".")) +``` + +Finally we consider an example (borrowed from tidyr package) where we +need to define the groups using a regular expression rather than a +separator. + +```{r} +(who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) +melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)(.*)")) +``` + +When using the `pattern` argument, it must be a Perl-compatible +regular expression containing the same number of capture groups +(parenthesized sub-expressions) as the number other arguments (group +names). The code below shows how to use a more complex regex with five +groups, two numeric output columns, and a custom type conversion +function, + +```{r} +print(melt(who, measure.vars = measure( + diagnosis, gender, ages, + ymin=as.numeric, + ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), + pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))" +)), class=TRUE) +``` #### {.bs-callout .bs-callout-info} From f8ef31b4b484dadda3dbb42d4cfc659abb72572a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 21:55:20 -0700 Subject: [PATCH 33/57] measure docs --- man/measure.Rd | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 man/measure.Rd diff --git a/man/measure.Rd b/man/measure.Rd new file mode 100644 index 0000000000..d2f1a8e28f --- /dev/null +++ b/man/measure.Rd @@ -0,0 +1,58 @@ +\name{measure} +\alias{measure} +\title{Specify measure.vars via regex or separator} +\description{ +\code{measure} computes an integer vector or list which can be passed as +the \code{measure.vars} argument to \code{melt}. +See the \code{Efficient reshaping using +data.tables} vignette linked below to learn more. +} +\usage{ +measure(\dots, sep, pattern, cols, multiple.keyword="value.name") +} +\arguments{ + \item{\dots}{One or more (1) symbols (without argument name; symbol + is used for output variable column name) or (2) + functions (with argument name that is used for output variable + column name). Must have same number of arguments as groups that are + specified by either \code{sep} or \code{pattern} arguments.} + \item{sep}{Separator to split \code{cols} into groups. Columns that + result in the maximum number of groups are considered measure variables.} + \item{pattern}{Perl-compatible regex with capture groups to match to + \code{cols}. Columns that match the regex are considered measure variables.} + \item{cols}{A character vector of column names.} + \item{multiple.keyword}{A string, if used in \code{\dots}, then + measure returns a list and melt returns multiple + value columns (with names defined by the unique values in that + group). Otherwise if the string not used in \code{\dots}, then + measure returns a vector and melt returns a single value column.} +} +\seealso{ + \code{\link{melt}}, + \url{https://github.com/Rdatatable/data.table/wiki/Getting-started} +} +\examples{ +(two.iris = data.table(datasets::iris)[c(1,150)]) +# melt into a single value column. +melt(two.iris, measure.vars = measure(part, dim, sep=".")) +# melt into two value columns, one for each part. +melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) +# melt into two value columns, one for each dim. +melt(two.iris, measure.vars = measure(part, value.name, sep=".")) +# melt using either sep or pattern, converting child number to integer. +(two.families = data.table(sex_child1="M", sex_child2="F", age_child1=10, age_child2=20)) +print(melt(two.families, measure.vars = measure(value.name, child=as.integer, sep="_child")), class=TRUE) +print(melt(two.families, measure.vars = measure(value.name, child=as.integer, pattern="(.*)_child(.)")), class=TRUE) +# inspired by data(who, package="tidyr") +(who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) +# melt to three variable columns, all character. +melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)(.*)")) +# melt to five variable columns, two numeric (with custom conversion). +print(melt(who, measure.vars = measure( + diagnosis, gender, ages, + ymin=as.numeric, + ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), + pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))" +)), class=TRUE) +} +\keyword{data} From b95fb7dae322cfe322f67558889ac2a8522cb8cc Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 22:03:25 -0700 Subject: [PATCH 34/57] document variable_table attribute --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 060feb6150..bcf861c906 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -31,7 +31,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } -\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors.} +\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors. If \code{measure.vars} has attribute \code{variable_table} then it must be a data table with nrow = length of \code{measure.vars} vector(s), each row describing the corresponding measured variables(s), (typically created via \code{measure}) and its columns will be output instead of the \code{variable.name} column.} \item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten data.} From 100a7a8a518780a1899d84138e5eb951bf9b520d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 22:05:07 -0700 Subject: [PATCH 35/57] variable_table, check all list element sizes, sep default --- R/fmelt.R | 16 +++++++--------- inst/tests/tests.Rraw | 16 +++++++++------- src/fmelt.c | 14 ++++++++------ 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index afbc9e1657..0dbc0b7773 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -34,13 +34,8 @@ patterns = function(..., cols=character(0L)) { matched } -measure = function(..., sep, pattern, cols, multiple.keyword="value.name") { +measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { # 1. error checking on sep/pattern args. - if (missing(sep) && missing(pattern)) { - stop( - "neither sep nor pattern arguments used in measure; ", - "must use either sep or pattern") - } if (!missing(sep) && !missing(pattern)) { stop( "both sep and pattern arguments used in measure; ", @@ -52,7 +47,7 @@ measure = function(..., sep, pattern, cols, multiple.keyword="value.name") { fun.list = L[-which(names(L)%in%names(formals()))] no.fun = names(fun.list)=="" names(fun.list)[no.fun] = sapply(fun.list[no.fun], paste) - # 3. compute initial group data table, used as variable.name attribute. + # 3. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { match.vec = regexpr(pattern, cols, perl=TRUE) measure.vec = which(0 < match.vec) @@ -69,6 +64,9 @@ measure = function(..., sep, pattern, cols, multiple.keyword="value.name") { list.of.vectors = strsplit(cols, sep, fixed=TRUE) vector.lengths = sapply(list.of.vectors, length) n.groups = max(vector.lengths) + if (n.groups == 1) { + stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") + } if (n.groups != length(fun.list)) { stop( "number of ... arguments to measure =", length(fun.list), @@ -101,7 +99,7 @@ measure = function(..., sep, pattern, cols, multiple.keyword="value.name") { other.values = lapply(group.dt[, ..is.other], unique) other.values$stringsAsFactors = FALSE other.dt = data.table(do.call(expand.grid, other.values)) - measure.list = structure(list(), variable.name=other.dt) + measure.list = structure(list(), variable_table=other.dt) column.values = unique(group.dt[[multiple.keyword]]) for(column.val in column.values){ select.dt = data.table(other.dt) @@ -112,7 +110,7 @@ measure = function(..., sep, pattern, cols, multiple.keyword="value.name") { } measure.list } else {# single output column. - structure(measure.vec, variable.name=group.dt) + structure(measure.vec, variable_table=group.dt) } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f8a7e4f5c1..b824e155ed 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17167,14 +17167,15 @@ measure = function(cols)cols#user-defined function for computing measure.vars, s test(2156.00, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) measure = list("foo", "bar")#measure below should not use this since it is not a function. test(2156.01, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) -test(2156.02, melt(DTid, measure.vars=structure(list(1, 2), variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") -test(2156.03, melt(DTid, measure.vars=structure(1:3, variable.name="foo")), error="variable.name attribute of measure.vars should be either NULL or a data table") -test(2156.04, melt(DTid, measure.vars=structure(1:3, variable.name=data.table())), error="variable.name attribute of measure.vars should be a data table with at least one column") -test(2156.05, melt(DTid, measure.vars=structure(1:3, variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") -test(2156.06, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable.name=data.table(x=1))), error="variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +test(2156.015, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword +test(2156.02, melt(DTid, measure.vars=structure(list(1, 2), variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") +test(2156.03, melt(DTid, measure.vars=structure(1:3, variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") +test(2156.04, melt(DTid, measure.vars=structure(1:3, variable_table=data.table())), error="variable_table attribute of measure.vars should be a data table with at least one column") +test(2156.05, melt(DTid, measure.vars=structure(1:3, variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") +test(2156.06, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +test(2156.061, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. # general measure errors. iris.dt = data.table(datasets::iris) -test(2156.07, melt(iris.dt, measure.vars=measure(value.name, dim)), error="neither sep nor pattern arguments used in measure; must use either sep or pattern") test(2156.08, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") # school example. schools.wide <- data.table( @@ -17190,12 +17191,13 @@ who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) test(2156.10, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))")), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) # measure with sep= -test(2156.11, melt(wide.again, na.rm=TRUE, measure.vars=measure(value.name, subject, number=as.integer, sep="_"))[order(score)], schools.expected) +test(2156.11, melt(wide.again, na.rm=TRUE, measure.vars=measure(value.name, subject, number=as.integer))[order(score)], schools.expected)#should work without sep due to same default _ as dcast. test(2156.12, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) test(2156.13, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width")) test(2156.14, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) test(2156.15, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim") test(2156.16, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2") +test(2156.17, melt(iris.dt, measure.vars=measure()), error="each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") # patterns with iris data. test(2156.20, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) # measure with pattern= diff --git a/src/fmelt.c b/src/fmelt.c index be008eaf51..bda95daee9 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -366,7 +366,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SET_VECTOR_ELT(data->RCHK, 1, data->naidx = allocVector(VECSXP, data->lmax)); } // TDH 1 Oct 2020 variable table. - if (VarNameSymbol == NULL) VarNameSymbol = install("variable.name"); + if (VarNameSymbol == NULL) VarNameSymbol = install("variable_table"); data->variable_table = getAttrib(measure, VarNameSymbol); if (isNull(data->variable_table)) { // We need to include this check first because isNewList(NULL) == @@ -375,14 +375,16 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna } else if (isNewList(data->variable_table)) { data->lvars = length(data->variable_table); if (data->lvars == 0) { - error(_("variable.name attribute of measure.vars should be a data table with at least one column")); + error(_("variable_table attribute of measure.vars should be a data table with at least one column")); } - int nrow = length(VECTOR_ELT(data->variable_table, 0)); - if (data->lmax != nrow) { - error(_("variable.name attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); + for (i=0; ivariable_table); i++) { + int nrow = length(VECTOR_ELT(data->variable_table, i)); + if (data->lmax != nrow) { + error(_("variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); + } } } else {//neither NULL nor DT. - error(_("variable.name attribute of measure.vars should be either NULL or a data table")); + error(_("variable_table attribute of measure.vars should be either NULL or a data table")); } } From 7a73a777544621cdd0ad97d80b48e88427a327bb Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 22:06:53 -0700 Subject: [PATCH 36/57] line break to avoid NOTE --- man/measure.Rd | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/man/measure.Rd b/man/measure.Rd index d2f1a8e28f..964660b6f8 100644 --- a/man/measure.Rd +++ b/man/measure.Rd @@ -41,8 +41,14 @@ melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) melt(two.iris, measure.vars = measure(part, value.name, sep=".")) # melt using either sep or pattern, converting child number to integer. (two.families = data.table(sex_child1="M", sex_child2="F", age_child1=10, age_child2=20)) -print(melt(two.families, measure.vars = measure(value.name, child=as.integer, sep="_child")), class=TRUE) -print(melt(two.families, measure.vars = measure(value.name, child=as.integer, pattern="(.*)_child(.)")), class=TRUE) +print(melt(two.families, measure.vars = measure( + value.name, child=as.integer, + sep="_child" +)), class=TRUE) +print(melt(two.families, measure.vars = measure( + value.name, child=as.integer, + pattern="(.*)_child(.)" +)), class=TRUE) # inspired by data(who, package="tidyr") (who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) # melt to three variable columns, all character. From 4c47a7f527192fe6fcc854f40b1f8d96bd3d52c6 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 22:43:23 -0700 Subject: [PATCH 37/57] melt variable_table measure fun --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8bd968bab2..739f22695c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ 1. `melt.data.table()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data tables with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing the changes to fmelt. +2. `melt.data.table()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`. It should be a data table with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is completely backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that have several distinct pieces of information encoded in each column name (for details see new `?measure` and new section in reshape vignette). Thanks to TODO for reporting, #TODO, and to @tdhock for implementing the new features. + ## BUG FIXES 1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. From c02fa9e8e6016986bbad3113c149d68104d70bff Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 23:20:39 -0700 Subject: [PATCH 38/57] changes to increase coverage --- R/fmelt.R | 14 ++++++++------ inst/tests/tests.Rraw | 6 +++++- src/fmelt.c | 15 ++------------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 0dbc0b7773..f98dccb333 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -45,8 +45,13 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { mcall = match.call() L = as.list(mcall)[-1] fun.list = L[-which(names(L)%in%names(formals()))] - no.fun = names(fun.list)=="" - names(fun.list)[no.fun] = sapply(fun.list[no.fun], paste) + user.named = names(fun.list)!="" + is.symb = sapply(fun.list, is.symbol) + bad.i = which((!user.named) & (!is.symb)) + if (length(bad.i)) { + stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) + } + names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) # 3. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { match.vec = regexpr(pattern, cols, perl=TRUE) @@ -78,11 +83,8 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { colnames(group.mat) = names(fun.list) group.dt = data.table(group.mat) # 4. apply conversion functions to group data table. - for (group.i in which(!no.fun)) { + for (group.i in which(user.named)) { group.name = names(fun.list)[[group.i]] - if (is.null(group.name) || nchar(group.name)==0) { - stop("each ... argument to measure must be named") - } fun = eval(fun.list[[group.name]], parent.frame(1L)) if (!is.function(fun) || (!is.primitive(fun) && length(formals(fun))==0)) { stop("each ... argument to measure must be a function with at least one argument, problem: ", group.name) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b824e155ed..62e60b196f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17164,8 +17164,10 @@ test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) # new variable.name attribute for measure.vars measure = function(cols)cols#user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. -test(2156.00, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +test(2156.000, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) measure = list("foo", "bar")#measure below should not use this since it is not a function. +test(2156.001, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2156.002, melt(DTid, measure.vars=structure(list(a=c(NA,"a2"),b=c("b1","b2")), variable_table=data.table(number=as.complex(1:2)))), error="variable_table does not support column type 'complex' for column 'number'") test(2156.01, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2156.015, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword test(2156.02, melt(DTid, measure.vars=structure(list(1, 2), variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") @@ -17205,6 +17207,8 @@ test(2156.30, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="( test(2156.31, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2156.32, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: istr") test(2156.33, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") +test(2156.34, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") +test(2156.35, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2156.40, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) diff --git a/src/fmelt.c b/src/fmelt.c index bda95daee9..0671a94566 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -649,28 +649,17 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels for (int k=0; klmax) { - // data->narm is true and there are some all-NA items causing at least one 'if (thislen==0) continue' above - // shrink the levels - SEXP newlevels = PROTECT(allocVector(STRSXP, nlevel)); protecti++; - for (int i=0; ilvars; out_col_i++) { SEXP out_col = VECTOR_ELT(data->variable_table, out_col_i); SET_VECTOR_ELT(ansvars, out_col_i, target=allocVector(TYPEOF(out_col), data->totlen)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; switch (TYPEOF(target)) { - case VECSXP : - for (int k=0; kvariable_table, R_NamesSymbol), out_col_i))); + error(_("variable_table does not support column type '%s' for column '%s'."), type2char(TYPEOF(out_col)), CHAR(STRING_ELT(getAttrib(data->variable_table, R_NamesSymbol), out_col_i))); } } } From c7a9aaed560fcb9caa457397d5760c1085f21fbf Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 6 Oct 2020 21:59:38 -0700 Subject: [PATCH 39/57] check for duplicate names --- R/fmelt.R | 8 +++++++- inst/tests/tests.Rraw | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/R/fmelt.R b/R/fmelt.R index f98dccb333..4549b09df4 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -22,7 +22,7 @@ melt = function(data, ..., na.rm = FALSE, value.name = "value") { patterns = function(..., cols=character(0L)) { # if ... has no names, names(list(...)) will be ""; # this assures they'll be NULL instead - L <- list(...) + L = list(...) p = unlist(L, use.names = any(nzchar(names(L)))) if (!is.character(p)) stop("Input patterns must be of type character.") @@ -52,6 +52,12 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) } names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) + name.tab = table(names(fun.list)) + bad.counts = name.tab[1 < name.tab] + if(length(bad.counts)){ + bad.str = paste(names(bad.counts), collapse=",") + stop("measure group names should be unique, problems: ", bad.str) + } # 3. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { match.vec = regexpr(pattern, cols, perl=TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 62e60b196f..b842b8dd3e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17209,6 +17209,7 @@ test(2156.32, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pat test(2156.33, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") test(2156.34, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") test(2156.35, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") +test(2156.36, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="measure group names should be unique, problems: value.name") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2156.40, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) From c081cda9a45a251aa433010ccca62685d231dfc0 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 6 Oct 2020 21:59:56 -0700 Subject: [PATCH 40/57] explain measure using iris example first --- vignettes/datatable-reshape.Rmd | 79 +++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index f3c297a08f..a8a3661d4d 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -198,6 +198,12 @@ DT.m2 str(DT.m2) ## col type is preserved ``` +#### {.bs-callout .bs-callout-info} + +* We can remove the `variable` column if necessary. + +* The functionality is implemented entirely in C, and is therefore both *fast* and *memory efficient* in addition to being *straightforward*. + #### - Using `patterns()` Usually in these problems, the columns we'd like to melt can be distinguished by a common pattern. We can use the function `patterns()`, implemented for convenience, to provide regular expressions for the columns to be combined together. The above operation can be rewritten as: @@ -207,67 +213,78 @@ DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "g print(DT.m2, class=TRUE) ``` -#### - Using `measure()` +#### - Using `measure()` to specify `measure.vars` via separator or pattern If, as in the data above, the input columns to melt have regular -names, then we can use `measure`. The `sep` argument is used with -`strsplit` on each column name, and the measured columns are defined -by the maximum number of items after the split: - -```{r} -DT.m3 = melt(DT, measure = measure(value.name, child=as.integer, sep="_child")) -print(DT.m3, class=TRUE) -``` - -In the code above we used `sep="_child"` which results in melting only -the columns which contain that string (six column names split into two -groups). The other arguments are used to name each group; these names -define the output variable column names. The special sentinel -`value.name` is used to indicate that unique values of the first group -should be used to define multiple value columns (one for each unique -value). The `child=as.integer` argument means the second group will -result in an output column named `child` with values defined by -plugging the groups into the function `as.integer`. - -Another example is the iris data, +names, then we can use `measure`, which allows specifying the columns +to melt via a separator or a regex. For example consider the iris +data, ```{r} (two.iris = data.table(datasets::iris)[c(1,150)]) ``` -To melt the iris data into a single value column, we simply avoid -using `value.name` in the group names below, +The iris data has four numeric columns with a regular structure: first +the flower part, then a period, then the measurement dimension. To +specify that we want to melt those four columns, we can use `measure` +with `sep="."` which means to use `strsplit` on all column names; the +columns which result in the maximum number of items after splitting +will be used as `measure.vars`: -```{r} +```r melt(two.iris, measure.vars = measure(part, dim, sep=".")) ``` -If we want two value columns, one for each part, we can do +The first two arguments to `measure` in the code above (`part` and +`dim`) are used to name the output columns; the number of arguments +must equal the number of groups after splitting with `sep`. + +If we want two value columns, one for each part, we can use the +special `value.name` keyword, which means to output a value column +for each unique name found in that group: ```{r} melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) ``` -If we want two value columns, one for each dim, we can do +Using the code above we get one value column per flower part. If we +instead want a value column for each measurement dimension, we can do ```{r} melt(two.iris, measure.vars = measure(part, value.name, sep=".")) ``` +Going back to the example of the data with families and children, we +can see a more complex usage of `measure`, involving a function which +is used to convert the `child` string values to integers: + +```{r} +DT.m3 = melt(DT, measure = measure(value.name, child=as.integer, sep="_child")) +print(DT.m3, class=TRUE) +``` + +In the code above we used `sep="_child"` which results in melting only +the columns which contain that string (six column names split into two +groups each). The `child=as.integer` argument means the second group +will result in an output column named `child` with values defined by +plugging the character strings from that group into the function +`as.integer`. + Finally we consider an example (borrowed from tidyr package) where we need to define the groups using a regular expression rather than a separator. ```{r} (who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) -melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)(.*)")) +melt(who, measure.vars = measure( + diagnosis, gender, ages, pattern="new_?(.*)_(.)(.*)")) ``` When using the `pattern` argument, it must be a Perl-compatible regular expression containing the same number of capture groups (parenthesized sub-expressions) as the number other arguments (group names). The code below shows how to use a more complex regex with five -groups, two numeric output columns, and a custom type conversion +groups, two numeric output columns, and an anonymous type conversion function, ```{r} @@ -279,12 +296,6 @@ print(melt(who, measure.vars = measure( )), class=TRUE) ``` -#### {.bs-callout .bs-callout-info} - -* We can remove the `variable` column if necessary. - -* The functionality is implemented entirely in C, and is therefore both *fast* and *memory efficient* in addition to being *straightforward*. - ### b) Enhanced `dcast` Okay great! We can now melt into multiple columns simultaneously. Now given the data set `DT.m2` as shown above, how can we get back to the same format as the original data we started with? From 44049f654b340e61a84601091d32a49f77d60723 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 6 Oct 2020 22:31:37 -0700 Subject: [PATCH 41/57] with=FALSE instead of ..is.other --- R/fmelt.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fmelt.R b/R/fmelt.R index 4549b09df4..6cfae9a18a 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -104,7 +104,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { # 5. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. is.other = names(group.dt) != multiple.keyword - other.values = lapply(group.dt[, ..is.other], unique) + other.values = lapply(group.dt[, is.other, with=FALSE], unique) other.values$stringsAsFactors = FALSE other.dt = data.table(do.call(expand.grid, other.values)) measure.list = structure(list(), variable_table=other.dt) From 7b2a6880040bbf2cef40cb7724004771b420e03b Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 6 Oct 2020 22:32:01 -0700 Subject: [PATCH 42/57] measure fun sep/regex examples --- man/melt.data.table.Rd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index bcf861c906..f3fe26b038 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -142,6 +142,12 @@ melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) +# specifying columns to melt via separator. +melt(DT.missing.cols, measure=measure(value.name, number=as.integer, sep="_")) + +# specifying columns to melt via regex. +melt(DT.missing.cols, measure=measure(value.name, number=as.integer, pattern="(.)_(.)")) + } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} From dfae9e2d060a138e8f2c27b7046ffc570254a36b Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 6 Oct 2020 22:34:13 -0700 Subject: [PATCH 43/57] {r} to execute R code --- vignettes/datatable-reshape.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index a8a3661d4d..84f8cfa3ca 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -231,7 +231,7 @@ with `sep="."` which means to use `strsplit` on all column names; the columns which result in the maximum number of items after splitting will be used as `measure.vars`: -```r +```{r} melt(two.iris, measure.vars = measure(part, dim, sep=".")) ``` From 9c96302e6a1b3d0e77850b9801e8bffc0f0d9bec Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 15 Oct 2020 16:49:23 -0700 Subject: [PATCH 44/57] minor typos --- vignettes/datatable-reshape.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index 84f8cfa3ca..d54320da4c 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -228,7 +228,7 @@ The iris data has four numeric columns with a regular structure: first the flower part, then a period, then the measurement dimension. To specify that we want to melt those four columns, we can use `measure` with `sep="."` which means to use `strsplit` on all column names; the -columns which result in the maximum number of items after splitting +columns which result in the maximum number of groups after splitting will be used as `measure.vars`: ```{r} @@ -237,7 +237,7 @@ melt(two.iris, measure.vars = measure(part, dim, sep=".")) The first two arguments to `measure` in the code above (`part` and `dim`) are used to name the output columns; the number of arguments -must equal the number of groups after splitting with `sep`. +must equal the max number of groups after splitting with `sep`. If we want two value columns, one for each part, we can use the special `value.name` keyword, which means to output a value column From cf11f67cdd8c5e6fd25402315780c12d72489eb0 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 15 Oct 2020 17:00:12 -0700 Subject: [PATCH 45/57] measure error messages --- R/fmelt.R | 14 ++++++++++++++ inst/tests/tests.Rraw | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/R/fmelt.R b/R/fmelt.R index 6cfae9a18a..dd53c16f1b 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -86,6 +86,13 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { measure.vec = which(vector.lengths==n.groups) do.call(rbind, list.of.vectors[measure.vec]) } + uniq.mat <- unique(group.mat) + if (nrow(uniq.mat) < nrow(group.mat)) { + stop( + "number of unique column IDs =", nrow(uniq.mat), + " is less than number of melted columns =", nrow(group.mat), + "; fix by changing pattern/sep") + } colnames(group.mat) = names(fun.list) group.dt = data.table(group.mat) # 4. apply conversion functions to group data table. @@ -99,8 +106,15 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { stop("each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) } + if (all(is.na(group.val))) { + stop(group.name, " conversion function returned vector of all NA") + } set(group.dt, j=group.name, value=group.val) } + group.uniq <- unique(group.dt) + if (nrow(group.uniq) < nrow(group.dt)) { + stop("number of unique groups after applying type conversion functions less than number of groups, change type conversion") + } # 5. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. is.other = names(group.dt) != multiple.keyword diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b842b8dd3e..18906f6c28 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17213,3 +17213,12 @@ test(2156.36, melt(iris.dt, measure.vars=measure(value.name, value.name, sep="." # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2156.40, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) +# measure errors. +iris.i <- 1 +iris.num <- datasets::iris[iris.i, 1:4] +iris.days <- data.table( + day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) +test(2156.50, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning="NAs introduced by coercion") +test(2156.51, melt(iris.days, measure.vars=measure(before=function(x)rep(4, length(x)), value.name, dim, sep=".")), error="number of unique groups after applying type conversion functions less than number of groups, change type conversion") +test(2156.52, melt(iris.days, measure.vars=measure(before, value.name, dim, pattern="(day)[12][.](.*)[.](.*)")), error="number of unique column IDs =4 is less than number of melted columns =8; fix by changing pattern/sep") +test(2156.53, melt(iris.days, measure.vars=measure(day=as.integer, value.name, dim, pattern="day(.)[.](.*)[.](.*)")), data.table(Species=factor("setosa"), day=as.integer(c(1,2,1,2)), dim=c("Length","Length","Width","Width"), Sepal=c(5.1,5.1,3.5,3.5), Petal=c(1.4,1.4,0.2,0.2))) From 608910eeefec4de51e623a4cbd37ef1a43b77bcd Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 18 Oct 2020 22:47:12 -0700 Subject: [PATCH 46/57] more errors --- R/fmelt.R | 36 ++++++++++++++++++++++++++++++------ inst/tests/tests.Rraw | 8 ++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index dd53c16f1b..56094d4fb3 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -52,17 +52,29 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) } names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) - name.tab = table(names(fun.list)) - bad.counts = name.tab[1 < name.tab] - if(length(bad.counts)){ - bad.str = paste(names(bad.counts), collapse=",") - stop("measure group names should be unique, problems: ", bad.str) + err.names.unique <- function(err.what, name.vec) { + name.tab = table(name.vec) + bad.counts = name.tab[1 < name.tab] + if (length(bad.counts)) { + stop( + err.what, " names should be unique, problems: ", + paste(names(bad.counts), collapse=",") + ) + } } + err.names.unique("measure group", names(fun.list)) # 3. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { match.vec = regexpr(pattern, cols, perl=TRUE) measure.vec = which(0 < match.vec) - start = attr(match.vec, "capture.start")[measure.vec,] + if (length(measure.vec) == 0L) { + stop( + "pattern did not match any cols, so nothing would be melted; fix by changing pattern") + } + start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] + if (is.null(start)) { + stop("pattern must contain at least one capture group (parenthesized sub-pattern)") + } if (ncol(start) != length(fun.list)) { stop( "number of ... arguments to measure =", length(fun.list), @@ -86,6 +98,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { measure.vec = which(vector.lengths==n.groups) do.call(rbind, list.of.vectors[measure.vec]) } + err.names.unique("measured column", cols[measure.vec]) uniq.mat <- unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { stop( @@ -117,7 +130,18 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { } # 5. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. + if (!is.character(group.dt[[multiple.keyword]])) { + stop( + multiple.keyword, " column class=", + class(group.dt[[multiple.keyword]])[[1L]], + " after applying conversion function, but must be character") + } is.other = names(group.dt) != multiple.keyword + if (!any(is.other)) { + stop( + multiple.keyword, " is the only group; ", + "fix by creating at least one more group") + } other.values = lapply(group.dt[, is.other, with=FALSE], unique) other.values$stringsAsFactors = FALSE other.dt = data.table(do.call(expand.grid, other.values)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 18906f6c28..b1aa2b8217 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17222,3 +17222,11 @@ test(2156.50, melt(iris.days, measure.vars=measure(before=as.integer, value.name test(2156.51, melt(iris.days, measure.vars=measure(before=function(x)rep(4, length(x)), value.name, dim, sep=".")), error="number of unique groups after applying type conversion functions less than number of groups, change type conversion") test(2156.52, melt(iris.days, measure.vars=measure(before, value.name, dim, pattern="(day)[12][.](.*)[.](.*)")), error="number of unique column IDs =4 is less than number of melted columns =8; fix by changing pattern/sep") test(2156.53, melt(iris.days, measure.vars=measure(day=as.integer, value.name, dim, pattern="day(.)[.](.*)[.](.*)")), data.table(Species=factor("setosa"), day=as.integer(c(1,2,1,2)), dim=c("Length","Length","Width","Width"), Sepal=c(5.1,5.1,3.5,3.5), Petal=c(1.4,1.4,0.2,0.2))) +test(2156.54, melt(iris.days, measure.vars=measure(pattern="day")), error="pattern must contain at least one capture group (parenthesized sub-pattern)") +test(2156.55, melt(iris.days, measure.vars=measure(value.name, pattern="(.*)")), error="value.name is the only group; fix by creating at least one more group") +test(2156.56, melt(iris.days, measure.vars=measure(foo, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") +test(2156.57, melt(iris.days, measure.vars=measure(value.name, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") +test(2157.58, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured column names should be unique, problems: ff") +test(2157.59, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured column names should be unique, problems: f_f") +test(2157.60, melt(iris.days, measure.vars=measure(value.name=as.integer, variable, pattern="day(.)[.](.*)")), error="value.name column class=integer after applying conversion function, but must be character") +test(2157.61, melt(data.table(ff=1, ff=2, a=3, b=4), measure.vars=measure(letter, pattern="([ab])"), id.vars="ff"), data.table(ff=1, letter=c("a","b"), value=c(3,4)))#duplicate column names are fine if they are not matched by pattern. From 5045e6fa33ffe6160347818786e6031d52bb0e6c Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 22 Oct 2020 10:32:15 -0700 Subject: [PATCH 47/57] more unusual type and arg name errors --- R/fmelt.R | 30 ++++++++++++++++++++++++++---- inst/tests/tests.Rraw | 20 +++++++++++++++----- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 56094d4fb3..4a1840e8bf 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -35,16 +35,23 @@ patterns = function(..., cols=character(0L)) { } measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { - # 1. error checking on sep/pattern args. + # 1. basic error checking. if (!missing(sep) && !missing(pattern)) { stop( "both sep and pattern arguments used in measure; ", "must use either sep or pattern (not both)") } + if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { + stop("multiple.keyword must be a character string with nchar>0") + } + if (!is.character(cols)) { + stop("cols must be a character vector of column names") + } # 2. compute conversion function list with group names. mcall = match.call() L = as.list(mcall)[-1] - fun.list = L[-which(names(L)%in%names(formals()))] + formal.names <- names(formals()) + fun.list = L[-which(names(L)%in%formal.names)] user.named = names(fun.list)!="" is.symb = sapply(fun.list, is.symbol) bad.i = which((!user.named) & (!is.symb)) @@ -52,19 +59,31 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) } names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) + # group names error checking. + group.is.formal <- names(fun.list) %in% formal.names + if (any(group.is.formal)) { + bad.names <- names(fun.list)[group.is.formal] + stop( + "group names specified in ... conflict with measure argument names;", + " please fix by changing group names: ", + paste(bad.names, collapse=",")) + } err.names.unique <- function(err.what, name.vec) { name.tab = table(name.vec) bad.counts = name.tab[1 < name.tab] if (length(bad.counts)) { stop( err.what, " names should be unique, problems: ", - paste(names(bad.counts), collapse=",") + paste(names(bad.counts), collapse=",") ) } } err.names.unique("measure group", names(fun.list)) # 3. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { + if (!is.character(pattern)) { + stop("pattern must be character string") + } match.vec = regexpr(pattern, cols, perl=TRUE) measure.vec = which(0 < match.vec) if (length(measure.vec) == 0L) { @@ -83,7 +102,10 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { end = attr(match.vec, "capture.length")[measure.vec,]+start-1L names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) substr(names.mat, start, end) - } else { + } else { #pattern not specified, so split using sep. + if (!is.character(sep)) { + stop("sep must be character string") + } list.of.vectors = strsplit(cols, sep, fixed=TRUE) vector.lengths = sapply(list.of.vectors, length) n.groups = max(vector.lengths) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b1aa2b8217..f838b62862 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10444,7 +10444,7 @@ test(1728.12, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong if (test_longdouble) { #3258 old = options(datatable.verbose=FALSE) # capture.output() exact tests must not be polluted with verbosity - + test(1729.01, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))), output="V1,V2\n1,10") test(1729.02, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))), @@ -10524,8 +10524,8 @@ if (test_longdouble) { #3258 # 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 test(1729.12, typeof(DT[[1L]]), "double") test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) - - options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on + + options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on } if (test_bit64) { @@ -17145,7 +17145,7 @@ test(2153.2, DT[, .(list(.GRP)), by=x], data.table(x=1:2, V1=as.list(1:2))) test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L))) test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result - data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) + data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) # warning message segfault when no column names present, #4644 test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0), @@ -17183,7 +17183,7 @@ test(2156.08, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", patte schools.wide <- data.table( school = c("A","B"), read_1 = c(1.1,2.1), read_1_sp = c(T,T), - read_2 = c(1.2,2.2), + read_2 = c(1.2,2.2), math_1 = c(10.1,20.1), math_1_sp = c(T,T), math_2 = c(NA,20.2), math_2_sp = c(NA,F)) schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=measure(subject, number=as.integer, value.name=function(x)ifelse(x=="", "score", "sp"), pattern="([^_]+)_([12])(.*)")) @@ -17230,3 +17230,13 @@ test(2157.58, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, test(2157.59, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured column names should be unique, problems: f_f") test(2157.60, melt(iris.days, measure.vars=measure(value.name=as.integer, variable, pattern="day(.)[.](.*)")), error="value.name column class=integer after applying conversion function, but must be character") test(2157.61, melt(data.table(ff=1, ff=2, a=3, b=4), measure.vars=measure(letter, pattern="([ab])"), id.vars="ff"), data.table(ff=1, letter=c("a","b"), value=c(3,4)))#duplicate column names are fine if they are not matched by pattern. +test(2157.62, melt(DTid, measure.vars=measure(letter, multiple.keyword, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: multiple.keyword") +test(2157.63, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=as.integer, pattern="([ab])([12])")), error="multiple.keyword must be a character string") +test(2157.64, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=NA_character_, pattern="([ab])([12])")), error="multiple.keyword must be a character string") +test(2157.65, melt(DTid, measure.vars=measure(letter, number, multiple.keyword="", pattern="([ab])([12])")), error="multiple.keyword must be a character string with nchar>0") +test(2157.66, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: cols") +test(2157.67, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="cols must be a character vector of column names") +test(2157.68, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer)), error="pattern must be character string") +test(2157.69, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") +##melt(DTid, measure.vars=measure(letter, number, sep=NA_character_) +##melt(DTid, measure.vars=measure(letter, number, sep=character()) From eed3129ccabfe028f8b8b88a62064f9a59f9a417 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 22 Jan 2021 14:22:15 -0700 Subject: [PATCH 48/57] err fun --- R/fmelt.R | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 56094d4fb3..07ceeda4d3 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -62,24 +62,27 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { ) } } + err.args.groups <- function(type, N){ + if (N != length(fun.list)) { + stop( + "number of ... arguments to measure =", length(fun.list), + " must be same as number of ", type, + " =", N) + } + } err.names.unique("measure group", names(fun.list)) # 3. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { match.vec = regexpr(pattern, cols, perl=TRUE) measure.vec = which(0 < match.vec) if (length(measure.vec) == 0L) { - stop( - "pattern did not match any cols, so nothing would be melted; fix by changing pattern") + stop("pattern did not match any cols, so nothing would be melted; fix by changing pattern") } start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] if (is.null(start)) { stop("pattern must contain at least one capture group (parenthesized sub-pattern)") } - if (ncol(start) != length(fun.list)) { - stop( - "number of ... arguments to measure =", length(fun.list), - " must be same as number of capture groups in pattern =", ncol(start)) - } + err.args.groups("capture groups in pattern", ncol(start)) end = attr(match.vec, "capture.length")[measure.vec,]+start-1L names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) substr(names.mat, start, end) @@ -90,11 +93,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { if (n.groups == 1) { stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } - if (n.groups != length(fun.list)) { - stop( - "number of ... arguments to measure =", length(fun.list), - " must be same as max number of items after splitting column names =", n.groups) - } + err.args.groups("items after splitting column names", n.groups) measure.vec = which(vector.lengths==n.groups) do.call(rbind, list.of.vectors[measure.vec]) } From 40d27895cd46eb14a96f4f15ed323b02c743a728 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 22 Jan 2021 14:50:11 -0700 Subject: [PATCH 49/57] fix stop --- R/fmelt.R | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/R/fmelt.R b/R/fmelt.R index 9eea58c63e..c2610bf942 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -37,9 +37,7 @@ patterns = function(..., cols=character(0L)) { measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { # 1. basic error checking. if (!missing(sep) && !missing(pattern)) { - stop( - "both sep and pattern arguments used in measure; ", - "must use either sep or pattern (not both)") + stop("both sep and pattern arguments used in measure; must use either sep or pattern (not both)") } if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { stop("multiple.keyword must be a character string with nchar>0") @@ -51,8 +49,8 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { mcall = match.call() L = as.list(mcall)[-1] formal.names <- names(formals()) - fun.list = L[-which(names(L)%in%formal.names)] - user.named = names(fun.list)!="" + fun.list = L[-which(names(L) %in% formal.names)] + user.named = names(fun.list) != "" is.symb = sapply(fun.list, is.symbol) bad.i = which((!user.named) & (!is.symb)) if (length(bad.i)) { @@ -63,27 +61,18 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { group.is.formal <- names(fun.list) %in% formal.names if (any(group.is.formal)) { bad.names <- names(fun.list)[group.is.formal] - stop( - "group names specified in ... conflict with measure argument names;", - " please fix by changing group names: ", - paste(bad.names, collapse=",")) + stop("group names specified in ... conflict with measure argument names; please fix by changing group names: ", paste(bad.names, collapse=",")) } err.names.unique <- function(err.what, name.vec) { name.tab = table(name.vec) bad.counts = name.tab[1 < name.tab] if (length(bad.counts)) { - stop( - err.what, " names should be unique, problems: ", - paste(names(bad.counts), collapse=",") - ) + stop(err.what, " names should be unique, problems: ", paste(names(bad.counts), collapse=",")) } } err.args.groups <- function(type, N){ if (N != length(fun.list)) { - stop( - "number of ... arguments to measure =", length(fun.list), - " must be same as number of ", type, - " =", N) + stop("number of ... arguments to measure =", length(fun.list), " must be same as ", type, " =", N) } } err.names.unique("measure group", names(fun.list)) @@ -101,7 +90,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { if (is.null(start)) { stop("pattern must contain at least one capture group (parenthesized sub-pattern)") } - err.args.groups("capture groups in pattern", ncol(start)) + err.args.groups("number of capture groups in pattern", ncol(start)) end = attr(match.vec, "capture.length")[measure.vec,]+start-1L names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) substr(names.mat, start, end) @@ -115,17 +104,14 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { if (n.groups == 1) { stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } - err.args.groups("items after splitting column names", n.groups) + err.args.groups("max number of items after splitting column names", n.groups) measure.vec = which(vector.lengths==n.groups) do.call(rbind, list.of.vectors[measure.vec]) } err.names.unique("measured column", cols[measure.vec]) uniq.mat <- unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { - stop( - "number of unique column IDs =", nrow(uniq.mat), - " is less than number of melted columns =", nrow(group.mat), - "; fix by changing pattern/sep") + stop("number of unique column IDs =", nrow(uniq.mat), " is less than number of melted columns =", nrow(group.mat), "; fix by changing pattern/sep") } colnames(group.mat) = names(fun.list) group.dt = data.table(group.mat) @@ -152,16 +138,11 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { # 5. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. if (!is.character(group.dt[[multiple.keyword]])) { - stop( - multiple.keyword, " column class=", - class(group.dt[[multiple.keyword]])[[1L]], - " after applying conversion function, but must be character") + stop(multiple.keyword, " column class=", class(group.dt[[multiple.keyword]])[[1L]], " after applying conversion function, but must be character") } is.other = names(group.dt) != multiple.keyword if (!any(is.other)) { - stop( - multiple.keyword, " is the only group; ", - "fix by creating at least one more group") + stop(multiple.keyword, " is the only group; fix by creating at least one more group") } other.values = lapply(group.dt[, is.other, with=FALSE], unique) other.values$stringsAsFactors = FALSE From 870bd835a7e3cc5cbdcdc02666f4aa07fad1a038 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 12 Feb 2021 20:37:23 -0700 Subject: [PATCH 50/57] simplify using args --- R/fmelt.R | 2 +- inst/tests/tests.Rraw | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fmelt.R b/R/fmelt.R index c2610bf942..266b243c6f 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -119,7 +119,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { for (group.i in which(user.named)) { group.name = names(fun.list)[[group.i]] fun = eval(fun.list[[group.name]], parent.frame(1L)) - if (!is.function(fun) || (!is.primitive(fun) && length(formals(fun))==0)) { + if (!is.function(fun) || length(formals(args(fun)))==0) { stop("each ... argument to measure must be a function with at least one argument, problem: ", group.name) } group.val = fun(group.dt[[group.name]]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f838b62862..48a636ebfa 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17205,6 +17205,7 @@ test(2156.20, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", " # measure with pattern= test(2156.30, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2156.31, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2156.3105, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2156.32, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: istr") test(2156.33, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") test(2156.34, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") From fafafc291032373ed5e3c3e01c487df6db4785ca Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 01:26:06 -0600 Subject: [PATCH 51/57] news tweak --- NEWS.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2294ad05c4..ab3d1d97d3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,7 +56,7 @@ 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. -9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`. It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to TODO for reporting, #TODO, and to @tdhock for implementing. +9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing. ## BUG FIXES @@ -178,8 +178,6 @@ 5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. -4. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. - ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. From 87c73ad1ba2fa63e393d09df5123420a40f08253 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 01:31:03 -0600 Subject: [PATCH 52/57] merge follow up --- src/fmelt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 39fc96bc26..71af5c6d3c 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -375,7 +375,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna if (data->lvars == 0) { error(_("variable_table attribute of measure.vars should be a data table with at least one column")); } - for (i=0; ivariable_table); i++) { + for (int i=0; ivariable_table); ++i) { int nrow = length(VECTOR_ELT(data->variable_table, i)); if (data->lmax != nrow) { error(_("variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); @@ -670,11 +670,11 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str case LGLSXP : for (int k=0; kvariable_table, R_NamesSymbol), out_col_i))); From dcca6fb1a4555f404bfc10885eb613687c18310f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 01:38:11 -0600 Subject: [PATCH 53/57] merge follow up: confirmed that eval_with_cols() replaced do_patterns() --- R/utils.R | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/R/utils.R b/R/utils.R index 3ab5b79513..45678f5a4d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -105,6 +105,7 @@ brackify = function(x, quote=FALSE) { } # patterns done via NSE in melt.data.table and .SDcols in `[.data.table` +# was called do_patterns() before PR#4731 eval_with_cols = function(orig_call, all_cols) { parent = parent.frame(2L) fun_uneval = orig_call[[1L]] @@ -127,22 +128,6 @@ eval_with_cols = function(orig_call, all_cols) { eval(named_call, parent) } } -do_patterns = function(pat_sub, all_cols) { - # received as substitute(patterns(...)) - pat_sub = as.list(pat_sub)[-1L] - # identify cols = argument if present - idx = which(names(pat_sub) == "cols") - if (length(idx)) { - cols = eval(pat_sub[["cols"]], parent.frame(2L)) - pat_sub = pat_sub[-idx] - } else cols = all_cols - pats = lapply(pat_sub, eval, parent.frame(2L)) - matched = patterns(pats, cols=cols) - # replace with lengths when R 3.2.0 dependency arrives - if (length(idx <- which(sapply(matched, length) == 0L))) - stop('Pattern', if (length(idx) > 1L) 's', ' not found: ', brackify(pats[idx])) - return(matched) -} # check UTC status is_utc = function(tz) { From e3b05824f68bfe5849db8363d5ea4b87f484156e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 01:45:26 -0600 Subject: [PATCH 54/57] merge follow up: remove chmatch_na --- src/fmelt.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 71af5c6d3c..74bfc71e84 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -99,21 +99,6 @@ static const char *concat(SEXP vec, SEXP idx) { return ans; } -// input: character vector of column names (maybe missing), output: -// integer vector of column indices with NA_INTEGER in the positions -// with missing inputs. -SEXP chmatch_na(SEXP x, SEXP table){ - SEXP ans; - PROTECT(ans = chmatch(x, table, 0)); - for(int i=0; i Date: Sun, 9 May 2021 02:08:09 -0600 Subject: [PATCH 55/57] whitespace --- src/fmelt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 74bfc71e84..734bbf0287 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -255,7 +255,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { tmp = PROTECT(unlist_(tmp2)); protecti++; } for (int i=0; imaxtype[out_col], data->nrow); -} +} SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { for (int i=0; ilvalues; ++i) { @@ -572,7 +572,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str // data->nrow * data->lmax == data->totlen int protecti=0; SEXP ansvars=PROTECT(allocVector(VECSXP, data->lvars)); protecti++; - SEXP target; + SEXP target; if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { @@ -647,12 +647,12 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str for (int k=0; k Date: Sun, 9 May 2021 02:22:12 -0600 Subject: [PATCH 56/57] VarNameSymbol moved to init.c --- src/data.table.h | 1 + src/fmelt.c | 5 +---- src/init.c | 2 ++ 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 67b4fbe82b..6cb5413918 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -101,6 +101,7 @@ extern SEXP sym_inherits; extern SEXP sym_datatable_locked; extern SEXP sym_tzone; extern SEXP sym_old_fread_datetime_character; +extern SEXP sym_variable_table; extern double NA_INT64_D; extern long long NA_INT64_LL; extern Rcomplex NA_CPLX; // initialized in init.c; see there for comments diff --git a/src/fmelt.c b/src/fmelt.c index 734bbf0287..08a5a37ce1 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -3,8 +3,6 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -static SEXP VarNameSymbol = NULL; - // generate from 1 to n (a simple fun for melt, vecseq is convenient from R due to SEXP inputs) SEXP seq_int(int n, int start) { if (n <= 0) return(R_NilValue); @@ -349,8 +347,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SET_VECTOR_ELT(data->RCHK, 1, data->naidx = allocVector(VECSXP, data->lmax)); } // TDH 1 Oct 2020 variable table. - if (VarNameSymbol == NULL) VarNameSymbol = install("variable_table"); - data->variable_table = getAttrib(measure, VarNameSymbol); + data->variable_table = getAttrib(measure, sym_variable_table); if (isNull(data->variable_table)) { // We need to include this check first because isNewList(NULL) == // TRUE diff --git a/src/init.c b/src/init.c index 979cd7b050..f168a5b8be 100644 --- a/src/init.c +++ b/src/init.c @@ -34,6 +34,7 @@ SEXP sym_inherits; SEXP sym_datatable_locked; SEXP sym_tzone; SEXP sym_old_fread_datetime_character; +SEXP sym_variable_table; double NA_INT64_D; long long NA_INT64_LL; Rcomplex NA_CPLX; @@ -359,6 +360,7 @@ void attribute_visible R_init_datatable(DllInfo *info) sym_datatable_locked = install(".data.table.locked"); sym_tzone = install("tzone"); sym_old_fread_datetime_character = install("datatable.old.fread.datetime.character"); + sym_variable_table = install("variable_table"); initDTthreads(); avoid_openmp_hang_within_fork(); From c927a52909090fc3b1357bf6964d788a9812460e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 02:35:51 -0600 Subject: [PATCH 57/57] PROTECT not needed --- src/fmelt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 08a5a37ce1..3a1da3bdc8 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -587,9 +587,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; char buff[20]; snprintf(buff, 20, "%d", level++); - SEXP str = PROTECT(mkChar(buff)); - for (int k=0; k