diff --git a/.Rbuildignore b/.Rbuildignore index 22a3a807fa..1e99a9004b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,7 +12,6 @@ ^\.ci$ ^\.dev$ -^\.devcontainer$ ^\.graphics$ ^\.github$ @@ -23,7 +22,6 @@ ^NEWS\.0\.md$ ^_pkgdown\.yml$ ^src/Makevars$ -^CODEOWNERS$ ^\.RData$ ^\.Rhistory$ @@ -41,4 +39,3 @@ ^pkgdown$ ^lib$ ^library$ -^devwd$ diff --git a/.ci/Dockerfile.in b/.ci/Dockerfile.in new file mode 100644 index 0000000000..559bb9a40a --- /dev/null +++ b/.ci/Dockerfile.in @@ -0,0 +1,9 @@ +FROM registry.gitlab.com/jangorecki/dockerfiles/SRC_IMAGE_NAME + +MAINTAINER Jan Gorecki j.gorecki@wit.edu.pl + +COPY bus/build/cran/ /cran/ + +RUN Rscript -e 'install.packages("data.table", repos=file.path("file:","cran"))' + +CMD ["R"] diff --git a/.ci/README.md b/.ci/README.md index d684a598e3..3f303e34ac 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -1,50 +1,72 @@ # data.table continuous integration and deployment -On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI nightly. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publish variety of artifacts. +On each Pull Request opened in GitHub we run Travis CI and Appveyor to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch. It tests more environments and different configurations. It publish variety of artifacts. ## Environments ### [GitLab CI](./../.gitlab-ci.yml) Test jobs: -- `test-lin-rel` - `r-release` on Linux, most comprehensive test environment, force all suggests, `-O3 -flto=auto -fno-common -Wunused-result`, test for no compilation warnings. -- `test-lin-rel-vanilla` - `r-release` on Linux, no suggested deps, no zlib, no OpenMP, flags `-g -O0 -fno-openmp`, skip manual and vignettes. -- `test-lin-rel-cran` - `--as-cran` on Linux, strict test for final status of `R CMD check`. -- `test-lin-dev-gcc-strict-cran` - `--as-cran` on Linux, `r-devel` built with `-enable-strict-barrier --disable-long-double`, test for compilation warnings, test for new NOTEs/WARNINGs from `R CMD check`. -- `test-lin-dev-clang-cran` - same as `gcc-strict` job but R built with `clang` and no `--enable-strict-barrier --disable-long-double` flags. -- `test-lin-310-cran` - R 3.1.0 on Linux, stated R dependency version. -- `test-win-rel` - `r-release` on Windows. -- `test-win-dev` - `r-devel` on Windows. -- `test-win-old` - `r-oldrel` on Windows. -- `test-mac-rel` - macOS build not yet available, see [#3326](https://github.com/Rdatatable/data.table/issues/3326) for status - -Tests jobs are allowed to fail, summary and logs of test jobs are later published at _CRAN-like checks_ page, see artifacts below. +- `test-rel-lin` - `r-release` on Linux, most comprehensive test environment, `-O3 -flto -fno-common -Wunused-result`, extra check for no compilation warnings, includes testing [_with other packages_](./../inst/tests/other.Rraw) +- `test-rel-cran-lin` - `--as-cran` on Linux, `-g0`, extra check for final status of `R CMD check` where we allow one NOTE (_size of tarball_). +- `test-dev-cran-lin` - `r-devel` and `--as-cran` on Linux, `--with-recommended-packages --enable-strict-barrier --disable-long-double`, tests for compilation warnings in pkg install and new NOTEs/Warnings in pkg check, and because it is R-devel it is marked as allow_failure +- `test-rel-vanilla-lin` - `r-release` on Linux, no suggested deps, no OpenMP, `-O0`, tracks memory usage during tests +- `test-310-cran-lin` - R 3.1.0 on Linux +- `test-344-cran-lin` - R 3.4.4 on Linux +- `test-350-cran-lin` - R 3.5.0 on Linux, no `r-recommended` +- `test-rel-win` - `r-release` on Windows +- `test-dev-win` - `r-devel` on Windows +- `test-old-win` - `r-oldrel` on Windows +- `test-rel-osx` - MacOSX build not yet deployed, see [#3326](https://github.com/Rdatatable/data.table/issues/3326) for status Artifacts: - [homepage](https://rdatatable.gitlab.io/data.table) - made with [pkgdown](https://github.com/r-lib/pkgdown) - [html manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/00Index.html) - [pdf manual](https://rdatatable.gitlab.io/data.table/web/packages/data.table/data.table.pdf) - [html vignettes](https://rdatatable.gitlab.io/data.table/library/data.table/doc/index.html) -- R packages repository for `data.table` and all _Suggests_ dependencies, url: `https://rdatatable.gitlab.io/data.table` +- R packages repository for `data.table` and all _Suggests_ dependencies, url: `https://Rdatatable.gitlab.io/data.table` - sources - Windows binaries for `r-release`, `r-devel` and `r-oldrel` - [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) -- [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) +- [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) - note that all artifacts, including check results page, are being published only when all test jobs successfully pass, thus one will not see an _ERROR_ status there (unless error happened on a job marked as `allow_failure`). +- [docker images](https://gitlab.com/Rdatatable/data.table/container_registry) - copy/paste-able `docker pull` commands can be found at the bottom of our [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) -### [GitHub Actions](./../.github/workflows) +### [Travis CI](./../.travis.yml) -TODO document +Test jobs: +- `r-release` on Linux, includes code coverage check +- _(might be disabled)_ `r-release` on OSX + +Artifacts: +- R packages repository having `data.table` sources only, url: `https://Rdatatable.github.io/data.table` +- code coverage stats pushed to [codecov.io/gh/Rdatatable/data.table](https://codecov.io/gh/Rdatatable/data.table) ### [Appveyor](./../.appveyor.yml) -TODO document +Test jobs: +- Windows `r-release` +- _(might be disabled)_ Windows `r-devel` + +Artifacts: +- Windows `r-release` binaries accessed only via web UI -## CI tools +## Tools ### [`ci.R`](./ci.R) -Base R implemented helper script, [originally proposed to base R](https://svn.r-project.org/R/branches/tools4pkgs/src/library/tools/R/packages.R), that ease the process of extracting dependency information from description files, and to mirror packages and their recursive dependencies from CRAN to local CRAN-like directory. It is used in [GitLab CI pipeline](./../.gitlab-ci.yml). +Base R implemented helper script, [originally proposed to R](https://svn.r-project.org/R/branches/tools4pkgs/src/library/tools/R/packages.R), that ease the process of extracting dependency information from description files, also to mirror packages and their recursive dependencies from CRAN to local CRAN-like directory. It is widely used in our [GitLab CI pipeline](./../.gitlab-ci.yml). ### [`publish.R`](./publish.R) -Base R implemented helper script to orchestrate generation of most artifacts and to arrange them nicely. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). +Base R implemented helper script to orchestrate generation of most artifacts. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). + +### [`Dockerfile.in`](./Dockerfile.in) + +Template file to produce `Dockerfile` for, as of now, three docker images. Docker images are being built and published in [_deploy_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). +- `r-base-dev` using `r-release`: publish docker image of `data.table` on R-release +- `r-builder` using `r-release`: publish on R-release and OS dependencies for building Rmarkdown vignettes +- `r-devel`: publish docker image of `data.table` on R-devel built with `--with-recommended-packages --enable-strict-barrier --disable-long-double` + +### [`deploy.sh`](./deploy.sh) + +Script used on Travis CI to publish CRAN-like repository of `data.table` sources. It publishes to `gh-pages` branch in GitHub repository. It depends on a token, which is provided based on `secure` environment variable in [.travis.yml](./../.travis.yml). It has been generated by @jangorecki. diff --git a/.ci/ci.R b/.ci/ci.R index f3a4285660..a165de8189 100644 --- a/.ci/ci.R +++ b/.ci/ci.R @@ -185,3 +185,7 @@ function(pkgs, dp } +## set repositories for CI tests +if (as.logical(Sys.getenv("GITLAB_CI","false")) && identical(Sys.getenv("CI_PROJECT_NAME"), "data.table")) { + options("repos" = if (.Platform$OS.type == "windows") file.path("file://",getwd(),"bus/mirror-packages/cran") else file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE))) +} diff --git a/.ci/deploy.sh b/.ci/deploy.sh new file mode 100644 index 0000000000..6f01ef136f --- /dev/null +++ b/.ci/deploy.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -o errexit -o nounset +PKG_REPO=$PWD +PKG_TARBALL=$(ls -1t *.tar.gz | head -n 1) +cd .. + +addToDrat(){ + mkdir drat; cd drat + + ## Set up Repo parameters + git init + git config user.name "addToDrat" + git config user.email "addToDrat@travis.ci" + + ## Get drat repo + git remote add upstream "https://$GH_TOKEN@github.com/Rdatatable/data.table.git" 2>err.txt + git fetch upstream gh-pages 2>err.txt + git checkout gh-pages 2>err.txt + git reset --hard "88000defd316538c37af4c8dc842e73e7953f4e2" 2>err.txt + + Rscript -e "drat::insertPackage('$PKG_REPO/$PKG_TARBALL', \ + repodir = '.', \ + commit='Travis publish data.table: build $TRAVIS_COMMIT', \ + addFiles=TRUE, fields='Revision')" + git push --force upstream gh-pages 2>err.txt + +} + +addToDrat diff --git a/.ci/publish.R b/.ci/publish.R index 0657790d25..526d9bd80d 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -91,17 +91,26 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { ) vign = tools::getVignetteInfo(pkg, lib.loc=lib.loc) r_rel_ver = Sys.getenv("R_REL_VERSION") - r_dev_ver = Sys.getenv("R_DEV_VERSION") - r_old_ver = Sys.getenv("R_OLD_VERSION") - stopifnot(nzchar(r_rel_ver), nzchar(r_dev_ver), nzchar(r_old_ver)) + r_devel_ver = Sys.getenv("R_DEVEL_VERSION") + r_oldrel_ver = Sys.getenv("R_OLDREL_VERSION") + stopifnot(nzchar(r_rel_ver), nzchar(r_devel_ver), nzchar(r_oldrel_ver)) cran.home = "../../.." tbl.dl = c( sprintf(" Reference manual: %s.pdf, 00Index.html ", pkg, pkg, cran.home, pkg), if (nrow(vign)) sprintf("Vignettes:%s", paste(sprintf("%s
", cran.home, vign[,"PDF"], vign[,"Title"]), collapse="\n")), # location unline cran web/pkg/vignettes to not duplicate content, documentation is in ../../../library sprintf(" Package source: %s_%s.tar.gz ", cran.home,pkg, version, pkg, version), - sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release","r-oldrel"), bin_ver=c(r_dev_ver, r_rel_ver, r_old_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), - sprintf(" macOS binaries: %s ", format.bins(ver=c("r-release","r-oldrel"), bin_ver=c(r_rel_ver, r_old_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) + sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release","r-oldrel"), bin_ver=c(r_devel_ver, r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), + sprintf(" macOS binaries: %s ", format.bins(ver=c("r-release","r-oldrel"), bin_ver=c(r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) ) + if (pkg=="data.table") { ## docker images + registry = Sys.getenv("CI_REGISTRY", "registry.gitlab.com") + namespace = Sys.getenv("CI_PROJECT_NAMESPACE", "Rdatatable") + project = Sys.getenv("CI_PROJECT_NAME", "data.table") + images = c("r-release","r-devel","r-release-builder") + images.title = c("Base R release", "Base R development", "R release package builder") + tags = rep("latest", 3) + docker.dl = sprintf(" %s:
docker pull %s/%s/%s/%s:%s
", images.title, tolower(registry), tolower(namespace), tolower(project), tolower(images), tags) + } index.file = file.path(repodir, "web/packages", pkg, "index.html") if (!dir.exists(dirname(index.file))) dir.create(dirname(index.file), recursive=TRUE) writeLines(c( @@ -122,6 +131,11 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { sprintf("", pkg), tbl.dl, "
", + if (pkg=="data.table") + c("

Docker images:

", + sprintf("", pkg), + docker.dl, + "
"), "", "" ), index.file) @@ -134,7 +148,7 @@ lib.copy <- function(lib.from, repodir="bus/integration/cran"){ pkg.copy <- function(pkg.from, lib.to) { pkg<-basename(pkg.from); dir.create(file.path(lib.to, pkg), recursive=TRUE) - lib.dirs<-intersect(c("help","html","doc"), all.lib.dirs<-list.dirs(pkg.from, full.names=FALSE)) + lib.dirs<-intersect(c("html","doc"), all.lib.dirs<-list.dirs(pkg.from, full.names=FALSE)) ans1<-setNames(file.copy(file.path(pkg.from, lib.dirs), file.path(lib.to, pkg), recursive=TRUE), lib.dirs) lib.files<-setdiff(list.files(pkg.from), all.lib.dirs) ans2<-setNames(file.copy(file.path(pkg.from, lib.files), file.path(lib.to, pkg)), lib.files) @@ -155,30 +169,24 @@ plat <- function(x) if (grepl("^.*win", x)) "Windows" else if (grepl("^.*mac", x r.ver <- function(x) { tmp = strsplit(x, "-", fixed=TRUE)[[1L]] - if (length(tmp) < 3L) stop("test job names must be test-[lin|win|mac]-[r.version]-...") - v = tmp[3L] + if (length(tmp) < 2L) stop("test job names must be test-[r.version]-...") + v = tmp[2L] if (identical(v, "rel")) "r-release" else if (identical(v, "dev")) "r-devel" else if (identical(v, "old")) "r-oldrel" else { - if (grepl("\\D", v)) stop("third word in test job name must be rel/dev/old or numbers of R version") + if (grepl("\\D", v)) stop("second word in test job name must be rel/dev/old or numbers of R version") paste0("r-", paste(strsplit(v, "")[[1L]], collapse=".")) } } # this for now is constant but when we move to independent pipelines (commit, daily, weekly) those values can be different pkg.version <- function(job, pkg) { - Rcheck = file.path("bus", job, paste(pkg, "Rcheck", sep=".")) - if (!dir.exists(Rcheck)) - return(NA_character_) - dcf = read.dcf(file.path(Rcheck, "00_pkg_src", pkg, "DESCRIPTION")) + dcf = read.dcf(file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "DESCRIPTION")) dcf[,"Version"] } pkg.revision <- function(job, pkg) { - Rcheck = file.path("bus", job, paste(pkg, "Rcheck", sep=".")) - if (!dir.exists(Rcheck)) - return(NA_character_) - dcf = read.dcf(file.path(Rcheck, "00_pkg_src", pkg, "DESCRIPTION")) + dcf = read.dcf(file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "DESCRIPTION")) if ("Revision" %in% colnames(dcf)) { proj.url = Sys.getenv("CI_PROJECT_URL", "") if (!nzchar(proj.url)) { @@ -190,10 +198,7 @@ pkg.revision <- function(job, pkg) { } else "" } pkg.flags <- function(job, pkg) { - Rcheck = file.path("bus", job, paste(pkg, "Rcheck", sep=".")) - if (!dir.exists(Rcheck)) - return(NA_character_) - cc = file.path(Rcheck, pkg, "cc") ## data.table style cc file + cc = file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "cc") ## data.table style cc file if (file.exists(cc)) { d = readLines(cc) w.cflags = substr(d, 1, 7)=="CFLAGS=" @@ -263,34 +268,6 @@ check.flavors <- function(jobs, repodir="bus/integration/cran") { setNames(file.exists(file), file) } -log.copy <- function(job, repodir="bus/integration/cran") { - dir.create(job.checks<-file.path(repodir, "web", "checks", pkg<-"data.table", job), recursive=TRUE, showWarnings=FALSE) - to = file.path(job.checks, "log") - if (!file.exists(job_id_file <- file.path("bus", job, "id"))) - return(setNames(file.exists(to), "log")) - job_id = readLines(job_id_file, warn=FALSE)[1L] - from = sprintf("https://gitlab.com/Rdatatable/data.table/-/jobs/%s/raw", job_id) - download.file(from, to, method="wget", quiet=TRUE) - Sys.sleep(0.1) ## to not get ban from gitlab.com - setNames(file.exists(to), "log") -} - -ci.status <- function(job) { - if (!file.exists(status_file <- file.path("bus", job, "status"))) - return(NA_character_) - readLines(status_file, warn=FALSE)[1L] -} - -ci.log <- function(jobs, repodir="bus/integration/cran") { - pkg = "data.table" - ans = vector("character", length(jobs)) - logs = sapply(jobs, log.copy, repodir=repodir) - statuses = sapply(jobs, ci.status) - ans[!logs] = statuses[!logs] - ans[logs] = sprintf('%s', pkg[any(logs)], jobs[logs], statuses[logs]) - ans -} - check.index <- function(pkg, jobs, repodir="bus/integration/cran") { status = function(x) if (grepl("^.*ERROR", x)) "ERROR" else if (grepl("^.*WARNING", x)) "WARNING" else if (grepl("^.*NOTE", x)) "NOTE" else if (grepl("^.*OK", x)) "OK" else NA_character_ test.files = function(job, files, trim.name=FALSE, trim.exts=0L, pkg="data.table") { @@ -331,18 +308,17 @@ check.index <- function(pkg, jobs, repodir="bus/integration/cran") { } memouts }) - th = "FlavorVersionRevisionInstallStatusFlagsRout.failLogMemtest" + th = "FlavorVersionRevisionInstallStatusFlagsRout.failMemtest" tbl = sprintf( - "%s%s%sout%s%s%s%s%s", - sub("test-", "", jobs, fixed=TRUE), ## Flavor - sapply(jobs, pkg.version, pkg), ## Version - sapply(jobs, pkg.revision, pkg), ## Revision - pkg, jobs, ## Install - pkg, jobs, sapply(sapply(jobs, check.test, pkg="data.table"), status), ## Status - sapply(jobs, pkg.flags, pkg), ## Flags - mapply(test.files, jobs, routs, trim.exts=2L), ## Rout.fail: 1st fail, 2nd Rout, keep just: tests_x64/main - ci.log(jobs), ## CI job logs - mapply(test.files, jobs, memouts, trim.name=TRUE) ## Memtest // currently not used + "%s%s%sout%s%s%s%s", + sub("test-", "", jobs, fixed=TRUE), + sapply(jobs, pkg.version, pkg), + sapply(jobs, pkg.revision, pkg), + pkg, jobs, ## install + pkg, jobs, sapply(sapply(jobs, check.test, pkg="data.table"), status), ## check + sapply(jobs, pkg.flags, pkg), + mapply(test.files, jobs, routs, trim.exts=2L), # 1st fail, 2nd Rout, keep just: tests_x64/main + mapply(test.files, jobs, memouts, trim.name=TRUE) ) file = file.path(repodir, "web/checks", sprintf("check_results_%s.html", pkg)) writeLines(c( @@ -378,8 +354,7 @@ check.test <- function(job, pkg) { check[length(check)] } -move.bin <- function(job, bin.version, os.type, file="DESCRIPTION", silent=TRUE) { - ## currently not used, if not used for macos in future then can be removed +move.bin <- function(job, bin.version, os.type, file="DESCRIPTION", silent=FALSE) { if (os.type=="unix") { stop("publish of linux binaries not supported") } else if (os.type=="windows") { diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 928f0e07fc..3d46c94d6e 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -8,8 +8,6 @@ alias gdm='git difftool master &> /dev/null' # If meld has scrolling issues, turn off GTK animation which I don't need: # https://gitlab.gnome.org/GNOME/meld/-/issues/479#note_866040 -alias perfbar=~/build/gtk_perfbar/linux_perfbar # revdep.R; https://github.com/tomkraljevic/gtk_perfbar - alias Rdevel='~/build/R-devel/bin/R --vanilla' alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' @@ -17,7 +15,7 @@ alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' -alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=NULL && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' # use ~/build/R-devel/bin/R at the end of revdepr to use R-devel instead of R-release. # If so, doing a `rm -rf *` in revdeplib first to rebuild everything is easiest way to avoid potential problems later. A full rebuild is a good idea periodically anyway. Packages in diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 94a4a17ec3..b010d175f4 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -185,7 +185,7 @@ grep -En "for\s*[(]\s*[a-zA-Z0-9_]+\s*=" src/*.c | grep -Fv "#loop_counter_not_l cd .. R -cc(test=TRUE, clean=TRUE, CC="gcc-12") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html +cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html saf = options()$stringsAsFactors options(stringsAsFactors=!saf) # check tests (that might be run by user) are insensitive to option, #2718 test.data.table() @@ -195,15 +195,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.14.99.tar.gz --as-cran -R CMD INSTALL data.table_1.14.99.tar.gz --html +R CMD check data.table_1.14.1.tar.gz --as-cran +R CMD INSTALL data.table_1.14.1.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.14.99.tar.gz +R CMD check data.table_1.14.1.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -220,24 +220,16 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.99.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.1.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.99.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.1.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) require(data.table) -f1 = tempfile() -f2 = tempfile() -suppressWarnings(try(rm(list=c(".Last",".Random.seed")))) -save.image(f1) test.data.table(script="other.Rraw") test.data.table(script="*.Rraw") test.data.table(verbose=TRUE) # since main.R no longer tests verbose mode -suppressWarnings(try(rm(list=c(".Last",".Random.seed")))) -save.image(f2) -system(paste("diff",f1,f2)) # to detect any changes to .GlobalEnv, #5514 -# print(load(f1)); print(load(f2)) # run if diff found any difference # check example() works on every exported function, with these sticter options too, and also that all help pages have examples options(warn=2, warnPartialMatchArgs=TRUE, warnPartialMatchAttr=TRUE, warnPartialMatchDollar=TRUE) @@ -266,7 +258,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.14.99.tar.gz +R310 CMD INSTALL ./data.table_1.14.1.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +270,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.14.99.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.14.1.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +278,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.14.99.tar.gz +R CMD check data.table_1.14.1.tar.gz ##################################################### @@ -297,30 +289,25 @@ cd ~/build wget -N https://stat.ethz.ch/R/daily/R-devel.tar.gz rm -rf R-devel rm -rf R-devel-strict-* -tar xf R-devel.tar.gz +tar xvf R-devel.tar.gz mv R-devel R-devel-strict-gcc -tar xf R-devel.tar.gz +tar xvf R-devel.tar.gz mv R-devel R-devel-strict-clang -tar xf R-devel.tar.gz +tar xvf R-devel.tar.gz -sudo apt-get -y build-dep r-base cd R-devel # may be used for revdep testing: .dev/revdep.R. # important to change directory name before building not after because the path is baked into the build, iiuc ./configure CFLAGS="-O0 -Wall -pedantic" make # use latest available `apt-cache search gcc-` or `clang-` -# wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - -# sudo add-apt-repository 'deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-15 main' -# sudo apt-get install clang-15 - cd ~/build/R-devel-strict-clang -./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-15 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-sanitize=alignment -fno-omit-frame-pointer" CFLAGS="-g -O3 -Wall -pedantic" +./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make cd ~/build/R-devel-strict-gcc -# gcc-10 failed to build R-devel at some point, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +# gcc-10 (in dev currently) failed to build R, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) +./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-9 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make # See R-exts#4.3.3 @@ -341,23 +328,15 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.99.tar.gz -# Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be -# passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R -# So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz -# Use the (failed) output to get the list of currently needed packages and install them -Rdevel-strict-[gcc|clang] +Rdevel-strict-gcc CMD INSTALL data.table_1.14.1.tar.gz +Rdevel-strict-clang CMD INSTALL data.table_1.14.1.tar.gz +# Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here +Rdevel-strict-gcc +Rdevel-strict-clang # repeat below with clang and gcc isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested options(repos = "http://cloud.r-project.org") -install.packages(c("bit64", "bit", "R.utils", "xts", "zoo", "yaml", "knitr", "markdown"), - Ncpus=4) -# Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check -q("no") -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz -# UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. -find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; - +install.packages(c("bit64","xts","nanotime","R.utils","yaml")) # minimum packages needed to not skip any tests in test.data.table() +# install.packages(c("curl","knitr")) # for `R CMD check` when not strict. Too slow to install when strict require(data.table) test.data.table(script="*.Rraw") # 7 mins (vs 1min normally) under UBSAN, ASAN and --strict-barrier # without the fix in PR#3515, the --disable-long-double lumped into this build does now work and correctly reproduces the noLD problem @@ -372,7 +351,7 @@ print(Sys.time()); started.at<-proc.time(); try(test.data.table()); print(Sys.ti ## apt-get update ## apt-get install libc6:i386 libstdc++6:i386 gcc-multilib g++-multilib gfortran-multilib libbz2-dev:i386 liblzma-dev:i386 libpcre3-dev:i386 libcurl3-dev:i386 libstdc++-7-dev:i386 ## sudo apt-get purge libcurl4-openssl-dev # cannot coexist, it seems -## sudo apt-get install libcurl4-openssl-dev:i386 ## may not be needed anymore as we dropped dependency on curl, try and update when reproducing +## sudo apt-get install libcurl4-openssl-dev:i386 ## cd ~/build/32bit/R-devel ## ./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --without-readline --without-x CC="gcc -m32" CXX="g++ -m32" F77="gfortran -m32" FC=${F77} OBJC=${CC} LDFLAGS="-L/usr/local/lib" LIBnn=lib LIBS="-lpthread" CFLAGS="-O0 -g -Wall -pedantic" ## @@ -391,7 +370,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.14.99.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.14.1.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -429,7 +408,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.14.99.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.14.1.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -490,15 +469,14 @@ shutdown now # doesn't return you to host prompt properly so just kill the win # Downstream dependencies ############################################### -# IF NOT ALREADY INSTALLED, OR AFTER AN OS UPGRADE -# No harm rerunning these commands; they do not reinstall if already latest version +# IF NOT ALREADY INSTALLED sudo apt-get update sudo apt-get -y install htop sudo apt-get -y install r-base r-base-dev sudo apt-get -y build-dep r-base-dev sudo apt-get -y build-dep qpdf sudo apt-get -y install aptitude -sudo apt-get -y build-dep r-cran-rgl # leads to libglu1-mesa-dev +sudo aptitude -y build-dep r-cran-rgl # leads to libglu1-mesa-dev sudo apt-get -y build-dep r-cran-rmpi sudo apt-get -y build-dep r-cran-cairodevice sudo apt-get -y build-dep r-cran-tkrplot @@ -546,8 +524,6 @@ sudo apt-get -y install libgit2-dev # for gert sudo apt-get -y install cmake # for symengine for RxODE sudo apt-get -y install libxslt1-dev # for xslt sudo apt-get -y install flex # for RcppCWB -sudo apt-get -y install libavfilter-dev libsodium-dev libgmp-dev libssh-dev librdf0-dev -sudo apt-get -y install libmariadb-dev mariadb-client # RMySQL for xQTLbiolinks sudo R CMD javareconf # ENDIF @@ -556,7 +532,6 @@ inst() # *** ensure latest dev version of data.table installed into revdeplib run() # prints menu of options status() # includes timestamp of installed data.table that is being tested. log() # cats all fail logs to ~/fail.log -cran() # compare packages with error or warning to their status on CRAN # Once all issues resolved with CRAN packages, tackle long-term unfixed bioconductor packages as follows. # 1. Note down all error and warning bioc packages @@ -594,7 +569,7 @@ du -k inst/tests # 0.75MB after R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" Rdevel -q -e "packageVersion('xml2')" # ensure installed -Rdevel CMD check data.table_1.16.0.tar.gz --as-cran # use latest Rdevel as it may have extra checks +Rdevel CMD check data.table_1.14.0.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -614,30 +589,15 @@ If it's evening, SLEEP. It can take a few days for CRAN's checks to run. If any issues arise, backport locally. Resubmit the same even version to CRAN. CRAN's first check is automatic and usually received within an hour. WAIT FOR THAT EMAIL. When CRAN's email contains "Pretest results OK pending a manual inspection" (or similar), or if not and it is known why not and ok, then bump dev. - -###### Bump dev for NON-PATCH RELEASE -0. Close milestone to prevent new issues being tagged with it. The final 'release checks' issue can be left open in a closed milestone. +###### Bump dev +0. Close milestone to prevent new issues being tagged with it. Update its name to the even release. The final 'release checks' issue can be left open in a closed milestone. 1. Check that 'git status' shows 4 files in modified and uncommitted state: DESCRIPTION, NEWS.md, init.c and this .dev/CRAN_Release.cmd -2. Bump minor version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. +2. Bump version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. -4. Bump minor version in dllVersion() in init.c -5. Bump 3 minor version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.99 to 1.15.99 inc below, 1.15.0 to 1.16.0 above, 1.14.0 to 1.15.0 below +4. Bump dllVersion() in init.c +5. Bump 3 version numbers in Makefile +6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.7 to 1.14.1, and 1.13.6 to 1.14.0 (e.g. in step 8 and 9 below) 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.15.0 on CRAN. Bump to 1.14.10" -9. Take sha from step 8 and run `git tag 1.15.0 96c..sha..d77` then `git push origin 1.15.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.14.0 on CRAN. Bump to 1.14.1" +9. Take sha from step 8 and run `git tag 1.14.0 96c..sha..d77` then `git push origin 1.14.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### - -###### Bump dev for PATCH RELEASE -## WARNING: review this process during the next first patch release (x.y.2) from a regular release (x,y,0), possibly during 1.15.2 release. -0. Close milestone to prevent new issues being tagged with it. The final 'release checks' issue can be left open in a closed milestone. -1. Check that 'git status' shows 4 files in modified and uncommitted state: DESCRIPTION, NEWS.md, init.c and this .dev/CRAN_Release.cmd -2. Bump patch version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. -3. Add new heading in NEWS for the next dev PATCH version. Add "(submitted to CRAN on )" on the released heading. -4. Bump patch version in dllVersion() in init.c -5. Bump 3 patch version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.9 to 1.14.11 inc below, 1.14.10 to 1.14.12 above, 1.14.8 to 1.14.10 below -7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.14.8 on CRAN. Bump to 1.14.10" -9. Take sha from step 8 and run `git tag 1.14.8 96c..sha..d77` then `git push origin 1.14.8` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) -###### \ No newline at end of file diff --git a/.dev/cc.R b/.dev/cc.R index a092aba351..bc15b6765f 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -61,8 +61,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys if (debug) { ret = system(sprintf("MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f%sopenmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data_table.so *.c", CC, OMP)) } else { - ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -Wstrict-prototypes\\ -isystem\\ /usr/share/R/include\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) - # the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem. + ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) # TODO add -Wextra too? } if (ret) return() diff --git a/.dev/revdep.R b/.dev/revdep.R index 0b949da361..10af35b553 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -13,10 +13,9 @@ options(error=quote(utils::dump.frames())) options(width=200) # for cran() output not to wrap # Check that env variables have been set correctly: -# export R_LIBS_SITE=NULL # R 4.2.0 changed to NULL but it doesn't appear to work +# export R_LIBS_SITE=none # export R_LIBS=~/build/revdeplib/ # export _R_CHECK_FORCE_SUGGESTS_=true -if (length(.libPaths())==3L) .libPaths(.libPaths()[-2L], include.site=FALSE) # workaround as I couldn't get R_LIBS_SITE=NULL to be effective stopifnot(identical(length(.libPaths()), 2L)) # revdeplib writeable by me, and the pre-installed recommended R library (sudo writeable) stopifnot(identical(.libPaths()[1L], getwd())) tt = file.info(.libPaths())[,"uname"] @@ -97,33 +96,10 @@ update.packages(ask=FALSE, checkBuilt=TRUE) avail = available.packages() # includes CRAN and Bioc, from getOption("repos") set above -avail = avail[!rownames(avail) %in% c("cplexAPI","Rcplex"), ] +avail = avail[!rownames(avail) %in% "cplexAPI", ] # cplexAPI is suggested by revdeps ivmte and prioritizr. I haven't succeeded to install IBM ILOG CPLEX which requires a license, # so consider cplexAPI not available when resolving missing suggests at the end of status(). -# Update: cplexAPI was removed from CRAN on 5 Nov 2021 so this is now redundant, but leave it in place for future use. -# Update: Rcplex is on CRAN as of 20 Nov 2022 but with install errors, therefore treat it as not available. - -# The presence of packages here in revdeplib which no longer exist on CRAN could explain differences to CRAN. A revdep -# could be running tests using that package when available and failing which may be the very reason that package was removed from CRAN. -# When it is removed from revdeplib to match CRAN, then the revdep might then pass as it will skip its tests using that package. -x = installed.packages() -tt = match(rownames(x), rownames(avail)) -removed = rownames(x)[is.na(tt) & is.na(x[,"Priority"])] -cat("Removing",length(removed),"packages which are no longer available on CRAN/Bioc:", paste(removed, collapse=","), "\n") -stopifnot(all(x[removed,"LibPath"] == .libPaths()[1])) -oldn = nrow(x) -remove.packages(removed, .libPaths()[1]) -x = installed.packages() -stopifnot(nrow(x) == oldn-length(removed)) - -# Ensure all installed packages were built with this x.y release of R; i.e. that checkBuilt=TRUE worked above -cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") -cat("Previously installed packages were built using:\n") -print(tt <- table(x[,"Built"], dnn=NULL)) -minorR = paste(strsplit(as.character(getRversion()), split="[.]")[[1]][c(1,2)], collapse=".") -if (any(w<-names(tt)ip[deps]; install.packages(names(pkgs[pkgs|is.na(pkgs)]), INSTALL_opts="--html") }' - - name: build - run: | - echo "Revision:" $GITHUB_SHA >> ./DESCRIPTION - R CMD build . - - name: check - run: | - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - - name: manual - if: github.ref == 'refs/heads/master' - run: | - cp -R ${{ env.R_LIBS_USER }} library - R CMD INSTALL --library="library" $(ls -1t data.table_*.tar.gz | head -n 1) --html - mkdir -p doc/html - cp /usr/share/R/doc/html/{left.jpg,up.jpg,Rlogo.svg,R.css,index.html} doc/html - Rscript -e 'utils::make.packages.html("library", docdir="doc")' - sed -i "s|file://|../..|g" doc/html/packages.html - mkdir -p public - mv doc public/doc - cp -r --parents library/*/{html,help,doc,demo,DESCRIPTION,README,NEWS,README.md,NEWS.md} public 2>/dev/null || : - sed -i 's|"/doc/html/|"/data.table/doc/html/|g' public/library/data.table/doc/index.html 2>/dev/null || : - - name: repo - if: github.ref == 'refs/heads/master' - run: | - mkdir -p public/src/contrib - mv $(ls -1t data.table_*.tar.gz | head -n 1) public/src/contrib - Rscript -e 'tools::write_PACKAGES("public/src/contrib", fields="Revision")' - - name: upload - if: github.ref == 'refs/heads/master' - uses: actions/upload-pages-artifact@v1 - with: - path: "public" - - name: deploy - if: github.ref == 'refs/heads/master' - id: deployment - uses: actions/deploy-pages@v1 diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 3e59198933..ba1f94fded 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v2 + - uses: r-lib/actions/setup-r@v1 - uses: r-lib/actions/setup-pandoc@v1 diff --git a/.gitignore b/.gitignore index 559df7b9de..00d0d0e8be 100644 --- a/.gitignore +++ b/.gitignore @@ -10,9 +10,6 @@ data.table_*.tar.gz data.table.Rcheck src/Makevars -# Package install -inst/cc - # Emacs IDE files .emacs.desktop .emacs.desktop.lock @@ -41,8 +38,6 @@ vignettes/plots/figures .Renviron lib library -devwd -dev.R *.csv *.csvy *.RDS diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 099f399772..419741f6c5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,3 @@ -workflow: - rules: - - if: '$CI_PIPELINE_SOURCE=="schedule" && $CI_COMMIT_REF_NAME=="master"' ## nightly scheduled pipeline at 4:15 UTC - - if: '$CI_PIPELINE_SOURCE=="web"' ## manually started from web UI - - if: '$CI_PIPELINE_SOURCE=="push" && $CI_COMMIT_REF_NAME!="master"' ## branches pushed to GL directly, mirror is set for master branch only - variables: CRAN_MIRROR: "https://cloud.r-project.org" _R_CHECK_FORCE_SUGGESTS_: "false" @@ -12,18 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.3" - R_REL_WIN_BIN: "https://cloud.r-project.org/bin/windows/base/old/4.3.2/R-4.3.2-win.exe" - RTOOLS_REL_BIN: "https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5863-5818.exe" - RTOOLS43_HOME: "/c/rtools" - R_DEV_VERSION: "4.4" - R_DEV_WIN_BIN: "https://cloud.r-project.org/bin/windows/base/R-devel-win.exe" - RTOOLS_DEV_BIN: "https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5863-5818.exe" - RTOOLS44_HOME: "" ## in case R-devel will use new Rtools toolchain, now it uses 4.3 env var - R_OLD_VERSION: "4.2" - R_OLD_WIN_BIN: "https://cloud.r-project.org/bin/windows/base/old/4.2.3/R-4.2.3-win.exe" - RTOOLS_OLD_BIN: "https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe" - RTOOLS42_HOME: "/c/rtools" + R_REL_VERSION: "4.2" + R_DEVEL_VERSION: "4.3" + R_OLDREL_VERSION: "4.1" stages: - dependencies @@ -37,307 +22,343 @@ stages: expire_in: 2 weeks when: always paths: - - bus/$CI_JOB_NAME + - bus -## mirror packages -# download all recursive dependencies once to be used across multiple test jobs -# sources and binaries for r-release, r-devel and r-oldrel -# cache between runs -mirror-packages: +mirror-packages: ## mirror all recursive dependencies, source and win.binary of data.table suggests from DESCRIPTION stage: dependencies tags: - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-minimal + image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev cache: paths: - - bus/$CI_JOB_NAME/cran + - bus/$CI_BUILD_NAME/cran script: - echo 'source(".ci/ci.R")' >> .Rprofile - - mkdir -p bus/$CI_JOB_NAME/cran/src/contrib + - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - Rscript -e 'mirror.packages(dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran")' - - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEV_VERSION","R_OLD_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' + - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works + - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -## install deps alias -.test-install-deps: &install-deps - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), repos=file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE)), quiet=TRUE)' +# mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw; off now #5274 +# stage: dependencies +# tags: +# - linux +# image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev +# cache: +# paths: +# - bus/$CI_BUILD_NAME/cran +# script: +# - echo 'source(".ci/ci.R")' >> .Rprofile +# - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib +# - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' +# <<: *artifacts -## build -# sources as tar.gz archive -# build vignettes -build: +build: ## build data.table sources as tar.gz archive stage: build tags: - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc + image: registry.gitlab.com/jangorecki/dockerfiles/r-builder needs: ["mirror-packages"] before_script: - - *install-deps + - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus + - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION script: - - sed -i '/^[[:space:]]*$/d' ./DESCRIPTION ## make last line end abruptly; i.e. without a final \n - - echo "Revision:" $CI_COMMIT_SHA >> ./DESCRIPTION - R CMD build . - - mkdir -p bus/$CI_JOB_NAME/ - - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/ + - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib + - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib/. + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/build/cran"), fields="Revision", addFiles=TRUE)' + - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works <<: *artifacts +.test-install-deps: &install-deps + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' + +.test-cp-src: &cp-src + - cp $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) . +.test-cp-src-win: &cp-src-win + - cp.exe $(ls.exe -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head.exe -n 1) . + +.test-mv-src: &mv-src + - mkdir -p bus/$CI_BUILD_NAME && mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME +.test-mv-src-win: &mv-src-win + - mkdir.exe -p bus/$CI_BUILD_NAME; mv.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) bus/$CI_BUILD_NAME + +.test-rm-src: &rm-src + - rm $(ls -1t data.table_*.tar.gz | head -n 1) +.test-rm-src-win: &rm-src-win + - rm.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + +.test-mv-bin-win: &mv-bin-win + - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION + +.test-install-r-rel-win: &install-r-rel-win + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.2.1-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated +.test-install-r-devel-win: &install-r-devel-win + - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait +.test-install-r-oldrel-win: &install-r-oldrel-win + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + +.test-install-rtools-win: &install-rtools-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5253-5107-signed.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + .test-template: &test stage: test needs: ["mirror-packages","build"] - allow_failure: true <<: *artifacts .test-lin-template: &test-lin <<: *test tags: - linux + +.test-cran-lin-template: &test-cran-lin + <<: *test-lin + variables: + _R_CHECK_CRAN_INCOMING_: "TRUE" + _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" before_script: - - cp $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) . - - mkdir -p ~/.R - after_script: - - mkdir -p bus/$CI_JOB_NAME - - echo $CI_JOB_ID > bus/$CI_JOB_NAME/id - - echo $CI_JOB_STATUS > bus/$CI_JOB_NAME/status - - echo $CI_JOB_IMAGE > bus/$CI_JOB_NAME/image - - '[ -d data.table.Rcheck ] && mv data.table.Rcheck bus/$CI_JOB_NAME/' - -## most comprehensive tests -# force all suggests -# flags: gcc -O3 -flto=auto -fno-common -Wunused-result -# tests for compilation warnings -test-lin-rel: + - *install-deps + - *cp-src + - rm -r bus + script: + - *mv-src + - cd bus/$CI_BUILD_NAME + - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src + +.test-win-template: &test-win + <<: *test + tags: + - windows + - shared-windows + +#.test-mac-template: &test-mac +# <<: *test +# tags: +# - macosx + +test-rel-lin: ## most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table + image: registry.gitlab.com/jangorecki/dockerfiles/r-builder + needs: ["mirror-packages","build"] # "mirror-other-packages" variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" + TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "FALSE" #5274 + before_script: + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), quiet=TRUE)' ## does seem to be needed despite 'needs mirror-packages' + ## - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(pkgs, quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' + - *cp-src + - rm -r bus + - mkdir -p ~/.R + - echo 'CFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - - *install-deps - - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - *mv-src + - cd bus/$CI_BUILD_NAME - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src - (! grep "warning:" data.table.Rcheck/00install.out) -## vanilla minimal -# no zlib -# no suggested deps -# no vignettes or manuals -# no openmp -# flags: gcc -O0 -fno-openmp -test-lin-rel-vanilla: +test-rel-vanilla-lin: ## minimal, no suggested deps, no vignettes or manuals, measure memory, using gcc -O0 -fno-openmp <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc - script: + image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev + variables: + TEST_DATA_TABLE_MEMTEST: "TRUE" + before_script: + - *cp-src + - rm -r bus + - mkdir -p ~/.R - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + script: + - *mv-src + - cd bus/$CI_BUILD_NAME - R CMD check --no-manual --ignore-vignettes $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src -## R-release on Linux -# strict checks for 0 NOTEs -# extra NOTEs check and build pdf manual thus not from cran-lin template -test-lin-rel-cran: +test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual thus not from cran-lin template <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-base + image: registry.gitlab.com/jangorecki/dockerfiles/r-builder variables: _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 - _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes - _R_CHECK_PKG_SIZES_THRESHOLD_: "7" ## MB 'checking installed package size' NOTE - script: + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0 + before_script: - *install-deps - - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - *cp-src + - rm -r bus + - mkdir -p ~/.R + - echo 'CFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2'> ~/.R/Makevars ## -g0 because -g increases datatable.so size from 0.5MB to 1.5MB and breaches 'installed package size <= 5MB' note + - echo 'CXXFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + script: + - *mv-src + - cd bus/$CI_BUILD_NAME - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " but ", shQuote(l)) else q("no")' -## R-devel on Linux gcc strict -# R built with --enable-strict-barrier --disable-long-double -# tests for compilation warnings -# tests for new notes -test-lin-dev-gcc-strict-cran: +test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc-strict + image: registry.gitlab.com/jangorecki/dockerfiles/r-devel + allow_failure: true variables: _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" ## detects S3 method lookup found on search path #4777 _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" - script: - - echo 'CFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + before_script: - *install-deps - - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - - (! grep "warning:" data.table.Rcheck/00install.out) - - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (size of tarball, installed package size) but ", shQuote(l)) else q("no")' - -## R-devel on Linux clang -# R compiled with clang -# tests for compilation warnings -# tests for new notes -test-lin-dev-clang-cran: - <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-clang - variables: - _R_CHECK_CRAN_INCOMING_: "TRUE" - _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" - _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" - _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" + - *cp-src + - rm -r bus script: - - echo 'CFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - - *install-deps - - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) + - *mv-src + - cd bus/$CI_BUILD_NAME + - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src - (! grep "warning:" data.table.Rcheck/00install.out) - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (size of tarball, installed package size) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, installed package size, top-level files) but ", shQuote(l)) else q("no")' -## R 3.1.0 -# stated dependency on R -test-lin-310-cran: +test-310-cran-lin: ## R-3.1.0 on Linux, stated dependency of R + <<: *test-cran-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-3.1.0 - <<: *test-lin - script: - - *install-deps - - R CMD check --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) -.test-win-template: &test-win - <<: *test - tags: - - shared-windows +test-344-cran-lin: ## R-3.4.4 on Linux, last R non-altrep version + <<: *test-cran-lin + image: registry.gitlab.com/jangorecki/dockerfiles/r-3.4.4 + +test-350-cran-lin: ## R-3.5.0 on Linux, first R altrep version + <<: *test-cran-lin + image: registry.gitlab.com/jangorecki/dockerfiles/r-3.5.0 + +test-rel-win: ## R-release on Windows, test and build binaries + <<: *test-win + variables: + R_VERSION: "$R_REL_VERSION" before_script: - - curl.exe -s -o ../R-win.exe $R_BIN; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait - - curl.exe -s -o ../rtools.exe $RTOOLS_BIN; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools" -NoNewWindow -Wait - - $env:PATH = "C:\R\bin;C:\rtools\usr\bin;$env:PATH" - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='all'), repos=file.path('file://',getwd(),'bus/mirror-packages/cran'), quiet=TRUE)" - - cp.exe $(ls.exe -1t bus/build/data.table_*.tar.gz | head.exe -n 1) . + - *install-r-rel-win + - *install-rtools-win + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" + - *cp-src-win + - rm.exe -r bus script: + - *mv-src-win + - cd bus/$CI_BUILD_NAME - R.exe CMD check --no-manual $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - -not (grep.exe "warning:" data.table.Rcheck\00install.out) - after_script: - - $env:PATH = "C:\R\bin;C:\rtools\usr\bin;$env:PATH" - - mkdir.exe -p bus/$CI_JOB_NAME - - Rscript.exe -e "cat(Sys.getenv('CI_JOB_ID'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'id'))" - - Rscript.exe -e "cat(Sys.getenv('CI_JOB_STATUS'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'status'))" - - Rscript.exe -e "cat(Sys.getenv('CI_JOB_IMAGE'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'image'))" - - Rscript.exe -e "to<-file.path('bus', Sys.getenv('CI_JOB_NAME'), 'data.table.Rcheck'); if (dir.exists(from<-'data.table.Rcheck')) invisible(file.rename(from, to)); dir.exists(to)" - - Rscript.exe -e "from<-tail(list.files(pattern='^data\\.table_.*\\.zip$'), 1L); to<-file.path('bus', Sys.getenv('CI_JOB_NAME'), from); if (length(from)) invisible(file.rename(from, to)); length(to)&&file.exists(to)" - -## R-release on Windows -# test and build binaries -test-win-rel: - <<: *test-win - variables: - R_VERSION: "$R_REL_VERSION" - R_BIN: "$R_REL_WIN_BIN" - RTOOLS_BIN: "$RTOOLS_REL_BIN" + - *rm-src-win + - *mv-bin-win -## R-devel on Windows -# test and build binaries -test-win-dev: +test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related to UCRT and Rtools42 <<: *test-win variables: - R_VERSION: "$R_DEV_VERSION" - R_BIN: "$R_DEV_WIN_BIN" - RTOOLS_BIN: "$RTOOLS_DEV_BIN" + R_VERSION: "$R_DEVEL_VERSION" + before_script: + - *install-r-devel-win + - *install-rtools-win + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 + - *cp-src-win + - rm.exe -r bus + script: + - *mv-src-win + - cd bus/$CI_BUILD_NAME + - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - *rm-src-win + - *mv-bin-win -## R-oldrel on Windows -# test and build binaries -test-win-old: +test-old-win: ## R-oldrel on Windows <<: *test-win variables: - R_VERSION: "$R_OLD_VERSION" - R_BIN: "$R_OLD_WIN_BIN" - RTOOLS_BIN: "$RTOOLS_OLD_BIN" - -.test-mac-template: &test-mac - <<: *test - tags: - - saas-macos-medium-m1 + R_VERSION: "$R_OLDREL_VERSION" before_script: - - *install-deps - - cp $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) . - after_script: - - mkdir -p bus/$CI_JOB_NAME - - '[ -d data.table.Rcheck ] && mv data.table.Rcheck bus/$CI_JOB_NAME/' - - '[ -f data.table_*.tgz ] && mv $(ls -1t data.table_*.tgz | head -n 1) bus/$CI_JOB_NAME/' - - echo $CI_JOB_ID > bus/$CI_JOB_NAME/id - - echo $CI_JOB_STATUS > bus/$CI_JOB_NAME/status - - echo $CI_JOB_IMAGE > bus/$CI_JOB_NAME/image - -## R-release on MacOS -# no macosx runner set yet -.test-mac-rel: - <<: *test-mac - variables: - R_VERSION: "$R_REL_VERSION" + - *install-r-oldrel-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait + ## rtools42 doesn't support 32bit so oldrel-win (currently R 4.1) needs rtools40. Can use install-rtools-win again here when oldrel is R 4.2+ + - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 + - *cp-src-win + - rm.exe -r bus script: - - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) - -## integrate artifacts -# merging package tarballs and binaries into single R repository -# rendering documentation -# setting up CRAN-like structure -# generating pkgdown website -integration: + - *mv-src-win + - cd bus/$CI_BUILD_NAME + - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - *rm-src-win + - *mv-bin-win + +#test-rel-mac: ## R-release on MacOS, no macosx runner yet +# <<: *test-mac +# variables: +# R_VERSION: "$R_REL_VERSION" +# before_script: +# - *install-deps +# - *cp-src +# - rm -r bus +# script: +# - *mv-src +# - cd bus/$CI_BUILD_NAME +# - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) +# - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) +# - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_VERSION +# - mv $(ls -1t data.table_*.tgz | head -n 1) cran/bin/macosx/el-capitan/contrib/$R_VERSION +# - *rm-src +# - *mv-bin-mac + +integration: ## merging all artifacts to produce single R repository, documentation and website stage: integration image: registry.gitlab.com/jangorecki/dockerfiles/r-pkgdown tags: - linux only: - master - needs: ["mirror-packages","build","test-lin-rel","test-lin-rel-cran","test-lin-dev-gcc-strict-cran","test-lin-dev-clang-cran","test-lin-rel-vanilla","test-lin-310-cran","test-win-rel","test-win-dev" ,"test-win-old"] + - tags + needs: ["mirror-packages","build","test-rel-lin","test-rel-cran-lin","test-dev-cran-lin","test-rel-vanilla-lin","test-310-cran-lin","test-344-cran-lin","test-350-cran-lin","test-rel-win","test-dev-win","test-old-win"] script: - - R --version - - *install-deps ## markdown pkg not present in r-pkgdown image - - rm -rf ./vignettes ## r-lib/pkgdown#2383 - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' ## html manual, vignettes, repos, cran_web, cran_checks - echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile ## list of available test-* jobs dynamically based on bus/test-* directories - Rscript -e 'cat("\ntest.jobs <- c(\n"); cat(paste0(" \"",list.files("bus",pattern="^test-"),"\" = \"data.table\""), sep=",\n"); cat(")\n")' >> .Rprofile - Rscript -e 'sapply(names(test.jobs), check.test, pkg="data.table", simplify=FALSE)' - - mkdir -p bus/$CI_JOB_NAME + - mkdir -p bus/$CI_BUILD_NAME ## delete any existing non-dev version of data.table - rm -f bus/mirror-packages/cran/src/contrib/data.table_*.tar.gz - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_REL_VERSION/data.table_*.zip - - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEV_VERSION/data.table_*.zip - - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_OLD_VERSION/data.table_*.zip + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEVEL_VERSION/data.table_*.zip + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_OLDREL_VERSION/data.table_*.zip #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_REL_VERSION/data.table_*.tgz - #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEV_VERSION/data.table_*.tgz - #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_OLD_VERSION/data.table_*.tgz + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEVEL_VERSION/data.table_*.tgz + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_OLDREL_VERSION/data.table_*.tgz ## merge mirror-packages and R devel packages - - mv bus/mirror-packages/cran bus/$CI_JOB_NAME/ + - mv bus/mirror-packages/cran bus/$CI_BUILD_NAME/ ## publish package sources - - mkdir -p bus/$CI_JOB_NAME/cran/library bus/$CI_JOB_NAME/cran/doc - - mv $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/cran/src/contrib + - mkdir -p bus/$CI_BUILD_NAME/cran/library bus/$CI_BUILD_NAME/cran/doc + - mv $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="source"), type="source", fields="Revision", addFiles=TRUE)' ## publish binaries - - mkdir -p bus/integration/cran/bin/windows/contrib/$R_REL_VERSION/ - - mkdir -p bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION/ - - mkdir -p bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION/ - - '[ -f bus/test-win-rel/data.table_*.zip ] && cp bus/test-win-rel/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_REL_VERSION/' - - ls -1 "bus/integration/cran/bin/windows/contrib/$R_REL_VERSION"/data.table_*.zip || true - - '[ -f bus/test-win-dev/data.table_*.zip ] && cp bus/test-win-dev/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION/' - - ls -1 "bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION"/data.table_*.zip || true - - '[ -f bus/test-win-old/data.table_*.zip ] && cp bus/test-win-old/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION/' - - ls -1 "bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION"/data.table_*.zip || true + - Rscript -e 'move.bin("test-rel-win", Sys.getenv("R_REL_VERSION"), os.type="windows")' + - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_VERSION"), os.type="windows")' + - Rscript -e 'move.bin("test-old-win", Sys.getenv("R_OLDREL_VERSION"), os.type="windows")' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_REL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEV_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_OLD_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'move.bin("test-mac-rel", Sys.getenv("R_REL_VERSION"), os.type="macosx")' - #- Rscript -e 'move.bin("test-mac-dev", Sys.getenv("R_DEV_VERSION"), os.type="macosx")' - #- Rscript -e 'move.bin("test-mac-old", Sys.getenv("R_OLD_VERSION"), os.type="macosx")' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEVEL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_OLDREL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'move.bin("test-rel-mac", Sys.getenv("R_REL_VERSION"), os.type="macosx")' + #- Rscript -e 'move.bin("test-dev-mac", Sys.getenv("R_DEVEL_VERSION"), os.type="macosx")' + #- Rscript -e 'move.bin("test-old-mac", Sys.getenv("R_OLDREL_VERSION"), os.type="macosx")' #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_REL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEV_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_OLD_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEVEL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_OLDREL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' ## install all pkgs to render html and double check successful installation of all devel packages - mkdir -p /tmp/opencran/library /tmp/opencran/doc/html ## reset R_LIBS_USER to re-install all with html because pkgdown image has pre installed curl knitr - R_LIBS_USER="" Rscript -e 'install.packages("data.table", dependencies=TRUE, lib="/tmp/opencran/library", repos=file.path("file:",normalizePath("bus/integration/cran")), INSTALL_opts="--html", quiet=TRUE)' @@ -353,31 +374,80 @@ integration: - mv /tmp/opencran/doc bus/integration/cran/ ## library html manual, vignettes - Rscript -e 'lib.copy(lib.from="/tmp/opencran/library")' - ## web/checks/$pkg/$job 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png ## memtest not available for now #5764 + ## web/checks/$pkg/$job 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png - Rscript -e 'sapply(names(test.jobs), check.copy, simplify=FALSE)' ## web/packages/$pkg/$pkg.pdf - - Rscript -e 'pdf.copy("data.table", "test-lin-rel")' + - Rscript -e 'pdf.copy("data.table", "test-rel-lin")' ## web/checks/check_results_$pkg.html - Rscript -e 'check.index("data.table", names(test.jobs))' ## web/checks/check_flavors.html - Rscript -e 'check.flavors(names(test.jobs))' - ## pkgdown vignettes workaround r-lib/pkgdown#2383 - - mkdir -p pkgdown/articles - - cp bus/integration/cran/library/data.table/doc/*.html pkgdown/articles/. - - rm pkgdown/articles/index.html ## pkgdown merge - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ - ## add plausible.io stats - - find bus/integration/cran -type f -iname "*.html" | xargs sed -i 's!!!g' + ## cleanup artifacts from other jobs + - mkdir tmpbus + - mv bus/$CI_BUILD_NAME tmpbus + - rm -r bus + - mv tmpbus bus <<: *artifacts -## publish -# R repository -# test jobs summaries -# html documentation of all packages in repo -# pkgdown website -pages: +.docker-template: &docker + stage: deploy + tags: + - linux + image: docker + services: + - docker:dind + needs: + - job: build + - job: integration + artifacts: false + before_script: + - sed "s/SRC_IMAGE_NAME/$SRC_IMAGE_NAME/" < .ci/Dockerfile.in > Dockerfile + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY + script: + - docker build --pull -t "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" -f Dockerfile . + - docker run --rm "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" Rscript -e 'cat(R.version.string, "\ndata.table revision", read.dcf(system.file("DESCRIPTION", package="data.table"), fields="Revision")[[1L]], "\n"); require(data.table); test.data.table()' + - docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" + +docker-r-release: ## data.table on R-release + only: + - master + variables: + SRC_IMAGE_NAME: "r-base-dev" + IMAGE_NAME: "r-release" + IMAGE_TAG: "latest" + <<: *docker + +docker-r-release-builder: ## data.table on R-release extended for Rmd vignettes build dependencies + only: + - master + variables: + SRC_IMAGE_NAME: "r-builder" + IMAGE_NAME: "r-release-builder" + IMAGE_TAG: "latest" + <<: *docker + +docker-r-devel: ## data.table on R-devel + only: + - master + variables: + SRC_IMAGE_NAME: "r-devel" + IMAGE_NAME: "r-devel" + IMAGE_TAG: "latest" + <<: *docker + +docker-tags: ## data.table on R-release fixed version images + only: + - tags + variables: + SRC_IMAGE_NAME: "r-base-dev" + IMAGE_NAME: "r-release" + IMAGE_TAG: $CI_COMMIT_TAG + <<: *docker + +pages: ## publish R repository, test jobs summaries, html documentation of all packages in repo, pkgdown stage: deploy environment: production tags: @@ -390,7 +460,7 @@ pages: - mkdir -p public - cp -r bus/integration/cran/* public - cat public/src/contrib/PACKAGES - artifacts: + artifacts: ## publish only when no failure expire_in: 2 weeks paths: - public diff --git a/CODEOWNERS b/CODEOWNERS deleted file mode 100644 index 5d98e02422..0000000000 --- a/CODEOWNERS +++ /dev/null @@ -1,45 +0,0 @@ -# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners -* @mattdowle - -# melt -/R/fmelt.R @tdhock -/src/fmelt.c @tdhock -/man/melt.data.table.Rd @tdhock -/vignettes/datatable-reshape.Rmd @tdhock - -# rolling statistics -/R/froll.R @jangorecki -/man/froll.Rd @jangorecki -/src/froll.c @jangorecki -/src/frollR.c @jangorecki -/src/frolladaptive.c @jangorecki - -# meta-programming -/R/programming.R @jangorecki -/man/substitute2.Rd @jangorecki -/src/programming.c @jangorecki -/vignettes/datatable-programming.Rmd @jangorecki - -# GForce groupby -/src/gsumm.c @ben-schwen -# datetime classes -/R/IDateTime.R @ben-schwen @michaelchirico -/src/idatetime.c @ben-schwen @michaelchirico -/man/IDateTime.Rd @ben-schwen @michaelchirico - -# shift -/R/shift.R @ben-schwen @michaelchirico -/src/shift.c @ben-schwen @michaelchirico -/man/shift.Rd @ben-schwen @michaelchirico - -# translations -/inst/po/ @michaelchirico -/po/ @michaelchirico -/R/translation.R @michaelchirico -/src/po.h @michaelchirico - -# printing -/R/print.data.table.R @michaelchirico - -# .SD vignette -/vignettes/datatable-sd-usage.Rmd @michaelchirico diff --git a/DESCRIPTION b/DESCRIPTION index a59298fcbe..586ef0f308 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,18 +1,8 @@ Package: data.table -Version: 1.14.99 +Version: 1.14.3 Title: Extension of `data.frame` -Depends: R (>= 3.1.0) -Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), R.utils, xts, zoo (>= 1.8-1), yaml, knitr, markdown -Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. -License: MPL-2.0 | file LICENSE -URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table -BugReports: https://github.com/Rdatatable/data.table/issues -VignetteBuilder: knitr -ByteCompile: TRUE Authors@R: c( - person("Tyson","Barrett", role=c("aut","cre"), email="t.barrett88@gmail.com"), - person("Matt","Dowle", role="aut", email="mattjdowle@gmail.com"), + person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), person("Arun","Srinivasan", role="aut", email="asrini@pm.me"), person("Jan","Gorecki", role="ctb"), person("Michael","Chirico", role="ctb"), @@ -60,6 +50,7 @@ Authors@R: c( person("Davis","Vaughan", role="ctb"), person("Toby","Hocking", role="ctb"), person("Leonardo","Silvestri", role="ctb"), + person("Tyson","Barrett", role="ctb"), person("Jim","Hester", role="ctb"), person("Anthony","Damico", role="ctb"), person("Sebastian","Freundt", role="ctb"), @@ -81,5 +72,14 @@ Authors@R: c( person("Olivier","Delmarcell", role="ctb"), person("Josh","O'Brien", role="ctb"), person("Dereck","de Mezquita", role="ctb"), - person("Michael","Czekanski", role="ctb") - ) + person("Michael","Czekanski", role="ctb")) +Depends: R (>= 3.1.0) +Imports: methods +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown +SystemRequirements: zlib +Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. +License: MPL-2.0 | file LICENSE +URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table +BugReports: https://github.com/Rdatatable/data.table/issues +VignetteBuilder: knitr +ByteCompile: TRUE diff --git a/Makefile b/Makefile index 45fb6203b9..50a919440e 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.99.tar.gz + $(RM) data.table_1.14.3.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.99.tar.gz + $(R) CMD INSTALL data.table_1.14.3.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.99.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.3.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NAMESPACE b/NAMESPACE index 75b490068f..dec4beee18 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,13 +51,14 @@ S3method(cube, data.table) S3method(rollup, data.table) export(frollmean) export(frollsum) +export(frollmax) export(frollapply) export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) export(substitute2) -#export(DT) # mtcars |> DT(i,j,by) #4872 #5472 +export(DT) # mtcars |> DT(i,j,by) #4872 S3method("[", data.table) export("[.data.table") # so that functional DT() finds it; PR#5176 @@ -90,18 +91,19 @@ if (getRversion() >= "4.0.0") { # if we register these (new in v1.12.6) methods always though, the previous workaround no longer works in R<4.0.0. Hence only register in R>=4.0.0. S3method(cbind, data.table) S3method(rbind, data.table) -} else { - # and if we export but don't register in R < 4.0.0 we get this note: - # > Found the following apparent S3 methods exported but not registered: - # > cbind.data.table rbind.data.table - # in addition to errors in tests 324, 326, 414.1, 414.2, 442, 445, 451 - # export(cbind.data.table) - # export(rbind.data.table) - # A revdep using rbind.data.frame() directly before (which data.table changed in base) should change to rbind() generic and that should work - # in all combinations of R before/after 4.0.0 and data.table before/after 1.12.6, so long as data.table is installed using the same major - # version of R (and that is checked in .onLoad with error if not). - export(.rbind.data.table) # only export in R<4.0.0 where it is still used; R-devel now detects it is missing doc, #5600 } +# else { +# # and if we export but don't register in R < 4.0.0 we get this note: +# # > Found the following apparent S3 methods exported but not registered: +# # > cbind.data.table rbind.data.table +# # in addition to errors in tests 324, 326, 414.1, 414.2, 442, 445, 451 +# export(cbind.data.table) +# export(rbind.data.table) +# # A revdep using rbind.data.frame() directly before (which data.table changed in base) should change to rbind() generic and that should work +# # in all combinations of R before/after 4.0.0 and data.table before/after 1.12.6, so long as data.table is installed using the same major +# # version of R (and that is checked in .onLoad with error if not). +# } +export(.rbind.data.table) # continue to export for now because it has been exported in the past so it may be depended on S3method(dim, data.table) S3method(dimnames, data.table) S3method("dimnames<-", data.table) @@ -158,7 +160,6 @@ S3method(as.IDate, Date) S3method(as.IDate, POSIXct) S3method(as.IDate, default) S3method(as.IDate, numeric) -S3method(as.IDate, IDate) S3method(as.ITime, character) S3method(as.ITime, default) S3method(as.ITime, POSIXct) diff --git a/NEWS.1.md b/NEWS.1.md deleted file mode 100644 index 249f349926..0000000000 --- a/NEWS.1.md +++ /dev/null @@ -1,1549 +0,0 @@ - -**This is OLD NEWS. Latest news is on GitHub [here](https://github.com/Rdatatable/data.table/blob/master/NEWS.md).** - -# data.table [v1.14.10](https://github.com/Rdatatable/data.table/milestone/20?closed=1) (8 Dec 2023) - -## NOTES - -1. Maintainer of the package for CRAN releases is from now on Tyson Barrett (@tysonstanley), [#5710](https://github.com/Rdatatable/data.table/issues/5710). - -2. Updated internal code for breaking change of `is.atomic(NULL)` in R-devel, [#5691](https://github.com/Rdatatable/data.table/pull/5691). Thanks to Martin Maechler for the patch. - -3. Fix multiple test concerning coercion to missing complex numbers, [#5695](https://github.com/Rdatatable/data.table/issues/5695) and [#5748](https://github.com/Rdatatable/data.table/issues/5748). Thanks to @MichaelChirico and @ben-schwen for the patches. - -4. Fix multiple format warnings (e.g., -Wformat) [#5712](https://github.com/Rdatatable/data.table/pull/5712), [#5781](https://github.com/Rdatatable/data.table/pull/5781), [#5880](https://github.com/Rdatatable/data.table/pull/5800), [#5786](https://github.com/Rdatatable/data.table/pull/5786). Thanks to @MichaelChirico and @jangorecki for the patches. - - -# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) (17 Feb 2023) - -## NOTES - -1. Test 1613.605 now passes changes to `as.data.frame()` in R-devel, [#5597](https://github.com/Rdatatable/data.table/pull/5597). Thanks to Avraham Adler for reporting. - -2. An out of bounds read when combining non-equi join with `by=.EACHI` has been found and fixed thanks to clang ASAN, [#5598](https://github.com/Rdatatable/data.table/issues/5598). There was no bug or consequence because the read was followed (now preceded) by a bounds test. - -3. `.rbind.data.table` (note the leading `.`) is no longer exported when `data.table` is installed in R>=4.0.0 (Apr 2020), [#5600](https://github.com/Rdatatable/data.table/pull/5600). It was never documented which R-devel now detects and warns about. It is only needed by `data.table` internals to support R<4.0.0; see note 1 in v1.12.6 (Oct 2019) below in this file for more details. - - -# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) - -## BUG FIXES - -1. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now highlights this issue on startup, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8 year old R 3.1.0 (April 2014) to 5 year old R 3.4.0 (April 2017). - -## NOTES - -1. Test 1962.098 has been modified to pass latest changes to `POSIXt` in R-devel. - -2. `test.data.table()` no longer creates `DT` in `.GlobalEnv`, a CRAN policy violation, [#5514](https://github.com/Rdatatable/data.table/issues/5514). No other writes occurred to `.GlobalEnv` and release procedures have been improved to prevent this happening again. - -3. The memory usage of the test suite has been halved, [#5507](https://github.com/Rdatatable/data.table/issues/5507). - - -# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) - -## NOTES - -1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked, [#5476](https://github.com/Rdatatable/data.table/pull/5476). Thanks to CRAN for testing latest versions of compilers. - -2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. - -3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN, [#5477](https://github.com/Rdatatable/data.table/pull/5477). - -4. `write.csv` in R-devel no longer responds to `getOption("digits.secs")` for `POSIXct`, [#5478](https://github.com/Rdatatable/data.table/issues/5478). This caused our tests of `fwrite(, dateTimeAs="write.csv")` to fail on CRAN's daily checks using latest daily R-devel. While R-devel discussion continues, and currently it seems like the change is intended with further changes possible, this `data.table` release massages our tests to pass on latest R-devel. The idea is to try to get out of the way of R-devel changes in this regard until the new behavior of `write.csv` is released and confirmed. Package updates are not accepted on CRAN if they do not pass the latest daily version of R-devel, even if R-devel changes after the package update is submitted. If the change to `write.csv()` stands, then a future release of `data.table` will be needed to make `fwrite(, dateTimeAs="write.csv")` match `write.csv()` output again in that future version of R onwards. If you use an older version of `data.table` than said future one in the said future version of R, then `fwrite(, dateTimeAs="write.csv")` may not match `write.csv()` if you are using `getOption("digits.secs")` too. However, you can always check that your installation of `data.table` works in your version of R on your platform by simply running `test.data.table()` yourself. Doing so would detect such a situation for you: test 1741 would fail in this case. `test.data.table()` runs the entire suite of tests and is always available to you locally. This way you do not need to rely on our statements about which combinations of versions of R and `data.table` on which platforms we have tested and support; just run `test.data.table()` yourself. Having said that, because test 1741 has been relaxed in this release in order to be accepted on CRAN to pass latest R-devel, this won't be true for this particular release in regard to this particular test. - - ```R - $ R --vanilla - R version 4.2.1 (2022-06-23) -- "Funny-Looking Kid" - > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) - > options(digits.secs=0) - > write.csv(DF) - "","A" - "1",2022-10-01 01:23:45 - > options(digits.secs=3) - > write.csv(DF) - "","A" - "1",2022-10-01 01:23:45.012 - - $ Rdevel --vanilla - R Under development (unstable) (2022-10-06 r83040) -- "Unsuffered Consequences" - > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) - > options(digits.secs=0) - > write.csv(DF) - "","A" - "1",2022-10-01 01:23:45.012 - ``` - -5. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). - -6. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. - - > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. - - -# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) - -## NOTES - -1. clang 13.0.0 (Sep 2021) requires the system header `omp.h` to be included before R's headers, [#5122](https://github.com/Rdatatable/data.table/issues/5122). Many thanks to Prof Ripley for testing and providing a patch file. - - -# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) - -## POTENTIALLY BREAKING CHANGES - -1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://posit.co/resources/videos/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). - - `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. - - The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. - -## BUG FIXES - -1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. - -2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. - -## NOTES - -1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. - -2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. - - -# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) - -## BUG FIXES - -1. Grouping could throw an error `Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. - -2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. - - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. - -## NOTES - -1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmission reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. - - -# data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) - -## BUG FIXES - -1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. - -2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the dynamic schedule. Although never guaranteed by the OpenMP standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, so `fsort` now checks that threads are receiving iterations monotonically and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. OpenMP 4.5 may be enabled in some compilers using `-fopenmp-version=45`. Otherwise, if you need to upgrade compiler, https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. - -3. Columns containing functions that don't inherit the class `'function'` would fail to group, [#4814](https://github.com/Rdatatable/data.table/issues/4814). Thanks @mb706 for reporting, @ecoRoland2 for helping investigate, and @Coorsaa for a follow-up example involving environments. - -## NOTES - -1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/2016/behind-the-scenes-of-cran/). - -2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. - -3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example which has also been added to the test suite. - - -# data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) - -## BUG FIXES - -1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. - -2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. - -3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks, and thanks to Venelin Mitov for assistance in tracing the memory fault. Thanks also to Hongyuan Jia and @ben-schwen for assistance in debugging the fix in dev to pass reverse dependency testing which highlighted, before release, that package `eplusr` would fail. Its good usage has been added to `data.table`'s test suite. - -4. `fread("1.2\n", colClasses='integer')` (note no columns names in the data) would segfault when creating a warning message, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present however, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. - -5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. - -## NOTES - -1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. - - The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. - - We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. - -2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. - -3. `test.data.table()` failed in non-English locales such as `LC_TIME=fr_FR.UTF-8` due to `Jan` vs `janv.` in tests 168 and 2042, [#3450](https://github.com/Rdatatable/data.table/issues/3450). Thanks to @shrektan for reporting, and @tdhock for making the tests locale-aware. - -4. User-supplied `PKG_LIBS` and `PKG_CFLAGS` are now retained and the suggestion in https://mac.r-project.org/openmp/; i.e., - `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` -has a better chance of working on Mac. - - -# data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) - -## POTENTIALLY BREAKING CHANGES - -1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. - - Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. - - The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. - -## NEW FEATURES - -1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). - -2. `CsubsetDT` C function is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organization of our C interface will be changed in future. - -3. `print` method for `data.table` gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. - -4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion. - -5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. - -6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details. - - ```R - # Lazy evaluation - x = 1:10 - data.table::fcase( - x < 5L, 1L, - x >= 5L, 3L, - x == 5L, stop("provided value is an unexpected one!") - ) - # [1] 1 1 1 1 3 3 3 3 3 3 - - dplyr::case_when( - x < 5L ~ 1L, - x >= 5L ~ 3L, - x == 5L ~ stop("provided value is an unexpected one!") - ) - # Error in eval_tidy(pair$rhs, env = default_env) : - # provided value is an unexpected one! - - # Benchmark - x = sample(1:100, 3e7, replace = TRUE) # 114 MB - microbenchmark::microbenchmark( - dplyr::case_when( - x < 10L ~ 0L, - x < 20L ~ 10L, - x < 30L ~ 20L, - x < 40L ~ 30L, - x < 50L ~ 40L, - x < 60L ~ 50L, - x > 60L ~ 60L - ), - data.table::fcase( - x < 10L, 0L, - x < 20L, 10L, - x < 30L, 20L, - x < 40L, 30L, - x < 50L, 40L, - x < 60L, 50L, - x > 60L, 60L - ), - times = 5L, - unit = "s") - # Unit: seconds - # expr min lq mean median uq max neval - # dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 - # data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 - ``` - -7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `.SDcols=patterns(...)` can still be used for filtering based on the column names. - -8. Compiler support for OpenMP is now detected during installation, which allows `data.table` to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. - -9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. - -10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. - -11. `frank` gains `ties.method='last'`, paralleling the same in `base::order` which has been available since R 3.3.0 (April 2016), [#1689](https://github.com/Rdatatable/data.table/issues/1689). Thanks @abudis for the encouragement to accommodate this. - -12. The `keep.rownames` argument in `as.data.table.xts` now accepts a string, which can be used for specifying the column name of the index of the xts input, [#4232](https://github.com/Rdatatable/data.table/issues/4232). Thanks to @shrektan for the request and the PR. - -13. New symbol `.NGRP` available in `j`, [#1206](https://github.com/Rdatatable/data.table/issues/1206). `.GRP` (the group number) was already available taking values from `1` to `.NGRP`. The number of groups, `.NGRP`, might be useful in `j` to calculate a percentage of groups processed so far, or to do something different for the last or penultimate group, for example. - -14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. - -15. A new throttle feature has been introduced to speed up small data tasks that are repeated in a loop, [#3175](https://github.com/Rdatatable/data.table/issues/3175) [#3438](https://github.com/Rdatatable/data.table/issues/3438) [#3205](https://github.com/Rdatatable/data.table/issues/3205) [#3735](https://github.com/Rdatatable/data.table/issues/3735) [#3739](https://github.com/Rdatatable/data.table/issues/3739) [#4284](https://github.com/Rdatatable/data.table/issues/4284) [#4527](https://github.com/Rdatatable/data.table/issues/4527) [#4294](https://github.com/Rdatatable/data.table/issues/4294) [#1120](https://github.com/Rdatatable/data.table/issues/1120). The default throttle of 1024 means that a single thread will be used when nrow<=1024, two threads when nrow<=2048, etc. To change the default, use `setDTthreads(throttle=)`. Or use the new environment variable `R_DATATABLE_THROTTLE`. If you use `Sys.setenv()` in a running R session to change this environment variable, be sure to run an empty `setDTthreads()` call afterwards for the change to take effect; see `?setDTthreads`. The word *throttle* is used to convey that the number of threads is restricted (throttled) for small data tasks. Reducing throttle to 1 will turn off throttling and should revert behaviour to past versions (i.e. using many threads even for small data). Increasing throttle to, say, 65536 will utilize multi-threading only for larger datasets. The value 1024 is a guess. We welcome feedback and test results indicating what the best default should be. - -## BUG FIXES - -1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). - -2. `DT[i]` could segfault when `i` is a zero-column `data.table`, [#4060](https://github.com/Rdatatable/data.table/issues/4060). Thanks @shrektan for reporting and fixing. - -3. Dispatch of `first` and `last` functions now properly works again for `xts` objects, [#4053](https://github.com/Rdatatable/data.table/issues/4053). Thanks to @ethanbsmith for reporting. - -4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks also to @mllg for detecting a problem with the initial fix here during the dev release [#4173](https://github.com/Rdatatable/data.table/issues/4173). - -5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks @hongyuanjia and @ColeMiller1 for helping debug an issue in dev with the original fix before release, [#4612](https://github.com/Rdatatable/data.table/issues/4612). - -6. `all.equal(DT, y)` no longer errors when `y` is not a data.table, [#4042](https://github.com/Rdatatable/data.table/issues/4042). Thanks to @d-sci for reporting and the PR. - -7. A length 1 `colClasses=NA_character_` would cause `fread` to incorrectly coerce all columns to character, [#4237](https://github.com/Rdatatable/data.table/issues/4237). - -8. An `fwrite` error message could include a garbled number and cause test 1737.5 to fail, [#3492](https://github.com/Rdatatable/data.table/issues/3492). Thanks to @QuLogic for debugging the issue on ARMv7hl, and the PR fixing it. - -9. `fread` improves handling of very small (<1e-300) or very large (>1e+300) floating point numbers on non-x86 architectures (specifically ppc64le and armv7hl). Thanks to @QuLogic for reporting and fixing, [PR#4165](https://github.com/Rdatatable/data.table/pull/4165). - -10. When updating by reference, the use of `get` could result in columns being re-ordered silently, [#4089](https://github.com/Rdatatable/data.table/issues/4089). Thanks to @dmongin for reporting and Cole Miller for the fix. - -11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. - -12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. - -13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). - -14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. - -15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422). - -16. Non-equi joins now automatically set `allow.cartesian=TRUE`, [4489](https://github.com/Rdatatable/data.table/issues/4489). Thanks to @Henrik-P for reporting. - -17. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. - -18. `by=col1:col4` gave an incorrect result if `key(DT)==c("col1","col4")`, [#4285](https://github.com/Rdatatable/data.table/issues/4285). Thanks to @cbilot for reporting, and Cole Miller for the PR. - -19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. - -20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). - -## NOTES - -0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. - -1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). - -2. `set2key`, `set2keyv`, and `key2` have been removed, as they have been warning since v1.9.8 (Nov 2016) and halting with helpful message since v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' and quickly superseded by `setindex` and `indices`. - -3. `data.table` now supports messaging in simplified Chinese (locale `zh_CN`). This was the result of a monumental collaboration to translate `data.table`'s roughly 1400 warnings, errors, and verbose messages (about 16,000 words/100,000 characters) over the course of two months from volunteer translators in at least 4 time zones, most of whom are first-time `data.table` contributors and many of whom are first-time OSS contributors! - - A big thanks goes out to @fengqifang, @hongyuanjia, @biobai, @zhiiiyang, @Leo-Lee15, @soappp9527, @amy17519, @Zachary-Wu, @caiquanyou, @dracodoc, @JulianYlli12, @renkun-ken, @Xueliang24, @koohoko, @KingdaShi, @gaospecial, @shrektan, @sunshine1126, @shawnchen1996, @yc0802, @HesperusArcher, and @Emberwhirl, all of whom took time from their busy schedules to translate and review others' translations. Especial thanks goes to @zhiiiyang and @hongyuanjia who went above and beyond in helping to push the project over the finish line, and to @GuangchuangYu who helped to organize the volunteer pool. - - `data.table` joins `lubridate` and `nlme` as the only of the top 200 most-downloaded community packages on CRAN to offer non-English messaging, and is the only of the top 50 packages to offer complete support of all messaging. We hope this is a first step in broadening the reach and accessibility of the R ecosystem to more users globally and look forward to working with other maintainers looking to bolster the portability of their packages by offering advice on learnings from this undertaking. - - We would be remiss not to mention the laudable lengths to which the R core team goes to maintain the _much_ larger repository (about 6,000 messages in more than 10 languages) of translations for R itself. - - We will evaluate the feasibility (in terms of maintenance difficulty and CRAN package size limits) of offering support for other languages in later releases. - -4. `fifelse` and `fcase` now notify users that S4 objects (except `nanotime`) are not supported [#4135](https://github.com/Rdatatable/data.table/issues/4135). Thanks to @torema-ed for bringing it to our attention and Morgan Jacob for the PR. - -5. `frank(..., ties.method="random", na.last=NA)` now returns the same random ordering that `base::rank` does, [#4243](https://github.com/Rdatatable/data.table/pull/4243). - -6. The error message when mistakenly using `:=` in `i` instead of `j` has been much improved, [#4227](https://github.com/Rdatatable/data.table/issues/4227). Thanks to Hugh Parsonage for the detailed suggestion. - - ```R - > DT = data.table(A=1:2) - > DT[B:=3] - Error: Operator := detected in i, the first argument inside DT[...], but is only valid in - the second argument, j. Most often, this happens when forgetting the first comma - (e.g. DT[newvar:=5] instead of DT[, new_var:=5]). Please double-check the - syntax. Run traceback(), and debugger() to get a line number. - > DT[, B:=3] - > DT - A B - - 1: 1 3 - 2: 2 3 - ``` - -7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). - -8. Changes upstream in R have been accomodated; e.g. `c.POSIXct` now raises `'origin' must be supplied` which impacted `foverlaps`, [#4428](https://github.com/Rdatatable/data.table/pull/4428). - -9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. - -10. `data.table` packages binaries built by R version 3 (R3) should only be installed in R3, and similarly `data.table` package binaries built by R4 should only be installed in R4. Otherwise, `package ‘data.table’ was built under R version...` warning will occur which should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R3 and R4, `data.table`'s NAMESPACE file contains a condition on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). - -11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. - -12. The `datatable.old.unique.by.key` option has been removed as per the 4 year schedule detailed in note 10 of v1.12.4 (Oct 2019), note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). It has been generating a helpful warning for 2 years, and helpful error for 1 year. - - -# data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) - -## NEW FEATURES - -1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). - -## BUG FIXES - -1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing. - -2. `DT[, sum(grp), by=grp]` (i.e. aggregating the same column being grouped) could error with `object 'grp' not found`, [#3103](https://github.com/Rdatatable/data.table/issues/3103). Thanks to @cbailiss for reporting. - -## NOTES - -1. Links in the manual were creating warnings when installing HTML, [#4000](https://github.com/Rdatatable/data.table/issues/4000). Thanks to Morgan Jacob. - -2. Adjustments for R-devel (R 4.0.0) which now has reference counting turned on, [#4058](https://github.com/Rdatatable/data.table/issues/4058) [#4093](https://github.com/Rdatatable/data.table/issues/4093). This motivated early release to CRAN because every day CRAN tests every package using the previous day's changes in R-devel; a much valued feature of the R ecosystem. It helps R-core if packages can pass changes in R-devel as soon as possible. Thanks to Luke Tierney for the notice, and for implementing reference counting which we look forward to very much. - -3. C internals have been standardized to use `PRI[u|d]64` to print `[u]int64_t`. This solves new warnings from `gcc-8` on Windows with `%lld`, [#4062](https://github.com/Rdatatable/data.table/issues/4062), in many cases already working around `snprintf` on Windows not supporting `%zu`. Release procedures have been augmented to prevent any internal use of `llu`, `lld`, `zu` or `zd`. - -4. `test.data.table()` gains `showProgress=interactive()` to suppress the thousands of `Running test id ...` lines displayed by CRAN checks when there are warnings or errors. - - -# data.table [v1.12.6](https://github.com/Rdatatable/data.table/milestone/18?closed=1) (18 Oct 2019) - -## BUG FIXES - -1. `shift()` on a `nanotime` with the default `fill=NA` now fills a `nanotime` missing value correctly, [#3945](https://github.com/Rdatatable/data.table/issues/3945). Thanks to @mschubmehl for reporting and fixing in PR [#3942](https://github.com/Rdatatable/data.table/pull/3942). - -2. Compilation failed on CRAN's MacOS due to an older version of `zlib.h/zconf.h` which did not have `z_const` defined, [#3939](https://github.com/Rdatatable/data.table/issues/3939). Other open-source projects unrelated to R have experienced this problem on MacOS too. We have followed the common practice of removing `z_const` to support the older `zlib` versions, and data.table's release procedures have gained a `grep` to ensure `z_const` isn't used again by accident in future. The library `zlib` is used for `fwrite`'s new feature of multithreaded compression on-the-fly; see item 3 of 1.12.4 below. - -3. A runtime error in `fwrite`'s compression, but only observed so far on Solaris 10 32bit with zlib 1.2.8 (Apr 2013), [#3931](https://github.com/Rdatatable/data.table/issues/3931): `Error -2: one or more threads failed to allocate buffers or there was a compression error.` In case it happens again, this area has been made more robust and the error more detailed. As is often the case, investigating the Solaris problem revealed secondary issues in the same area of the code. In this case, some `%d` in verbose output should have been `%lld`. This obliquity that CRAN's Solaris provides is greatly appreciated. - -4. A leak could occur in the event of an unsupported column type error, or if working memory could only partially be allocated; [#3940](https://github.com/Rdatatable/data.table/issues/3940). Found thanks to `clang`'s Leak Sanitizer (prompted by CRAN's diligent use of latest tools), and two tests in the test suite which tested the unsupported-type error. - -## NOTES - -1. Many thanks to Kurt Hornik for fixing R's S3 dispatch of `rbind` and `cbind` methods, [#3948](https://github.com/Rdatatable/data.table/issues/3948). With `R>=4.0.0` (current R-devel), `data.table` now registers the S3 methods `cbind.data.table` and `rbind.data.table`, and no longer applies the workaround documented in FAQ 2.24. - - -# data.table [v1.12.4](https://github.com/Rdatatable/data.table/milestone/16?closed=1) (03 Oct 2019) - -## NEW FEATURES - -1. `rleid()` functions now support long vectors (length > 2 billion). - -2. `fread()`: - * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, Roy Storey for the initial PR, and Bingjie Qian for testing this feature on a very complicated real-world file. - * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR. - * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR. - * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR. - * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/gesistsa/rio) for the inspiration and @MichaelChirico for implementing. - * `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example: - - ```R - fread(file, select=c(colD="character", # returns 2 columns: colD,colA - colA="integer64")) - fread(file, select=list(character="colD", # returns 5 columns: colD,8,9,10,colA - integer= 8:10, - character="colA")) - ``` - * gains `tmpdir=` argument which is passed to `tempfile()` whenever a temporary file is needed. Thanks to @mschubmehl for the PR. As before, setting `TMPDIR` (to `/dev/shm` for example) before starting the R session still works too; see `?base::tempdir`. - -3. `fwrite()`: - * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example: - - ```R - DT = data.table(A=rep(1:2, 100e6), B=rep(1:4, 50e6)) - fwrite(DT, "data.csv") # 763MB; 1.3s - fwrite(DT, "data.csv.gz") # 2MB; 1.6s - identical(fread("data.csv.gz"), DT) - ``` - - Note that compression is handled using `zlib` library. In the unlikely event of missing `zlib.h`, on a machine that is compiling `data.table` from sources, one may get `fwrite.c` compilation error `zlib.h: No such file or directory`. As of now, the easiest solution is to install missing library using `sudo apt install zlib1g-dev` (Debian/Ubuntu). Installing R (`r-base-dev`) depends on `zlib1g-dev` so this should be rather uncommon. If it happens to you please upvote related issue [#3872](https://github.com/Rdatatable/data.table/issues/3872). - - * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. - - * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing. - - * Now supports type `complex`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). - - * Gains `scipen` [#2020](https://github.com/Rdatatable/data.table/issues/2020), the number 1 most-requested feature [#3189](https://github.com/Rdatatable/data.table/issues/3189). The default is `getOption("scipen")` so that `fwrite` will now respect R's option in the same way as `base::write.csv` and `base::format`, as expected. The parameter and option name have been kept the same as base R's `scipen` for consistency and to aid online search. It stands for 'scientific penalty'; i.e., the number of characters to add to the width within which non-scientific number format is used if it will fit. A high penalty essentially turns off scientific format. We believe that common practice is to use a value of 999, however, if you do use 999, because your data _might_ include very long numbers such as `10^300`, `fwrite` needs to account for the worst case field width in its buffer allocation per thread. This may impact space or time. If you experience slowdowns or unacceptable memory usage, please pass `verbose=TRUE` to `fwrite`, inspect the output, and report the issue. A workaround, until we can determine the best strategy, may be to pass a smaller value to `scipen`, such as 50. We have observed that `fwrite(DT, scipen=50)` appears to write `10^50` accurately, unlike base R. However, this may be a happy accident and not apply generally. Further work may be needed in this area. - - ```R - DT = data.table(a=0.0001, b=1000000) - fwrite(DT) - # a,b - # 1e-04,1e+06 - fwrite(DT,scipen=1) - # a,b - # 0.0001,1e+06 - fwrite(DT,scipen=2) - # a,b - # 0.0001,1000000 - - 10^50 - # [1] 1e+50 - options(scipen=50) - 10^50 - # [1] 100000000000000007629769841091887003294964970946560 - fwrite(data.table(A=10^50)) - # A - # 100000000000000000000000000000000000000000000000000 - ``` - -4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). - - ```R - > DT = data.table(A=1:3, B=list(1:2,"foo",3:5)) - > DT - A B - - 1: 1 1,2 - 2: 2 foo - 3: 3 3,4,5 - > - # The following all accomplish the same assignment: - > DT[2, B:=letters[9:13]] # was error, now works - > DT[2, B:=.(letters[9:13])] # was error, now works - > DT[2, B:=.(list(letters[9:13]))] # .(list()) was needed, still works - > DT - A B - - 1: 1 1,2 - 2: 2 i,j,k,l,m - 3: 3 3,4,5 - ``` - -5. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. - -6. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. - - ```R - DT = setDT(lapply(1:100, function(i) sample(c(rnorm(9e6), rep(NA_real_, 1e6))))) - format(object.size(DT), units="GB") ## 7.5 Gb - zoo::na.locf(DT, na.rm=FALSE) ## zoo 53.518s - setDTthreads(1L) - nafill(DT, "locf") ## DT 1 thread 7.562s - setDTthreads(0L) - nafill(DT, "locf") ## DT 40 threads 0.605s - setnafill(DT, "locf") ## DT in-place 0.367s - ``` - -7. New variable `.Last.updated` (similar to R's `.Last.value`) contains the number of rows affected by the most recent `:=` or `set()`, [#1885](https://github.com/Rdatatable/data.table/issues/1885). For details see `?.Last.updated`. - -8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). `between()` gains `check=` which checks `any(lower>upper)`; off by default for speed in particular for type character. - -9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. - -10. `on=.NATURAL` (or alternatively `X[on=Y]` [#3621](https://github.com/Rdatatable/data.table/issues/3621)) joins two tables on their common column names, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. As before, when `on=` is not provided, `X` must have a key and the key columns are used to join (like rownames, but multi-column and multi-type). - -11. `as.data.table` gains `key` argument mirroring its use in `setDT` and `data.table`, [#890](https://github.com/Rdatatable/data.table/issues/890). As a byproduct, the arguments of `as.data.table.array` have changed order, which could affect code relying on positional arguments to this method. Thanks @cooldome for the suggestion and @MichaelChirico for implementation. - -12. `merge.data.table` is now exported, [#2618](https://github.com/Rdatatable/data.table/pull/2618). We realize that S3 methods should not ordinarily be exported. Rather, the method should be invoked via S3 dispatch. But users continue to request its export, perhaps because of intricacies relating to the fact that data.table inherits from data.frame, there are two arguments to `merge()` but S3 dispatch applies just to the first, and a desire to explicitly call `data.table::merge.data.table` from package code. Thanks to @AndreMikulec for the most recent request. - -13. New rolling function to calculate rolling sum has been implemented and exported, see `?frollsum`, [#2778](https://github.com/Rdatatable/data.table/issues/2778). - -14. `setkey` to an existing index now uses the index, [#2889](https://github.com/Rdatatable/data.table/issues/2889). Thanks to @MichaelChirico for suggesting and @saraswatmks for the PR. - -15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`. - -16. `as.data.table` now unpacks columns in a `data.frame` which are themselves a `data.frame` or `matrix`. This need arises when parsing JSON, a corollary in [#3369](https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752). Bug fix 19 in v1.12.2 (see below) added a helpful error (rather than segfault) to detect such invalid `data.table`, and promised that `as.data.table()` would unpack these columns in the next release (i.e. this release) so that the invalid `data.table` is not created in the first place. Further, `setDT` now warns if it observes such columns and suggests using `as.data.table` instead, [#3760](https://github.com/Rdatatable/data.table/issues/3760). - -17. `CJ` has been ported to C and parallelized, thanks to a PR by Michael Chirico, [#3596](https://github.com/Rdatatable/data.table/pull/3596). All types benefit, but, as in many `data.table` operations, factors benefit more than character. - - ```R - # default 4 threads on a laptop with 16GB RAM and 8 logical CPU - - ids = as.vector(outer(LETTERS, LETTERS, paste0)) - system.time( CJ(ids, 1:500000) ) # 3.9GB; 340m rows - # user system elapsed (seconds) - # 3.000 0.817 3.798 # was - # 1.800 0.832 2.190 # now - - # ids = as.factor(ids) - system.time( CJ(ids, 1:500000) ) # 2.6GB; 340m rows - # user system elapsed (seconds) - # 1.779 0.534 2.293 # was - # 0.357 0.763 0.292 # now - ``` - -18. New function `fcoalesce(...)` has been written in C, and is multithreaded for `numeric` and `factor`. It replaces missing values according to a prioritized list of candidates (as per SQL COALESCE, `dplyr::coalesce`, and `hutils::coalesce`), [#3424](https://github.com/Rdatatable/data.table/issues/3424). It accepts any number of vectors in several forms. For example, given three vectors `x`, `y`, and `z`, where each `NA` in `x` is to be replaced by the corresponding value in `y` if that is non-NA, else the corresponding value in `z`, the following equivalent forms are all accepted: `fcoalesce(x,y,z)`, `fcoalesce(x,list(y,z))`, and `fcoalesce(list(x,y,z))`. Being a new function, its behaviour is subject to change particularly for type `list`, [#3712](https://github.com/Rdatatable/data.table/issues/3712). - - ```R - # default 4 threads on a laptop with 16GB RAM and 8 logical CPU - N = 100e6 - x = replicate(5, {x=sample(N); x[sample(N, N/2)]=NA; x}, simplify=FALSE) # 2GB - y1 = do.call(dplyr::coalesce, x)) - y2 = do.call(hutils::coalesce, x)) - y3 = do.call(data.table::fcoalesce, x)) - # user system elapsed (seconds) - # 4.935 1.876 6.810 # dplyr::coalesce - # 3.122 0.831 3.956 # hutils::coalesce - # 0.915 0.099 0.379 # data.table::fcoalesce - identical(y1,y2) && identical(y1,y3) - # TRUE - ``` - -19. Type `complex` is now supported by `setkey`, `setorder`, `:=`, `by=`, `keyby=`, `shift`, `dcast`, `frank`, `rowid`, `rleid`, `CJ`, `fcoalesce`, `unique`, and `uniqueN`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). Thanks to Gareth Ward and Elio Campitelli for their reports and input. Sorting `complex` is achieved the same way as base R; i.e., first by the real part then by the imaginary part (as if the `complex` column were two separate columns of `double`). There is no plan to support joining/merging on `complex` columns until a user demonstrates a need for that. - -20. `setkey`, `[key]by=` and `on=` in verbose mode (`options(datatable.verbose=TRUE)`) now detect any columns inheriting from `Date` which are stored as 8 byte double, test if any fractions are present, and if not suggest using a 4 byte integer instead (such as `data.table::IDate`) to save space and time, [#1738](https://github.com/Rdatatable/data.table/issues/1738). In future this could be upgraded to `message` or `warning` depending on feedback. - -21. New function `fifelse(test, yes, no, na)` has been implemented in C by Morgan Jacob, [#3657](https://github.com/Rdatatable/data.table/issues/3657) and [#3753](https://github.com/Rdatatable/data.table/issues/3753). It is comparable to `base::ifelse`, `dplyr::if_else`, `hutils::if_else`, and (forthcoming) [`vctrs::if_else()`](https://vctrs.r-lib.org/articles/stability.html#ifelse). It returns a vector of the same length as `test` but unlike `base::ifelse` the output type is consistent with those of `yes` and `no`. Please see `?data.table::fifelse` for more details. - - ```R - # default 4 threads on a laptop with 16GB RAM and 8 logical CPU - x = sample(c(TRUE,FALSE), 3e8, replace=TRUE) # 1GB - microbenchmark::microbenchmark( - base::ifelse(x, 7L, 11L), - dplyr::if_else(x, 7L, 11L), - hutils::if_else(x, 7L, 11L), - data.table::fifelse(x, 7L, 11L), - times = 5L, unit="s" - ) - # Unit: seconds - # expr min med max neval - # base::ifelse(x, 7L, 11L) 8.5 8.6 8.8 5 - # dplyr::if_else(x, 7L, 11L) 9.4 9.5 9.7 5 - # hutils::if_else(x, 7L, 11L) 2.6 2.6 2.7 5 - # data.table::fifelse(x, 7L, 11L) 1.5 1.5 1.6 5 # setDTthreads(1) - # data.table::fifelse(x, 7L, 11L) 0.8 0.8 0.9 5 # setDTthreads(2) - # data.table::fifelse(x, 7L, 11L) 0.4 0.4 0.5 5 # setDTthreads(4) - ``` - -22. `transpose` gains `keep.names=` and `make.names=` arguments, [#1886](https://github.com/Rdatatable/data.table/issues/1886). Previously, column names were dropped and there was no way to keep them. `keep.names="rn"` keeps the column names and puts them in the `"rn"` column of the result. Similarly, `make.names="rn"` uses column `"rn"` as the column names of the result. Both arguments are `NULL` by default for backwards compatibility. As these new arguments are new, they are subject to change in future according to community feedback. Thanks to @ghost for the request. - -23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). - -24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. - -25. `setnames(DT, old, new)` now omits any `old==new` to save redundant key and index name updates, [#3783](https://github.com/Rdatatable/data.table/issues/3783). `setnames(DT, new)` (i.e. not providing `old`) already omitted any column name updates where `names(DT)==new`; e.g. `setnames(DT, gsub('^_', '', names(DT)))` exits early if no columns start with `_`. - -26. `[[` by group is now optimized for regular vectors (not type list), [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. `[` by group was already optimized. Please file a feature request if you would like this optimization for list columns. - -27. New function `frollapply` for rolling computation of arbitrary R functions (caveat: input `x` is coerced to numeric beforehand, and the function must return a scalar numeric value). The API is consistent to extant rolling functions `frollmean` and `frollsum`; note that it will generally be slower than those functions because (1) the known functions use our optimized internal C implementation and (2) there is no thread-safe API to R's C `eval`. Nevertheless `frollapply` is faster than corresponding `base`-only and `zoo` versions: - - ```R - set.seed(108) - x = rnorm(1e6); n = 1e3 - base_rollapply = function(x, n, FUN) { - nx = length(x) - ans = rep(NA_real_, nx) - for (i in n:nx) ans[i] = FUN(x[(i-n+1):i]) - ans - } - system.time(base_rollapply(x, n, mean)) - system.time(zoo::rollapplyr(x, n, function(x) mean(x), fill=NA)) - system.time(zoo::rollmeanr(x, n, fill=NA)) - system.time(frollapply(x, n, mean)) - system.time(frollmean(x, n)) - - ### fun mean sum median - # base_rollapply 8.815 5.151 60.175 - # zoo::rollapply 34.373 27.837 88.552 - # zoo::roll[fun] 0.215 0.185 NA ## median not fully supported - # frollapply 5.404 1.419 56.475 - # froll[fun] 0.003 0.002 NA ## median not yet supported - ``` - -28. `setnames()` now accepts functions in `old=` and `new=`, [#3703](https://github.com/Rdatatable/data.table/issues/3703). Thanks @smingerson for the feature request and @shrektan for the PR. - - ```R - DT = data.table(a=1:3, b=4:6, c=7:9) - setnames(DT, toupper) - names(DT) - # [1] "A" "B" "C" - setnames(DT, c(1,3), tolower) - names(DT) - # [1] "a" "B" "c" - ``` - -29. `:=` and `set()` now use zero-copy type coercion. Accordingly, `DT[..., integerColumn:=0]` and `set(DT,i,j,0)` no longer warn about the `0` ('numeric') needing to be `0L` ('integer') because there is no longer any time or space used for this coercion. The old long warning was off-putting to new users ("what and why L?"), whereas advanced users appreciated the old warning so they could avoid the coercion. Although the time and space for one coercion in a single call is unmeasurably small, when placed in a loop the small overhead of any allocation on R's heap could start to become noticeable (more so for `set()` whose purpose is low-overhead looping). Further, when assigning a value across columns of varying types, it could be inconvenient to supply the correct type for every column. Hence, zero-copy coercion was introduced to satisfy all these requirements. A warning is still issued, as before, when fractional data is discarded; e.g. when 3.14 is assigned to an integer column. Zero-copy coercion applies to length>1 vectors as well as length-1 vectors. - -## BUG FIXES - -1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. - -2. `keyby=colName` could use the wrong index and return incorrect results if both `colName` and `colNameExtra` (where `colName` is a leading subset of characters of `colNameExtra`) are column names and an index exists on `colNameExtra`, [#3498](https://github.com/Rdatatable/data.table/issues/3498). Thanks to Xianying Tan for the detailed report and pinpointing the source line at fault. - -3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. - -4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288). - -5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example. - -6. Printing could occur unexpectedly when code is run with `source`, [#2369](https://github.com/Rdatatable/data.table/issues/2369). Thanks to @jan-glx for the report and reproducible example. - -7. Grouping by `NULL` on zero rows `data.table` now behaves consistently to non-zero rows `data.table`, [#3530](https://github.com/Rdatatable/data.table/issues/3530). Thanks to @SymbolixAU for the report and reproducible example. - -8. GForce optimization of `median` did not retain the class; e.g. `median` of `Date` or `POSIXct` would return a raw number rather than retain the date class, [#3079](https://github.com/Rdatatable/data.table/issues/3079). Thanks to @Henrik-P for reporting. - -9. `DT[, format(mean(date,""%b-%Y")), by=group]` could fail with `invalid 'trim' argument`, [#1876](https://github.com/Rdatatable/data.table/issues/1876). Thanks to Ross Holmberg for reporting. - -10. `externalVar=1:5; DT[, mean(externalVar), by=group]` could return incorrect results rather than a constant (`3` in this example) for each group, [#875](https://github.com/Rdatatable/data.table/issues/875). GForce optimization was being applied incorrectly to the `mean` without realizing `externalVar` was not a column. - -11. `test.data.table()` now passes in non-English R sessions, [#630](https://github.com/Rdatatable/data.table/issues/630) [#3039](https://github.com/Rdatatable/data.table/issues/3039). Each test still checks that the number of warnings and/or errors produced is correct. However, a message is displayed suggesting to restart R with `LANGUAGE=en` in order to test that the text of the warning and/or error messages are as expected, too. - -12. Joining a double column in `i` containing say 1.3, with an integer column in `x` containing say 1, would result in the 1.3 matching to 1, [#2592](https://github.com/Rdatatable/data.table/issues/2592), and joining a factor column to an integer column would match the factor's integers rather than error. The type coercion logic has been revised and strengthened. Many thanks to @MarkusBonsch for reporting and fixing. Joining a character column in `i` to a factor column in `x` is now faster and retains the character column in the result rather than coercing it to factor. Joining an integer column in `i` to a double column in `x` now retains the integer type in the result rather than coercing the integers into the double type. Logical columns may now only be joined to logical columns, other than all-NA columns which are coerced to the matching column's type. All coercions are reported in verbose mode: `options(datatable.verbose=TRUE)`. - -13. Attempting to recycle 2 or more items into an existing `list` column now gives the intended helpful error rather than `Internal error: recycle length error not caught earlier.`, [#3543](https://github.com/Rdatatable/data.table/issues/3543). Thanks to @MichaelChirico for finding and reporting. - -14. Subassigning using `$<-` to a `data.table` embedded in a list column of a single-row `data.table` could fail, [#3474](https://github.com/Rdatatable/data.table/issues/3474). Note that `$<-` is not recommended; please use `:=` instead which already worked in this case. Thanks to Jakob Richter for reporting. - -15. `rbind` and `rbindlist` of zero-row items now retain (again) the unused levels of any (zero-length) factor columns, [#3508](https://github.com/Rdatatable/data.table/issues/3508). This was a regression in v1.12.2 just for zero-row items. Unused factor levels were already retained for items having `nrow>=1`. Thanks to Gregory Demin for reporting. - -16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting. Also a related segfault when recycling a length-1 factor column, [#3662](https://github.com/Rdatatable/data.table/issues/3662). - -17. `example(":=", local=TRUE)` now works rather than error, [#2972](https://github.com/Rdatatable/data.table/issues/2972). Thanks @vlulla for the report. - -18. `rbind.data.frame` on `IDate` columns changed the column from `integer` to `double`, [#2008](https://github.com/Rdatatable/data.table/issues/2008). Thanks to @rmcgehee for reporting. - -19. `merge.data.table` now retains any custom classes of the first argument, [#1378](https://github.com/Rdatatable/data.table/issues/1378). Thanks to @michaelquinn32 for reopening. - -20. `c`, `seq` and `mean` of `ITime` objects now retain the `ITime` class via new `ITime` methods, [#3628](https://github.com/Rdatatable/data.table/issues/3628). Thanks @UweBlock for reporting. The `cut` and `split` methods for `ITime` have been removed since the default methods work, [#3630](https://github.com/Rdatatable/data.table/pull/3630). - -21. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). - -22. Adding a `list` column using `cbind`, `as.data.table`, or `data.table` now works rather than treating the `list` as if it were a set of columns and introducing an invalid NA column name, [#3471](https://github.com/Rdatatable/data.table/pull/3471). However, please note that using `:=` to add columns is preferred. - - ```R - cbind( data.table(1:2), list(c("a","b"),"a") ) - # V1 V2 NA # v1.12.2 and before - # - # 1: 1 a a - # 2: 2 b a - # - # V1 V2 # v1.12.4+ - # - # 1: 1 a,b - # 2: 2 a - ``` - -23. Incorrect sorting/grouping results due to a bug in Intel's `icc` compiler 2019 (Version 19.0.4.243 Build 20190416) has been worked around thanks to a report and fix by Sebastian Freundt, [#3647](https://github.com/Rdatatable/data.table/issues/3647). Please run `data.table::test.data.table()`. If that passes, your installation does not have the problem. - -24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. - -25. Slight fix to the logic for auto-naming the `by` clause for using a custom function like `evaluate` to now be named `evaluate` instead of the name of the first symbolic argument, [#3758](https://github.com/Rdatatable/data.table/issues/3758). - -26. Column binding of zero column `data.table` will now work as expected, [#3334](https://github.com/Rdatatable/data.table/issues/3334). Thanks to @kzenstratus for the report. - -27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. - -28. From v1.12.0 `between()` and `%between%` interpret missing values in `lower=` or `upper=` as unlimited bounds. A new parameter `NAbounds` has been added to achieve the old behaviour of returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks @cguill95 for reporting. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported too. - -29. `integer64` defined on a subset of a new column would leave "gibberish" on the remaining rows, [#3723](https://github.com/Rdatatable/data.table/issues/3723). A bug in `rbindlist` with the same root cause was also fixed, [#1459](https://github.com/Rdatatable/data.table/issues/1459). Thanks @shrektan and @jangorecki for the reports. - -30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. - -31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue. - -32. `x[, round(.SD, 1)]` and similar operations on the whole of `.SD` could return a locked result, incorrectly preventing `:=` on the result, [#2245](https://github.com/Rdatatable/data.table/issues/2245). Thanks @grayskripko for raising. - -33. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), and [#2036](https://github.com/Rdatatable/data.table/issues/2036). Thanks @franknarf1, @MichaelChirico, and @TonyBonen, for the reports. - -34. `DT[, i-1L, with=FALSE]` would misinterpret the minus sign and return an incorrect result, [#2019](https://github.com/Rdatatable/data.table/issues/2109). Thanks @cguill95 for the report. - -35. `DT[id==1, DT2[.SD, on="id"]]` (i.e. joining from `.SD` in `j`) could incorrectly fail in some cases due to `.SD` being locked, [#1926](https://github.com/Rdatatable/data.table/issues/1926), and when updating-on-join with factors [#3559](https://github.com/Rdatatable/data.table/issues/3559) [#2099](https://github.com/Rdatatable/data.table/issues/2099). Thanks @franknarf1 and @Henrik-P for the reports and for diligently tracking use cases for almost 3 years! - -36. `as.IDate.POSIXct` returned `NA` for UTC times before Dec 1901 and after Jan 2038, [#3780](https://github.com/Rdatatable/data.table/issues/3780). Thanks @gschett for the report. - -37. `rbindlist` now returns correct idcols for lists with different length vectors, [#3785](https://github.com/Rdatatable/data.table/issues/3785), [#3786](https://github.com/Rdatatable/data.table/pull/3786). Thanks to @shrektan for the report and fix. - -38. `DT[ , !rep(FALSE, ncol(DT)), with=FALSE]` correctly returns the full table, [#3013](https://github.com/Rdatatable/data.table/issues/3013) and [#2917](https://github.com/Rdatatable/data.table/issues/2917). Thanks @alexnss and @DavidArenburg for the reports. - -39. `shift(x, 0:1, type='lead', give.names=TRUE)` uses `lead` in all returned column names, [#3832](https://github.com/Rdatatable/data.table/issues/3832). Thanks @daynefiler for the report. - -40. Subtracting two `POSIXt` objects by group could lead to incorrect results because the `base` method internally calls `difftime` with `units='auto'`; `data.table` does not notice if the chosen units differ by group and only the last group's `units` attribute was retained, [#3694](https://github.com/Rdatatable/data.table/issues/3694) and [#761](https://github.com/Rdatatable/data.table/issues/761). To surmount this, we now internally force `units='secs'` on all `POSIXt-POSIXt` calls (reported when `verbose=TRUE`); generally we recommend calling `difftime` directly instead. Thanks @oliver-oliver and @boethian for the reports. - -41. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), [#2036](https://github.com/Rdatatable/data.table/issues/2036), and [#2946](https://github.com/Rdatatable/data.table/issues/2946). Thanks @franknarf1, @MichaelChirico, @TonyBonen, and Steffen J. (StackOverflow) for the reports. - -42. `DT[...,by={...}]` now handles expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. - -43. `:=` could change a `data.table` creation statement in the body of the function calling it, or a variable in calling scope, [#3890](https://github.com/Rdatatable/data.table/issues/3890). Many thanks to @kirillmayantsev for the detailed reports. - -44. Grouping could create a `malformed factor` and/or segfault when the factors returned by each group did not have identical levels, [#2199](https://github.com/Rdatatable/data.table/issues/2199) and [#2522](https://github.com/Rdatatable/data.table/issues/2522). Thanks to Václav Hausenblas, @franknarf1, @ben519, and @Henrik-P for reporting. - -45. `rbindlist` (and printing a `data.table` with over 100 rows because that uses `rbindlist(head, tail)`) could error with `malformed factor` for unordered factor columns containing a used `NA_character_` level, [#3915](https://github.com/Rdatatable/data.table/issues/3915). This is an unusual input for unordered factors because NA_integer_ is recommended by default in R. Thanks to @sindribaldur for reporting. - -46. Adding a `list` column containing an item of type `list` to a one row `data.table` could fail, [#3626](https://github.com/Rdatatable/data.table/issues/3626). Thanks to Jakob Richter for reporting. - -## NOTES - -1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. - -2. Adding a new column by reference using `set()` on a `data.table` loaded from binary file now give a more helpful error message, [#2996](https://github.com/Rdatatable/data.table/issues/2996). Thanks to Joseph Burling for reporting. - - ``` - This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed - manually (e.g. using structure()). Please run setDT() or alloc.col() on it first (to pre-allocate - space for new columns) before adding new columns by reference to it. - ``` - -3. `setorder` on a superset of a keyed `data.table`'s key now retains its key, [#3456](https://github.com/Rdatatable/data.table/issues/3456). For example, if `a` is the key of `DT`, `setorder(DT, a, -v)` will leave `DT` keyed by `a`. - -4. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. - -5. When loading a `data.table` from disk (e.g. with `readRDS`), best practice is to run `setDT()` on the new object to assure it is correctly allocated memory for new column pointers. Barring this, unexpected behavior can follow; for example, if you assign a new column to `DT` from a function `f`, the new columns will only be assigned within `f` and `DT` will be unchanged. The `verbose` messaging in this situation is now more helpful, [#1729](https://github.com/Rdatatable/data.table/issues/1729). Thanks @vspinu for sharing his experience to spur this. - -6. New vignette _Using `.SD` for Data Analysis_, a deep dive into use cases for the `.SD` variable to help illuminate this topic which we've found to be a sticking point for beginning and intermediate `data.table` users, [#3412](https://github.com/Rdatatable/data.table/issues/3412). - -7. Added a note to `?frank` clarifying that ranking is being done according to C sorting (i.e., like `forder`), [#2328](https://github.com/Rdatatable/data.table/issues/2328). Thanks to @cguill95 for the request. - -8. Historically, `dcast` and `melt` were built as enhancements to `reshape2`'s own `dcast`/`melt`. We removed dependency on `reshape2` in v1.9.6 but maintained some backward compatibility. As that package has been superseded since December 2017, we will begin to formally complete the split from `reshape2` by removing some last vestiges. In particular we now warn when redirecting to `reshape2` methods and will later error before ultimately completing the split; see [#3549](https://github.com/Rdatatable/data.table/issues/3549) and [#3633](https://github.com/Rdatatable/data.table/issues/3633). We thank the `reshape2` authors for their original inspiration for these functions, and @ProfFancyPants for testing and reporting regressions in dev which have been fixed before release. - -9. `DT[col]` where `col` is a column containing row numbers of itself to select, now suggests the correct syntax (`DT[(col)]` or `DT[DT$col]`), [#697](https://github.com/Rdatatable/data.table/issues/697). This expands the message introduced in [#1884](https://github.com/Rdatatable/data.table/issues/1884) for the case where `col` is type `logical` and `DT[col==TRUE]` is suggested. - -10. The `datatable.old.unique.by.key` option has been warning for 1 year that it is deprecated: `... Please stop using it and pass by=key(DT) instead for clarity ...`. This warning is now upgraded to error as per the schedule in note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). In June 2020 the option will be removed. - -11. We intend to deprecate the `datatable.nomatch` option, [more info](https://github.com/Rdatatable/data.table/pull/3578/files). A message is now printed upon use of the option (once per session) as a first step. It asks you to please stop using the option and to pass `nomatch=NULL` explicitly if you require inner join. Outer join (`nomatch=NA`) has always been the default because it is safer; it does not drop missing data silently. The problem is that the option is global; i.e., if a user changes the default using this option for their own use, that can change the behavior of joins inside packages that use `data.table` too. This is the only `data.table` option with this concern. - -12. The test suite of 9k tests now runs with three R options on: `warnPartialMatchArgs`, `warnPartialMatchAttr`, and `warnPartialMatchDollar`. This ensures that we don't rely on partial argument matching in internal code, for robustness and efficiency, and so that users can turn these options on for their code in production, [#3664](https://github.com/Rdatatable/data.table/issues/3664). Thanks to Vijay Lulla for the suggestion, and Michael Chirico for fixing 48 internal calls to `attr()` which were missing `exact=TRUE`, for example. Thanks to R-core for adding these options to R 2.6.0 (Oct 2007). - -13. `test.data.table()` could fail if the `datatable.integer64` user option was set, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for reporting. - -14. The warning message when using `keyby=` together with `:=` is clearer, [#2763](https://github.com/Rdatatable/data.table/issues/2763). Thanks to @eliocamp. - -15. `first` and `last` gain an explicit `n=1L` argument so that it's clear the default is 1, and their almost identical manual pages have been merged into one. - -16. Rolling functions (`?froll`) coerce `logical` input to `numeric` (instead of failing) to mimic the behavior of `integer` input. - -17. The warning message when using `strptime` in `j` has been improved, [#2068](https://github.com/Rdatatable/data.table/issues/2068). Thanks to @tdhock for the report. - -18. Added a note to `?setkey` clarifying that `setkey` always uses C-locale sorting (as has been noted in `?setorder`). Thanks @JBreidaks for the report in [#2114](https://github.com/Rdatatable/data.table/issues/2114). - -19. `hour()`/`minute()`/`second()` are much faster for `ITime` input, [#3518](https://github.com/Rdatatable/data.table/issues/3158). - -20. New alias `setalloccol` for `alloc.col`, [#3475](https://github.com/Rdatatable/data.table/issues/3475). For consistency with `set*` prefixes for functions that operate in-place (like `setkey`, `setorder`, etc.). `alloc.col` is not going to be deprecated but we recommend using `setalloccol`. - -21. `dcast` no longer emits a message when `value.var` is missing but `fun.aggregate` is explicitly set to `length` (since `value.var` is arbitrary in this case), [#2980](https://github.com/Rdatatable/data.table/issues/2980). - -22. Optimized `mean` of `integer` columns no longer warns about a coercion to numeric, [#986](https://github.com/Rdatatable/data.table/issues/986). Thanks @dgrtwo for his [YouTube tutorial at 3:01](https://youtu.be/AmE4LXPQErM?t=175) where the warning occurs. - -23. Using `first` and `last` function on `POSIXct` object no longer loads `xts` namespace, [#3857](https://github.com/Rdatatable/data.table/issues/3857). `first` on empty `data.table` returns empty `data.table` now [#3858](https://github.com/Rdatatable/data.table/issues/3858). - -24. Added some clarifying details about what happens when a shell command is used in `fread`, [#3877](https://github.com/Rdatatable/data.table/issues/3877). Thanks Brian for the StackOverflow question which highlighted the lack of explanation here. - -25. We continue to encourage packages to `Import` rather than `Depend` on `data.table`, [#3076](https://github.com/Rdatatable/data.table/issues/3076). To prevent the growth rate in new packages using `Depend`, we have requested that CRAN apply a small patch we provided to prevent new submissions using `Depend`. If this is accepted, the error under `--as-cran` will be as follows. The existing 73 packages using `Depend` will continue to pass OK until they next update, at which point they will be required to change from `Depend` to `Import`. - - ``` - R CMD check --as-cran - ... - * checking package dependencies ... ERROR - - data.table should be in Imports not Depends. Please contact its - maintainer for more information. - ``` - - -# data.table [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) - -## NEW FEATURES - -1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, please use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 774 CRAN and Bioconductor packages using `data.table` were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead, [#3310](https://github.com/Rdatatable/data.table/pull/3310). Upon agreement we went ahead. Many thanks to all those maintainers for already updating on CRAN, [#3347](https://github.com/Rdatatable/data.table/pull/3347). - -2. `foverlaps` now supports `type="equal"`, [#3416](https://github.com/Rdatatable/data.table/issues/3416) and part of [#3002](https://github.com/Rdatatable/data.table/issues/3002). - -3. The number of logical CPUs used by default has been reduced from 100% to 50%. The previous 100% default was reported to cause significant slow downs when other non-trivial processes were also running, [#3395](https://github.com/Rdatatable/data.table/issues/3395) [#3298](https://github.com/Rdatatable/data.table/issues/3298). Two new optional environment variables (`R_DATATABLE_NUM_PROCS_PERCENT` & `R_DATATABLE_NUM_THREADS`) control this default. `setDTthreads()` gains `percent=` and `?setDTthreads` has been significantly revised. The output of `getDTthreads(verbose=TRUE)` has been expanded. The environment variable `OMP_THREAD_LIMIT` is now respected ([#3300](https://github.com/Rdatatable/data.table/issues/3300)) in addition to `OMP_NUM_THREADS` as before. - -4. `rbind` and `rbindlist` now retain the position of duplicate column names rather than grouping them together [#3373](https://github.com/Rdatatable/data.table/issues/3373), fill length 0 columns (including NULL) with NA with warning [#1871](https://github.com/Rdatatable/data.table/issues/1871), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524). Thanks to Kun Ren for the requests which arose when parsing JSON. - -5. `rbindlist`'s `use.names=` default has changed from `FALSE` to `"check"`. This emits a message if the column names of each item are not identical and then proceeds as if `use.names=FALSE` for backwards compatibility; i.e., bind by column position not by column name. The `rbind` method for `data.table` already sets `use.names=TRUE` so this change affects `rbindlist` only and not `rbind.data.table`. To stack differently named columns together silently (the previous default behavior of `rbindlist`), it is now necessary to specify `use.names=FALSE` for clarity to readers of your code. Thanks to Clayton Stanley who first raised the issue [here](https://lists.r-forge.r-project.org/pipermail/datatable-help/2014-April/002480.html). To aid pinpointing the calls to `rbindlist` that need attention, the message can be turned to error using `options(datatable.rbindlist.check="error")`. This option also accepts `"warning"`, `"message"` and `"none"`. In this release the message is suppressed for default column names (`"V[0-9]+"`); the next release will emit the message for those too. In 6 months the default will be upgraded from message to warning. There are two slightly different messages. They are helpful, include context and point to this news item : - - ``` - Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with - NA (NULL for list columns), or use.names=FALSE to ignore column names. - See news item 5 in v1.12.2 for options to control this message. - - Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE - to match by column name, or use.names=FALSE to ignore column names. - See news item 5 in v1.12.2 for options to control this message. - ``` - -6. `fread` gains `keepLeadingZeros`, [#2999](https://github.com/Rdatatable/data.table/issues/2999). By default `FALSE` so that, as before, a field containing `001` is interpreted as the integer 1, otherwise the character string `"001"`. The default may be changed using `options(datatable.keepLeadingZeros=TRUE)`. Many thanks to @marc-outins for the PR. - -## BUG FIXES - -1. `rbindlist()` of a malformed factor which is missing a levels attribute is now a helpful error rather than a cryptic error about `STRING_ELT`, [#3315](https://github.com/Rdatatable/data.table/issues/3315). Thanks to Michael Chirico for reporting. - -2. Forgetting `type=` in `shift(val, "lead")` would segfault, [#3354](https://github.com/Rdatatable/data.table/issues/3354). A helpful error is now produced to indicate `"lead"` is being passed to `n=` rather than the intended `type=` argument. Thanks to @SymbolixAU for reporting. - -3. The default print output (top 5 and bottom 5 rows) when ncol>255 could display the columns in the wrong order, [#3306](https://github.com/Rdatatable/data.table/issues/3306). Thanks to Kun Ren for reporting. - -4. Grouping by unusual column names such as `by='string_with_\\'` and `keyby="x y"` could fail, [#3319](https://github.com/Rdatatable/data.table/issues/3319) [#3378](https://github.com/Rdatatable/data.table/issues/3378). Thanks to @HughParsonage for reporting and @MichaelChirico for the fixes. - -5. `foverlaps()` could return incorrect results for `POSIXct <= 1970-01-01`, [#3349](https://github.com/Rdatatable/data.table/issues/3349). Thanks to @lux5 for reporting. - -6. `dcast.data.table` now handles functions passed to `fun.aggregate=` via a variable; e.g., `funs <- list(sum, mean); dcast(..., fun.aggregate=funs`, [#1974](https://github.com/Rdatatable/data.table/issues/1974) [#1369](https://github.com/Rdatatable/data.table/issues/1369) [#2064](https://github.com/Rdatatable/data.table/issues/2064) [#2949](https://github.com/Rdatatable/data.table/issues/2949). Thanks to @sunbee, @Ping2016, @smidelius and @d0rg0ld for reporting. - -7. Some non-equijoin cases could segfault, [#3401](https://github.com/Rdatatable/data.table/issues/3401). Thanks to @Gayyam for reporting. - -8. `dcast.data.table` could sort rows containing `NA` incorrectly, [#2202](https://github.com/Rdatatable/data.table/issues/2202). Thanks to @Galileo-Galilei for the report. - -9. Sorting, grouping and finding unique values of a numeric column containing at most one finite value (such as `c(Inf,0,-Inf)`) could return incorrect results, [#3372](https://github.com/Rdatatable/data.table/issues/3372) [#3381](https://github.com/Rdatatable/data.table/issues/3381); e.g., `data.table(A=c(Inf,0,-Inf), V=1:3)[,sum(V),by=A]` would treat the 3 rows as one group. This was a regression in 1.12.0. Thanks to Nicolas Ampuero for reporting. - -10. `:=` with quoted expression and dot alias now works as expected, [#3425](https://github.com/Rdatatable/data.table/pull/3425). Thanks to @franknarf1 for raising and @jangorecki for the PR. - -11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause. - -12. `rbind` and `rbindlist(..., use.names=TRUE)` with over 255 columns could return the columns in a random order, [#3373](https://github.com/Rdatatable/data.table/issues/3373). The contents and name of each column was correct but the order that the columns appeared in the result might not have matched the original input. - -13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), and support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819). - -14. `NULL` columns are caught and error appropriately rather than segfault in some cases, [#2303](https://github.com/Rdatatable/data.table/issues/2303) [#2305](https://github.com/Rdatatable/data.table/issues/2305). Thanks to Hugh Parsonage and @franknarf1 for reporting. - -15. `melt` would error with 'factor malformed' or segfault in the presence of duplicate column names, [#1754](https://github.com/Rdatatable/data.table/issues/1754). Many thanks to @franknarf1, William Marble, wligtenberg and Toby Dylan Hocking for reproducible examples. All examples have been added to the test suite. - -16. Removing a column from a null (0-column) data.table is now a (standard and simpler) warning rather than error, [#2335](https://github.com/Rdatatable/data.table/issues/2335). It is no longer an error to add a column to a null (0-column) data.table. - -17. Non-UTF8 strings were not always sorted correctly on Windows (a regression in v1.12.0), [#3397](https://github.com/Rdatatable/data.table/issues/3397) [#3451](https://github.com/Rdatatable/data.table/issues/3451). Many thanks to @shrektan for reporting and fixing. - -18. `cbind` with a null (0-column) `data.table` now works as expected, [#3445](https://github.com/Rdatatable/data.table/issues/3445). Thanks to @mb706 for reporting. - -19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. - -## NOTES - -1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. - -2. v1.12.0 checked itself on loading using `tools::checkMD5sums("data.table")` but this check failed under the `packrat` package manager on Windows because `packrat` appears to modify the DESCRIPTION file of packages it has snapshot, [#3329](https://github.com/Rdatatable/data.table/issues/3329). This check is now removed. The `CdllVersion` check was introduced after the `checkMD5sums()` attempt and is better; e.g., reliable on all platforms. - -3. As promised in new feature 6 of v1.11.6 Sep 2018 (see below in this news file), the `datatable.CJ.names` option's default is now `TRUE`. In v1.13.0 it will be removed. - -4. Travis CI gains OSX using homebrew llvm for OpenMP support, [#3326](https://github.com/Rdatatable/data.table/issues/3326). Thanks @marcusklik for the PR. - -5. Calling `data.table:::print.data.table()` directly (i.e. bypassing method dispatch by using 3 colons) and passing it a 0-column `data.frame` (not `data.table`) now works, [#3363](https://github.com/Rdatatable/data.table/pull/3363). Thanks @heavywatal for the PR. - -6. v1.12.0 did not compile on Solaris 10 using Oracle Developer Studio 12.6, [#3285](https://github.com/Rdatatable/data.table/issues/3285). Many thanks to Prof Ripley for providing and testing a patch. For future reference and other package developers, a `const` variable should not be passed to OpenMP's `num_threads()` directive otherwise `left operand must be modifiable lvalue` occurs. This appears to be a compiler bug which is why the specific versions are mentioned in this note. - -7. `foverlaps` provides clearer error messages w.r.t. factor and POSIXct interval columns, [#2645](https://github.com/Rdatatable/data.table/issues/2645) [#3007](https://github.com/Rdatatable/data.table/issues/3007) [#1143](https://github.com/Rdatatable/data.table/issues/1143). Thanks to @sritchie73, @msummersgill and @DavidArenburg for the reports. - -8. `unique(DT)` checks up-front the types of all the columns and will fail if any column is type `list` even though those `list` columns may not be needed to establish uniqueness. Use `unique(DT, by=...)` to specify columns that are not type `list`. v1.11.8 and before would also correctly fail with the same error, but not when uniqueness had been established in prior columns: it would stop early, not look at the `list` column and return the correct result. Checking up-front was necessary for some internal optimizations and it's probably best to be explicit anyway. Thanks to James Lamb for reporting, [#3332](https://github.com/Rdatatable/data.table/issues/3332). The error message has been embellished : - - ``` - Column 2 of by= (2) is type 'list', not yet supported. Please use the by= argument to specify - columns with types that are supported. - ``` - -9. Reminder that note 11 in v1.11.0 (May 2018) warned that `set2key()` and `key2()` will be removed in May 2019. They have been warning since v1.9.8 (Nov 2016) and their warnings were upgraded to errors in v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental'. - -10. The `key(DT)<-` form of `setkey()` has been warning since at least 2012 to use `setkey()`. The warning is now stronger: `key(x)<-value is deprecated and not supported. Please change to use setkey().`. This warning will be upgraded to error in one year. - - -# data.table v1.12.0 (13 Jan 2019) - -## NEW FEATURES - -1. `setDTthreads()` gains `restore_after_fork=`, [#2885](https://github.com/Rdatatable/data.table/issues/2885). The default `NULL` leaves the internal option unchanged which by default is `TRUE`. `data.table` has always switched to single-threaded mode on fork. It used to restore multithreading after a fork too but problems were reported on Mac and Intel OpenMP library (see 1.10.4 notes below). We are now trying again thanks to suggestions and success reported by Kun Ren and Mark Klik in package `fst`. If you experience problems with multithreading after a fork, please restart R and call `setDTthreads(restore_after_fork=FALSE)`. - -2. Subsetting, ordering and grouping now use more parallelism. See benchmarks [here](https://h2oai.github.io/db-benchmark/) and Matt Dowle's presentation in October 2018 on YouTube [here](https://youtu.be/Ddr8N9STSuI). These internal changes gave rise to 4 regressions which were found before release thanks to Kun Ren, [#3211](https://github.com/Rdatatable/data.table/issues/3211). He kindly volunteers to 'go-first' and runs data.table through his production systems before release. We are looking for a 'go-second' volunteer please. A request to test before release was tweeted on 17 Dec [here](https://twitter.com/MattDowle/status/1074746218645938176). As usual, all CRAN and Bioconductor packages using data.table (currently 750) have been tested against this release, [#3233](https://github.com/Rdatatable/data.table/issues/3233). There are now 8,000 tests in 13,000 lines of test code; more lines of test code than there is code. Overall coverage has increased to 94% thanks to Michael Chirico. - -3. New `frollmean` has been added by Jan Gorecki to calculate _rolling mean_, see `?froll` for documentation. Function name and arguments are experimental. Related to [#2778](https://github.com/Rdatatable/data.table/issues/2778) (and [#624](https://github.com/Rdatatable/data.table/issues/624), [#626](https://github.com/Rdatatable/data.table/issues/626), [#1855](https://github.com/Rdatatable/data.table/issues/1855)). Other rolling statistics will follow. - -4. `fread()` can now read a remote compressed file in one step; `fread("https://domain.org/file.csv.bz2")`. The `file=` argument now supports `.gz` and `.bz2` too; i.e. `fread(file="file.csv.gz")` works now where only `fread("file.csv.gz")` worked in 1.11.8. - -5. `nomatch=NULL` now does the same as `nomatch=0L` in both `DT[...]` and `foverlaps()`; i.e. discards missing values silently (inner join). The default is still `nomatch=NA` (outer join) for statistical safety so that missing values are retained by default. After several years have elapsed, we will start to deprecate `0L`; please start using `NULL`. In future `nomatch=.(0)` (note that `.()` creates a `list` type and is different to `nomatch=0`) will fill with `0` to save replacing `NA` with `0` afterwards, [#857](https://github.com/Rdatatable/data.table/issues/857). - -6. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR. - -7. `NA` in `between()` and `%between%`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than `NA`. This is now documented. - -8. `shift()` now interprets negative values of `n` to mean the opposite `type=`, [#1708](https://github.com/Rdatatable/data.table/issues/1708). When `give.names=TRUE` the result is named using a positive `n` with the appropriate `type=`. Alternatively, a new `type="shift"` names the result using a signed `n` and constant type. - - ```R - shift(x, n=-5:5, give.names=TRUE) => "_lead_5" ... "_lag_5" - shift(x, n=-5:5, type="shift", give.names=TRUE) => "_shift_-5" ... "_shift_5" - ``` - -9. `fwrite()` now accepts `matrix`, [#2613](https://github.com/Rdatatable/data.table/issues/2613). Thanks to Michael Chirico for the suggestion and Felipe Parages for implementing. For now matrix input is converted to data.table (which can be costly) before writing. - -10. `fread()` and `fwrite()` can now handle file names in native and UTF-8 encoding, [#3078](https://github.com/Rdatatable/data.table/issues/3078). Thanks to Daniel Possenriede (@dpprdan) for reporting and fixing. - -11. `DT[i]` and `DT[i,cols]` now call internal parallel subsetting code, [#2951](https://github.com/Rdatatable/data.table/issues/2951). Subsetting is significantly faster (as are many other operations) with factor columns rather than character. - - ```R - N = 2e8 # 4GB data on 4-core CPU with 16GB RAM - DT = data.table(ID = sample(LETTERS,N,TRUE), - V1 = sample(5,N,TRUE), - V2 = runif(N)) - w = which(DT$V1 > 3) # select 40% of rows - # v1.12.0 v1.11.8 - system.time(DT[w]) # 0.8s 2.6s - DT[, ID := as.factor(ID)] - system.time(DT[w]) # 0.4s 2.3s - system.time(DT[w, c("ID","V2")]) # 0.3s 1.9s - ``` - -12. `DT[..., .SDcols=]` now accepts `patterns()`; e.g. `DT[..., .SDcols=patterns("^V")]`, for filtering columns according to a pattern (as in `melt.data.table`), [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples. - -13. `split` data.table method will now preserve attributes, closes [#2047](https://github.com/Rdatatable/data.table/issues/2047). Thanks to @caneff for reporting. - -14. `DT[i,j]` now retains user-defined and inherited attributes, [#995](https://github.com/Rdatatable/data.table/issues/995); e.g. - - ```R - attr(datasets::BOD,"reference") # "A1.4, p. 270" - attr(as.data.table(datasets::BOD)[2],"reference") # was NULL now "A1.4, p. 270" - ``` - - If a superclass defines attributes that may not be valid after a `[` subset then the superclass should implement its own `[` method to manage those after calling `NextMethod()`. - -## BUG FIXES - -1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. - -2. Column names that look like expressions (e.g. `"a<=colB"`) caused an error when used in `on=` even when wrapped with backticks, [#3092](https://github.com/Rdatatable/data.table/issues/3092). Additionally, `on=` now supports white spaces around operators; e.g. `on = "colA == colB"`. Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. - -3. Unmatched `patterns` in `measure.vars` fail early and with feedback, [#3106](https://github.com/Rdatatable/data.table/issues/3106). - -4. `fread(..., skip=)` now skips non-standard `\r` and `\n\r` line endings properly again, [#3006](https://github.com/Rdatatable/data.table/issues/3006). Standard line endings (`\n` Linux/Mac and `\r\n` Windows) were skipped ok. Thanks to @brattono and @tbrycekelly for providing reproducible examples, and @st-pasha for fixing. - -5. `fread(..., colClasses=)` could return a corrupted result when a lower type was requested for one or more columns (e.g. reading "3.14" as integer), [#2922](https://github.com/Rdatatable/data.table/issues/2922) [#2863](https://github.com/Rdatatable/data.table/issues/2863) [#3143](https://github.com/Rdatatable/data.table/issues/3143). It now ignores the request as documented and the helpful message in verbose mode is upgraded to warning. In future, coercing to a lower type might be supported (with warning if any accuracy is lost). `"NULL"` is recognized again in both vector and list mode; e.g. `colClasses=c("integer","NULL","integer")` and `colClasses=list(NULL=2, integer=10:40)`. Thanks to Arun Srinivasan, Kun Ren, Henri Ståhl and @kszela24 for reporting. - -6. `cube()` will now produce expected order of results, [#3179](https://github.com/Rdatatable/data.table/issues/3179). Thanks to @Henrik-P for reporting. - -7. `groupingsets()` groups by empty column set and constant value in `j`, [#3173](https://github.com/Rdatatable/data.table/issues/3173). - -8. `split.data.table()` failed if `DT` had a factor column named `"x"`, [#3151](https://github.com/Rdatatable/data.table/issues/3151). Thanks to @tdeenes for reporting and fixing. - -9. `fsetequal` now handles properly datasets having last column a character, closes [#2318](https://github.com/Rdatatable/data.table/issues/2318). Thanks to @pschil and @franknarf1 for reporting. - -10. `DT[..., .SDcols=integer(0L)]` could fail, [#3185](https://github.com/Rdatatable/data.table/issues/3185). An empty `data.table` is now returned correctly. - -11. `as.data.table.default` method will now always copy its input, closes [#3230](https://github.com/Rdatatable/data.table/issues/3230). Thanks to @NikdAK for reporting. - -12. `DT[..., .SDcols=integer()]` failed with `.SDcols is numeric but has both +ve and -ve indices`, [#1789](https://github.com/Rdatatable/data.table/issues/1789) and [#3185](https://github.com/Rdatatable/data.table/issues/3185). It now functions as `.SDcols=character()` has done and creates an empty `.SD`. Thanks to Gabor Grothendieck and Hugh Parsonage for reporting. A related issue with empty `.SDcols` was fixed in development before release thanks to Kun Ren's testing, [#3211](https://github.com/Rdatatable/data.table/issues/3211). - -13. Multithreaded stability should be much improved with R 3.5+. Many thanks to Luke Tierney for pinpointing a memory issue with package `constellation` caused by `data.table` and his advice, [#3165](https://github.com/Rdatatable/data.table/issues/3165). Luke also added an extra check to R-devel when compiled with `--enable-strict-barrier`. The test suite is run through latest daily R-devel after every commit as usual, but now with `--enable-strict-barrier` on too via GitLab CI ("Extra" badge on the `data.table` homepage) thanks to Jan Gorecki. - -14. Fixed an edge-case bug of platform-dependent output of `strtoi("", base = 2L)` on which `groupingsets` had relied, [#3267](https://github.com/Rdatatable/data.table/issues/3267). - -## NOTES - -1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. - -2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. - -3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR. - -4. `update.dev.pkg()` gains `type=` to specify if update should be made from binaries, sources or both. [#3148](https://github.com/Rdatatable/data.table/issues/3148). Thanks to Reino Bruner for the detailed suggestions. - -5. `setDT()` improves feedback when passed a ragged list (i.e. where all columns in the list are not the same length), [#3121](https://github.com/Rdatatable/data.table/issues/3121). Thanks @chuk-yong for highlighting. - -6. The one and only usage of `UNPROTECT_PTR()` has been removed, [#3232](https://github.com/Rdatatable/data.table/issues/3232). Thanks to Tomas Kalibera's investigation and advice here: https://developer.r-project.org/Blog/public/2018/12/10/unprotecting-by-value/index.html - - -# data.table v1.11.8 (30 Sep 2018) - -## NEW FEATURES - -1. `fread()` can now read `.gz` and `.bz2` files directly: `fread("file.csv.gz")`, [#717](https://github.com/Rdatatable/data.table/issues/717) [#3058](https://github.com/Rdatatable/data.table/issues/3058). It uses `R.utils::decompressFile` to decompress to a `tempfile()` which is then read by `fread()` in the usual way. For greater speed on large-RAM servers, it is recommended to use ramdisk for temporary files by setting `TMPDIR` to `/dev/shm` before starting R; see `?tempdir`. The decompressed temporary file is removed as soon as `fread` completes even if there is an error reading the file. Reading a remote compressed file in one step will be supported in the next version; e.g. `fread("https://domain.org/file.csv.bz2")`. - -## BUG FIXES - -1. Joining two keyed tables using `on=` to columns not forming a leading subset of `key(i)` could result in an invalidly keyed result, [#3061](https://github.com/Rdatatable/data.table/issues/3061). Subsequent queries on the result could then return incorrect results. A warning `longer object length is not a multiple of shorter object length` could also occur. Thanks to @renkun-ken for reporting and the PR. - -2. `keyby=` on columns for which an index exists now uses the index (new feature 7 in v1.11.6 below) but if an `i` subset is present in the same query then it could segfault, [#3062](https://github.com/Rdatatable/data.table/issues/3062). Again thanks to @renkun-ken for reporting. - -3. Assigning an out-of-range integer to an item in a factor column (a rare operation) correctly created an `NA` in that spot with warning, but now no longer also corrupts the variable being assigned, [#2984](https://github.com/Rdatatable/data.table/issues/2984). Thanks to @radfordneal for reporting and @MarkusBonsch for fixing. Assigning a string which is missing from the factor levels continues to automatically append the string to the factor levels. - -4. Assigning a sequence to a column using base R methods (e.g. `DT[["foo"]] = 1:10`) could cause subsetting to fail with `Internal error in subset.c: column is an ALTREP vector`, [#3051](https://github.com/Rdatatable/data.table/issues/3051). Thanks to Michel Lang for reporting. - -5. `as.data.table` `matrix` method now properly handles rownames for 0 column data.table output. Thanks @mllg for reporting. Closes [#3149](https://github.com/Rdatatable/data.table/issues/3149). - -## NOTES - -1. The test suite now turns on R's new _R_CHECK_LENGTH_1_LOGIC2_ to catch when internal use of `&&` or `||` encounter arguments of length more than one. Thanks to Hugh Parsonage for implementing and fixing the problems caught by this. - -2. Some namespace changes have been made with respect to melt, dcast and xts. No change is expected but if you do have any trouble, please file an issue. - -3. `split.data.table` was exported in v1.11.6 in addition to being registered using `S3method(split, data.table)`. The export has been removed again. It had been added because a user said they found it difficult to find, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But S3 methods are not normally exported explicitly by packages. The proper way to access the `split.data.table` method is to call `split(DT)` where `DT` is a `data.table`. The generic (`base::split` in this case) then dispatches to the `split.data.table` method. v1.11.6 was not on CRAN very long (1 week) so we think it's better to revert this change quickly. To know what methods exist, R provides the `methods()` function. - - ```R - methods(split) # all the methods for the split generic - methods(class="data.table") # all the generics that data.table has a method for (47 currently) - ``` - - -# data.table v1.11.6 (19 Sep 2018) - -## NEW FEATURES - -1. For convenience when some of the files in `fnams` are empty in `rbindlist(lapply(fnams,fread))`, `fread` now reads empty input as a null-data.table with warning rather than error, [#2898](https://github.com/Rdatatable/data.table/issues/2898). For consistency, `fwrite(data.table(NULL))` now creates an empty file and warns instead of error, too. - -2. `setcolorder(DT)` without further arguments now defaults to moving the key columns to be first, [#2895](https://github.com/Rdatatable/data.table/issues/2895). Thanks to @jsams for the PR. - -3. Attempting to subset on `col` when the column is actually called `Col` will still error, but the error message will helpfully suggest similarly-spelled columns, [#2887](https://github.com/Rdatatable/data.table/issues/2887). This is experimental, applies just to `i` currently, and we look forward to feedback. Thanks to Michael Chirico for the suggestion and PR. - -4. `fread()` has always accepted literal data; e.g. `fread("A,B\n1,2\n3,4")`. It now gains explicit `text=`; e.g. `fread(text="A,B\n1,2\n3,4")`. Unlike the first general purpose `input=` argument, the `text=` argument accepts multi-line input; e.g. `fread(text=c("A,B","1,2","3,4"))`, [#1423](https://github.com/Rdatatable/data.table/issues/1423). Thanks to Douglas Clark for the request and Hugh Parsonage for the PR. - -5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. - -6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. - -7. If an appropriate index exists, `keyby=` will now use it. For example, given `setindex(DT,colA,colB)`, both `DT[,j,keyby=colA]` (a leading subset of the index columns) and `DT[,j,keyby=.(colA,colB)]` will use the index, but not `DT[,j,keyby=.(colB,colA)]`. The option `options(datatable.use.index=FALSE)` will turn this feature off. Please always use `keyby=` unless you wish to retain the order of groups by first-appearance order (in which case use `by=`). Also, both `keyby=` and `by=` already used the key where possible but are now faster when using just the first column of the key. As usual, setting `verbose=TRUE` either per-query or globally using `options(datatable.verbose=TRUE)` will report what's being done internally. - -## BUG FIXES - -1. `fread` now respects the order of columns passed to `select=` when column numbers are used, [#2986](https://github.com/Rdatatable/data.table/issues/2986). It already respected the order when column names are used. Thanks @privefl for raising the issue. - -2. `gmin` and `gmax` no longer fail on _ordered_ factors, [#1947](https://github.com/Rdatatable/data.table/issues/1947). Thanks to @mcieslik-mctp for identifying and @mbacou for the nudge. - -3. `as.ITime.character` now properly handles NA when attempting to detect the format of non-NA values in vector. Thanks @polyjian for reporting, closes [#2940](https://github.com/Rdatatable/data.table/issues/2940). - -4. `as.matrix(DT, rownames="id")` now works when `DT` has a single row, [#2930](https://github.com/Rdatatable/data.table/issues/2930). Thanks to @malcook for reporting and @sritchie73 for fixing. The root cause was the dual meaning of the `rownames=` argument: i) a single column name/number (most common), or ii) rowname values length 1 for the single row. For clarity and safety, `rownames.value=` has been added. Old usage (i.e. `length(rownames)>1`) continues to work for now but will issue a warning in a future release, and then error in a release after that. - -5. Fixed regression in v1.11.0 (May 2018) caused by PR [#2389](https://github.com/Rdatatable/data.table/pull/2389) which introduced partial key retainment on `:=` assigns. This broke the joining logic that assumed implicitly that assigning always drops keys completely. Consequently, join and subset results could be wrong when matching character to factor columns with existing keys, [#2881](https://github.com/Rdatatable/data.table/issues/2881). Thanks to @ddong63 for reporting and to @MarkusBonsch for fixing. Missing test added to ensure this doesn't arise again. - -6. `as.IDate.numeric` no longer ignores "origin", [#2880](https://github.com/Rdatatable/data.table/issues/2880). Thanks to David Arenburg for reporting and fixing. - -7. `as.ITime.times` was rounding fractional seconds while other methods were truncating, [#2870](https://github.com/Rdatatable/data.table/issues/2870). The `as.ITime` method gains `ms=` taking `"truncate"` (default), `"nearest"` and `"ceil"`. Thanks to @rossholmberg for reporting and Michael Chirico for fixing. - -8. `fwrite()` now writes POSIXct dates after 2038 correctly, [#2995](https://github.com/Rdatatable/data.table/issues/2995). Thanks to Manfred Zorn for reporting and Philippe Chataignon for the PR fixing it. - -9. `fsetequal` gains the `all` argument to make it consistent with the other set operator functions `funion`, `fsetdiff` and `fintersect` [#2968](https://github.com/Rdatatable/data.table/issues/2968). When `all = FALSE` `fsetequal` will treat rows as elements in a set when checking whether two `data.tables` are equal (i.e. duplicate rows will be ignored). For now the default value is `all = TRUE` for backwards compatibility, but this will be changed to `all = FALSE` in a future release to make it consistent with the other set operation functions. Thanks to @franknarf1 for reporting and @sritchie73 for fixing. - -10. `fintersect` failed on tables with a column called `y`, [#3034](https://github.com/Rdatatable/data.table/issues/3034). Thanks to Maxim Nazarov for reporting. - -11. Compilation fails in AIX because NAN and INFINITY macros definition in AIX make them not constant literals, [#3043](https://github.com/Rdatatable/data.table/pull/3043). Thanks to Ayappan for reporting and fixing. - -12. The introduction of altrep in R 3.5.0 caused some performance regressions of about 20% in some cases, [#2962](https://github.com/Rdatatable/data.table/issues/2962). Investigating this led to some improvements to grouping which are faster than before R 3.5.0 in some cases. Thanks to Nikolay S. for reporting. The work to accomodate altrep is not complete but it is better and it is highly recommended to upgrade to this update. - -13. Fixed 7 memory faults thanks to CRAN's [`rchk`](https://github.com/kalibera/rchk) tool by Tomas Kalibera, [#3033](https://github.com/Rdatatable/data.table/pull/3033). - -## NOTES - -1. The type coercion warning message has been improved, [#2989](https://github.com/Rdatatable/data.table/pull/2989). Thanks to @sarahbeeysian on Twitter for highlighting. For example, given the follow statements: - - ```R - DT = data.table(id=1:3) - DT[2, id:="foo"] - ``` - - the warning message has changed from : - - ``` - Coerced character RHS to integer to match the column's type. Either change the target column - ['id'] to character first (by creating a new character vector length 3 (nrows of entire table) and - assign that; i.e. 'replace' column), or coerce RHS to integer (e.g. 1L, NA_[real|integer]_, as.*, - etc) to make your intent clear and for speed. Or, set the column type correctly up front when you - create the table and stick to it, please. - ``` - - to : - - ``` - Coerced character RHS to integer to match the type of the target column (column 1 named 'id'). If - the target column's type integer is correct, it's best for efficiency to avoid the coercion and - create the RHS as type integer. To achieve that consider the L postfix: typeof(0L) vs typeof(0), - and typeof(NA) vs typeof(NA_integer_) vs typeof(NA_real_). Wrapping the RHS with as.integer() will - avoid this warning but still perform the coercion. If the target column's type is not correct, it - is best to revisit where the DT was created and fix the column type there; e.g., by using - colClasses= in fread(). Otherwise, you can change the column type now by plonking a new column (of - the desired type) over the top of it; e.g. DT[, `id`:=as.character(`id`)]. If the RHS of := has - nrow(DT) elements then the assignment is called a column plonk and is the way to change a column's - type. Column types can be observed with sapply(DT,typeof). - ``` - - Further, if a coercion from double to integer is performed, fractional data such as 3.14 is now detected and the truncation to 3 is warned about if and only if truncation has occurred. - - ```R - DT = data.table(v=1:3) - DT[2, v:=3.14] - Warning message: - Coerced double RHS to integer to match the type of the target column (column 1 named 'v'). One - or more RHS values contain fractions which have been lost; e.g. item 1 with value 3.140000 has - been truncated to 3. - ``` - -2. `split.data.table` method is now properly exported, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But we don't recommend it because `split` copies all the pieces into new memory. - -3. Setting indices on columns which are part of the key will now create those indices. - -4. `hour`, `minute`, and `second` utility functions use integer arithmetic when the input is already (explicitly) UTC-based `POSIXct` for 4-10x speedup vs. using `as.POSIXlt`. - -5. Error added for incorrect usage of `%between%`, with some helpful diagnostic hints, [#3014](https://github.com/Rdatatable/data.table/issues/3014). Thanks @peterlittlejohn for offering his user experience and providing the impetus. - - -# data.table v1.11.4 (27 May 2018) - -1. Empty RHS of `:=` is no longer an error when the `i` clause returns no rows to assign to anyway, [#2829](https://github.com/Rdatatable/data.table/issues/2829). Thanks to @cguill95 for reporting and to @MarkusBonsch for fixing. - -2. Fixed runaway memory usage with R-devel (R > 3.5.0), [#2882](https://github.com/Rdatatable/data.table/pull/2882). Thanks to many people but in particular to Trang Nguyen for making the breakthrough reproducible example, Paul Bailey for liaising, and Luke Tierney for then pinpointing the issue. It was caused by an interaction of two or more data.table threads operating on new compact vectors in the ALTREP framework, such as the sequence `1:n`. This interaction could result in R's garbage collector turning off, and hence the memory explosion. Problems may occur in R 3.5.0 too but we were only able to reproduce in R > 3.5.0. The R code in data.table's implementation benefits from ALTREP (`for` loops in R no longer allocate their range vector input, for example) but are not so appropriate as data.table columns. Sequences such as `1:n` are common in test data but not very common in real-world datasets. Therefore, there is no need for data.table to support columns which are ALTREP compact sequences. The `data.table()` function already expanded compact vectors (by happy accident) but `setDT()` did not (it now does). If, somehow, a compact vector still reaches the internal parallel regions, a helpful error will now be generated. If this happens, please report it as a bug. - -3. Tests 1590.3 & 1590.4 now pass when users run `test.data.table()` on Windows, [#2856](https://github.com/Rdatatable/data.table/pull/2856). Thanks to Avraham Adler for reporting. Those tests were passing on AppVeyor, win-builder and CRAN's Windows because `R CMD check` sets `LC_COLLATE=C` as documented in R-exts$1.3.1, whereas by default on Windows `LC_COLLATE` is usually a regional Windows-1252 dialect such as `English_United States.1252`. - -4. Around 1 billion very small groups (of size 1 or 2 rows) could result in `"Failed to realloc working memory"` even when plenty of memory is available, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks once again to @jsams for the detailed report as a follow up to bug fix 40 in v1.11.0. - - -# data.table v1.11.2 (08 May 2018) - -1. `test.data.table()` created/overwrote variable `x` in `.GlobalEnv`, [#2828](https://github.com/Rdatatable/data.table/issues/2828); i.e. a modification of user's workspace which is not allowed. Thanks to @etienne-s for reporting. - -2. `as.chron` methods for `IDate` and `ITime` have been removed, [#2825](https://github.com/Rdatatable/data.table/issues/2825). `as.chron` still works since `IDate` inherits from `Date`. We are not sure why we had specific methods in the first place. It may have been from a time when `IDate` did not inherit from `Date`, perhaps. Note that we don't use `chron` ourselves in our own work. - -3. Fixed `SETLENGTH() cannot be applied to an ALTVEC object` starting in R-devel (R 3.6.0) on 1 May 2018, a few hours after 1.11.0 was accepted on CRAN, [#2820](https://github.com/Rdatatable/data.table/issues/2820). Many thanks to Luke Tierney for pinpointing the problem. - -4. Fixed some rare memory faults in `fread()` and `rbindlist()` found with `gctorture2()` and [`rchk`](https://github.com/kalibera/rchk), [#2841](https://github.com/Rdatatable/data.table/issues/2841). - - -# data.table v1.11.0 (01 May 2018) - -## NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES - -1. `fread()`'s `na.strings=` argument : - - ```R - "NA" # old default - getOption("datatable.na.strings", "NA") # this release; i.e. the same; no change yet - getOption("datatable.na.strings", "") # future release - ``` - - This option controls how `,,` is read in character columns. It does not affect numeric columns which read `,,` as `NA` regardless. We would like `,,`=>`NA` for consistency with numeric types, and `,"",`=>empty string to be the standard default for `fwrite/fread` character columns so that `fread(fwrite(DT))==DT` without needing any change to any parameters. `fwrite` has never written `NA` as `"NA"` in case `"NA"` is a valid string in the data; e.g., 2 character id columns sometimes do. Instead, `fwrite` has always written `,,` by default for an `` in a character columns. The use of R's `getOption()` allows users to move forward now, using `options(datatable.fread.na.strings="")`, or restore old behaviour when the default's default is changed in future, using `options(datatable.fread.na.strings="NA")`. - -2. `fread()` and `fwrite()`'s `logical01=` argument : - - ```R - logical01 = FALSE # old default - getOption("datatable.logical01", FALSE) # this release; i.e. the same; no change yet - getOption("datatable.logical01", TRUE) # future release - ``` - - This option controls whether a column of all 0's and 1's is read as `integer`, or `logical` directly to avoid needing to change the type afterwards to `logical` or use `colClasses`. `0/1` is smaller and faster than `"TRUE"/"FALSE"`, which can make a significant difference to space and time the more `logical` columns there are. When the default's default changes to `TRUE` for `fread` we do not expect much impact since all arithmetic operators that are currently receiving 0's and 1's as type `integer` (think `sum()`) but instead could receive `logical`, would return exactly the same result on the 0's and 1's as `logical` type. However, code that is manipulating column types using `is.integer` or `is.logical` on `fread`'s result, could require change. It could be painful if `DT[(logical_column)]` (i.e. `DT[logical_column==TRUE]`) changed behaviour due to `logical_column` no longer being type `logical` but `integer`. But that is not the change proposed. The change is the other way around; i.e., a previously `integer` column holding only 0's and 1's would now be type `logical`. Since it's that way around, we believe the scope for breakage is limited. We think a lot of code is converting 0/1 integer columns to logical anyway, either using `colClasses=` or afterwards with an assign. For `fwrite`, the level of breakage depends on the consumer of the output file. We believe `0/1` is a better more standard default choice to move to. See notes below about improvements to `fread`'s sampling for type guessing, and automatic rereading in the rare cases of out-of-sample type surprises. - - -These options are meant for temporary use to aid your migration, [#2652](https://github.com/Rdatatable/data.table/pull/2652). You are not meant to set them to the old default and then not migrate your code that is dependent on the default. Either set the argument explicitly so your code is not dependent on the default, or change the code to cope with the new default. Over the next few years we will slowly start to remove these options, warning you if you are using them, and return to a simple default. See the history of NEWS and NEWS.0 for past migrations that have, generally speaking, been successfully managed in this way. For example, at the end of NOTES for this version (below in this file) is a note about the usage of `datatable.old.unique.by.key` now warning, as you were warned it would do over a year ago. When that change was introduced, the default was changed and that option provided an option to restore the old behaviour. These `fread`/`fwrite` changes are even more cautious and not even changing the default's default yet. Giving you extra warning by way of this notice to move forward. And giving you a chance to object. - -## NEW FEATURES - -1. `fread()`: - * Efficiency savings at C level including **parallelization** announced [here](https://github.com/Rdatatable/data.table/wiki/talks/BARUG_201704_ParallelFread.pdf); e.g. a 9GB 2 column integer csv input is **50s down to 12s** to cold load on a 4 core laptop with 16GB RAM and SSD. Run `echo 3 >/proc/sys/vm/drop_caches` first to measure cold load time. Subsequent load time (after file has been cached by OS on the first run) **40s down to 6s**. - * The [fread for small data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread) page has been revised. - * Memory maps lazily; e.g. reading just the first 10 rows with `nrow=10` is **12s down to 0.01s** from cold for the 9GB file. Large files close to your RAM limit may work more reliably too. The progress meter will commence sooner and more consistently. - * `fread` has always jumped to the middle and to the end of the file for a much improved column type guess. The sample size is increased from 100 rows at 10 jump jump points (1,000 rows) to 100 rows at 100 jumps points (10,000 row sample). In the rare case of there still being out-of-sample type exceptions, those columns are now *automatically reread* so you don't have to use `colClasses` yourself. - * Large number of columns support; e.g. **12,000 columns** tested. - * **Quoting rules** are more robust and flexible. See point 10 on the wiki page [here](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread#10-automatic-quote-escape-method-detection-including-no-escape). - * Numeric data that has been quoted is now detected and read as numeric. - * The ability to position `autostart` anywhere inside one of multiple tables in a single file is removed with warning. It used to search upwards from that line to find the start of the table based on a consistent number of columns. People appear to be using `skip="string"` or `skip=nrow` to find the header row exactly, which is retained and simpler. It was too difficult to retain search-upwards-autostart together with skipping/filling blank lines, filling incomplete rows and parallelization too. If there is any header info above the column names, it is still auto detected and auto skipped (particularly useful when loading a set of files where the column names start on different lines due to a varying height messy header). - * `dec=','` is now implemented directly so there is no dependency on locale. The options `datatable.fread.dec.experiment` and `datatable.fread.dec.locale` have been removed. - * `\\r\\r\\n` line endings are now handled such as produced by `base::download.file()` when it doubles up `\\r`. Other rare line endings (`\\r` and `\\n\\r`) are now more robust. - * Mixed line endings are now handled; e.g. a file formed by concatenating a Unix file and a Windows file so that some lines end with `\\n` while others end with `\\r\\n`. - * Improved automatic detection of whether the first row is column names by comparing the types of the fields on the first row against the column types ascertained by the 10,000 rows sample (or `colClasses` if provided). If a numeric column has a string value at the top, then column names are deemed present. - * Detects GB-18030 and UTF-16 encodings and in verbose mode prints a message about BOM detection. - * Detects and ignores trailing ^Z end-of-file control character sometimes created on MS DOS/Windows, [#1612](https://github.com/Rdatatable/data.table/issues/1612). Thanks to Gergely Daróczi for reporting and providing a file. - * Added ability to recognize and parse hexadecimal floating point numbers, as used for example in Java. Thanks for @scottstanfield [#2316](https://github.com/Rdatatable/data.table/issues/2316) for the report. - * Now handles floating-point NaN values in a wide variety of formats, including `NaN`, `sNaN`, `1.#QNAN`, `NaN1234`, `#NUM!` and others, [#1800](https://github.com/Rdatatable/data.table/issues/1800). Thanks to Jori Liesenborgs for highlighting and the PR. - * If negative numbers are passed to `select=` the out-of-range error now suggests `drop=` instead, [#2423](https://github.com/Rdatatable/data.table/issues/2423). Thanks to Michael Chirico for the suggestion. - * `sep=NULL` or `sep=""` (i.e., no column separator) can now be used to specify single column input reliably like `base::readLines`, [#1616](https://github.com/Rdatatable/data.table/issues/1616). `sep='\\n'` still works (even on Windows where line ending is actually `\\r\\n`) but `NULL` or `""` are now documented and recommended. Thanks to Dmitriy Selivanov for the pull request and many others for comments. As before, `sep=NA` is not valid; use the default `"auto"` for automatic separator detection. `sep='\\n'` is now deprecated and in future will start to warn when used. - * Single-column input with blank lines is now valid and the blank lines are significant (representing `NA`). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing `NA` which are written as blank. There is no change when `ncol>1`; i.e., input stops with detailed warning at the first blank line, because a blank line when `ncol>1` is invalid input due to no separators being present. Thanks to @skanskan, Michael Chirico, @franknarf1 and Pasha for the testing and discussions, [#2106](https://github.com/Rdatatable/data.table/issues/2106). - * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. - * `skip=` and `nrow=` are more reliable and are no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). - * Ram disk (`/dev/shm`) is no longer used for the output of system command input. Although faster when it worked, it was causing too many device full errors; e.g., [#1139](https://github.com/Rdatatable/data.table/issues/1139) and [zUMIs/19](https://github.com/sdparekh/zUMIs/issues/19). Thanks to Kyle Chung for reporting. Standard `tempdir()` is now used. If you wish to use ram disk, set TEMPDIR to `/dev/shm`; see `?tempdir`. - * Detecting whether a very long input string is a file name or data is now much faster, [#2531](https://github.com/Rdatatable/data.table/issues/2531). Many thanks to @javrucebo for the detailed report, benchmarks and suggestions. - * A column of `TRUE/FALSE`s is ok, as well as `True/False`s and `true/false`s, but mixing styles (e.g. `TRUE/false`) is not and will be read as type `character`. - * New argument `index` to compliment the existing `key` argument for applying secondary orderings out of the box for convenience, [#2633](https://github.com/Rdatatable/data.table/issues/2633). - * A warning is now issued whenever incorrectly quoted fields have been detected and fixed using a non-standard quote rule. `fread` has always used these advanced rules but now it warns that it is using them. Most file writers correctly quote fields if the field contains the field separator, but a common error is not to also quote fields that contain a quote and then escape those quotes, particularly if that quote occurs at the start of the field. The ability to detect and fix such files is referred to as self-healing. Ambiguities are resolved using the knowledge that the number of columns is constant, and therefore this ability is not available when `fill=TRUE`. This feature can be improved in future by using column type consistency as well as the number of fields. For example: - - ```R - txt = 'A,B\n1,hello\n2,"howdy" said Joe\n3,bonjour\n' - cat(txt) - # A,B - # 1,hello - # 2,"howdy" said Joe - # 3,bonjour - fread(txt) - A B - - 1: 1 hello - 2: 2 "howdy" said Joe - 3: 3 bonjour - Warning message: - In fread(txt) : Found and resolved improper quoting - ``` - - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney, Ananda Mahto, @memoryfull, @brandenkmurray for testing dev and reporting these regressions before release to CRAN: #1464, #1671, #1888, #1895, #2070, #2073, #2087, #2091, #2092, #2107, #2118, #2123, #2167, #2194, #2196, #2201, #2222, #2228, #2238, #2246, #2251, #2265, #2267, #2285, #2287, #2299, #2322, #2347, #2352, #2370, #2371, #2395, #2404, #2446, #2453, #2457, #2464, #2481, #2499, #2512, #2515, #2516, #2518, #2520, #2523, #2526, #2535, #2542, #2548, #2561, #2600, #2625, #2666, #2697, #2735, #2744. - -2. `fwrite()`: - * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). - * `logical01` has been added and the old name `logicalAsInt` retained. Pease move to the new name when convenient for you. The old argument name (`logicalAsInt`) will slowly be deprecated over the next few years. The default is unchanged: `FALSE`, so `logical` is still written as `"TRUE"`/`"FALSE"` in full by default. We intend to change the default's default in future to `TRUE`; see the notice at the top of these release notes. - -3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. - -4. `tables` gains `index` argument for supplementary metadata about `data.table`s in memory (or any optionally specified environment), part of [#1648](https://github.com/Rdatatable/data.table/issues/1648). Thanks due variously to @jangorecki, @rsaporta, @MichaelChirico for ideas and work towards PR. - -5. Improved auto-detection of `character` inputs' formats to `as.ITime` to mirror the logic in `as.POSIXlt.character`, [#1383](https://github.com/Rdatatable/data.table/issues/1383) Thanks @franknarf1 for identifying a discrepancy and @MichaelChirico for investigating. - -6. `setcolorder()` now accepts less than `ncol(DT)` columns to be moved to the front, [#592](https://github.com/Rdatatable/data.table/issues/592). Thanks @MichaelChirico for the PR. This also incidentally fixed [#2007](https://github.com/Rdatatable/data.table/issues/2007) whereby explicitly setting `select = NULL` in `fread` errored; thanks to @rcapell for reporting that and @dselivanov and @MichaelChirico for investigating and providing a new test. - -7. Three new *Grouping Sets* functions: `rollup`, `cube` and `groupingsets`, [#1377](https://github.com/Rdatatable/data.table/issues/1377). Allows to aggregation on various grouping levels at once producing sub-totals and grand total. - -8. `as.data.table()` gains new method for `array`s to return a useful data.table, [#1418](https://github.com/Rdatatable/data.table/issues/1418). - -9. `print.data.table()` (all via master issue [#1523](https://github.com/Rdatatable/data.table/issues/1523)): - - * gains `print.keys` argument, `FALSE` by default, which displays the keys and/or indices (secondary keys) of a `data.table`. Thanks @MichaelChirico for the PR, Yike Lu for the suggestion and Arun for honing that idea to its present form. - - * gains `col.names` argument, `"auto"` by default, which toggles which registers of column names to include in printed output. `"top"` forces `data.frame`-like behavior where column names are only ever included at the top of the output, as opposed to the default behavior which appends the column names below the output as well for longer (>20 rows) tables. `"none"` shuts down column name printing altogether. Thanks @MichaelChirico for the PR, Oleg Bondar for the suggestion, and Arun for guiding commentary. - - * list columns would print the first 6 items in each cell followed by a comma if there are more than 6 in that cell. Now it ends ",..." to make it clearer, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). Thanks to @franknarf1 for drawing attention to an issue raised on Stack Overflow by @TMOTTM [here](https://stackoverflow.com/q/47679701). - -10. `setkeyv` accelerated if key already exists [#2331](https://github.com/Rdatatable/data.table/issues/2331). Thanks to @MarkusBonsch for the PR. - -11. Keys and indexes are now partially retained up to the key column assigned to with ':=' [#2372](https://github.com/Rdatatable/data.table/issues/2372). They used to be dropped completely if any one of the columns was affected by `:=`. Tanks to @MarkusBonsch for the PR. - -12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR. - -13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR, and thanks to @mgahan for pointing out a reversion in `na.omit.data.table` before release, [#2660](https://github.com/Rdatatable/data.table/issues/2660#issuecomment-371027948). - -14. `uniqueN()` is now faster on logical vectors. Thanks to Hugh Parsonage for [PR#2648](https://github.com/Rdatatable/data.table/pull/2648). - - ```R - N = 1e9 - # was now - x = c(TRUE,FALSE,NA,rep(TRUE,N)) # - uniqueN(x) == 3 # 5.4s 0.00s - x = c(TRUE,rep(FALSE,N), NA) # - uniqueN(x,na.rm=TRUE) == 2 # 5.4s 0.00s - x = c(rep(TRUE,N),FALSE,NA) # - uniqueN(x) == 3 # 6.7s 0.38s - ``` - -15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). -Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation. - -16. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. - -17. `update.dev.pkg` is new function to update package from development repository, it will download package sources only when newer commit is available in repository. `data.table::update.dev.pkg()` defaults updates `data.table`, but any package can be used. - -18. Item 1 in NEWS for [v1.10.2](https://github.com/Rdatatable/data.table/blob/master/NEWS.md#changes-in-v1102--on-cran-31-jan-2017) on CRAN in Jan 2017 included : - - > When j is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. - > When you see the `..` prefix think one-level-up, like the directory `..` in all operating systems means the parent directory. - > In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. - - The response has been positive ([this tweet](https://twitter.com/MattDowle/status/967290562725359617) and [FR#2655](https://github.com/Rdatatable/data.table/issues/2655)) and so this prefix is now expanded to all symbols appearing in `j=` as a first step; e.g. - - ```R - cols = "colB" - DT[, c(..cols, "colC")] # same as DT[, .(colB,colC)] - DT[, -..cols] # all columns other than colB - ``` - - Thus, `with=` should no longer be needed in any cases. Please change to using the `..` prefix and over the next few years we will start to formally deprecate and remove the `with=` parameter. If this is well received, the `..` prefix could be expanded to symbols appearing in `i=` and `by=`, too. Note that column names should not now start with `..`. If a symbol `..var` is used in `j=` but `..var` exists as a column name, the column still takes precedence, for backwards compatibility. Over the next few years, data.table will start issuing warnings/errors when it sees column names starting with `..`. This affects one CRAN package out of 475 using data.table, so we do not believe this restriction to be unreasonable. Our main focus here which we believe `..` achieves is to resolve the more common ambiguity when `var` is in calling scope and `var` is a column name too. Further, we have not forgotten that in the past we recommended prefixing the variable in calling scope with `..` yourself. If you did that and `..var` exists in calling scope, that still works, provided neither `var` exists in calling scope nor `..var` exists as a column name. Please now remove the `..` prefix on `..var` in calling scope to tidy this up. In future data.table will start to warn/error on such usage. - -19. `setindexv` can now assign multiple (separate) indices by accepting a `list` in the `cols` argument. - -20. `as.matrix.data.table` method now has an additional `rownames` argument allowing for a single column to be used as the `rownames` after conversion to a `matrix`. Thanks to @sritchie73 for the suggestion, use cases, [#2692](https://github.com/Rdatatable/data.table/issues/2692) and implementation [PR#2702](https://github.com/Rdatatable/data.table/pull/2702) and @MichaelChirico for additional use cases. - -## BUG FIXES - -1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding - Great Stocks\"", I discussed the value of stock screeners as a powerful tool"`, [#2051](https://github.com/Rdatatable/data.table/issues/2051). Thanks to @scarrascoso for reporting. Example file added to test suite. - -2. `fwrite()` creates a file with permissions that now play correctly with `Sys.umask()`, [#2049](https://github.com/Rdatatable/data.table/issues/2049). Thanks to @gnguy for reporting. - -3. `fread()` no longer holds an open lock on the file when a line outside the large sample has too many fields and generates an error, [#2044](https://github.com/Rdatatable/data.table/issues/2044). Thanks to Hugh Parsonage for reporting. - -4. Setting `j = {}` no longer results in an error, [#2142](https://github.com/Rdatatable/data.table/issues/2142). Thanks Michael Chirico for the pull request. - -5. Segfault in `rbindlist()` when one or more items are empty, [#2019](https://github.com/Rdatatable/data.table/issues/2019). Thanks Michael Lang for the pull request. Another segfault if the result would be more than 2bn rows, thanks to @jsams's comment in [#2340](https://github.com/Rdatatable/data.table/issues/2340#issuecomment-331505494). - -6. Error printing 0-length `ITime` and `NA` objects, [#2032](https://github.com/Rdatatable/data.table/issues/2032) and [#2171](https://github.com/Rdatatable/data.table/issues/2171). Thanks Michael Chirico for the pull requests and @franknarf1 for pointing out a shortcoming of the initial fix. - -7. `as.IDate.POSIXct` error with `NULL` timezone, [#1973](https://github.com/Rdatatable/data.table/issues/1973). Thanks @lbilli for reporting and Michael Chirico for the pull request. - -8. Printing a null `data.table` with `print` no longer visibly outputs `NULL`, [#1852](https://github.com/Rdatatable/data.table/issues/1852). Thanks @aaronmcdaid for spotting and @MichaelChirico for the PR. - -9. `data.table` now works with Shiny Reactivity / Flexdashboard. The error was typically something like `col not found` in `DT[col==val]`. Thanks to Dirk Eddelbuettel leading Matt through reproducible steps and @sergeganakou and Richard White for reporting. Closes [#2001](https://github.com/Rdatatable/data.table/issues/2001) and [shiny/#1696](https://github.com/rstudio/shiny/issues/1696). - -10. The `as.IDate.POSIXct` method passed `tzone` along but was not exported. So `tzone` is now taken into account by `as.IDate` too as well as `IDateTime`, [#977](https://github.com/Rdatatable/data.table/issues/977) and [#1498](https://github.com/Rdatatable/data.table/issues/1498). Tests added. - -11. Named logical vector now select rows as expected from single row data.table. Thanks to @skranz for reporting. Closes [#2152](https://github.com/Rdatatable/data.table/issues/2152). - -12. `fread()`'s rare `Internal error: Sampling jump point 10 is before the last jump ended` has been fixed, [#2157](https://github.com/Rdatatable/data.table/issues/2157). Thanks to Frank Erickson and Artem Klevtsov for reporting with example files which are now added to the test suite. - -13. `CJ()` no longer loses attribute information, [#2029](https://github.com/Rdatatable/data.table/issues/2029). Thanks to @MarkusBonsch and @royalts for the pull request. - -14. `split.data.table` respects `factor` ordering in `by` argument, [#2082](https://github.com/Rdatatable/data.table/issues/2082). Thanks to @MichaelChirico for identifying and fixing the issue. - -15. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR, and @ProfFancyPants for reporing a regression introduced in the PR. Closes [#2326](https://github.com/Rdatatable/data.table/issues/2326) and [#2338](https://github.com/Rdatatable/data.table/issues/2338). - -16. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). - -17. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). - -18. `isoweek` calculation is correct regardless of local timezone setting (`Sys.timezone()`), [#2407](https://github.com/Rdatatable/data.table/issues/2407). Thanks to @MoebiusAV and @SimonCoulombe for reporting and @MichaelChirico for fixing. - -19. Fixed `as.xts.data.table` to support all xts supported time based index clasess [#2408](https://github.com/Rdatatable/data.table/issues/2408). Thanks to @ebs238 for reporting and for the PR. - -20. A memory leak when a very small number such as `0.58E-2141` is bumped to type `character` is resolved, [#918](https://github.com/Rdatatable/data.table/issues/918). - -21. The edge case `setnames(data.table(), character(0))` now works rather than error, [#2452](https://github.com/Rdatatable/data.table/issues/2452). - -22. Order of rows returned in non-equi joins were incorrect in certain scenarios as reported under [#1991](https://github.com/Rdatatable/data.table/issues/1991). This is now fixed. Thanks to @Henrik-P for reporting. - -23. Non-equi joins work as expected when `x` in `x[i, on=...]` is a 0-row data.table. Closes [#1986](https://github.com/Rdatatable/data.table/issues/1986). - -24. Non-equi joins along with `by=.EACHI` returned incorrect result in some rare cases as reported under [#2360](https://github.com/Rdatatable/data.table/issues/2360). This is fixed now. This fix also takes care of [#2275](https://github.com/Rdatatable/data.table/issues/2275). Thanks to @ebs238 for the nice minimal reproducible report, @Mihael for asking on SO and to @Frank for following up on SO and filing an issue. - -25. `by=.EACHI` works now when `list` columns are being returned and some join values are missing, [#2300](https://github.com/Rdatatable/data.table/issues/2300). Thanks to @jangorecki and @franknarf1 for the reproducible examples which have been added to the test suite. - -26. Indices are now retrieved by exact name, [#2465](https://github.com/Rdatatable/data.table/issues/2465). This prevents usage of wrong indices as well as unexpected row reordering in join results. Thanks to @pannnda for reporting and providing a reproducible example and to @MarkusBonsch for fixing. - -27. `setnames` of whole table when original table had `NA` names skipped replacing those, [#2475](https://github.com/Rdatatable/data.table/issues/2475). Thanks to @franknarf1 and [BenoitLondon on StackOverflow](https://stackoverflow.com/questions/47228836/) for the report and @MichaelChirico for fixing. - -28. `CJ()` works with multiple empty vectors now [#2511](https://github.com/Rdatatable/data.table/issues/2511). Thanks to @MarkusBonsch for fixing. - -29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again. - -30. "Negative length vectors not allowed" error when grouping `median` and `var` fixed, [#2046](https://github.com/Rdatatable/data.table/issues/2046) and [#2111](https://github.com/Rdatatable/data.table/issues/2111). Thanks to @caneff and @osofr for reporting and to @kmillar for debugging and explaining the cause. - -31. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing. - -32. `x.` prefixes during joins sometimes resulted in a "column not found" error. This is now fixed. Closes [#2313](https://github.com/Rdatatable/data.table/issues/2313). Thanks to @franknarf1 for the MRE. - -33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. - -34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`. -`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons. -In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. -Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653) - -35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636). - -36. `NA` in character columns now display as `` just like base R to distinguish from `""` and `"NA"`. - -37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708). - -38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674). - -39. Internal aliasing of `.` to `list` was over-aggressive in applying `list` even when `.` was intended within `bquote`, [#1912](https://github.com/Rdatatable/data.table/issues/1912). Thanks @MichaelChirico for reporting/filing and @ecoRoland for suggesting and testing a fix. - -40. Attempt to allocate a wildly large amount of RAM (16EB) when grouping by key and there are close to 2 billion 1-row groups, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks to @jsams for the detailed report. - -41. Fix a bug that `print(dt, class=TRUE)` shows only `topn - 1` rows. Thanks to @heavywatal for reporting [#2803](https://github.com/Rdatatable/data.table/issues/2803) and filing [PR#2804](https://github.com/Rdatatable/data.table/pull/2804). - -## NOTES - -0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. - -1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing. - -2. Test 1675.1 updated to cope with a change in R-devel in June 2017 related to `factor()` and `NA` levels. - -3. Package `ezknitr` has been added to the whitelist of packages that run user code and should be consider data.table-aware, [#2266](https://github.com/Rdatatable/data.table/issues/2266). Thanks to Matt Mills for testing and reporting. - -4. Printing with `quote = TRUE` now quotes column names as well, [#1319](https://github.com/Rdatatable/data.table/issues/1319). Thanks @jan-glx for the suggestion and @MichaelChirico for the PR. - -5. Added a blurb to `?melt.data.table` explicating the subtle difference in behavior of the `id.vars` argument vis-a-vis its analog in `reshape2::melt`, [#1699](https://github.com/Rdatatable/data.table/issues/1699). Thanks @MichaelChirico for uncovering and filing. - -6. Added some clarification about the usage of `on` to `?data.table`, [#2383](https://github.com/Rdatatable/data.table/issues/2383). Thanks to @peterlittlejohn for volunteering his confusion and @MichaelChirico for brushing things up. - -7. Clarified that "data.table always sorts in `C-locale`" means that upper-case letters are sorted before lower-case letters by ordering in data.table (e.g. `setorder`, `setkey`, `DT[order(...)]`). Thanks to @hughparsonage for the pull request editing the documentation. Note this makes no difference in most cases of data; e.g. ids where only uppercase or lowercase letters are used (`"AB123"<"AC234"` is always true, regardless), or country names and words which are consistently capitalized. For example, `"America" < "Brazil"` is not affected (it's always true), and neither is `"america" < "brazil"` (always true too); since the first letter is consistently capitalized. But, whether `"america" < "Brazil"` (the words are not consistently capitalized) is true or false in base R depends on the locale of your R session. In America it is true by default and false if you i) type `Sys.setlocale(locale="C")`, ii) the R session has been started in a C locale for you which can happen on servers/services (the locale comes from the environment the R session is started in). However, `"america" < "Brazil"` is always, consistently false in data.table which can be a surprise because it differs to base R by default in most regions. It is false because `"B"<"a"` is true because all upper-case letters come first, followed by all lower case letters (the ascii number of each letter determines the order, which is what is meant by `C-locale`). - -8. `data.table`'s dependency has been moved forward from R 3.0.0 (Apr 2013) to R 3.1.0 (Apr 2014; i.e. 3.5 years old). We keep this dependency as old as possible for as long as possible as requested by users in managed environments. Thanks to Jan Gorecki, the test suite from latest dev now runs on R 3.1.0 continously, as well as R-release (currently 3.4.2) and latest R-devel snapshot. The primary motivation for the bump to R 3.1.0 was allowing one new test which relies on better non-copying behaviour in that version, [#2484](https://github.com/Rdatatable/data.table/issues/2484). It also allows further internal simplifications. Thanks to @MichaelChirico for fixing another test that failed on R 3.1.0 due to slightly different behaviour of `base::read.csv` in R 3.1.0-only which the test was comparing to, [#2489](https://github.com/Rdatatable/data.table/pull/2489). - -9. New vignette added: _Importing data.table_ - focused on using data.table as a dependency in R packages. Answers most commonly asked questions and promote good practices. - -10. As warned in v1.9.8 release notes below in this file (25 Nov 2016) it has been 1 year since then and so use of `options(datatable.old.unique.by.key=TRUE)` to restore the old default is now deprecated with warning. The new warning states that this option still works and repeats the request to pass `by=key(DT)` explicitly to `unique()`, `duplicated()`, `uniqueN()` and `anyDuplicated()` and to stop using this option. In another year, this warning will become error. Another year after that the option will be removed. - -11. As `set2key()` and `key2()` have been warning since v1.9.8 (Nov 2016), their warnings have now been upgraded to errors. Note that when they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' in NEWS item 4. They will be removed in one year. - - ``` - Was warning: set2key() will be deprecated in the next relase. Please use setindex() instead. - Now error: set2key() is now deprecated. Please use setindex() instead. - ``` - -12. The option `datatable.showProgress` is no longer set to a default value when the package is loaded. Instead, the `default=` argument of `getOption` is used by both `fwrite` and `fread`. The default is the result of `interactive()` at the time of the call. Using `getOption` in this way is intended to be more helpful to users looking at `args(fread)` and `?fread`. - -13. `print.data.table()` invisibly returns its first argument instead of `NULL`. This behavior is compatible with the standard `print.data.frame()` and tibble's `print.tbl_df()`. Thanks to @heavywatal for [PR#2807](https://github.com/Rdatatable/data.table/pull/2807) - - -# data.table v1.10.4-3 (20 Oct 2017) - -1. Fixed crash/hang on MacOS when `parallel::mclapply` is used and data.table is merely loaded, [#2418](https://github.com/Rdatatable/data.table/issues/2418). Oddly, all tests including test 1705 (which tests `mclapply` with data.table) passed fine on CRAN. It appears to be some versions of MacOS or some versions of libraries on MacOS, perhaps. Many thanks to Martin Morgan for reporting and confirming this fix works. Thanks also to @asenabouth, Joe Thorley and Danton Noriega for testing, debugging and confirming that automatic parallelism inside data.table (such as `fwrite`) works well even on these MacOS installations. See also news items below for 1.10.4-1 and 1.10.4-2. - - -# data.table v1.10.4-2 (12 Oct 2017) - -1. OpenMP on MacOS is now supported by CRAN and included in CRAN's package binaries for Mac. But installing v1.10.4-1 from source on MacOS failed when OpenMP was not enabled at compile time, [#2409](https://github.com/Rdatatable/data.table/issues/2409). Thanks to Liz Macfie and @fupangpangpang for reporting. The startup message when OpenMP is not enabled has been updated. - -2. Two rare potential memory faults fixed, thanks to CRAN's automated use of latest compiler tools; e.g. clang-5 and gcc-7 - - -# data.table v1.10.4-1 (09 Oct 2017) - -1. The `nanotime` v0.2.0 update (June 2017) changed from `integer64` to `S4` and broke `fwrite` of `nanotime` columns. Fixed to work with `nanotime` both before and after v0.2.0. - -2. Pass R-devel changes related to `deparse(,backtick=)` and `factor()`. - -3. Internal `NAMED()==2` now `MAYBE_SHARED()`, [#2330](https://github.com/Rdatatable/data.table/issues/2330). Back-ported to pass under the stated dependency, R 3.0.0. - -4. Attempted improvement on Mac-only when the `parallel` package is used too (which forks), [#2137](https://github.com/Rdatatable/data.table/issues/2137). Intel's OpenMP implementation appears to leave threads running after the OpenMP parallel region (inside data.table) has finished unlike GNU libgomp. So, if and when `parallel`'s `fork` is invoked by the user after data.table has run in parallel already, instability occurs. The problem only occurs with Mac package binaries from CRAN because they are built by CRAN with Intel's OpenMP library. No known problems on Windows or Linux and no known problems on any platform when `parallel` is not used. If this Mac-only fix still doesn't work, call `setDTthreads(1)` immediately after `library(data.table)` which has been reported to fix the problem by putting `data.table` into single threaded mode earlier. - -5. When `fread()` and `print()` see `integer64` columns are present but package `bit64` is not installed, the warning is now displayed as intended. Thanks to a question by Santosh on r-help and forwarded by Bill Dunlap. - - -# data.table v1.10.4 (01 Feb 2017) - -## BUG FIXES - -1. The new specialized `nanotime` writer in `fwrite()` type punned using `*(long long *)&REAL(column)[i]` which, strictly, is undefined behavour under C standards. It passed a plethora of tests on linux (gcc 5.4 and clang 3.8), win-builder and 6 out 10 CRAN flavours using gcc. But failed (wrong data written) with the newest version of clang (3.9.1) as used by CRAN on the failing flavors, and solaris-sparc. Replaced with the union method and added a grep to CRAN_Release.cmd. - - -# data.table v1.10.2 (31 Jan 2017) - -## NEW FEATURES - -1. When `j` is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. - - ```R - myCols = c("colA","colB") - DT[, myCols, with=FALSE] - DT[, ..myCols] # same - ``` - - When you see the `..` prefix think _one-level-up_ like the directory `..` in all operating systems meaning the parent directory. In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. It is intended to be a convenient way to protect your code from accidentally picking up a column name. Similar to how `x.` and `i.` prefixes (analogous to SQL table aliases) can already be used to disambiguate the same column name present in both `x` and `i`. A symbol prefix rather than a `..()` _function_ will be easier for us to optimize internally and more convenient if you have many variables in calling scope that you wish to use in your expressions safely. This feature was first raised in 2012 and long wished for, [#633](https://github.com/Rdatatable/data.table/issues/633). It is experimental. - -2. When `fread()` or `print()` see `integer64` columns are present, `bit64`'s namespace is now automatically loaded for convenience. - -3. `fwrite()` now supports the new [`nanotime`](https://cran.r-project.org/package=nanotime) type by Dirk Eddelbuettel, [#1982](https://github.com/Rdatatable/data.table/issues/1982). Aside: `data.table` already automatically supported `nanotime` in grouping and joining operations via longstanding support of its underlying `integer64` type. - -4. `indices()` gains a new argument `vectors`, default `FALSE`. This strsplits the index names by `__` for you, [#1589](https://github.com/Rdatatable/data.table/issues/1589). - - ```R - DT = data.table(A=1:3, B=6:4) - setindex(DT, B) - setindex(DT, B, A) - indices(DT) - [1] "B" "B__A" - indices(DT, vectors=TRUE) - [[1]] - [1] "B" - [[2]] - [1] "B" "A" - ``` - -## BUG FIXES - -1. Some long-standing potential instability has been discovered and resolved many thanks to a detailed report from Bill Dunlap and Michael Sannella. At C level any call of the form `setAttrib(x, install(), allocVector())` can be unstable in any R package. Despite `setAttrib()` PROTECTing its inputs, the 3rd argument (`allocVector`) can be executed first only for its result to to be released by `install()`'s potential GC before reaching `setAttrib`'s PROTECTion of its inputs. Fixed by either PROTECTing or pre-`install()`ing. Added to CRAN_Release.cmd procedures: i) `grep`s to prevent usage of this idiom in future and ii) running data.table's test suite with `gctorture(TRUE)`. - -2. A new potential instability introduced in the last release (v1.10.0) in GForce optimized grouping has been fixed by reverting one change from malloc to R_alloc. Thanks again to Michael Sannella for the detailed report. - -3. `fwrite()` could write floating point values incorrectly, [#1968](https://github.com/Rdatatable/data.table/issues/1968). A thread-local variable was incorrectly thread-global. This variable's usage lifetime is only a few clock cycles so it needed large data and many threads for several threads to overlap their usage of it and cause the problem. Many thanks to @mgahan and @jmosser for finding and reporting. - -## NOTES - -1. `fwrite()`'s `..turbo` option has been removed as the warning message warned. If you've found a problem, please [report it](https://github.com/Rdatatable/data.table/issues). - -2. No known issues have arisen due to `DT[,1]` and `DT[,c("colA","colB")]` now returning columns as introduced in v1.9.8. However, as we've moved forward by setting `options('datatable.WhenJisSymbolThenCallingScope'=TRUE)` introduced then too, it has become clear a better solution is needed. All 340 CRAN and Bioconductor packages that use data.table have been checked with this option on. 331 lines would need to be changed in 59 packages. Their usage is elegant, correct and recommended, though. Examples are `DT[1, encoding]` in quanteda and `DT[winner=="first", freq]` in xgboost. These are looking up the columns `encoding` and `freq` respectively and returning them as vectors. But if, for some reason, those columns are removed from `DT` and `encoding` or `freq` are still variables in calling scope, their values in calling scope would be returned. Which cannot be what was intended and could lead to silent bugs. That was the risk we were trying to avoid.
-`options('datatable.WhenJisSymbolThenCallingScope')` is now removed. A migration timeline is no longer needed. The new strategy needs no code changes and has no breakage. It was proposed and discussed in point 2 [here](https://github.com/Rdatatable/data.table/issues/1188#issuecomment-127824969), as follows.
-When `j` is a symbol (as in the quanteda and xgboost examples above) it will continue to be looked up as a column name and returned as a vector, as has always been the case. If it's not a column name however, it is now a helpful error explaining that data.table is different to data.frame and what to do instead (use `..` prefix or `with=FALSE`). The old behaviour of returning the symbol's value in calling scope can never have been useful to anybody and therefore not depended on. Just as the `DT[,1]` change could be made in v1.9.8, this change can be made now. This change increases robustness with no downside. Rerunning all 340 CRAN and Bioconductor package checks reveal 2 packages throwing the new error: partools and simcausal. Their maintainers have been informed that there is a likely bug on those lines due to data.table's (now remedied) weakness. This is exactly what we wanted to reveal and improve. - -3. As before, and as we can see is in common use in CRAN and Bioconductor packages using data.table, `DT[,myCols,with=FALSE]` continues to lookup `myCols` in calling scope and take its value as column names or numbers. You can move to the new experimental convenience feature `DT[, ..myCols]` if you wish at leisure. - - -# data.table v1.10.0 (03 Dec 2016) - -## BUG FIXES - -1. `fwrite(..., quote='auto')` already quoted a field if it contained a `sep` or `\n`, or `sep2[2]` when `list` columns are present. Now it also quotes a field if it contains a double quote (`"`) as documented, [#1925](https://github.com/Rdatatable/data.table/issues/1925). Thanks to Aki Matsuo for reporting. Tests added. The `qmethod` tests did test escaping embedded double quotes, but only when `sep` or `\n` was present in the field as well to trigger the quoting of the field. - -2. Fixed 3 test failures on Solaris only, [#1934](https://github.com/Rdatatable/data.table/issues/1934). Two were on both sparc and x86 and related to a `tzone` attribute difference between `as.POSIXct` and `as.POSIXlt` even when passed the default `tz=""`. The third was on sparc only: a minor rounding issue in `fwrite()` of 1e-305. - -3. Regression crash fixed when 0's occur at the end of a non-empty subset of an empty table, [#1937](https://github.com/Rdatatable/data.table/issues/1937). Thanks Arun for tracking down. Tests added. For example, subsetting the empty `DT=data.table(a=character())` with `DT[c(1,0)]` should return a 1 row result with one `NA` since 1 is past the end of `nrow(DT)==0`, the same result as `DT[1]`. - -4. Fixed newly reported crash that also occurred in old v1.9.6 when `by=.EACHI`, `nomatch=0`, the first item in `i` has no match AND `j` has a function call that is passed a key column, [#1933](https://github.com/Rdatatable/data.table/issues/1933). Many thanks to Reino Bruner for finding and reporting with a reproducible example. Tests added. - -5. Fixed `fread()` error occurring for a subset of Windows users: `showProgress is not type integer but type 'logical'.`, [#1944](https://github.com/Rdatatable/data.table/issues/1944) and [#1111](https://github.com/Rdatatable/data.table/issues/1111). Our tests cover this usage (it is just default usage), pass on AppVeyor (Windows), win-builder (Windows) and CRAN's Windows so perhaps it only occurs on a specific and different version of Windows to all those. Thanks to @demydd for reporting. Fixed by using strictly `logical` type at R level and `Rboolean` at C level, consistently throughout. - -6. Combining `on=` (new in v1.9.6) with `by=` or `keyby=` gave incorrect results, [#1943](https://github.com/Rdatatable/data.table/issues/1943). Many thanks to Henrik-P for the detailed and reproducible report. Tests added. - -7. New function `rleidv` was ignoring its `cols` argument, [#1942](https://github.com/Rdatatable/data.table/issues/1942). Thanks Josh O'Brien for reporting. Tests added. - -## NOTES - -1. It seems OpenMP is not available on CRAN's Mac platform; NOTEs appeared in [CRAN checks](https://cran.r-project.org/web/checks/check_results_data.table.html) for v1.9.8. Moved `Rprintf` from `init.c` to `packageStartupMessage` to avoid the NOTE as requested urgently by Professor Ripley. Also fixed the bad grammar of the message: 'single threaded' now 'single-threaded'. If you have a Mac and run macOS or OS X on it (I run Ubuntu on mine) please contact CRAN maintainers and/or Apple if you'd like CRAN's Mac binary to support OpenMP. Otherwise, please follow [these instructions for OpenMP on Mac](https://github.com/Rdatatable/data.table/wiki/Installation) which people have reported success with. - -2. Just to state explicitly: data.table does not now depend on or require OpenMP. If you don't have it (as on CRAN's Mac it appears but not in general on Mac) then data.table should build, run and pass all tests just fine. - -3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://app.codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. - -4. The FAQ vignette has been revised given the changes in v1.9.8. In particular, the very first FAQ. - -5. With hindsight, the last release v1.9.8 should have been named v1.10.0 to convey it wasn't just a patch release from .6 to .8 owing to the 'potentially breaking changes' items. Thanks to @neomantic for correctly pointing out. The best we can do now is now bump to 1.10.0. - - -# data.table v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) diff --git a/NEWS.md b/NEWS.md index 48f7c529e8..7f54fdec70 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,12 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -# data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) +**Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -## BREAKING CHANGE +# data.table [v1.14.3](https://github.com/Rdatatable/data.table/milestone/20) (in development) -1. `shift` and `nafill` will now raise error `input must not be matrix or array` when `matrix` or `array` is provided on input, rather than giving useless result, [#5287](https://github.com/Rdatatable/data.table/issues/5287). Thanks to @ethanbsmith for reporting. +## POTENTIALLY BREAKING CHANGES + +1. Rolling functions `frollmean` and `frollsum` used to treat `Inf` and `-Inf` as `NA` when using default `algo="fast"`. It has been changed now and infinity values are not treated as `NA` anymore. If your input into those function has `Inf` or `-Inf` then you will be affected by this change. [#5441](https://github.com/Rdatatable/data.table/pull/5441). ## NEW FEATURES @@ -111,8 +113,15 @@ 21. `melt()` was pseudo generic in that `melt(DT)` would dispatch to the `melt.data.table` method but `melt(not-DT)` would explicitly redirect to `reshape2`. Now `melt()` is standard generic so that methods can be developed in other packages, [#4864](https://github.com/Rdatatable/data.table/pull/4864). Thanks to @odelmarcelle for suggesting and implementing. +22. `DT(i, j, by, ...)` has been added, i.e. functional form of a `data.table` query, [#641](https://github.com/Rdatatable/data.table/issues/641) [#4872](https://github.com/Rdatatable/data.table/issues/4872). Thanks to Yike Lu and Elio Campitelli for filing requests, many others for comments and suggestions, and Matt Dowle for the PR. This enables the `data.table` general form query to be invoked on a `data.frame` without converting it to a `data.table` first. The class of the input object is retained. Thanks to Mark Fairbanks and Boniface Kamgang for testing and reporting problems that have been fixed before release, [#5106](https://github.com/Rdatatable/data.table/issues/5106) [#5107](https://github.com/Rdatatable/data.table/issues/5107). + + ```R + mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) + ``` + + When `data.table` queries (either `[...]` or `|> DT(...)`) receive a `data.table`, the operations maintain `data.table`'s attributes such as its key and any indices. For example, if a `data.table` is reordered by `data.table`, or a key column has a value changed by `:=` in `data.table`, its key and indices will either be dropped or reordered appropriately. Some `data.table` operations automatically add and store an index on a `data.table` for reuse in future queries, if `options(datatable.auto.index=TRUE)`, which is `TRUE` by default. `data.table`'s are also over-allocated, which means there are spare column pointer slots allocated in advance so that a `data.table` in the `.GlobalEnv` can have a column added to it truly by reference, like an in-memory database with multiple client sessions connecting to one server R process, as a `data.table` video has shown in the past. But because R and other packages don't maintain `data.table`'s attributes or over-allocation (e.g. a subset or reorder by R or another package will create invalid `data.table` attributes) `data.table` cannot use these attributes when it detects that base R or another package has touched the `data.table` in the meantime, even if the attributes may sometimes still be valid. So, please realize that, `DT()` on a `data.table` should realize better speed and memory usage than `DT()` on a `data.frame`. `DT()` on a `data.frame` may still be useful to use `data.table`'s syntax (e.g. sub-queries within group: `|> DT(i, .SD[sub-query], by=grp)`) without needing to convert to a `data.table` first. -22. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. +23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. ```R DT = data.table(A=1:3) @@ -130,13 +139,13 @@ # 2: 3 ``` -23. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. +24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. -24. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. +25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. -25. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. +26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. -26. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. +27. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. ```R # Usage @@ -164,11 +173,11 @@ # c(tail(x, 1), head(x, -1)) 6.96 7.16 7.49 7.32 7.64 8.60 10 ``` -27. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. +28. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. -28. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. +29. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. -29. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. +30. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. ```R N = 1e7 @@ -202,7 +211,7 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` -30. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`, [#5444](https://github.com/Rdatatable/data.table/issues/5444). Thanks to @sindribaldur for testing dev and filing a bug report which was fixed before release. +31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.` ```R DT1 @@ -246,12 +255,12 @@ # 3: 3 NA # 4: 4 NA ``` + +32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. -31. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. - -32. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. +33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. -33. `DT[, let(...)]` is a new alias for the functional form of `:=`; i.e. `DT[, ':='(...)]`, [#3795](https://github.com/Rdatatable/data.table/issues/3795). Thanks to Elio Campitelli for requesting, and Benjamin Schwendinger for the PR. +34. `DT[, let(...)]` is a new alias for the functional form of `:=`; i.e. `DT[, ':='(...)]`, [#3795](https://github.com/Rdatatable/data.table/issues/3795). Thanks to Elio Campitelli for requesting, and Benjamin Schwendinger for the PR. ```R DT = data.table(A=1:2) @@ -262,16 +271,16 @@ # 1: 1 3 a # 2: 2 4 b ``` + +35. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. -34. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. +36. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. -35. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. +37. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. -36. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. +38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. Thanks to @clerousset, @dcaseykc, @OfekShilon, and @SeanShao98 for testing dev and filing detailed bug reports which were fixed before release and their tests added to the test suite. -37. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. Thanks to @clerousset, @dcaseykc, @OfekShilon, and @SeanShao98 for testing dev and filing detailed bug reports which were fixed before release and their tests added to the test suite. - -38. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. +39. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. ```R DT @@ -279,7 +288,7 @@ # # 1: 3 5 # 2: 4 6 - + DT[, sum(.SD), by=.I] # I V1 # @@ -287,11 +296,47 @@ # 2: 2 10 ``` -39. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. Thanks to @berg-michael for testing dev and filing a bug report for special case of missing values which was fixed before release. - -40. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. - -41. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. +40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. + +41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. + +42. Multiple improvements has been added to rolling functions. Request came from @gpierard who needed left aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). There was no `frollmax` function yet. Adaptive rolling functions did not have support for `align="left"`. `frollapply` did not support `adaptive=TRUE`. Available alternatives were base R `mapply` or self-join using `max` and grouping `by=.EACHI`. As a follow up of his request, following features has been added: +- new function `frollmax`, applies `max` over a rolling window. +- support for `align="left"` for adaptive rolling function. +- support for `adaptive=TRUE` in `frollapply`. +- `partial` argument to trim window width to available observations rather than returning `NA` whenever window is not complete. +- `give.names` argument that can be used to automatically give the names based on the names of `x` and `n`. +- `frollmean` and `frollsum` no longer treat `Inf` and `-Inf` as `NA`s as it used to be for `algo="fast"` (breaking change). +- `hasNA` argument has been renamed to `has.nf` to convey that it is not only related to `NA/NaN` but other non-finite values (`Inf/-Inf`) as well. + +For a comprehensive description about all available features see `?froll` manual. + +Adaptive `frollmax` has observed to be up to 50 times faster than second fastest solution (data.table self-join using `max` and grouping `by=.EACHI`). Note that important factor in performance is width of the rolling window. Code for the benchmark below has been taken from [this SO answer](https://stackoverflow.com/a/73408459/2490497). +```r +set.seed(108) +setDTthreads(8) +x = data.table( + value = cumsum(rnorm(1e6, 0.1)), + end_window = 1:1e6 + sample(50:500, 1e6, TRUE), + row = 1:1e6 +)[, "end_window" := pmin(end_window, .N) + ][, "len_window" := end_window-row+1L] + +baser = function(x) x[, mapply(function(from, to) max(value[from:to]), row, end_window)] +sj = function(x) x[x, max(value), on=.(row >= row, row <= end_window), by=.EACHI]$V1 +frmax = function(x) x[, frollmax(value, len_window, adaptive=TRUE, align="left", has.nf=FALSE)] +frapply = function(x) x[, frollapply(value, len_window, max, adaptive=TRUE, align="left")] +microbenchmark::microbenchmark( + baser(x), sj(x), frmax(x), frapply(x), + times=10, check="identical" +) +#Unit: milliseconds +# expr min lq mean median uq max neval +# baser(x) 5181.36076 5417.57505 5537.2929 5494.73652 5706.2721 5818.6627 10 +# sj(x) 4608.28940 4627.57186 4792.4031 4785.35306 4856.4475 5054.3301 10 +# frmax(x) 70.41253 75.28659 91.3774 91.40227 102.0248 116.8622 10 +# frapply(x) 713.23108 742.34657 865.2524 848.31641 965.3599 1114.0531 10 +``` ## BUG FIXES @@ -331,7 +376,7 @@ 18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR. -19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. +19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. 20. `uniqueN(DT, by=character())` is now equivalent to `uniqueN(DT)` rather than internal error `'by' is either not integer or is length 0`, [#4594](https://github.com/Rdatatable/data.table/issues/4594). Thanks Marco Colombo for the report, and Michael Chirico for the PR. Similarly for `unique()`, `duplicated()` and `anyDuplicated()`. @@ -549,17 +594,6 @@ 53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance. -54. `data.table(...)` unnamed arguments are deparsed in an attempt to name the columns but when called from `do.call()` the input data itself was deparsed taking a very long time, [#5501](https://github.com/Rdatatable/data.table/pull/5501). Many thanks to @OfekShilon for the report and fix, and @michaelchirico for guidance. Unnamed arguments to `data.table(...)` may now be faster in other cases not involving `do.call()` too; e.g. expressions spanning a lot of lines or other function call constructions that led to the data itself being deparsed. - - ```R - DF = data.frame(a=runif(1e6), b=runif(1e6)) - DT1 = data.table(DF) # 0.02s before and after - DT2 = do.call(data.table, list(DF)) # 3.07s before, 0.02s after - identical(DT1, DT2) # TRUE - ``` - -55. `fread(URL)` with `https:` and `ftps:` could timeout if proxy settings were not guessed right by `curl::curl_download`, [#1686](https://github.com/Rdatatable/data.table/issues/1686). `fread(URL)` now uses `download.file()` as default for downloading files from urls. Thanks to @cderv for the report and Benjamin Schwendinger for the fix. - ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -594,7 +628,11 @@ 12. `?merge` and `?setkey` have been updated to clarify that the row order is retained when `sort=FALSE`, and why `NA`s are always first when `sort=TRUE`, [#2574](https://github.com/Rdatatable/data.table/issues/2574) [#2594](https://github.com/Rdatatable/data.table/issues/2594). Thanks to Davor Josipovic and Markus Bonsch for the reports, and Jan Gorecki for the PR. -13. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : +13. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. + + > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. + +14. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : ``` The option 'datatable.nomatch' is being used and is not set to the default NA. This option @@ -605,9 +643,1476 @@ The message is now upgraded to warning that the option is now ignored. -14. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. +15. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). + +16. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. + +17. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has also passed all tests. As such we don't expect any backwards compatibility concerns. + + +# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) + +## NOTES + +1. clang 13.0.0 (Sep 2021) requires the system header `omp.h` to be included before R's headers, [#5122](https://github.com/Rdatatable/data.table/issues/5122). Many thanks to Prof Ripley for testing and providing a patch file. + + +# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) + +## POTENTIALLY BREAKING CHANGES + +1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. + + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://www.rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + + `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. + + The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. + +## BUG FIXES + +1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. + +2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. + +## NOTES + +1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. + +2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. + + +# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) + +## BUG FIXES + +1. Grouping could throw an error `Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. + +2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. + + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. + +## NOTES + +1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmission reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. + + +# data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) + +## BUG FIXES + +1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. + +2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the dynamic schedule. Although never guaranteed by the OpenMP standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, so `fsort` now checks that threads are receiving iterations monotonically and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. OpenMP 4.5 may be enabled in some compilers using `-fopenmp-version=45`. Otherwise, if you need to upgrade compiler, https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. + +3. Columns containing functions that don't inherit the class `'function'` would fail to group, [#4814](https://github.com/Rdatatable/data.table/issues/4814). Thanks @mb706 for reporting, @ecoRoland2 for helping investigate, and @Coorsaa for a follow-up example involving environments. + +## NOTES + +1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). + +2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. + +3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example which has also been added to the test suite. + + +# data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) + +## BUG FIXES + +1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. + +2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. + +3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks, and thanks to Venelin Mitov for assistance in tracing the memory fault. Thanks also to Hongyuan Jia and @ben-schwen for assistance in debugging the fix in dev to pass reverse dependency testing which highlighted, before release, that package `eplusr` would fail. Its good usage has been added to `data.table`'s test suite. + +4. `fread("1.2\n", colClasses='integer')` (note no columns names in the data) would segfault when creating a warning message, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present however, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <
> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. + +5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. + +## NOTES + +1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. + + The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. + + We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. + +2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. + +3. `test.data.table()` failed in non-English locales such as `LC_TIME=fr_FR.UTF-8` due to `Jan` vs `janv.` in tests 168 and 2042, [#3450](https://github.com/Rdatatable/data.table/issues/3450). Thanks to @shrektan for reporting, and @tdhock for making the tests locale-aware. + +4. User-supplied `PKG_LIBS` and `PKG_CFLAGS` are now retained and the suggestion in https://mac.r-project.org/openmp/; i.e., + `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` +has a better chance of working on Mac. + + +# data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) + +## POTENTIALLY BREAKING CHANGES + +1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. + + Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. + + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. + +## NEW FEATURES + +1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). + +2. `CsubsetDT` C function is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organization of our C interface will be changed in future. + +3. `print` method for `data.table` gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. + +4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion. + +5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. + +6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details. + + ```R + # Lazy evaluation + x = 1:10 + data.table::fcase( + x < 5L, 1L, + x >= 5L, 3L, + x == 5L, stop("provided value is an unexpected one!") + ) + # [1] 1 1 1 1 3 3 3 3 3 3 + + dplyr::case_when( + x < 5L ~ 1L, + x >= 5L ~ 3L, + x == 5L ~ stop("provided value is an unexpected one!") + ) + # Error in eval_tidy(pair$rhs, env = default_env) : + # provided value is an unexpected one! + + # Benchmark + x = sample(1:100, 3e7, replace = TRUE) # 114 MB + microbenchmark::microbenchmark( + dplyr::case_when( + x < 10L ~ 0L, + x < 20L ~ 10L, + x < 30L ~ 20L, + x < 40L ~ 30L, + x < 50L ~ 40L, + x < 60L ~ 50L, + x > 60L ~ 60L + ), + data.table::fcase( + x < 10L, 0L, + x < 20L, 10L, + x < 30L, 20L, + x < 40L, 30L, + x < 50L, 40L, + x < 60L, 50L, + x > 60L, 60L + ), + times = 5L, + unit = "s") + # Unit: seconds + # expr min lq mean median uq max neval + # dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 + # data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 + ``` + +7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `.SDcols=patterns(...)` can still be used for filtering based on the column names. + +8. Compiler support for OpenMP is now detected during installation, which allows `data.table` to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. + +9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. + +10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. + +11. `frank` gains `ties.method='last'`, paralleling the same in `base::order` which has been available since R 3.3.0 (April 2016), [#1689](https://github.com/Rdatatable/data.table/issues/1689). Thanks @abudis for the encouragement to accommodate this. + +12. The `keep.rownames` argument in `as.data.table.xts` now accepts a string, which can be used for specifying the column name of the index of the xts input, [#4232](https://github.com/Rdatatable/data.table/issues/4232). Thanks to @shrektan for the request and the PR. + +13. New symbol `.NGRP` available in `j`, [#1206](https://github.com/Rdatatable/data.table/issues/1206). `.GRP` (the group number) was already available taking values from `1` to `.NGRP`. The number of groups, `.NGRP`, might be useful in `j` to calculate a percentage of groups processed so far, or to do something different for the last or penultimate group, for example. + +14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. + +15. A new throttle feature has been introduced to speed up small data tasks that are repeated in a loop, [#3175](https://github.com/Rdatatable/data.table/issues/3175) [#3438](https://github.com/Rdatatable/data.table/issues/3438) [#3205](https://github.com/Rdatatable/data.table/issues/3205) [#3735](https://github.com/Rdatatable/data.table/issues/3735) [#3739](https://github.com/Rdatatable/data.table/issues/3739) [#4284](https://github.com/Rdatatable/data.table/issues/4284) [#4527](https://github.com/Rdatatable/data.table/issues/4527) [#4294](https://github.com/Rdatatable/data.table/issues/4294) [#1120](https://github.com/Rdatatable/data.table/issues/1120). The default throttle of 1024 means that a single thread will be used when nrow<=1024, two threads when nrow<=2048, etc. To change the default, use `setDTthreads(throttle=)`. Or use the new environment variable `R_DATATABLE_THROTTLE`. If you use `Sys.setenv()` in a running R session to change this environment variable, be sure to run an empty `setDTthreads()` call afterwards for the change to take effect; see `?setDTthreads`. The word *throttle* is used to convey that the number of threads is restricted (throttled) for small data tasks. Reducing throttle to 1 will turn off throttling and should revert behaviour to past versions (i.e. using many threads even for small data). Increasing throttle to, say, 65536 will utilize multi-threading only for larger datasets. The value 1024 is a guess. We welcome feedback and test results indicating what the best default should be. + +## BUG FIXES + +1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). + +2. `DT[i]` could segfault when `i` is a zero-column `data.table`, [#4060](https://github.com/Rdatatable/data.table/issues/4060). Thanks @shrektan for reporting and fixing. + +3. Dispatch of `first` and `last` functions now properly works again for `xts` objects, [#4053](https://github.com/Rdatatable/data.table/issues/4053). Thanks to @ethanbsmith for reporting. + +4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks also to @mllg for detecting a problem with the initial fix here during the dev release [#4173](https://github.com/Rdatatable/data.table/issues/4173). + +5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks @hongyuanjia and @ColeMiller1 for helping debug an issue in dev with the original fix before release, [#4612](https://github.com/Rdatatable/data.table/issues/4612). + +6. `all.equal(DT, y)` no longer errors when `y` is not a data.table, [#4042](https://github.com/Rdatatable/data.table/issues/4042). Thanks to @d-sci for reporting and the PR. + +7. A length 1 `colClasses=NA_character_` would cause `fread` to incorrectly coerce all columns to character, [#4237](https://github.com/Rdatatable/data.table/issues/4237). + +8. An `fwrite` error message could include a garbled number and cause test 1737.5 to fail, [#3492](https://github.com/Rdatatable/data.table/issues/3492). Thanks to @QuLogic for debugging the issue on ARMv7hl, and the PR fixing it. + +9. `fread` improves handling of very small (<1e-300) or very large (>1e+300) floating point numbers on non-x86 architectures (specifically ppc64le and armv7hl). Thanks to @QuLogic for reporting and fixing, [PR#4165](https://github.com/Rdatatable/data.table/pull/4165). + +10. When updating by reference, the use of `get` could result in columns being re-ordered silently, [#4089](https://github.com/Rdatatable/data.table/issues/4089). Thanks to @dmongin for reporting and Cole Miller for the fix. + +11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. + +12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. + +13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). + +14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. + +15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422). + +16. Non-equi joins now automatically set `allow.cartesian=TRUE`, [4489](https://github.com/Rdatatable/data.table/issues/4489). Thanks to @Henrik-P for reporting. + +17. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. + +18. `by=col1:col4` gave an incorrect result if `key(DT)==c("col1","col4")`, [#4285](https://github.com/Rdatatable/data.table/issues/4285). Thanks to @cbilot for reporting, and Cole Miller for the PR. + +19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. + +20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). + +## NOTES + +0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. + +1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). + +2. `set2key`, `set2keyv`, and `key2` have been removed, as they have been warning since v1.9.8 (Nov 2016) and halting with helpful message since v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' and quickly superseded by `setindex` and `indices`. + +3. `data.table` now supports messaging in simplified Chinese (locale `zh_CN`). This was the result of a monumental collaboration to translate `data.table`'s roughly 1400 warnings, errors, and verbose messages (about 16,000 words/100,000 characters) over the course of two months from volunteer translators in at least 4 time zones, most of whom are first-time `data.table` contributors and many of whom are first-time OSS contributors! + + A big thanks goes out to @fengqifang, @hongyuanjia, @biobai, @zhiiiyang, @Leo-Lee15, @soappp9527, @amy17519, @Zachary-Wu, @caiquanyou, @dracodoc, @JulianYlli12, @renkun-ken, @Xueliang24, @koohoko, @KingdaShi, @gaospecial, @shrektan, @sunshine1126, @shawnchen1996, @yc0802, @HesperusArcher, and @Emberwhirl, all of whom took time from their busy schedules to translate and review others' translations. Especial thanks goes to @zhiiiyang and @hongyuanjia who went above and beyond in helping to push the project over the finish line, and to @GuangchuangYu who helped to organize the volunteer pool. + + `data.table` joins `lubridate` and `nlme` as the only of the top 200 most-downloaded community packages on CRAN to offer non-English messaging, and is the only of the top 50 packages to offer complete support of all messaging. We hope this is a first step in broadening the reach and accessibility of the R ecosystem to more users globally and look forward to working with other maintainers looking to bolster the portability of their packages by offering advice on learnings from this undertaking. + + We would be remiss not to mention the laudable lengths to which the R core team goes to maintain the _much_ larger repository (about 6,000 messages in more than 10 languages) of translations for R itself. + + We will evaluate the feasibility (in terms of maintenance difficulty and CRAN package size limits) of offering support for other languages in later releases. + +4. `fifelse` and `fcase` now notify users that S4 objects (except `nanotime`) are not supported [#4135](https://github.com/Rdatatable/data.table/issues/4135). Thanks to @torema-ed for bringing it to our attention and Morgan Jacob for the PR. + +5. `frank(..., ties.method="random", na.last=NA)` now returns the same random ordering that `base::rank` does, [#4243](https://github.com/Rdatatable/data.table/pull/4243). + +6. The error message when mistakenly using `:=` in `i` instead of `j` has been much improved, [#4227](https://github.com/Rdatatable/data.table/issues/4227). Thanks to Hugh Parsonage for the detailed suggestion. + + ```R + > DT = data.table(A=1:2) + > DT[B:=3] + Error: Operator := detected in i, the first argument inside DT[...], but is only valid in + the second argument, j. Most often, this happens when forgetting the first comma + (e.g. DT[newvar:=5] instead of DT[, new_var:=5]). Please double-check the + syntax. Run traceback(), and debugger() to get a line number. + > DT[, B:=3] + > DT + A B + + 1: 1 3 + 2: 2 3 + ``` + +7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). + +8. Changes upstream in R have been accomodated; e.g. `c.POSIXct` now raises `'origin' must be supplied` which impacted `foverlaps`, [#4428](https://github.com/Rdatatable/data.table/pull/4428). + +9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. + +10. `data.table` packages binaries built by R version 3 (R3) should only be installed in R3, and similarly `data.table` package binaries built by R4 should only be installed in R4. Otherwise, `package ‘data.table’ was built under R version...` warning will occur which should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R3 and R4, `data.table`'s NAMESPACE file contains a condition on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). + +11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. + +12. The `datatable.old.unique.by.key` option has been removed as per the 4 year schedule detailed in note 10 of v1.12.4 (Oct 2019), note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). It has been generating a helpful warning for 2 years, and helpful error for 1 year. + + +# data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) + +## NEW FEATURES + +1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). + +## BUG FIXES + +1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing. + +2. `DT[, sum(grp), by=grp]` (i.e. aggregating the same column being grouped) could error with `object 'grp' not found`, [#3103](https://github.com/Rdatatable/data.table/issues/3103). Thanks to @cbailiss for reporting. + +## NOTES + +1. Links in the manual were creating warnings when installing HTML, [#4000](https://github.com/Rdatatable/data.table/issues/4000). Thanks to Morgan Jacob. + +2. Adjustments for R-devel (R 4.0.0) which now has reference counting turned on, [#4058](https://github.com/Rdatatable/data.table/issues/4058) [#4093](https://github.com/Rdatatable/data.table/issues/4093). This motivated early release to CRAN because every day CRAN tests every package using the previous day's changes in R-devel; a much valued feature of the R ecosystem. It helps R-core if packages can pass changes in R-devel as soon as possible. Thanks to Luke Tierney for the notice, and for implementing reference counting which we look forward to very much. + +3. C internals have been standardized to use `PRI[u|d]64` to print `[u]int64_t`. This solves new warnings from `gcc-8` on Windows with `%lld`, [#4062](https://github.com/Rdatatable/data.table/issues/4062), in many cases already working around `snprintf` on Windows not supporting `%zu`. Release procedures have been augmented to prevent any internal use of `llu`, `lld`, `zu` or `zd`. + +4. `test.data.table()` gains `showProgress=interactive()` to suppress the thousands of `Running test id ...` lines displayed by CRAN checks when there are warnings or errors. + + +# data.table [v1.12.6](https://github.com/Rdatatable/data.table/milestone/18?closed=1) (18 Oct 2019) + +## BUG FIXES + +1. `shift()` on a `nanotime` with the default `fill=NA` now fills a `nanotime` missing value correctly, [#3945](https://github.com/Rdatatable/data.table/issues/3945). Thanks to @mschubmehl for reporting and fixing in PR [#3942](https://github.com/Rdatatable/data.table/pull/3942). + +2. Compilation failed on CRAN's MacOS due to an older version of `zlib.h/zconf.h` which did not have `z_const` defined, [#3939](https://github.com/Rdatatable/data.table/issues/3939). Other open-source projects unrelated to R have experienced this problem on MacOS too. We have followed the common practice of removing `z_const` to support the older `zlib` versions, and data.table's release procedures have gained a `grep` to ensure `z_const` isn't used again by accident in future. The library `zlib` is used for `fwrite`'s new feature of multithreaded compression on-the-fly; see item 3 of 1.12.4 below. + +3. A runtime error in `fwrite`'s compression, but only observed so far on Solaris 10 32bit with zlib 1.2.8 (Apr 2013), [#3931](https://github.com/Rdatatable/data.table/issues/3931): `Error -2: one or more threads failed to allocate buffers or there was a compression error.` In case it happens again, this area has been made more robust and the error more detailed. As is often the case, investigating the Solaris problem revealed secondary issues in the same area of the code. In this case, some `%d` in verbose output should have been `%lld`. This obliquity that CRAN's Solaris provides is greatly appreciated. + +4. A leak could occur in the event of an unsupported column type error, or if working memory could only partially be allocated; [#3940](https://github.com/Rdatatable/data.table/issues/3940). Found thanks to `clang`'s Leak Sanitizer (prompted by CRAN's diligent use of latest tools), and two tests in the test suite which tested the unsupported-type error. + +## NOTES + +1. Many thanks to Kurt Hornik for fixing R's S3 dispatch of `rbind` and `cbind` methods, [#3948](https://github.com/Rdatatable/data.table/issues/3948). With `R>=4.0.0` (current R-devel), `data.table` now registers the S3 methods `cbind.data.table` and `rbind.data.table`, and no longer applies the workaround documented in FAQ 2.24. + + +# data.table [v1.12.4](https://github.com/Rdatatable/data.table/milestone/16?closed=1) (03 Oct 2019) + +## NEW FEATURES + +1. `rleid()` functions now support long vectors (length > 2 billion). + +2. `fread()`: + * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, Roy Storey for the initial PR, and Bingjie Qian for testing this feature on a very complicated real-world file. + * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR. + * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR. + * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR. + * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing. + * `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example: + + ```R + fread(file, select=c(colD="character", # returns 2 columns: colD,colA + colA="integer64")) + fread(file, select=list(character="colD", # returns 5 columns: colD,8,9,10,colA + integer= 8:10, + character="colA")) + ``` + * gains `tmpdir=` argument which is passed to `tempfile()` whenever a temporary file is needed. Thanks to @mschubmehl for the PR. As before, setting `TMPDIR` (to `/dev/shm` for example) before starting the R session still works too; see `?base::tempdir`. + +3. `fwrite()`: + * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example: + + ```R + DT = data.table(A=rep(1:2, 100e6), B=rep(1:4, 50e6)) + fwrite(DT, "data.csv") # 763MB; 1.3s + fwrite(DT, "data.csv.gz") # 2MB; 1.6s + identical(fread("data.csv.gz"), DT) + ``` + + Note that compression is handled using `zlib` library. In the unlikely event of missing `zlib.h`, on a machine that is compiling `data.table` from sources, one may get `fwrite.c` compilation error `zlib.h: No such file or directory`. As of now, the easiest solution is to install missing library using `sudo apt install zlib1g-dev` (Debian/Ubuntu). Installing R (`r-base-dev`) depends on `zlib1g-dev` so this should be rather uncommon. If it happens to you please upvote related issue [#3872](https://github.com/Rdatatable/data.table/issues/3872). + + * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. + + * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing. + + * Now supports type `complex`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). + + * Gains `scipen` [#2020](https://github.com/Rdatatable/data.table/issues/2020), the number 1 most-requested feature [#3189](https://github.com/Rdatatable/data.table/issues/3189). The default is `getOption("scipen")` so that `fwrite` will now respect R's option in the same way as `base::write.csv` and `base::format`, as expected. The parameter and option name have been kept the same as base R's `scipen` for consistency and to aid online search. It stands for 'scientific penalty'; i.e., the number of characters to add to the width within which non-scientific number format is used if it will fit. A high penalty essentially turns off scientific format. We believe that common practice is to use a value of 999, however, if you do use 999, because your data _might_ include very long numbers such as `10^300`, `fwrite` needs to account for the worst case field width in its buffer allocation per thread. This may impact space or time. If you experience slowdowns or unacceptable memory usage, please pass `verbose=TRUE` to `fwrite`, inspect the output, and report the issue. A workaround, until we can determine the best strategy, may be to pass a smaller value to `scipen`, such as 50. We have observed that `fwrite(DT, scipen=50)` appears to write `10^50` accurately, unlike base R. However, this may be a happy accident and not apply generally. Further work may be needed in this area. + + ```R + DT = data.table(a=0.0001, b=1000000) + fwrite(DT) + # a,b + # 1e-04,1e+06 + fwrite(DT,scipen=1) + # a,b + # 0.0001,1e+06 + fwrite(DT,scipen=2) + # a,b + # 0.0001,1000000 + + 10^50 + # [1] 1e+50 + options(scipen=50) + 10^50 + # [1] 100000000000000007629769841091887003294964970946560 + fwrite(data.table(A=10^50)) + # A + # 100000000000000000000000000000000000000000000000000 + ``` + +4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). + + ```R + > DT = data.table(A=1:3, B=list(1:2,"foo",3:5)) + > DT + A B + + 1: 1 1,2 + 2: 2 foo + 3: 3 3,4,5 + > + # The following all accomplish the same assignment: + > DT[2, B:=letters[9:13]] # was error, now works + > DT[2, B:=.(letters[9:13])] # was error, now works + > DT[2, B:=.(list(letters[9:13]))] # .(list()) was needed, still works + > DT + A B + + 1: 1 1,2 + 2: 2 i,j,k,l,m + 3: 3 3,4,5 + ``` + +5. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. + +6. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. + + ```R + DT = setDT(lapply(1:100, function(i) sample(c(rnorm(9e6), rep(NA_real_, 1e6))))) + format(object.size(DT), units="GB") ## 7.5 Gb + zoo::na.locf(DT, na.rm=FALSE) ## zoo 53.518s + setDTthreads(1L) + nafill(DT, "locf") ## DT 1 thread 7.562s + setDTthreads(0L) + nafill(DT, "locf") ## DT 40 threads 0.605s + setnafill(DT, "locf") ## DT in-place 0.367s + ``` + +7. New variable `.Last.updated` (similar to R's `.Last.value`) contains the number of rows affected by the most recent `:=` or `set()`, [#1885](https://github.com/Rdatatable/data.table/issues/1885). For details see `?.Last.updated`. + +8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). `between()` gains `check=` which checks `any(lower>upper)`; off by default for speed in particular for type character. + +9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. + +10. `on=.NATURAL` (or alternatively `X[on=Y]` [#3621](https://github.com/Rdatatable/data.table/issues/3621)) joins two tables on their common column names, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. As before, when `on=` is not provided, `X` must have a key and the key columns are used to join (like rownames, but multi-column and multi-type). + +11. `as.data.table` gains `key` argument mirroring its use in `setDT` and `data.table`, [#890](https://github.com/Rdatatable/data.table/issues/890). As a byproduct, the arguments of `as.data.table.array` have changed order, which could affect code relying on positional arguments to this method. Thanks @cooldome for the suggestion and @MichaelChirico for implementation. + +12. `merge.data.table` is now exported, [#2618](https://github.com/Rdatatable/data.table/pull/2618). We realize that S3 methods should not ordinarily be exported. Rather, the method should be invoked via S3 dispatch. But users continue to request its export, perhaps because of intricacies relating to the fact that data.table inherits from data.frame, there are two arguments to `merge()` but S3 dispatch applies just to the first, and a desire to explicitly call `data.table::merge.data.table` from package code. Thanks to @AndreMikulec for the most recent request. + +13. New rolling function to calculate rolling sum has been implemented and exported, see `?frollsum`, [#2778](https://github.com/Rdatatable/data.table/issues/2778). + +14. `setkey` to an existing index now uses the index, [#2889](https://github.com/Rdatatable/data.table/issues/2889). Thanks to @MichaelChirico for suggesting and @saraswatmks for the PR. + +15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`. + +16. `as.data.table` now unpacks columns in a `data.frame` which are themselves a `data.frame` or `matrix`. This need arises when parsing JSON, a corollary in [#3369](https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752). Bug fix 19 in v1.12.2 (see below) added a helpful error (rather than segfault) to detect such invalid `data.table`, and promised that `as.data.table()` would unpack these columns in the next release (i.e. this release) so that the invalid `data.table` is not created in the first place. Further, `setDT` now warns if it observes such columns and suggests using `as.data.table` instead, [#3760](https://github.com/Rdatatable/data.table/issues/3760). + +17. `CJ` has been ported to C and parallelized, thanks to a PR by Michael Chirico, [#3596](https://github.com/Rdatatable/data.table/pull/3596). All types benefit, but, as in many `data.table` operations, factors benefit more than character. + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + + ids = as.vector(outer(LETTERS, LETTERS, paste0)) + system.time( CJ(ids, 1:500000) ) # 3.9GB; 340m rows + # user system elapsed (seconds) + # 3.000 0.817 3.798 # was + # 1.800 0.832 2.190 # now + + # ids = as.factor(ids) + system.time( CJ(ids, 1:500000) ) # 2.6GB; 340m rows + # user system elapsed (seconds) + # 1.779 0.534 2.293 # was + # 0.357 0.763 0.292 # now + ``` + +18. New function `fcoalesce(...)` has been written in C, and is multithreaded for `numeric` and `factor`. It replaces missing values according to a prioritized list of candidates (as per SQL COALESCE, `dplyr::coalesce`, and `hutils::coalesce`), [#3424](https://github.com/Rdatatable/data.table/issues/3424). It accepts any number of vectors in several forms. For example, given three vectors `x`, `y`, and `z`, where each `NA` in `x` is to be replaced by the corresponding value in `y` if that is non-NA, else the corresponding value in `z`, the following equivalent forms are all accepted: `fcoalesce(x,y,z)`, `fcoalesce(x,list(y,z))`, and `fcoalesce(list(x,y,z))`. Being a new function, its behaviour is subject to change particularly for type `list`, [#3712](https://github.com/Rdatatable/data.table/issues/3712). + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + N = 100e6 + x = replicate(5, {x=sample(N); x[sample(N, N/2)]=NA; x}, simplify=FALSE) # 2GB + y1 = do.call(dplyr::coalesce, x)) + y2 = do.call(hutils::coalesce, x)) + y3 = do.call(data.table::fcoalesce, x)) + # user system elapsed (seconds) + # 4.935 1.876 6.810 # dplyr::coalesce + # 3.122 0.831 3.956 # hutils::coalesce + # 0.915 0.099 0.379 # data.table::fcoalesce + identical(y1,y2) && identical(y1,y3) + # TRUE + ``` + +19. Type `complex` is now supported by `setkey`, `setorder`, `:=`, `by=`, `keyby=`, `shift`, `dcast`, `frank`, `rowid`, `rleid`, `CJ`, `fcoalesce`, `unique`, and `uniqueN`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). Thanks to Gareth Ward and Elio Campitelli for their reports and input. Sorting `complex` is achieved the same way as base R; i.e., first by the real part then by the imaginary part (as if the `complex` column were two separate columns of `double`). There is no plan to support joining/merging on `complex` columns until a user demonstrates a need for that. + +20. `setkey`, `[key]by=` and `on=` in verbose mode (`options(datatable.verbose=TRUE)`) now detect any columns inheriting from `Date` which are stored as 8 byte double, test if any fractions are present, and if not suggest using a 4 byte integer instead (such as `data.table::IDate`) to save space and time, [#1738](https://github.com/Rdatatable/data.table/issues/1738). In future this could be upgraded to `message` or `warning` depending on feedback. + +21. New function `fifelse(test, yes, no, na)` has been implemented in C by Morgan Jacob, [#3657](https://github.com/Rdatatable/data.table/issues/3657) and [#3753](https://github.com/Rdatatable/data.table/issues/3753). It is comparable to `base::ifelse`, `dplyr::if_else`, `hutils::if_else`, and (forthcoming) [`vctrs::if_else()`](https://vctrs.r-lib.org/articles/stability.html#ifelse). It returns a vector of the same length as `test` but unlike `base::ifelse` the output type is consistent with those of `yes` and `no`. Please see `?data.table::fifelse` for more details. + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + x = sample(c(TRUE,FALSE), 3e8, replace=TRUE) # 1GB + microbenchmark::microbenchmark( + base::ifelse(x, 7L, 11L), + dplyr::if_else(x, 7L, 11L), + hutils::if_else(x, 7L, 11L), + data.table::fifelse(x, 7L, 11L), + times = 5L, unit="s" + ) + # Unit: seconds + # expr min med max neval + # base::ifelse(x, 7L, 11L) 8.5 8.6 8.8 5 + # dplyr::if_else(x, 7L, 11L) 9.4 9.5 9.7 5 + # hutils::if_else(x, 7L, 11L) 2.6 2.6 2.7 5 + # data.table::fifelse(x, 7L, 11L) 1.5 1.5 1.6 5 # setDTthreads(1) + # data.table::fifelse(x, 7L, 11L) 0.8 0.8 0.9 5 # setDTthreads(2) + # data.table::fifelse(x, 7L, 11L) 0.4 0.4 0.5 5 # setDTthreads(4) + ``` + +22. `transpose` gains `keep.names=` and `make.names=` arguments, [#1886](https://github.com/Rdatatable/data.table/issues/1886). Previously, column names were dropped and there was no way to keep them. `keep.names="rn"` keeps the column names and puts them in the `"rn"` column of the result. Similarly, `make.names="rn"` uses column `"rn"` as the column names of the result. Both arguments are `NULL` by default for backwards compatibility. As these new arguments are new, they are subject to change in future according to community feedback. Thanks to @ghost for the request. + +23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). + +24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. + +25. `setnames(DT, old, new)` now omits any `old==new` to save redundant key and index name updates, [#3783](https://github.com/Rdatatable/data.table/issues/3783). `setnames(DT, new)` (i.e. not providing `old`) already omitted any column name updates where `names(DT)==new`; e.g. `setnames(DT, gsub('^_', '', names(DT)))` exits early if no columns start with `_`. + +26. `[[` by group is now optimized for regular vectors (not type list), [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. `[` by group was already optimized. Please file a feature request if you would like this optimization for list columns. + +27. New function `frollapply` for rolling computation of arbitrary R functions (caveat: input `x` is coerced to numeric beforehand, and the function must return a scalar numeric value). The API is consistent to extant rolling functions `frollmean` and `frollsum`; note that it will generally be slower than those functions because (1) the known functions use our optimized internal C implementation and (2) there is no thread-safe API to R's C `eval`. Nevertheless `frollapply` is faster than corresponding `base`-only and `zoo` versions: + + ```R + set.seed(108) + x = rnorm(1e6); n = 1e3 + base_rollapply = function(x, n, FUN) { + nx = length(x) + ans = rep(NA_real_, nx) + for (i in n:nx) ans[i] = FUN(x[(i-n+1):i]) + ans + } + system.time(base_rollapply(x, n, mean)) + system.time(zoo::rollapplyr(x, n, function(x) mean(x), fill=NA)) + system.time(zoo::rollmeanr(x, n, fill=NA)) + system.time(frollapply(x, n, mean)) + system.time(frollmean(x, n)) + + ### fun mean sum median + # base_rollapply 8.815 5.151 60.175 + # zoo::rollapply 34.373 27.837 88.552 + # zoo::roll[fun] 0.215 0.185 NA ## median not fully supported + # frollapply 5.404 1.419 56.475 + # froll[fun] 0.003 0.002 NA ## median not yet supported + ``` + +28. `setnames()` now accepts functions in `old=` and `new=`, [#3703](https://github.com/Rdatatable/data.table/issues/3703). Thanks @smingerson for the feature request and @shrektan for the PR. + + ```R + DT = data.table(a=1:3, b=4:6, c=7:9) + setnames(DT, toupper) + names(DT) + # [1] "A" "B" "C" + setnames(DT, c(1,3), tolower) + names(DT) + # [1] "a" "B" "c" + ``` + +29. `:=` and `set()` now use zero-copy type coercion. Accordingly, `DT[..., integerColumn:=0]` and `set(DT,i,j,0)` no longer warn about the `0` ('numeric') needing to be `0L` ('integer') because there is no longer any time or space used for this coercion. The old long warning was off-putting to new users ("what and why L?"), whereas advanced users appreciated the old warning so they could avoid the coercion. Although the time and space for one coercion in a single call is unmeasurably small, when placed in a loop the small overhead of any allocation on R's heap could start to become noticeable (more so for `set()` whose purpose is low-overhead looping). Further, when assigning a value across columns of varying types, it could be inconvenient to supply the correct type for every column. Hence, zero-copy coercion was introduced to satisfy all these requirements. A warning is still issued, as before, when fractional data is discarded; e.g. when 3.14 is assigned to an integer column. Zero-copy coercion applies to length>1 vectors as well as length-1 vectors. + +## BUG FIXES + +1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. + +2. `keyby=colName` could use the wrong index and return incorrect results if both `colName` and `colNameExtra` (where `colName` is a leading subset of characters of `colNameExtra`) are column names and an index exists on `colNameExtra`, [#3498](https://github.com/Rdatatable/data.table/issues/3498). Thanks to Xianying Tan for the detailed report and pinpointing the source line at fault. + +3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. + +4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288). + +5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example. + +6. Printing could occur unexpectedly when code is run with `source`, [#2369](https://github.com/Rdatatable/data.table/issues/2369). Thanks to @jan-glx for the report and reproducible example. + +7. Grouping by `NULL` on zero rows `data.table` now behaves consistently to non-zero rows `data.table`, [#3530](https://github.com/Rdatatable/data.table/issues/3530). Thanks to @SymbolixAU for the report and reproducible example. + +8. GForce optimization of `median` did not retain the class; e.g. `median` of `Date` or `POSIXct` would return a raw number rather than retain the date class, [#3079](https://github.com/Rdatatable/data.table/issues/3079). Thanks to @Henrik-P for reporting. + +9. `DT[, format(mean(date,""%b-%Y")), by=group]` could fail with `invalid 'trim' argument`, [#1876](https://github.com/Rdatatable/data.table/issues/1876). Thanks to Ross Holmberg for reporting. + +10. `externalVar=1:5; DT[, mean(externalVar), by=group]` could return incorrect results rather than a constant (`3` in this example) for each group, [#875](https://github.com/Rdatatable/data.table/issues/875). GForce optimization was being applied incorrectly to the `mean` without realizing `externalVar` was not a column. + +11. `test.data.table()` now passes in non-English R sessions, [#630](https://github.com/Rdatatable/data.table/issues/630) [#3039](https://github.com/Rdatatable/data.table/issues/3039). Each test still checks that the number of warnings and/or errors produced is correct. However, a message is displayed suggesting to restart R with `LANGUAGE=en` in order to test that the text of the warning and/or error messages are as expected, too. + +12. Joining a double column in `i` containing say 1.3, with an integer column in `x` containing say 1, would result in the 1.3 matching to 1, [#2592](https://github.com/Rdatatable/data.table/issues/2592), and joining a factor column to an integer column would match the factor's integers rather than error. The type coercion logic has been revised and strengthened. Many thanks to @MarkusBonsch for reporting and fixing. Joining a character column in `i` to a factor column in `x` is now faster and retains the character column in the result rather than coercing it to factor. Joining an integer column in `i` to a double column in `x` now retains the integer type in the result rather than coercing the integers into the double type. Logical columns may now only be joined to logical columns, other than all-NA columns which are coerced to the matching column's type. All coercions are reported in verbose mode: `options(datatable.verbose=TRUE)`. + +13. Attempting to recycle 2 or more items into an existing `list` column now gives the intended helpful error rather than `Internal error: recycle length error not caught earlier.`, [#3543](https://github.com/Rdatatable/data.table/issues/3543). Thanks to @MichaelChirico for finding and reporting. + +14. Subassigning using `$<-` to a `data.table` embedded in a list column of a single-row `data.table` could fail, [#3474](https://github.com/Rdatatable/data.table/issues/3474). Note that `$<-` is not recommended; please use `:=` instead which already worked in this case. Thanks to Jakob Richter for reporting. + +15. `rbind` and `rbindlist` of zero-row items now retain (again) the unused levels of any (zero-length) factor columns, [#3508](https://github.com/Rdatatable/data.table/issues/3508). This was a regression in v1.12.2 just for zero-row items. Unused factor levels were already retained for items having `nrow>=1`. Thanks to Gregory Demin for reporting. + +16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting. Also a related segfault when recycling a length-1 factor column, [#3662](https://github.com/Rdatatable/data.table/issues/3662). + +17. `example(":=", local=TRUE)` now works rather than error, [#2972](https://github.com/Rdatatable/data.table/issues/2972). Thanks @vlulla for the report. + +18. `rbind.data.frame` on `IDate` columns changed the column from `integer` to `double`, [#2008](https://github.com/Rdatatable/data.table/issues/2008). Thanks to @rmcgehee for reporting. + +19. `merge.data.table` now retains any custom classes of the first argument, [#1378](https://github.com/Rdatatable/data.table/issues/1378). Thanks to @michaelquinn32 for reopening. + +20. `c`, `seq` and `mean` of `ITime` objects now retain the `ITime` class via new `ITime` methods, [#3628](https://github.com/Rdatatable/data.table/issues/3628). Thanks @UweBlock for reporting. The `cut` and `split` methods for `ITime` have been removed since the default methods work, [#3630](https://github.com/Rdatatable/data.table/pull/3630). + +21. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). + +22. Adding a `list` column using `cbind`, `as.data.table`, or `data.table` now works rather than treating the `list` as if it were a set of columns and introducing an invalid NA column name, [#3471](https://github.com/Rdatatable/data.table/pull/3471). However, please note that using `:=` to add columns is preferred. + + ```R + cbind( data.table(1:2), list(c("a","b"),"a") ) + # V1 V2 NA # v1.12.2 and before + # + # 1: 1 a a + # 2: 2 b a + # + # V1 V2 # v1.12.4+ + # + # 1: 1 a,b + # 2: 2 a + ``` + +23. Incorrect sorting/grouping results due to a bug in Intel's `icc` compiler 2019 (Version 19.0.4.243 Build 20190416) has been worked around thanks to a report and fix by Sebastian Freundt, [#3647](https://github.com/Rdatatable/data.table/issues/3647). Please run `data.table::test.data.table()`. If that passes, your installation does not have the problem. + +24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. + +25. Slight fix to the logic for auto-naming the `by` clause for using a custom function like `evaluate` to now be named `evaluate` instead of the name of the first symbolic argument, [#3758](https://github.com/Rdatatable/data.table/issues/3758). + +26. Column binding of zero column `data.table` will now work as expected, [#3334](https://github.com/Rdatatable/data.table/issues/3334). Thanks to @kzenstratus for the report. + +27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. + +28. From v1.12.0 `between()` and `%between%` interpret missing values in `lower=` or `upper=` as unlimited bounds. A new parameter `NAbounds` has been added to achieve the old behaviour of returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks @cguill95 for reporting. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported too. + +29. `integer64` defined on a subset of a new column would leave "gibberish" on the remaining rows, [#3723](https://github.com/Rdatatable/data.table/issues/3723). A bug in `rbindlist` with the same root cause was also fixed, [#1459](https://github.com/Rdatatable/data.table/issues/1459). Thanks @shrektan and @jangorecki for the reports. + +30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. + +31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue. + +32. `x[, round(.SD, 1)]` and similar operations on the whole of `.SD` could return a locked result, incorrectly preventing `:=` on the result, [#2245](https://github.com/Rdatatable/data.table/issues/2245). Thanks @grayskripko for raising. + +33. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), and [#2036](https://github.com/Rdatatable/data.table/issues/2036). Thanks @franknarf1, @MichaelChirico, and @TonyBonen, for the reports. + +34. `DT[, i-1L, with=FALSE]` would misinterpret the minus sign and return an incorrect result, [#2019](https://github.com/Rdatatable/data.table/issues/2109). Thanks @cguill95 for the report. + +35. `DT[id==1, DT2[.SD, on="id"]]` (i.e. joining from `.SD` in `j`) could incorrectly fail in some cases due to `.SD` being locked, [#1926](https://github.com/Rdatatable/data.table/issues/1926), and when updating-on-join with factors [#3559](https://github.com/Rdatatable/data.table/issues/3559) [#2099](https://github.com/Rdatatable/data.table/issues/2099). Thanks @franknarf1 and @Henrik-P for the reports and for diligently tracking use cases for almost 3 years! + +36. `as.IDate.POSIXct` returned `NA` for UTC times before Dec 1901 and after Jan 2038, [#3780](https://github.com/Rdatatable/data.table/issues/3780). Thanks @gschett for the report. + +37. `rbindlist` now returns correct idcols for lists with different length vectors, [#3785](https://github.com/Rdatatable/data.table/issues/3785), [#3786](https://github.com/Rdatatable/data.table/pull/3786). Thanks to @shrektan for the report and fix. + +38. `DT[ , !rep(FALSE, ncol(DT)), with=FALSE]` correctly returns the full table, [#3013](https://github.com/Rdatatable/data.table/issues/3013) and [#2917](https://github.com/Rdatatable/data.table/issues/2917). Thanks @alexnss and @DavidArenburg for the reports. + +39. `shift(x, 0:1, type='lead', give.names=TRUE)` uses `lead` in all returned column names, [#3832](https://github.com/Rdatatable/data.table/issues/3832). Thanks @daynefiler for the report. + +40. Subtracting two `POSIXt` objects by group could lead to incorrect results because the `base` method internally calls `difftime` with `units='auto'`; `data.table` does not notice if the chosen units differ by group and only the last group's `units` attribute was retained, [#3694](https://github.com/Rdatatable/data.table/issues/3694) and [#761](https://github.com/Rdatatable/data.table/issues/761). To surmount this, we now internally force `units='secs'` on all `POSIXt-POSIXt` calls (reported when `verbose=TRUE`); generally we recommend calling `difftime` directly instead. Thanks @oliver-oliver and @boethian for the reports. + +41. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), [#2036](https://github.com/Rdatatable/data.table/issues/2036), and [#2946](https://github.com/Rdatatable/data.table/issues/2946). Thanks @franknarf1, @MichaelChirico, @TonyBonen, and Steffen J. (StackOverflow) for the reports. + +42. `DT[...,by={...}]` now handles expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. + +43. `:=` could change a `data.table` creation statement in the body of the function calling it, or a variable in calling scope, [#3890](https://github.com/Rdatatable/data.table/issues/3890). Many thanks to @kirillmayantsev for the detailed reports. + +44. Grouping could create a `malformed factor` and/or segfault when the factors returned by each group did not have identical levels, [#2199](https://github.com/Rdatatable/data.table/issues/2199) and [#2522](https://github.com/Rdatatable/data.table/issues/2522). Thanks to Václav Hausenblas, @franknarf1, @ben519, and @Henrik-P for reporting. + +45. `rbindlist` (and printing a `data.table` with over 100 rows because that uses `rbindlist(head, tail)`) could error with `malformed factor` for unordered factor columns containing a used `NA_character_` level, [#3915](https://github.com/Rdatatable/data.table/issues/3915). This is an unusual input for unordered factors because NA_integer_ is recommended by default in R. Thanks to @sindribaldur for reporting. + +46. Adding a `list` column containing an item of type `list` to a one row `data.table` could fail, [#3626](https://github.com/Rdatatable/data.table/issues/3626). Thanks to Jakob Richter for reporting. + +## NOTES + +1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. + +2. Adding a new column by reference using `set()` on a `data.table` loaded from binary file now give a more helpful error message, [#2996](https://github.com/Rdatatable/data.table/issues/2996). Thanks to Joseph Burling for reporting. + + ``` + This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed + manually (e.g. using structure()). Please run setDT() or alloc.col() on it first (to pre-allocate + space for new columns) before adding new columns by reference to it. + ``` + +3. `setorder` on a superset of a keyed `data.table`'s key now retains its key, [#3456](https://github.com/Rdatatable/data.table/issues/3456). For example, if `a` is the key of `DT`, `setorder(DT, a, -v)` will leave `DT` keyed by `a`. + +4. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. + +5. When loading a `data.table` from disk (e.g. with `readRDS`), best practice is to run `setDT()` on the new object to assure it is correctly allocated memory for new column pointers. Barring this, unexpected behavior can follow; for example, if you assign a new column to `DT` from a function `f`, the new columns will only be assigned within `f` and `DT` will be unchanged. The `verbose` messaging in this situation is now more helpful, [#1729](https://github.com/Rdatatable/data.table/issues/1729). Thanks @vspinu for sharing his experience to spur this. + +6. New vignette _Using `.SD` for Data Analysis_, a deep dive into use cases for the `.SD` variable to help illuminate this topic which we've found to be a sticking point for beginning and intermediate `data.table` users, [#3412](https://github.com/Rdatatable/data.table/issues/3412). + +7. Added a note to `?frank` clarifying that ranking is being done according to C sorting (i.e., like `forder`), [#2328](https://github.com/Rdatatable/data.table/issues/2328). Thanks to @cguill95 for the request. + +8. Historically, `dcast` and `melt` were built as enhancements to `reshape2`'s own `dcast`/`melt`. We removed dependency on `reshape2` in v1.9.6 but maintained some backward compatibility. As that package has been superseded since December 2017, we will begin to formally complete the split from `reshape2` by removing some last vestiges. In particular we now warn when redirecting to `reshape2` methods and will later error before ultimately completing the split; see [#3549](https://github.com/Rdatatable/data.table/issues/3549) and [#3633](https://github.com/Rdatatable/data.table/issues/3633). We thank the `reshape2` authors for their original inspiration for these functions, and @ProfFancyPants for testing and reporting regressions in dev which have been fixed before release. + +9. `DT[col]` where `col` is a column containing row numbers of itself to select, now suggests the correct syntax (`DT[(col)]` or `DT[DT$col]`), [#697](https://github.com/Rdatatable/data.table/issues/697). This expands the message introduced in [#1884](https://github.com/Rdatatable/data.table/issues/1884) for the case where `col` is type `logical` and `DT[col==TRUE]` is suggested. + +10. The `datatable.old.unique.by.key` option has been warning for 1 year that it is deprecated: `... Please stop using it and pass by=key(DT) instead for clarity ...`. This warning is now upgraded to error as per the schedule in note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). In June 2020 the option will be removed. + +11. We intend to deprecate the `datatable.nomatch` option, [more info](https://github.com/Rdatatable/data.table/pull/3578/files). A message is now printed upon use of the option (once per session) as a first step. It asks you to please stop using the option and to pass `nomatch=NULL` explicitly if you require inner join. Outer join (`nomatch=NA`) has always been the default because it is safer; it does not drop missing data silently. The problem is that the option is global; i.e., if a user changes the default using this option for their own use, that can change the behavior of joins inside packages that use `data.table` too. This is the only `data.table` option with this concern. + +12. The test suite of 9k tests now runs with three R options on: `warnPartialMatchArgs`, `warnPartialMatchAttr`, and `warnPartialMatchDollar`. This ensures that we don't rely on partial argument matching in internal code, for robustness and efficiency, and so that users can turn these options on for their code in production, [#3664](https://github.com/Rdatatable/data.table/issues/3664). Thanks to Vijay Lulla for the suggestion, and Michael Chirico for fixing 48 internal calls to `attr()` which were missing `exact=TRUE`, for example. Thanks to R-core for adding these options to R 2.6.0 (Oct 2007). + +13. `test.data.table()` could fail if the `datatable.integer64` user option was set, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for reporting. + +14. The warning message when using `keyby=` together with `:=` is clearer, [#2763](https://github.com/Rdatatable/data.table/issues/2763). Thanks to @eliocamp. + +15. `first` and `last` gain an explicit `n=1L` argument so that it's clear the default is 1, and their almost identical manual pages have been merged into one. + +16. Rolling functions (`?froll`) coerce `logical` input to `numeric` (instead of failing) to mimic the behavior of `integer` input. + +17. The warning message when using `strptime` in `j` has been improved, [#2068](https://github.com/Rdatatable/data.table/issues/2068). Thanks to @tdhock for the report. + +18. Added a note to `?setkey` clarifying that `setkey` always uses C-locale sorting (as has been noted in `?setorder`). Thanks @JBreidaks for the report in [#2114](https://github.com/Rdatatable/data.table/issues/2114). + +19. `hour()`/`minute()`/`second()` are much faster for `ITime` input, [#3518](https://github.com/Rdatatable/data.table/issues/3158). + +20. New alias `setalloccol` for `alloc.col`, [#3475](https://github.com/Rdatatable/data.table/issues/3475). For consistency with `set*` prefixes for functions that operate in-place (like `setkey`, `setorder`, etc.). `alloc.col` is not going to be deprecated but we recommend using `setalloccol`. + +21. `dcast` no longer emits a message when `value.var` is missing but `fun.aggregate` is explicitly set to `length` (since `value.var` is arbitrary in this case), [#2980](https://github.com/Rdatatable/data.table/issues/2980). + +22. Optimized `mean` of `integer` columns no longer warns about a coercion to numeric, [#986](https://github.com/Rdatatable/data.table/issues/986). Thanks @dgrtwo for his [YouTube tutorial at 3:01](https://youtu.be/AmE4LXPQErM?t=175) where the warning occurs. + +23. Using `first` and `last` function on `POSIXct` object no longer loads `xts` namespace, [#3857](https://github.com/Rdatatable/data.table/issues/3857). `first` on empty `data.table` returns empty `data.table` now [#3858](https://github.com/Rdatatable/data.table/issues/3858). + +24. Added some clarifying details about what happens when a shell command is used in `fread`, [#3877](https://github.com/Rdatatable/data.table/issues/3877). Thanks Brian for the StackOverflow question which highlighted the lack of explanation here. + +25. We continue to encourage packages to `Import` rather than `Depend` on `data.table`, [#3076](https://github.com/Rdatatable/data.table/issues/3076). To prevent the growth rate in new packages using `Depend`, we have requested that CRAN apply a small patch we provided to prevent new submissions using `Depend`. If this is accepted, the error under `--as-cran` will be as follows. The existing 73 packages using `Depend` will continue to pass OK until they next update, at which point they will be required to change from `Depend` to `Import`. + + ``` + R CMD check --as-cran + ... + * checking package dependencies ... ERROR + + data.table should be in Imports not Depends. Please contact its + maintainer for more information. + ``` + + +# data.table [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) + +## NEW FEATURES + +1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, please use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 774 CRAN and Bioconductor packages using `data.table` were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead, [#3310](https://github.com/Rdatatable/data.table/pull/3310). Upon agreement we went ahead. Many thanks to all those maintainers for already updating on CRAN, [#3347](https://github.com/Rdatatable/data.table/pull/3347). + +2. `foverlaps` now supports `type="equal"`, [#3416](https://github.com/Rdatatable/data.table/issues/3416) and part of [#3002](https://github.com/Rdatatable/data.table/issues/3002). + +3. The number of logical CPUs used by default has been reduced from 100% to 50%. The previous 100% default was reported to cause significant slow downs when other non-trivial processes were also running, [#3395](https://github.com/Rdatatable/data.table/issues/3395) [#3298](https://github.com/Rdatatable/data.table/issues/3298). Two new optional environment variables (`R_DATATABLE_NUM_PROCS_PERCENT` & `R_DATATABLE_NUM_THREADS`) control this default. `setDTthreads()` gains `percent=` and `?setDTthreads` has been significantly revised. The output of `getDTthreads(verbose=TRUE)` has been expanded. The environment variable `OMP_THREAD_LIMIT` is now respected ([#3300](https://github.com/Rdatatable/data.table/issues/3300)) in addition to `OMP_NUM_THREADS` as before. + +4. `rbind` and `rbindlist` now retain the position of duplicate column names rather than grouping them together [#3373](https://github.com/Rdatatable/data.table/issues/3373), fill length 0 columns (including NULL) with NA with warning [#1871](https://github.com/Rdatatable/data.table/issues/1871), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524). Thanks to Kun Ren for the requests which arose when parsing JSON. + +5. `rbindlist`'s `use.names=` default has changed from `FALSE` to `"check"`. This emits a message if the column names of each item are not identical and then proceeds as if `use.names=FALSE` for backwards compatibility; i.e., bind by column position not by column name. The `rbind` method for `data.table` already sets `use.names=TRUE` so this change affects `rbindlist` only and not `rbind.data.table`. To stack differently named columns together silently (the previous default behavior of `rbindlist`), it is now necessary to specify `use.names=FALSE` for clarity to readers of your code. Thanks to Clayton Stanley who first raised the issue [here](https://lists.r-forge.r-project.org/pipermail/datatable-help/2014-April/002480.html). To aid pinpointing the calls to `rbindlist` that need attention, the message can be turned to error using `options(datatable.rbindlist.check="error")`. This option also accepts `"warning"`, `"message"` and `"none"`. In this release the message is suppressed for default column names (`"V[0-9]+"`); the next release will emit the message for those too. In 6 months the default will be upgraded from message to warning. There are two slightly different messages. They are helpful, include context and point to this news item : + + ``` + Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with + NA (NULL for list columns), or use.names=FALSE to ignore column names. + See news item 5 in v1.12.2 for options to control this message. + + Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE + to match by column name, or use.names=FALSE to ignore column names. + See news item 5 in v1.12.2 for options to control this message. + ``` + +6. `fread` gains `keepLeadingZeros`, [#2999](https://github.com/Rdatatable/data.table/issues/2999). By default `FALSE` so that, as before, a field containing `001` is interpreted as the integer 1, otherwise the character string `"001"`. The default may be changed using `options(datatable.keepLeadingZeros=TRUE)`. Many thanks to @marc-outins for the PR. + +## BUG FIXES + +1. `rbindlist()` of a malformed factor which is missing a levels attribute is now a helpful error rather than a cryptic error about `STRING_ELT`, [#3315](https://github.com/Rdatatable/data.table/issues/3315). Thanks to Michael Chirico for reporting. + +2. Forgetting `type=` in `shift(val, "lead")` would segfault, [#3354](https://github.com/Rdatatable/data.table/issues/3354). A helpful error is now produced to indicate `"lead"` is being passed to `n=` rather than the intended `type=` argument. Thanks to @SymbolixAU for reporting. + +3. The default print output (top 5 and bottom 5 rows) when ncol>255 could display the columns in the wrong order, [#3306](https://github.com/Rdatatable/data.table/issues/3306). Thanks to Kun Ren for reporting. + +4. Grouping by unusual column names such as `by='string_with_\\'` and `keyby="x y"` could fail, [#3319](https://github.com/Rdatatable/data.table/issues/3319) [#3378](https://github.com/Rdatatable/data.table/issues/3378). Thanks to @HughParsonage for reporting and @MichaelChirico for the fixes. + +5. `foverlaps()` could return incorrect results for `POSIXct <= 1970-01-01`, [#3349](https://github.com/Rdatatable/data.table/issues/3349). Thanks to @lux5 for reporting. + +6. `dcast.data.table` now handles functions passed to `fun.aggregate=` via a variable; e.g., `funs <- list(sum, mean); dcast(..., fun.aggregate=funs`, [#1974](https://github.com/Rdatatable/data.table/issues/1974) [#1369](https://github.com/Rdatatable/data.table/issues/1369) [#2064](https://github.com/Rdatatable/data.table/issues/2064) [#2949](https://github.com/Rdatatable/data.table/issues/2949). Thanks to @sunbee, @Ping2016, @smidelius and @d0rg0ld for reporting. + +7. Some non-equijoin cases could segfault, [#3401](https://github.com/Rdatatable/data.table/issues/3401). Thanks to @Gayyam for reporting. + +8. `dcast.data.table` could sort rows containing `NA` incorrectly, [#2202](https://github.com/Rdatatable/data.table/issues/2202). Thanks to @Galileo-Galilei for the report. + +9. Sorting, grouping and finding unique values of a numeric column containing at most one finite value (such as `c(Inf,0,-Inf)`) could return incorrect results, [#3372](https://github.com/Rdatatable/data.table/issues/3372) [#3381](https://github.com/Rdatatable/data.table/issues/3381); e.g., `data.table(A=c(Inf,0,-Inf), V=1:3)[,sum(V),by=A]` would treat the 3 rows as one group. This was a regression in 1.12.0. Thanks to Nicolas Ampuero for reporting. + +10. `:=` with quoted expression and dot alias now works as expected, [#3425](https://github.com/Rdatatable/data.table/pull/3425). Thanks to @franknarf1 for raising and @jangorecki for the PR. + +11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause. + +12. `rbind` and `rbindlist(..., use.names=TRUE)` with over 255 columns could return the columns in a random order, [#3373](https://github.com/Rdatatable/data.table/issues/3373). The contents and name of each column was correct but the order that the columns appeared in the result might not have matched the original input. + +13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), and support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819). + +14. `NULL` columns are caught and error appropriately rather than segfault in some cases, [#2303](https://github.com/Rdatatable/data.table/issues/2303) [#2305](https://github.com/Rdatatable/data.table/issues/2305). Thanks to Hugh Parsonage and @franknarf1 for reporting. + +15. `melt` would error with 'factor malformed' or segfault in the presence of duplicate column names, [#1754](https://github.com/Rdatatable/data.table/issues/1754). Many thanks to @franknarf1, William Marble, wligtenberg and Toby Dylan Hocking for reproducible examples. All examples have been added to the test suite. + +16. Removing a column from a null (0-column) data.table is now a (standard and simpler) warning rather than error, [#2335](https://github.com/Rdatatable/data.table/issues/2335). It is no longer an error to add a column to a null (0-column) data.table. + +17. Non-UTF8 strings were not always sorted correctly on Windows (a regression in v1.12.0), [#3397](https://github.com/Rdatatable/data.table/issues/3397) [#3451](https://github.com/Rdatatable/data.table/issues/3451). Many thanks to @shrektan for reporting and fixing. + +18. `cbind` with a null (0-column) `data.table` now works as expected, [#3445](https://github.com/Rdatatable/data.table/issues/3445). Thanks to @mb706 for reporting. + +19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. + +## NOTES + +1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. + +2. v1.12.0 checked itself on loading using `tools::checkMD5sums("data.table")` but this check failed under the `packrat` package manager on Windows because `packrat` appears to modify the DESCRIPTION file of packages it has snapshot, [#3329](https://github.com/Rdatatable/data.table/issues/3329). This check is now removed. The `CdllVersion` check was introduced after the `checkMD5sums()` attempt and is better; e.g., reliable on all platforms. + +3. As promised in new feature 6 of v1.11.6 Sep 2018 (see below in this news file), the `datatable.CJ.names` option's default is now `TRUE`. In v1.13.0 it will be removed. + +4. Travis CI gains OSX using homebrew llvm for OpenMP support, [#3326](https://github.com/Rdatatable/data.table/issues/3326). Thanks @marcusklik for the PR. + +5. Calling `data.table:::print.data.table()` directly (i.e. bypassing method dispatch by using 3 colons) and passing it a 0-column `data.frame` (not `data.table`) now works, [#3363](https://github.com/Rdatatable/data.table/pull/3363). Thanks @heavywatal for the PR. + +6. v1.12.0 did not compile on Solaris 10 using Oracle Developer Studio 12.6, [#3285](https://github.com/Rdatatable/data.table/issues/3285). Many thanks to Prof Ripley for providing and testing a patch. For future reference and other package developers, a `const` variable should not be passed to OpenMP's `num_threads()` directive otherwise `left operand must be modifiable lvalue` occurs. This appears to be a compiler bug which is why the specific versions are mentioned in this note. + +7. `foverlaps` provides clearer error messages w.r.t. factor and POSIXct interval columns, [#2645](https://github.com/Rdatatable/data.table/issues/2645) [#3007](https://github.com/Rdatatable/data.table/issues/3007) [#1143](https://github.com/Rdatatable/data.table/issues/1143). Thanks to @sritchie73, @msummersgill and @DavidArenburg for the reports. + +8. `unique(DT)` checks up-front the types of all the columns and will fail if any column is type `list` even though those `list` columns may not be needed to establish uniqueness. Use `unique(DT, by=...)` to specify columns that are not type `list`. v1.11.8 and before would also correctly fail with the same error, but not when uniqueness had been established in prior columns: it would stop early, not look at the `list` column and return the correct result. Checking up-front was necessary for some internal optimizations and it's probably best to be explicit anyway. Thanks to James Lamb for reporting, [#3332](https://github.com/Rdatatable/data.table/issues/3332). The error message has been embellished : + + ``` + Column 2 of by= (2) is type 'list', not yet supported. Please use the by= argument to specify + columns with types that are supported. + ``` + +9. Reminder that note 11 in v1.11.0 (May 2018) warned that `set2key()` and `key2()` will be removed in May 2019. They have been warning since v1.9.8 (Nov 2016) and their warnings were upgraded to errors in v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental'. + +10. The `key(DT)<-` form of `setkey()` has been warning since at least 2012 to use `setkey()`. The warning is now stronger: `key(x)<-value is deprecated and not supported. Please change to use setkey().`. This warning will be upgraded to error in one year. + + +# data.table v1.12.0 (13 Jan 2019) + +## NEW FEATURES + +1. `setDTthreads()` gains `restore_after_fork=`, [#2885](https://github.com/Rdatatable/data.table/issues/2885). The default `NULL` leaves the internal option unchanged which by default is `TRUE`. `data.table` has always switched to single-threaded mode on fork. It used to restore multithreading after a fork too but problems were reported on Mac and Intel OpenMP library (see 1.10.4 notes below). We are now trying again thanks to suggestions and success reported by Kun Ren and Mark Klik in package `fst`. If you experience problems with multithreading after a fork, please restart R and call `setDTthreads(restore_after_fork=FALSE)`. + +2. Subsetting, ordering and grouping now use more parallelism. See benchmarks [here](https://h2oai.github.io/db-benchmark/) and Matt Dowle's presentation in October 2018 on YouTube [here](https://youtu.be/Ddr8N9STSuI). These internal changes gave rise to 4 regressions which were found before release thanks to Kun Ren, [#3211](https://github.com/Rdatatable/data.table/issues/3211). He kindly volunteers to 'go-first' and runs data.table through his production systems before release. We are looking for a 'go-second' volunteer please. A request to test before release was tweeted on 17 Dec [here](https://twitter.com/MattDowle/status/1074746218645938176). As usual, all CRAN and Bioconductor packages using data.table (currently 750) have been tested against this release, [#3233](https://github.com/Rdatatable/data.table/issues/3233). There are now 8,000 tests in 13,000 lines of test code; more lines of test code than there is code. Overall coverage has increased to 94% thanks to Michael Chirico. + +3. New `frollmean` has been added by Jan Gorecki to calculate _rolling mean_, see `?froll` for documentation. Function name and arguments are experimental. Related to [#2778](https://github.com/Rdatatable/data.table/issues/2778) (and [#624](https://github.com/Rdatatable/data.table/issues/624), [#626](https://github.com/Rdatatable/data.table/issues/626), [#1855](https://github.com/Rdatatable/data.table/issues/1855)). Other rolling statistics will follow. + +4. `fread()` can now read a remote compressed file in one step; `fread("https://domain.org/file.csv.bz2")`. The `file=` argument now supports `.gz` and `.bz2` too; i.e. `fread(file="file.csv.gz")` works now where only `fread("file.csv.gz")` worked in 1.11.8. + +5. `nomatch=NULL` now does the same as `nomatch=0L` in both `DT[...]` and `foverlaps()`; i.e. discards missing values silently (inner join). The default is still `nomatch=NA` (outer join) for statistical safety so that missing values are retained by default. After several years have elapsed, we will start to deprecate `0L`; please start using `NULL`. In future `nomatch=.(0)` (note that `.()` creates a `list` type and is different to `nomatch=0`) will fill with `0` to save replacing `NA` with `0` afterwards, [#857](https://github.com/Rdatatable/data.table/issues/857). + +6. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR. + +7. `NA` in `between()` and `%between%`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than `NA`. This is now documented. + +8. `shift()` now interprets negative values of `n` to mean the opposite `type=`, [#1708](https://github.com/Rdatatable/data.table/issues/1708). When `give.names=TRUE` the result is named using a positive `n` with the appropriate `type=`. Alternatively, a new `type="shift"` names the result using a signed `n` and constant type. + + ```R + shift(x, n=-5:5, give.names=TRUE) => "_lead_5" ... "_lag_5" + shift(x, n=-5:5, type="shift", give.names=TRUE) => "_shift_-5" ... "_shift_5" + ``` + +9. `fwrite()` now accepts `matrix`, [#2613](https://github.com/Rdatatable/data.table/issues/2613). Thanks to Michael Chirico for the suggestion and Felipe Parages for implementing. For now matrix input is converted to data.table (which can be costly) before writing. + +10. `fread()` and `fwrite()` can now handle file names in native and UTF-8 encoding, [#3078](https://github.com/Rdatatable/data.table/issues/3078). Thanks to Daniel Possenriede (@dpprdan) for reporting and fixing. + +11. `DT[i]` and `DT[i,cols]` now call internal parallel subsetting code, [#2951](https://github.com/Rdatatable/data.table/issues/2951). Subsetting is significantly faster (as are many other operations) with factor columns rather than character. + + ```R + N = 2e8 # 4GB data on 4-core CPU with 16GB RAM + DT = data.table(ID = sample(LETTERS,N,TRUE), + V1 = sample(5,N,TRUE), + V2 = runif(N)) + w = which(DT$V1 > 3) # select 40% of rows + # v1.12.0 v1.11.8 + system.time(DT[w]) # 0.8s 2.6s + DT[, ID := as.factor(ID)] + system.time(DT[w]) # 0.4s 2.3s + system.time(DT[w, c("ID","V2")]) # 0.3s 1.9s + ``` + +12. `DT[..., .SDcols=]` now accepts `patterns()`; e.g. `DT[..., .SDcols=patterns("^V")]`, for filtering columns according to a pattern (as in `melt.data.table`), [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples. + +13. `split` data.table method will now preserve attributes, closes [#2047](https://github.com/Rdatatable/data.table/issues/2047). Thanks to @caneff for reporting. + +14. `DT[i,j]` now retains user-defined and inherited attributes, [#995](https://github.com/Rdatatable/data.table/issues/995); e.g. + + ```R + attr(datasets::BOD,"reference") # "A1.4, p. 270" + attr(as.data.table(datasets::BOD)[2],"reference") # was NULL now "A1.4, p. 270" + ``` + + If a superclass defines attributes that may not be valid after a `[` subset then the superclass should implement its own `[` method to manage those after calling `NextMethod()`. + +## BUG FIXES + +1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. + +2. Column names that look like expressions (e.g. `"a<=colB"`) caused an error when used in `on=` even when wrapped with backticks, [#3092](https://github.com/Rdatatable/data.table/issues/3092). Additionally, `on=` now supports white spaces around operators; e.g. `on = "colA == colB"`. Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. + +3. Unmatched `patterns` in `measure.vars` fail early and with feedback, [#3106](https://github.com/Rdatatable/data.table/issues/3106). + +4. `fread(..., skip=)` now skips non-standard `\r` and `\n\r` line endings properly again, [#3006](https://github.com/Rdatatable/data.table/issues/3006). Standard line endings (`\n` Linux/Mac and `\r\n` Windows) were skipped ok. Thanks to @brattono and @tbrycekelly for providing reproducible examples, and @st-pasha for fixing. + +5. `fread(..., colClasses=)` could return a corrupted result when a lower type was requested for one or more columns (e.g. reading "3.14" as integer), [#2922](https://github.com/Rdatatable/data.table/issues/2922) [#2863](https://github.com/Rdatatable/data.table/issues/2863) [#3143](https://github.com/Rdatatable/data.table/issues/3143). It now ignores the request as documented and the helpful message in verbose mode is upgraded to warning. In future, coercing to a lower type might be supported (with warning if any accuracy is lost). `"NULL"` is recognized again in both vector and list mode; e.g. `colClasses=c("integer","NULL","integer")` and `colClasses=list(NULL=2, integer=10:40)`. Thanks to Arun Srinivasan, Kun Ren, Henri Ståhl and @kszela24 for reporting. + +6. `cube()` will now produce expected order of results, [#3179](https://github.com/Rdatatable/data.table/issues/3179). Thanks to @Henrik-P for reporting. + +7. `groupingsets()` groups by empty column set and constant value in `j`, [#3173](https://github.com/Rdatatable/data.table/issues/3173). + +8. `split.data.table()` failed if `DT` had a factor column named `"x"`, [#3151](https://github.com/Rdatatable/data.table/issues/3151). Thanks to @tdeenes for reporting and fixing. + +9. `fsetequal` now handles properly datasets having last column a character, closes [#2318](https://github.com/Rdatatable/data.table/issues/2318). Thanks to @pschil and @franknarf1 for reporting. + +10. `DT[..., .SDcols=integer(0L)]` could fail, [#3185](https://github.com/Rdatatable/data.table/issues/3185). An empty `data.table` is now returned correctly. + +11. `as.data.table.default` method will now always copy its input, closes [#3230](https://github.com/Rdatatable/data.table/issues/3230). Thanks to @NikdAK for reporting. + +12. `DT[..., .SDcols=integer()]` failed with `.SDcols is numeric but has both +ve and -ve indices`, [#1789](https://github.com/Rdatatable/data.table/issues/1789) and [#3185](https://github.com/Rdatatable/data.table/issues/3185). It now functions as `.SDcols=character()` has done and creates an empty `.SD`. Thanks to Gabor Grothendieck and Hugh Parsonage for reporting. A related issue with empty `.SDcols` was fixed in development before release thanks to Kun Ren's testing, [#3211](https://github.com/Rdatatable/data.table/issues/3211). + +13. Multithreaded stability should be much improved with R 3.5+. Many thanks to Luke Tierney for pinpointing a memory issue with package `constellation` caused by `data.table` and his advice, [#3165](https://github.com/Rdatatable/data.table/issues/3165). Luke also added an extra check to R-devel when compiled with `--enable-strict-barrier`. The test suite is run through latest daily R-devel after every commit as usual, but now with `--enable-strict-barrier` on too via GitLab CI ("Extra" badge on the `data.table` homepage) thanks to Jan Gorecki. + +14. Fixed an edge-case bug of platform-dependent output of `strtoi("", base = 2L)` on which `groupingsets` had relied, [#3267](https://github.com/Rdatatable/data.table/issues/3267). + +## NOTES + +1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. + +2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. + +3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR. + +4. `update.dev.pkg()` gains `type=` to specify if update should be made from binaries, sources or both. [#3148](https://github.com/Rdatatable/data.table/issues/3148). Thanks to Reino Bruner for the detailed suggestions. + +5. `setDT()` improves feedback when passed a ragged list (i.e. where all columns in the list are not the same length), [#3121](https://github.com/Rdatatable/data.table/issues/3121). Thanks @chuk-yong for highlighting. + +6. The one and only usage of `UNPROTECT_PTR()` has been removed, [#3232](https://github.com/Rdatatable/data.table/issues/3232). Thanks to Tomas Kalibera's investigation and advice here: https://developer.r-project.org/Blog/public/2018/12/10/unprotecting-by-value/index.html + + +# data.table v1.11.8 (30 Sep 2018) + +## NEW FEATURES + +1. `fread()` can now read `.gz` and `.bz2` files directly: `fread("file.csv.gz")`, [#717](https://github.com/Rdatatable/data.table/issues/717) [#3058](https://github.com/Rdatatable/data.table/issues/3058). It uses `R.utils::decompressFile` to decompress to a `tempfile()` which is then read by `fread()` in the usual way. For greater speed on large-RAM servers, it is recommended to use ramdisk for temporary files by setting `TMPDIR` to `/dev/shm` before starting R; see `?tempdir`. The decompressed temporary file is removed as soon as `fread` completes even if there is an error reading the file. Reading a remote compressed file in one step will be supported in the next version; e.g. `fread("https://domain.org/file.csv.bz2")`. + +## BUG FIXES + +1. Joining two keyed tables using `on=` to columns not forming a leading subset of `key(i)` could result in an invalidly keyed result, [#3061](https://github.com/Rdatatable/data.table/issues/3061). Subsequent queries on the result could then return incorrect results. A warning `longer object length is not a multiple of shorter object length` could also occur. Thanks to @renkun-ken for reporting and the PR. + +2. `keyby=` on columns for which an index exists now uses the index (new feature 7 in v1.11.6 below) but if an `i` subset is present in the same query then it could segfault, [#3062](https://github.com/Rdatatable/data.table/issues/3062). Again thanks to @renkun-ken for reporting. + +3. Assigning an out-of-range integer to an item in a factor column (a rare operation) correctly created an `NA` in that spot with warning, but now no longer also corrupts the variable being assigned, [#2984](https://github.com/Rdatatable/data.table/issues/2984). Thanks to @radfordneal for reporting and @MarkusBonsch for fixing. Assigning a string which is missing from the factor levels continues to automatically append the string to the factor levels. + +4. Assigning a sequence to a column using base R methods (e.g. `DT[["foo"]] = 1:10`) could cause subsetting to fail with `Internal error in subset.c: column is an ALTREP vector`, [#3051](https://github.com/Rdatatable/data.table/issues/3051). Thanks to Michel Lang for reporting. + +5. `as.data.table` `matrix` method now properly handles rownames for 0 column data.table output. Thanks @mllg for reporting. Closes [#3149](https://github.com/Rdatatable/data.table/issues/3149). + +## NOTES + +1. The test suite now turns on R's new _R_CHECK_LENGTH_1_LOGIC2_ to catch when internal use of `&&` or `||` encounter arguments of length more than one. Thanks to Hugh Parsonage for implementing and fixing the problems caught by this. + +2. Some namespace changes have been made with respect to melt, dcast and xts. No change is expected but if you do have any trouble, please file an issue. + +3. `split.data.table` was exported in v1.11.6 in addition to being registered using `S3method(split, data.table)`. The export has been removed again. It had been added because a user said they found it difficult to find, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But S3 methods are not normally exported explicitly by packages. The proper way to access the `split.data.table` method is to call `split(DT)` where `DT` is a `data.table`. The generic (`base::split` in this case) then dispatches to the `split.data.table` method. v1.11.6 was not on CRAN very long (1 week) so we think it's better to revert this change quickly. To know what methods exist, R provides the `methods()` function. + + ```R + methods(split) # all the methods for the split generic + methods(class="data.table") # all the generics that data.table has a method for (47 currently) + ``` + + +# data.table v1.11.6 (19 Sep 2018) + +## NEW FEATURES + +1. For convenience when some of the files in `fnams` are empty in `rbindlist(lapply(fnams,fread))`, `fread` now reads empty input as a null-data.table with warning rather than error, [#2898](https://github.com/Rdatatable/data.table/issues/2898). For consistency, `fwrite(data.table(NULL))` now creates an empty file and warns instead of error, too. + +2. `setcolorder(DT)` without further arguments now defaults to moving the key columns to be first, [#2895](https://github.com/Rdatatable/data.table/issues/2895). Thanks to @jsams for the PR. + +3. Attempting to subset on `col` when the column is actually called `Col` will still error, but the error message will helpfully suggest similarly-spelled columns, [#2887](https://github.com/Rdatatable/data.table/issues/2887). This is experimental, applies just to `i` currently, and we look forward to feedback. Thanks to Michael Chirico for the suggestion and PR. + +4. `fread()` has always accepted literal data; e.g. `fread("A,B\n1,2\n3,4")`. It now gains explicit `text=`; e.g. `fread(text="A,B\n1,2\n3,4")`. Unlike the first general purpose `input=` argument, the `text=` argument accepts multi-line input; e.g. `fread(text=c("A,B","1,2","3,4"))`, [#1423](https://github.com/Rdatatable/data.table/issues/1423). Thanks to Douglas Clark for the request and Hugh Parsonage for the PR. + +5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://www.datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. + +6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. + +7. If an appropriate index exists, `keyby=` will now use it. For example, given `setindex(DT,colA,colB)`, both `DT[,j,keyby=colA]` (a leading subset of the index columns) and `DT[,j,keyby=.(colA,colB)]` will use the index, but not `DT[,j,keyby=.(colB,colA)]`. The option `options(datatable.use.index=FALSE)` will turn this feature off. Please always use `keyby=` unless you wish to retain the order of groups by first-appearance order (in which case use `by=`). Also, both `keyby=` and `by=` already used the key where possible but are now faster when using just the first column of the key. As usual, setting `verbose=TRUE` either per-query or globally using `options(datatable.verbose=TRUE)` will report what's being done internally. + +## BUG FIXES + +1. `fread` now respects the order of columns passed to `select=` when column numbers are used, [#2986](https://github.com/Rdatatable/data.table/issues/2986). It already respected the order when column names are used. Thanks @privefl for raising the issue. + +2. `gmin` and `gmax` no longer fail on _ordered_ factors, [#1947](https://github.com/Rdatatable/data.table/issues/1947). Thanks to @mcieslik-mctp for identifying and @mbacou for the nudge. + +3. `as.ITime.character` now properly handles NA when attempting to detect the format of non-NA values in vector. Thanks @polyjian for reporting, closes [#2940](https://github.com/Rdatatable/data.table/issues/2940). + +4. `as.matrix(DT, rownames="id")` now works when `DT` has a single row, [#2930](https://github.com/Rdatatable/data.table/issues/2930). Thanks to @malcook for reporting and @sritchie73 for fixing. The root cause was the dual meaning of the `rownames=` argument: i) a single column name/number (most common), or ii) rowname values length 1 for the single row. For clarity and safety, `rownames.value=` has been added. Old usage (i.e. `length(rownames)>1`) continues to work for now but will issue a warning in a future release, and then error in a release after that. + +5. Fixed regression in v1.11.0 (May 2018) caused by PR [#2389](https://github.com/Rdatatable/data.table/pull/2389) which introduced partial key retainment on `:=` assigns. This broke the joining logic that assumed implicitly that assigning always drops keys completely. Consequently, join and subset results could be wrong when matching character to factor columns with existing keys, [#2881](https://github.com/Rdatatable/data.table/issues/2881). Thanks to @ddong63 for reporting and to @MarkusBonsch for fixing. Missing test added to ensure this doesn't arise again. + +6. `as.IDate.numeric` no longer ignores "origin", [#2880](https://github.com/Rdatatable/data.table/issues/2880). Thanks to David Arenburg for reporting and fixing. + +7. `as.ITime.times` was rounding fractional seconds while other methods were truncating, [#2870](https://github.com/Rdatatable/data.table/issues/2870). The `as.ITime` method gains `ms=` taking `"truncate"` (default), `"nearest"` and `"ceil"`. Thanks to @rossholmberg for reporting and Michael Chirico for fixing. + +8. `fwrite()` now writes POSIXct dates after 2038 correctly, [#2995](https://github.com/Rdatatable/data.table/issues/2995). Thanks to Manfred Zorn for reporting and Philippe Chataignon for the PR fixing it. + +9. `fsetequal` gains the `all` argument to make it consistent with the other set operator functions `funion`, `fsetdiff` and `fintersect` [#2968](https://github.com/Rdatatable/data.table/issues/2968). When `all = FALSE` `fsetequal` will treat rows as elements in a set when checking whether two `data.tables` are equal (i.e. duplicate rows will be ignored). For now the default value is `all = TRUE` for backwards compatibility, but this will be changed to `all = FALSE` in a future release to make it consistent with the other set operation functions. Thanks to @franknarf1 for reporting and @sritchie73 for fixing. + +10. `fintersect` failed on tables with a column called `y`, [#3034](https://github.com/Rdatatable/data.table/issues/3034). Thanks to Maxim Nazarov for reporting. + +11. Compilation fails in AIX because NAN and INFINITY macros definition in AIX make them not constant literals, [#3043](https://github.com/Rdatatable/data.table/pull/3043). Thanks to Ayappan for reporting and fixing. + +12. The introduction of altrep in R 3.5.0 caused some performance regressions of about 20% in some cases, [#2962](https://github.com/Rdatatable/data.table/issues/2962). Investigating this led to some improvements to grouping which are faster than before R 3.5.0 in some cases. Thanks to Nikolay S. for reporting. The work to accomodate altrep is not complete but it is better and it is highly recommended to upgrade to this update. + +13. Fixed 7 memory faults thanks to CRAN's [`rchk`](https://github.com/kalibera/rchk) tool by Tomas Kalibera, [#3033](https://github.com/Rdatatable/data.table/pull/3033). + +## NOTES + +1. The type coercion warning message has been improved, [#2989](https://github.com/Rdatatable/data.table/pull/2989). Thanks to @sarahbeeysian on Twitter for highlighting. For example, given the follow statements: + + ```R + DT = data.table(id=1:3) + DT[2, id:="foo"] + ``` + + the warning message has changed from : + + ``` + Coerced character RHS to integer to match the column's type. Either change the target column + ['id'] to character first (by creating a new character vector length 3 (nrows of entire table) and + assign that; i.e. 'replace' column), or coerce RHS to integer (e.g. 1L, NA_[real|integer]_, as.*, + etc) to make your intent clear and for speed. Or, set the column type correctly up front when you + create the table and stick to it, please. + ``` + + to : + + ``` + Coerced character RHS to integer to match the type of the target column (column 1 named 'id'). If + the target column's type integer is correct, it's best for efficiency to avoid the coercion and + create the RHS as type integer. To achieve that consider the L postfix: typeof(0L) vs typeof(0), + and typeof(NA) vs typeof(NA_integer_) vs typeof(NA_real_). Wrapping the RHS with as.integer() will + avoid this warning but still perform the coercion. If the target column's type is not correct, it + is best to revisit where the DT was created and fix the column type there; e.g., by using + colClasses= in fread(). Otherwise, you can change the column type now by plonking a new column (of + the desired type) over the top of it; e.g. DT[, `id`:=as.character(`id`)]. If the RHS of := has + nrow(DT) elements then the assignment is called a column plonk and is the way to change a column's + type. Column types can be observed with sapply(DT,typeof). + ``` + + Further, if a coercion from double to integer is performed, fractional data such as 3.14 is now detected and the truncation to 3 is warned about if and only if truncation has occurred. + + ```R + DT = data.table(v=1:3) + DT[2, v:=3.14] + Warning message: + Coerced double RHS to integer to match the type of the target column (column 1 named 'v'). One + or more RHS values contain fractions which have been lost; e.g. item 1 with value 3.140000 has + been truncated to 3. + ``` + +2. `split.data.table` method is now properly exported, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But we don't recommend it because `split` copies all the pieces into new memory. + +3. Setting indices on columns which are part of the key will now create those indices. + +4. `hour`, `minute`, and `second` utility functions use integer arithmetic when the input is already (explicitly) UTC-based `POSIXct` for 4-10x speedup vs. using `as.POSIXlt`. + +5. Error added for incorrect usage of `%between%`, with some helpful diagnostic hints, [#3014](https://github.com/Rdatatable/data.table/issues/3014). Thanks @peterlittlejohn for offering his user experience and providing the impetus. + + +# data.table v1.11.4 (27 May 2018) + +1. Empty RHS of `:=` is no longer an error when the `i` clause returns no rows to assign to anyway, [#2829](https://github.com/Rdatatable/data.table/issues/2829). Thanks to @cguill95 for reporting and to @MarkusBonsch for fixing. + +2. Fixed runaway memory usage with R-devel (R > 3.5.0), [#2882](https://github.com/Rdatatable/data.table/pull/2882). Thanks to many people but in particular to Trang Nguyen for making the breakthrough reproducible example, Paul Bailey for liaising, and Luke Tierney for then pinpointing the issue. It was caused by an interaction of two or more data.table threads operating on new compact vectors in the ALTREP framework, such as the sequence `1:n`. This interaction could result in R's garbage collector turning off, and hence the memory explosion. Problems may occur in R 3.5.0 too but we were only able to reproduce in R > 3.5.0. The R code in data.table's implementation benefits from ALTREP (`for` loops in R no longer allocate their range vector input, for example) but are not so appropriate as data.table columns. Sequences such as `1:n` are common in test data but not very common in real-world datasets. Therefore, there is no need for data.table to support columns which are ALTREP compact sequences. The `data.table()` function already expanded compact vectors (by happy accident) but `setDT()` did not (it now does). If, somehow, a compact vector still reaches the internal parallel regions, a helpful error will now be generated. If this happens, please report it as a bug. + +3. Tests 1590.3 & 1590.4 now pass when users run `test.data.table()` on Windows, [#2856](https://github.com/Rdatatable/data.table/pull/2856). Thanks to Avraham Adler for reporting. Those tests were passing on AppVeyor, win-builder and CRAN's Windows because `R CMD check` sets `LC_COLLATE=C` as documented in R-exts$1.3.1, whereas by default on Windows `LC_COLLATE` is usually a regional Windows-1252 dialect such as `English_United States.1252`. + +4. Around 1 billion very small groups (of size 1 or 2 rows) could result in `"Failed to realloc working memory"` even when plenty of memory is available, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks once again to @jsams for the detailed report as a follow up to bug fix 40 in v1.11.0. + + +# data.table v1.11.2 (08 May 2018) + +1. `test.data.table()` created/overwrote variable `x` in `.GlobalEnv`, [#2828](https://github.com/Rdatatable/data.table/issues/2828); i.e. a modification of user's workspace which is not allowed. Thanks to @etienne-s for reporting. + +2. `as.chron` methods for `IDate` and `ITime` have been removed, [#2825](https://github.com/Rdatatable/data.table/issues/2825). `as.chron` still works since `IDate` inherits from `Date`. We are not sure why we had specific methods in the first place. It may have been from a time when `IDate` did not inherit from `Date`, perhaps. Note that we don't use `chron` ourselves in our own work. + +3. Fixed `SETLENGTH() cannot be applied to an ALTVEC object` starting in R-devel (R 3.6.0) on 1 May 2018, a few hours after 1.11.0 was accepted on CRAN, [#2820](https://github.com/Rdatatable/data.table/issues/2820). Many thanks to Luke Tierney for pinpointing the problem. + +4. Fixed some rare memory faults in `fread()` and `rbindlist()` found with `gctorture2()` and [`rchk`](https://github.com/kalibera/rchk), [#2841](https://github.com/Rdatatable/data.table/issues/2841). + + +# data.table v1.11.0 (01 May 2018) + +## NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES + +1. `fread()`'s `na.strings=` argument : + + ```R + "NA" # old default + getOption("datatable.na.strings", "NA") # this release; i.e. the same; no change yet + getOption("datatable.na.strings", "") # future release + ``` + + This option controls how `,,` is read in character columns. It does not affect numeric columns which read `,,` as `NA` regardless. We would like `,,`=>`NA` for consistency with numeric types, and `,"",`=>empty string to be the standard default for `fwrite/fread` character columns so that `fread(fwrite(DT))==DT` without needing any change to any parameters. `fwrite` has never written `NA` as `"NA"` in case `"NA"` is a valid string in the data; e.g., 2 character id columns sometimes do. Instead, `fwrite` has always written `,,` by default for an `` in a character columns. The use of R's `getOption()` allows users to move forward now, using `options(datatable.fread.na.strings="")`, or restore old behaviour when the default's default is changed in future, using `options(datatable.fread.na.strings="NA")`. + +2. `fread()` and `fwrite()`'s `logical01=` argument : + + ```R + logical01 = FALSE # old default + getOption("datatable.logical01", FALSE) # this release; i.e. the same; no change yet + getOption("datatable.logical01", TRUE) # future release + ``` + + This option controls whether a column of all 0's and 1's is read as `integer`, or `logical` directly to avoid needing to change the type afterwards to `logical` or use `colClasses`. `0/1` is smaller and faster than `"TRUE"/"FALSE"`, which can make a significant difference to space and time the more `logical` columns there are. When the default's default changes to `TRUE` for `fread` we do not expect much impact since all arithmetic operators that are currently receiving 0's and 1's as type `integer` (think `sum()`) but instead could receive `logical`, would return exactly the same result on the 0's and 1's as `logical` type. However, code that is manipulating column types using `is.integer` or `is.logical` on `fread`'s result, could require change. It could be painful if `DT[(logical_column)]` (i.e. `DT[logical_column==TRUE]`) changed behaviour due to `logical_column` no longer being type `logical` but `integer`. But that is not the change proposed. The change is the other way around; i.e., a previously `integer` column holding only 0's and 1's would now be type `logical`. Since it's that way around, we believe the scope for breakage is limited. We think a lot of code is converting 0/1 integer columns to logical anyway, either using `colClasses=` or afterwards with an assign. For `fwrite`, the level of breakage depends on the consumer of the output file. We believe `0/1` is a better more standard default choice to move to. See notes below about improvements to `fread`'s sampling for type guessing, and automatic rereading in the rare cases of out-of-sample type surprises. + + +These options are meant for temporary use to aid your migration, [#2652](https://github.com/Rdatatable/data.table/pull/2652). You are not meant to set them to the old default and then not migrate your code that is dependent on the default. Either set the argument explicitly so your code is not dependent on the default, or change the code to cope with the new default. Over the next few years we will slowly start to remove these options, warning you if you are using them, and return to a simple default. See the history of NEWS and NEWS.0 for past migrations that have, generally speaking, been successfully managed in this way. For example, at the end of NOTES for this version (below in this file) is a note about the usage of `datatable.old.unique.by.key` now warning, as you were warned it would do over a year ago. When that change was introduced, the default was changed and that option provided an option to restore the old behaviour. These `fread`/`fwrite` changes are even more cautious and not even changing the default's default yet. Giving you extra warning by way of this notice to move forward. And giving you a chance to object. + +## NEW FEATURES + +1. `fread()`: + * Efficiency savings at C level including **parallelization** announced [here](https://github.com/Rdatatable/data.table/wiki/talks/BARUG_201704_ParallelFread.pdf); e.g. a 9GB 2 column integer csv input is **50s down to 12s** to cold load on a 4 core laptop with 16GB RAM and SSD. Run `echo 3 >/proc/sys/vm/drop_caches` first to measure cold load time. Subsequent load time (after file has been cached by OS on the first run) **40s down to 6s**. + * The [fread for small data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread) page has been revised. + * Memory maps lazily; e.g. reading just the first 10 rows with `nrow=10` is **12s down to 0.01s** from cold for the 9GB file. Large files close to your RAM limit may work more reliably too. The progress meter will commence sooner and more consistently. + * `fread` has always jumped to the middle and to the end of the file for a much improved column type guess. The sample size is increased from 100 rows at 10 jump jump points (1,000 rows) to 100 rows at 100 jumps points (10,000 row sample). In the rare case of there still being out-of-sample type exceptions, those columns are now *automatically reread* so you don't have to use `colClasses` yourself. + * Large number of columns support; e.g. **12,000 columns** tested. + * **Quoting rules** are more robust and flexible. See point 10 on the wiki page [here](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread#10-automatic-quote-escape-method-detection-including-no-escape). + * Numeric data that has been quoted is now detected and read as numeric. + * The ability to position `autostart` anywhere inside one of multiple tables in a single file is removed with warning. It used to search upwards from that line to find the start of the table based on a consistent number of columns. People appear to be using `skip="string"` or `skip=nrow` to find the header row exactly, which is retained and simpler. It was too difficult to retain search-upwards-autostart together with skipping/filling blank lines, filling incomplete rows and parallelization too. If there is any header info above the column names, it is still auto detected and auto skipped (particularly useful when loading a set of files where the column names start on different lines due to a varying height messy header). + * `dec=','` is now implemented directly so there is no dependency on locale. The options `datatable.fread.dec.experiment` and `datatable.fread.dec.locale` have been removed. + * `\\r\\r\\n` line endings are now handled such as produced by `base::download.file()` when it doubles up `\\r`. Other rare line endings (`\\r` and `\\n\\r`) are now more robust. + * Mixed line endings are now handled; e.g. a file formed by concatenating a Unix file and a Windows file so that some lines end with `\\n` while others end with `\\r\\n`. + * Improved automatic detection of whether the first row is column names by comparing the types of the fields on the first row against the column types ascertained by the 10,000 rows sample (or `colClasses` if provided). If a numeric column has a string value at the top, then column names are deemed present. + * Detects GB-18030 and UTF-16 encodings and in verbose mode prints a message about BOM detection. + * Detects and ignores trailing ^Z end-of-file control character sometimes created on MS DOS/Windows, [#1612](https://github.com/Rdatatable/data.table/issues/1612). Thanks to Gergely Daróczi for reporting and providing a file. + * Added ability to recognize and parse hexadecimal floating point numbers, as used for example in Java. Thanks for @scottstanfield [#2316](https://github.com/Rdatatable/data.table/issues/2316) for the report. + * Now handles floating-point NaN values in a wide variety of formats, including `NaN`, `sNaN`, `1.#QNAN`, `NaN1234`, `#NUM!` and others, [#1800](https://github.com/Rdatatable/data.table/issues/1800). Thanks to Jori Liesenborgs for highlighting and the PR. + * If negative numbers are passed to `select=` the out-of-range error now suggests `drop=` instead, [#2423](https://github.com/Rdatatable/data.table/issues/2423). Thanks to Michael Chirico for the suggestion. + * `sep=NULL` or `sep=""` (i.e., no column separator) can now be used to specify single column input reliably like `base::readLines`, [#1616](https://github.com/Rdatatable/data.table/issues/1616). `sep='\\n'` still works (even on Windows where line ending is actually `\\r\\n`) but `NULL` or `""` are now documented and recommended. Thanks to Dmitriy Selivanov for the pull request and many others for comments. As before, `sep=NA` is not valid; use the default `"auto"` for automatic separator detection. `sep='\\n'` is now deprecated and in future will start to warn when used. + * Single-column input with blank lines is now valid and the blank lines are significant (representing `NA`). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing `NA` which are written as blank. There is no change when `ncol>1`; i.e., input stops with detailed warning at the first blank line, because a blank line when `ncol>1` is invalid input due to no separators being present. Thanks to @skanskan, Michael Chirico, @franknarf1 and Pasha for the testing and discussions, [#2106](https://github.com/Rdatatable/data.table/issues/2106). + * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. + * `skip=` and `nrow=` are more reliable and are no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). + * Ram disk (`/dev/shm`) is no longer used for the output of system command input. Although faster when it worked, it was causing too many device full errors; e.g., [#1139](https://github.com/Rdatatable/data.table/issues/1139) and [zUMIs/19](https://github.com/sdparekh/zUMIs/issues/19). Thanks to Kyle Chung for reporting. Standard `tempdir()` is now used. If you wish to use ram disk, set TEMPDIR to `/dev/shm`; see `?tempdir`. + * Detecting whether a very long input string is a file name or data is now much faster, [#2531](https://github.com/Rdatatable/data.table/issues/2531). Many thanks to @javrucebo for the detailed report, benchmarks and suggestions. + * A column of `TRUE/FALSE`s is ok, as well as `True/False`s and `true/false`s, but mixing styles (e.g. `TRUE/false`) is not and will be read as type `character`. + * New argument `index` to compliment the existing `key` argument for applying secondary orderings out of the box for convenience, [#2633](https://github.com/Rdatatable/data.table/issues/2633). + * A warning is now issued whenever incorrectly quoted fields have been detected and fixed using a non-standard quote rule. `fread` has always used these advanced rules but now it warns that it is using them. Most file writers correctly quote fields if the field contains the field separator, but a common error is not to also quote fields that contain a quote and then escape those quotes, particularly if that quote occurs at the start of the field. The ability to detect and fix such files is referred to as self-healing. Ambiguities are resolved using the knowledge that the number of columns is constant, and therefore this ability is not available when `fill=TRUE`. This feature can be improved in future by using column type consistency as well as the number of fields. For example: + + ```R + txt = 'A,B\n1,hello\n2,"howdy" said Joe\n3,bonjour\n' + cat(txt) + # A,B + # 1,hello + # 2,"howdy" said Joe + # 3,bonjour + fread(txt) + A B + + 1: 1 hello + 2: 2 "howdy" said Joe + 3: 3 bonjour + Warning message: + In fread(txt) : Found and resolved improper quoting + ``` + + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney, Ananda Mahto, @memoryfull, @brandenkmurray for testing dev and reporting these regressions before release to CRAN: #1464, #1671, #1888, #1895, #2070, #2073, #2087, #2091, #2092, #2107, #2118, #2123, #2167, #2194, #2196, #2201, #2222, #2228, #2238, #2246, #2251, #2265, #2267, #2285, #2287, #2299, #2322, #2347, #2352, #2370, #2371, #2395, #2404, #2446, #2453, #2457, #2464, #2481, #2499, #2512, #2515, #2516, #2518, #2520, #2523, #2526, #2535, #2542, #2548, #2561, #2600, #2625, #2666, #2697, #2735, #2744. + +2. `fwrite()`: + * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). + * `logical01` has been added and the old name `logicalAsInt` retained. Pease move to the new name when convenient for you. The old argument name (`logicalAsInt`) will slowly be deprecated over the next few years. The default is unchanged: `FALSE`, so `logical` is still written as `"TRUE"`/`"FALSE"` in full by default. We intend to change the default's default in future to `TRUE`; see the notice at the top of these release notes. + +3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. + +4. `tables` gains `index` argument for supplementary metadata about `data.table`s in memory (or any optionally specified environment), part of [#1648](https://github.com/Rdatatable/data.table/issues/1648). Thanks due variously to @jangorecki, @rsaporta, @MichaelChirico for ideas and work towards PR. + +5. Improved auto-detection of `character` inputs' formats to `as.ITime` to mirror the logic in `as.POSIXlt.character`, [#1383](https://github.com/Rdatatable/data.table/issues/1383) Thanks @franknarf1 for identifying a discrepancy and @MichaelChirico for investigating. + +6. `setcolorder()` now accepts less than `ncol(DT)` columns to be moved to the front, [#592](https://github.com/Rdatatable/data.table/issues/592). Thanks @MichaelChirico for the PR. This also incidentally fixed [#2007](https://github.com/Rdatatable/data.table/issues/2007) whereby explicitly setting `select = NULL` in `fread` errored; thanks to @rcapell for reporting that and @dselivanov and @MichaelChirico for investigating and providing a new test. + +7. Three new *Grouping Sets* functions: `rollup`, `cube` and `groupingsets`, [#1377](https://github.com/Rdatatable/data.table/issues/1377). Allows to aggregation on various grouping levels at once producing sub-totals and grand total. + +8. `as.data.table()` gains new method for `array`s to return a useful data.table, [#1418](https://github.com/Rdatatable/data.table/issues/1418). + +9. `print.data.table()` (all via master issue [#1523](https://github.com/Rdatatable/data.table/issues/1523)): + + * gains `print.keys` argument, `FALSE` by default, which displays the keys and/or indices (secondary keys) of a `data.table`. Thanks @MichaelChirico for the PR, Yike Lu for the suggestion and Arun for honing that idea to its present form. + + * gains `col.names` argument, `"auto"` by default, which toggles which registers of column names to include in printed output. `"top"` forces `data.frame`-like behavior where column names are only ever included at the top of the output, as opposed to the default behavior which appends the column names below the output as well for longer (>20 rows) tables. `"none"` shuts down column name printing altogether. Thanks @MichaelChirico for the PR, Oleg Bondar for the suggestion, and Arun for guiding commentary. + + * list columns would print the first 6 items in each cell followed by a comma if there are more than 6 in that cell. Now it ends ",..." to make it clearer, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). Thanks to @franknarf1 for drawing attention to an issue raised on Stack Overflow by @TMOTTM [here](https://stackoverflow.com/q/47679701). + +10. `setkeyv` accelerated if key already exists [#2331](https://github.com/Rdatatable/data.table/issues/2331). Thanks to @MarkusBonsch for the PR. + +11. Keys and indexes are now partially retained up to the key column assigned to with ':=' [#2372](https://github.com/Rdatatable/data.table/issues/2372). They used to be dropped completely if any one of the columns was affected by `:=`. Tanks to @MarkusBonsch for the PR. + +12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR. + +13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR, and thanks to @mgahan for pointing out a reversion in `na.omit.data.table` before release, [#2660](https://github.com/Rdatatable/data.table/issues/2660#issuecomment-371027948). + +14. `uniqueN()` is now faster on logical vectors. Thanks to Hugh Parsonage for [PR#2648](https://github.com/Rdatatable/data.table/pull/2648). + + ```R + N = 1e9 + # was now + x = c(TRUE,FALSE,NA,rep(TRUE,N)) # + uniqueN(x) == 3 # 5.4s 0.00s + x = c(TRUE,rep(FALSE,N), NA) # + uniqueN(x,na.rm=TRUE) == 2 # 5.4s 0.00s + x = c(rep(TRUE,N),FALSE,NA) # + uniqueN(x) == 3 # 6.7s 0.38s + ``` + +15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). +Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation. + +16. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. + +17. `update.dev.pkg` is new function to update package from development repository, it will download package sources only when newer commit is available in repository. `data.table::update.dev.pkg()` defaults updates `data.table`, but any package can be used. + +18. Item 1 in NEWS for [v1.10.2](https://github.com/Rdatatable/data.table/blob/master/NEWS.md#changes-in-v1102--on-cran-31-jan-2017) on CRAN in Jan 2017 included : + + > When j is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. + > When you see the `..` prefix think one-level-up, like the directory `..` in all operating systems means the parent directory. + > In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. + + The response has been positive ([this tweet](https://twitter.com/MattDowle/status/967290562725359617) and [FR#2655](https://github.com/Rdatatable/data.table/issues/2655)) and so this prefix is now expanded to all symbols appearing in `j=` as a first step; e.g. + + ```R + cols = "colB" + DT[, c(..cols, "colC")] # same as DT[, .(colB,colC)] + DT[, -..cols] # all columns other than colB + ``` + + Thus, `with=` should no longer be needed in any cases. Please change to using the `..` prefix and over the next few years we will start to formally deprecate and remove the `with=` parameter. If this is well received, the `..` prefix could be expanded to symbols appearing in `i=` and `by=`, too. Note that column names should not now start with `..`. If a symbol `..var` is used in `j=` but `..var` exists as a column name, the column still takes precedence, for backwards compatibility. Over the next few years, data.table will start issuing warnings/errors when it sees column names starting with `..`. This affects one CRAN package out of 475 using data.table, so we do not believe this restriction to be unreasonable. Our main focus here which we believe `..` achieves is to resolve the more common ambiguity when `var` is in calling scope and `var` is a column name too. Further, we have not forgotten that in the past we recommended prefixing the variable in calling scope with `..` yourself. If you did that and `..var` exists in calling scope, that still works, provided neither `var` exists in calling scope nor `..var` exists as a column name. Please now remove the `..` prefix on `..var` in calling scope to tidy this up. In future data.table will start to warn/error on such usage. + +19. `setindexv` can now assign multiple (separate) indices by accepting a `list` in the `cols` argument. + +20. `as.matrix.data.table` method now has an additional `rownames` argument allowing for a single column to be used as the `rownames` after conversion to a `matrix`. Thanks to @sritchie73 for the suggestion, use cases, [#2692](https://github.com/Rdatatable/data.table/issues/2692) and implementation [PR#2702](https://github.com/Rdatatable/data.table/pull/2702) and @MichaelChirico for additional use cases. + +## BUG FIXES + +1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding + Great Stocks\"", I discussed the value of stock screeners as a powerful tool"`, [#2051](https://github.com/Rdatatable/data.table/issues/2051). Thanks to @scarrascoso for reporting. Example file added to test suite. + +2. `fwrite()` creates a file with permissions that now play correctly with `Sys.umask()`, [#2049](https://github.com/Rdatatable/data.table/issues/2049). Thanks to @gnguy for reporting. + +3. `fread()` no longer holds an open lock on the file when a line outside the large sample has too many fields and generates an error, [#2044](https://github.com/Rdatatable/data.table/issues/2044). Thanks to Hugh Parsonage for reporting. + +4. Setting `j = {}` no longer results in an error, [#2142](https://github.com/Rdatatable/data.table/issues/2142). Thanks Michael Chirico for the pull request. + +5. Segfault in `rbindlist()` when one or more items are empty, [#2019](https://github.com/Rdatatable/data.table/issues/2019). Thanks Michael Lang for the pull request. Another segfault if the result would be more than 2bn rows, thanks to @jsams's comment in [#2340](https://github.com/Rdatatable/data.table/issues/2340#issuecomment-331505494). + +6. Error printing 0-length `ITime` and `NA` objects, [#2032](https://github.com/Rdatatable/data.table/issues/2032) and [#2171](https://github.com/Rdatatable/data.table/issues/2171). Thanks Michael Chirico for the pull requests and @franknarf1 for pointing out a shortcoming of the initial fix. + +7. `as.IDate.POSIXct` error with `NULL` timezone, [#1973](https://github.com/Rdatatable/data.table/issues/1973). Thanks @lbilli for reporting and Michael Chirico for the pull request. + +8. Printing a null `data.table` with `print` no longer visibly outputs `NULL`, [#1852](https://github.com/Rdatatable/data.table/issues/1852). Thanks @aaronmcdaid for spotting and @MichaelChirico for the PR. + +9. `data.table` now works with Shiny Reactivity / Flexdashboard. The error was typically something like `col not found` in `DT[col==val]`. Thanks to Dirk Eddelbuettel leading Matt through reproducible steps and @sergeganakou and Richard White for reporting. Closes [#2001](https://github.com/Rdatatable/data.table/issues/2001) and [shiny/#1696](https://github.com/rstudio/shiny/issues/1696). + +10. The `as.IDate.POSIXct` method passed `tzone` along but was not exported. So `tzone` is now taken into account by `as.IDate` too as well as `IDateTime`, [#977](https://github.com/Rdatatable/data.table/issues/977) and [#1498](https://github.com/Rdatatable/data.table/issues/1498). Tests added. + +11. Named logical vector now select rows as expected from single row data.table. Thanks to @skranz for reporting. Closes [#2152](https://github.com/Rdatatable/data.table/issues/2152). + +12. `fread()`'s rare `Internal error: Sampling jump point 10 is before the last jump ended` has been fixed, [#2157](https://github.com/Rdatatable/data.table/issues/2157). Thanks to Frank Erickson and Artem Klevtsov for reporting with example files which are now added to the test suite. + +13. `CJ()` no longer loses attribute information, [#2029](https://github.com/Rdatatable/data.table/issues/2029). Thanks to @MarkusBonsch and @royalts for the pull request. + +14. `split.data.table` respects `factor` ordering in `by` argument, [#2082](https://github.com/Rdatatable/data.table/issues/2082). Thanks to @MichaelChirico for identifying and fixing the issue. + +15. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR, and @ProfFancyPants for reporing a regression introduced in the PR. Closes [#2326](https://github.com/Rdatatable/data.table/issues/2326) and [#2338](https://github.com/Rdatatable/data.table/issues/2338). + +16. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). + +17. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). + +18. `isoweek` calculation is correct regardless of local timezone setting (`Sys.timezone()`), [#2407](https://github.com/Rdatatable/data.table/issues/2407). Thanks to @MoebiusAV and @SimonCoulombe for reporting and @MichaelChirico for fixing. + +19. Fixed `as.xts.data.table` to support all xts supported time based index clasess [#2408](https://github.com/Rdatatable/data.table/issues/2408). Thanks to @ebs238 for reporting and for the PR. + +20. A memory leak when a very small number such as `0.58E-2141` is bumped to type `character` is resolved, [#918](https://github.com/Rdatatable/data.table/issues/918). + +21. The edge case `setnames(data.table(), character(0))` now works rather than error, [#2452](https://github.com/Rdatatable/data.table/issues/2452). + +22. Order of rows returned in non-equi joins were incorrect in certain scenarios as reported under [#1991](https://github.com/Rdatatable/data.table/issues/1991). This is now fixed. Thanks to @Henrik-P for reporting. + +23. Non-equi joins work as expected when `x` in `x[i, on=...]` is a 0-row data.table. Closes [#1986](https://github.com/Rdatatable/data.table/issues/1986). + +24. Non-equi joins along with `by=.EACHI` returned incorrect result in some rare cases as reported under [#2360](https://github.com/Rdatatable/data.table/issues/2360). This is fixed now. This fix also takes care of [#2275](https://github.com/Rdatatable/data.table/issues/2275). Thanks to @ebs238 for the nice minimal reproducible report, @Mihael for asking on SO and to @Frank for following up on SO and filing an issue. + +25. `by=.EACHI` works now when `list` columns are being returned and some join values are missing, [#2300](https://github.com/Rdatatable/data.table/issues/2300). Thanks to @jangorecki and @franknarf1 for the reproducible examples which have been added to the test suite. + +26. Indices are now retrieved by exact name, [#2465](https://github.com/Rdatatable/data.table/issues/2465). This prevents usage of wrong indices as well as unexpected row reordering in join results. Thanks to @pannnda for reporting and providing a reproducible example and to @MarkusBonsch for fixing. + +27. `setnames` of whole table when original table had `NA` names skipped replacing those, [#2475](https://github.com/Rdatatable/data.table/issues/2475). Thanks to @franknarf1 and [BenoitLondon on StackOverflow](https://stackoverflow.com/questions/47228836/) for the report and @MichaelChirico for fixing. + +28. `CJ()` works with multiple empty vectors now [#2511](https://github.com/Rdatatable/data.table/issues/2511). Thanks to @MarkusBonsch for fixing. + +29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again. + +30. "Negative length vectors not allowed" error when grouping `median` and `var` fixed, [#2046](https://github.com/Rdatatable/data.table/issues/2046) and [#2111](https://github.com/Rdatatable/data.table/issues/2111). Thanks to @caneff and @osofr for reporting and to @kmillar for debugging and explaining the cause. + +31. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing. + +32. `x.` prefixes during joins sometimes resulted in a "column not found" error. This is now fixed. Closes [#2313](https://github.com/Rdatatable/data.table/issues/2313). Thanks to @franknarf1 for the MRE. + +33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. + +34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`. +`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons. +In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. +Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653) + +35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636). + +36. `NA` in character columns now display as `` just like base R to distinguish from `""` and `"NA"`. + +37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708). + +38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674). + +39. Internal aliasing of `.` to `list` was over-aggressive in applying `list` even when `.` was intended within `bquote`, [#1912](https://github.com/Rdatatable/data.table/issues/1912). Thanks @MichaelChirico for reporting/filing and @ecoRoland for suggesting and testing a fix. + +40. Attempt to allocate a wildly large amount of RAM (16EB) when grouping by key and there are close to 2 billion 1-row groups, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks to @jsams for the detailed report. + +41. Fix a bug that `print(dt, class=TRUE)` shows only `topn - 1` rows. Thanks to @heavywatal for reporting [#2803](https://github.com/Rdatatable/data.table/issues/2803) and filing [PR#2804](https://github.com/Rdatatable/data.table/pull/2804). + +## NOTES + +0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. + +1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing. + +2. Test 1675.1 updated to cope with a change in R-devel in June 2017 related to `factor()` and `NA` levels. + +3. Package `ezknitr` has been added to the whitelist of packages that run user code and should be consider data.table-aware, [#2266](https://github.com/Rdatatable/data.table/issues/2266). Thanks to Matt Mills for testing and reporting. + +4. Printing with `quote = TRUE` now quotes column names as well, [#1319](https://github.com/Rdatatable/data.table/issues/1319). Thanks @jan-glx for the suggestion and @MichaelChirico for the PR. + +5. Added a blurb to `?melt.data.table` explicating the subtle difference in behavior of the `id.vars` argument vis-a-vis its analog in `reshape2::melt`, [#1699](https://github.com/Rdatatable/data.table/issues/1699). Thanks @MichaelChirico for uncovering and filing. + +6. Added some clarification about the usage of `on` to `?data.table`, [#2383](https://github.com/Rdatatable/data.table/issues/2383). Thanks to @peterlittlejohn for volunteering his confusion and @MichaelChirico for brushing things up. + +7. Clarified that "data.table always sorts in `C-locale`" means that upper-case letters are sorted before lower-case letters by ordering in data.table (e.g. `setorder`, `setkey`, `DT[order(...)]`). Thanks to @hughparsonage for the pull request editing the documentation. Note this makes no difference in most cases of data; e.g. ids where only uppercase or lowercase letters are used (`"AB123"<"AC234"` is always true, regardless), or country names and words which are consistently capitalized. For example, `"America" < "Brazil"` is not affected (it's always true), and neither is `"america" < "brazil"` (always true too); since the first letter is consistently capitalized. But, whether `"america" < "Brazil"` (the words are not consistently capitalized) is true or false in base R depends on the locale of your R session. In America it is true by default and false if you i) type `Sys.setlocale(locale="C")`, ii) the R session has been started in a C locale for you which can happen on servers/services (the locale comes from the environment the R session is started in). However, `"america" < "Brazil"` is always, consistently false in data.table which can be a surprise because it differs to base R by default in most regions. It is false because `"B"<"a"` is true because all upper-case letters come first, followed by all lower case letters (the ascii number of each letter determines the order, which is what is meant by `C-locale`). + +8. `data.table`'s dependency has been moved forward from R 3.0.0 (Apr 2013) to R 3.1.0 (Apr 2014; i.e. 3.5 years old). We keep this dependency as old as possible for as long as possible as requested by users in managed environments. Thanks to Jan Gorecki, the test suite from latest dev now runs on R 3.1.0 continously, as well as R-release (currently 3.4.2) and latest R-devel snapshot. The primary motivation for the bump to R 3.1.0 was allowing one new test which relies on better non-copying behaviour in that version, [#2484](https://github.com/Rdatatable/data.table/issues/2484). It also allows further internal simplifications. Thanks to @MichaelChirico for fixing another test that failed on R 3.1.0 due to slightly different behaviour of `base::read.csv` in R 3.1.0-only which the test was comparing to, [#2489](https://github.com/Rdatatable/data.table/pull/2489). + +9. New vignette added: _Importing data.table_ - focused on using data.table as a dependency in R packages. Answers most commonly asked questions and promote good practices. + +10. As warned in v1.9.8 release notes below in this file (25 Nov 2016) it has been 1 year since then and so use of `options(datatable.old.unique.by.key=TRUE)` to restore the old default is now deprecated with warning. The new warning states that this option still works and repeats the request to pass `by=key(DT)` explicitly to `unique()`, `duplicated()`, `uniqueN()` and `anyDuplicated()` and to stop using this option. In another year, this warning will become error. Another year after that the option will be removed. + +11. As `set2key()` and `key2()` have been warning since v1.9.8 (Nov 2016), their warnings have now been upgraded to errors. Note that when they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' in NEWS item 4. They will be removed in one year. + + ``` + Was warning: set2key() will be deprecated in the next relase. Please use setindex() instead. + Now error: set2key() is now deprecated. Please use setindex() instead. + ``` + +12. The option `datatable.showProgress` is no longer set to a default value when the package is loaded. Instead, the `default=` argument of `getOption` is used by both `fwrite` and `fread`. The default is the result of `interactive()` at the time of the call. Using `getOption` in this way is intended to be more helpful to users looking at `args(fread)` and `?fread`. + +13. `print.data.table()` invisibly returns its first argument instead of `NULL`. This behavior is compatible with the standard `print.data.frame()` and tibble's `print.tbl_df()`. Thanks to @heavywatal for [PR#2807](https://github.com/Rdatatable/data.table/pull/2807) + + +# data.table v1.10.4-3 (20 Oct 2017) + +1. Fixed crash/hang on MacOS when `parallel::mclapply` is used and data.table is merely loaded, [#2418](https://github.com/Rdatatable/data.table/issues/2418). Oddly, all tests including test 1705 (which tests `mclapply` with data.table) passed fine on CRAN. It appears to be some versions of MacOS or some versions of libraries on MacOS, perhaps. Many thanks to Martin Morgan for reporting and confirming this fix works. Thanks also to @asenabouth, Joe Thorley and Danton Noriega for testing, debugging and confirming that automatic parallelism inside data.table (such as `fwrite`) works well even on these MacOS installations. See also news items below for 1.10.4-1 and 1.10.4-2. + + +# data.table v1.10.4-2 (12 Oct 2017) + +1. OpenMP on MacOS is now supported by CRAN and included in CRAN's package binaries for Mac. But installing v1.10.4-1 from source on MacOS failed when OpenMP was not enabled at compile time, [#2409](https://github.com/Rdatatable/data.table/issues/2409). Thanks to Liz Macfie and @fupangpangpang for reporting. The startup message when OpenMP is not enabled has been updated. + +2. Two rare potential memory faults fixed, thanks to CRAN's automated use of latest compiler tools; e.g. clang-5 and gcc-7 + + +# data.table v1.10.4-1 (09 Oct 2017) + +1. The `nanotime` v0.2.0 update (June 2017) changed from `integer64` to `S4` and broke `fwrite` of `nanotime` columns. Fixed to work with `nanotime` both before and after v0.2.0. + +2. Pass R-devel changes related to `deparse(,backtick=)` and `factor()`. + +3. Internal `NAMED()==2` now `MAYBE_SHARED()`, [#2330](https://github.com/Rdatatable/data.table/issues/2330). Back-ported to pass under the stated dependency, R 3.0.0. + +4. Attempted improvement on Mac-only when the `parallel` package is used too (which forks), [#2137](https://github.com/Rdatatable/data.table/issues/2137). Intel's OpenMP implementation appears to leave threads running after the OpenMP parallel region (inside data.table) has finished unlike GNU libgomp. So, if and when `parallel`'s `fork` is invoked by the user after data.table has run in parallel already, instability occurs. The problem only occurs with Mac package binaries from CRAN because they are built by CRAN with Intel's OpenMP library. No known problems on Windows or Linux and no known problems on any platform when `parallel` is not used. If this Mac-only fix still doesn't work, call `setDTthreads(1)` immediately after `library(data.table)` which has been reported to fix the problem by putting `data.table` into single threaded mode earlier. + +5. When `fread()` and `print()` see `integer64` columns are present but package `bit64` is not installed, the warning is now displayed as intended. Thanks to a question by Santosh on r-help and forwarded by Bill Dunlap. + + +# data.table v1.10.4 (01 Feb 2017) + +## BUG FIXES + +1. The new specialized `nanotime` writer in `fwrite()` type punned using `*(long long *)&REAL(column)[i]` which, strictly, is undefined behavour under C standards. It passed a plethora of tests on linux (gcc 5.4 and clang 3.8), win-builder and 6 out 10 CRAN flavours using gcc. But failed (wrong data written) with the newest version of clang (3.9.1) as used by CRAN on the failing flavors, and solaris-sparc. Replaced with the union method and added a grep to CRAN_Release.cmd. + + +# data.table v1.10.2 (31 Jan 2017) + +## NEW FEATURES + +1. When `j` is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. + + ```R + myCols = c("colA","colB") + DT[, myCols, with=FALSE] + DT[, ..myCols] # same + ``` + + When you see the `..` prefix think _one-level-up_ like the directory `..` in all operating systems meaning the parent directory. In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. It is intended to be a convenient way to protect your code from accidentally picking up a column name. Similar to how `x.` and `i.` prefixes (analogous to SQL table aliases) can already be used to disambiguate the same column name present in both `x` and `i`. A symbol prefix rather than a `..()` _function_ will be easier for us to optimize internally and more convenient if you have many variables in calling scope that you wish to use in your expressions safely. This feature was first raised in 2012 and long wished for, [#633](https://github.com/Rdatatable/data.table/issues/633). It is experimental. + +2. When `fread()` or `print()` see `integer64` columns are present, `bit64`'s namespace is now automatically loaded for convenience. + +3. `fwrite()` now supports the new [`nanotime`](https://cran.r-project.org/package=nanotime) type by Dirk Eddelbuettel, [#1982](https://github.com/Rdatatable/data.table/issues/1982). Aside: `data.table` already automatically supported `nanotime` in grouping and joining operations via longstanding support of its underlying `integer64` type. + +4. `indices()` gains a new argument `vectors`, default `FALSE`. This strsplits the index names by `__` for you, [#1589](https://github.com/Rdatatable/data.table/issues/1589). + + ```R + DT = data.table(A=1:3, B=6:4) + setindex(DT, B) + setindex(DT, B, A) + indices(DT) + [1] "B" "B__A" + indices(DT, vectors=TRUE) + [[1]] + [1] "B" + [[2]] + [1] "B" "A" + ``` + +## BUG FIXES + +1. Some long-standing potential instability has been discovered and resolved many thanks to a detailed report from Bill Dunlap and Michael Sannella. At C level any call of the form `setAttrib(x, install(), allocVector())` can be unstable in any R package. Despite `setAttrib()` PROTECTing its inputs, the 3rd argument (`allocVector`) can be executed first only for its result to to be released by `install()`'s potential GC before reaching `setAttrib`'s PROTECTion of its inputs. Fixed by either PROTECTing or pre-`install()`ing. Added to CRAN_Release.cmd procedures: i) `grep`s to prevent usage of this idiom in future and ii) running data.table's test suite with `gctorture(TRUE)`. + +2. A new potential instability introduced in the last release (v1.10.0) in GForce optimized grouping has been fixed by reverting one change from malloc to R_alloc. Thanks again to Michael Sannella for the detailed report. + +3. `fwrite()` could write floating point values incorrectly, [#1968](https://github.com/Rdatatable/data.table/issues/1968). A thread-local variable was incorrectly thread-global. This variable's usage lifetime is only a few clock cycles so it needed large data and many threads for several threads to overlap their usage of it and cause the problem. Many thanks to @mgahan and @jmosser for finding and reporting. + +## NOTES + +1. `fwrite()`'s `..turbo` option has been removed as the warning message warned. If you've found a problem, please [report it](https://github.com/Rdatatable/data.table/issues). + +2. No known issues have arisen due to `DT[,1]` and `DT[,c("colA","colB")]` now returning columns as introduced in v1.9.8. However, as we've moved forward by setting `options('datatable.WhenJisSymbolThenCallingScope'=TRUE)` introduced then too, it has become clear a better solution is needed. All 340 CRAN and Bioconductor packages that use data.table have been checked with this option on. 331 lines would need to be changed in 59 packages. Their usage is elegant, correct and recommended, though. Examples are `DT[1, encoding]` in quanteda and `DT[winner=="first", freq]` in xgboost. These are looking up the columns `encoding` and `freq` respectively and returning them as vectors. But if, for some reason, those columns are removed from `DT` and `encoding` or `freq` are still variables in calling scope, their values in calling scope would be returned. Which cannot be what was intended and could lead to silent bugs. That was the risk we were trying to avoid.
+`options('datatable.WhenJisSymbolThenCallingScope')` is now removed. A migration timeline is no longer needed. The new strategy needs no code changes and has no breakage. It was proposed and discussed in point 2 [here](https://github.com/Rdatatable/data.table/issues/1188#issuecomment-127824969), as follows.
+When `j` is a symbol (as in the quanteda and xgboost examples above) it will continue to be looked up as a column name and returned as a vector, as has always been the case. If it's not a column name however, it is now a helpful error explaining that data.table is different to data.frame and what to do instead (use `..` prefix or `with=FALSE`). The old behaviour of returning the symbol's value in calling scope can never have been useful to anybody and therefore not depended on. Just as the `DT[,1]` change could be made in v1.9.8, this change can be made now. This change increases robustness with no downside. Rerunning all 340 CRAN and Bioconductor package checks reveal 2 packages throwing the new error: partools and simcausal. Their maintainers have been informed that there is a likely bug on those lines due to data.table's (now remedied) weakness. This is exactly what we wanted to reveal and improve. + +3. As before, and as we can see is in common use in CRAN and Bioconductor packages using data.table, `DT[,myCols,with=FALSE]` continues to lookup `myCols` in calling scope and take its value as column names or numbers. You can move to the new experimental convenience feature `DT[, ..myCols]` if you wish at leisure. + + +# data.table v1.10.0 (03 Dec 2016) + +## BUG FIXES + +1. `fwrite(..., quote='auto')` already quoted a field if it contained a `sep` or `\n`, or `sep2[2]` when `list` columns are present. Now it also quotes a field if it contains a double quote (`"`) as documented, [#1925](https://github.com/Rdatatable/data.table/issues/1925). Thanks to Aki Matsuo for reporting. Tests added. The `qmethod` tests did test escaping embedded double quotes, but only when `sep` or `\n` was present in the field as well to trigger the quoting of the field. + +2. Fixed 3 test failures on Solaris only, [#1934](https://github.com/Rdatatable/data.table/issues/1934). Two were on both sparc and x86 and related to a `tzone` attribute difference between `as.POSIXct` and `as.POSIXlt` even when passed the default `tz=""`. The third was on sparc only: a minor rounding issue in `fwrite()` of 1e-305. + +3. Regression crash fixed when 0's occur at the end of a non-empty subset of an empty table, [#1937](https://github.com/Rdatatable/data.table/issues/1937). Thanks Arun for tracking down. Tests added. For example, subsetting the empty `DT=data.table(a=character())` with `DT[c(1,0)]` should return a 1 row result with one `NA` since 1 is past the end of `nrow(DT)==0`, the same result as `DT[1]`. + +4. Fixed newly reported crash that also occurred in old v1.9.6 when `by=.EACHI`, `nomatch=0`, the first item in `i` has no match AND `j` has a function call that is passed a key column, [#1933](https://github.com/Rdatatable/data.table/issues/1933). Many thanks to Reino Bruner for finding and reporting with a reproducible example. Tests added. + +5. Fixed `fread()` error occurring for a subset of Windows users: `showProgress is not type integer but type 'logical'.`, [#1944](https://github.com/Rdatatable/data.table/issues/1944) and [#1111](https://github.com/Rdatatable/data.table/issues/1111). Our tests cover this usage (it is just default usage), pass on AppVeyor (Windows), win-builder (Windows) and CRAN's Windows so perhaps it only occurs on a specific and different version of Windows to all those. Thanks to @demydd for reporting. Fixed by using strictly `logical` type at R level and `Rboolean` at C level, consistently throughout. + +6. Combining `on=` (new in v1.9.6) with `by=` or `keyby=` gave incorrect results, [#1943](https://github.com/Rdatatable/data.table/issues/1943). Many thanks to Henrik-P for the detailed and reproducible report. Tests added. + +7. New function `rleidv` was ignoring its `cols` argument, [#1942](https://github.com/Rdatatable/data.table/issues/1942). Thanks Josh O'Brien for reporting. Tests added. + +## NOTES + +1. It seems OpenMP is not available on CRAN's Mac platform; NOTEs appeared in [CRAN checks](https://cran.r-project.org/web/checks/check_results_data.table.html) for v1.9.8. Moved `Rprintf` from `init.c` to `packageStartupMessage` to avoid the NOTE as requested urgently by Professor Ripley. Also fixed the bad grammar of the message: 'single threaded' now 'single-threaded'. If you have a Mac and run macOS or OS X on it (I run Ubuntu on mine) please contact CRAN maintainers and/or Apple if you'd like CRAN's Mac binary to support OpenMP. Otherwise, please follow [these instructions for OpenMP on Mac](https://github.com/Rdatatable/data.table/wiki/Installation) which people have reported success with. + +2. Just to state explicitly: data.table does not now depend on or require OpenMP. If you don't have it (as on CRAN's Mac it appears but not in general on Mac) then data.table should build, run and pass all tests just fine. + +3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. + +4. The FAQ vignette has been revised given the changes in v1.9.8. In particular, the very first FAQ. -15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). +5. With hindsight, the last release v1.9.8 should have been named v1.10.0 to convey it wasn't just a patch release from .6 to .8 owing to the 'potentially breaking changes' items. Thanks to @neomantic for correctly pointing out. The best we can do now is now bump to 1.10.0. -# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md) +# data.table v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) diff --git a/R/IDateTime.R b/R/IDateTime.R index 185952fe72..4e6adf55e3 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -315,8 +315,8 @@ clip_msec = function(secs, action) { # Adapted from Hadley Wickham's routines cited below to ensure # integer results. # http://gist.github.com/10238 -# See also Hadley et al's more advanced and complex lubridate package: -# https://github.com/tidyverse/lubridate +# See also Hadley's more advanced and complex lubridate package: +# http://github.com/hadley/lubridate # lubridate routines do not return integer values. ################################################################### diff --git a/R/data.table.R b/R/data.table.R index 801482147e..473cf6e766 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -882,7 +882,7 @@ replace_dot_alias = function(e) { bynames = allbyvars = NULL # the rest now fall through } else bynames = names(byval) - if (is.atomic(byval) || is.null(byval)) { + if (is.atomic(byval)) { if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) { stopf("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval%s should work. This is for efficiency so data.table can detect which columns are needed.", deparse(bysub)) } else { diff --git a/R/devel.R b/R/devel.R index 3aed1017f8..8bd7a1466a 100644 --- a/R/devel.R +++ b/R/devel.R @@ -17,8 +17,9 @@ dcf.repo = function(pkg, repo, field, type) { dcf[dcf[,"Package"]==pkg, field][[1L]] } -update_dev_pkg = function(pkg="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { +update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { # this works for any package, not just data.table + pkg = object # perform package upgrade when new Revision present stopifnot(is.character(pkg), length(pkg)==1L, !is.na(pkg), is.character(repo), length(repo)==1L, !is.na(repo), @@ -27,7 +28,7 @@ update_dev_pkg = function(pkg="data.table", repo="https://Rdatatable.gitlab.io/d # get Revision field from remote repository PACKAGES file una = is.na(ups<-dcf.repo(pkg, repo, field, type)) if (una) - catf("No revision information found in DESCRIPTION file for %s package. Make sure that '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", + catf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", pkg, field, contrib.url(repo, type=type)) # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) @@ -43,7 +44,6 @@ update_dev_pkg = function(pkg="data.table", repo="https://Rdatatable.gitlab.io/d unname(read.dcf(system.file("DESCRIPTION", package=pkg, lib.loc=lib, mustWork=TRUE), fields=field)[, field]), utils::packageVersion(pkg, lib.loc=lib))) }) - invisible(upg) } # non-exported utility when using devel version #3272: data.table:::.git() diff --git a/R/frank.R b/R/frank.R index 419f5ea414..ba90a83b93 100644 --- a/R/frank.R +++ b/R/frank.R @@ -12,7 +12,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a .Call(Csetlistelt, xx, 1L, x) xx } - if (is.atomic(x) || is.null(x)) { + if (is.atomic(x)) { if (!missing(cols) && !is.null(cols)) stopf("x is a single vector, non-NULL 'cols' doesn't make sense") cols = 1L diff --git a/R/fread.R b/R/fread.R index 8e9a11b123..f8b025d9c3 100644 --- a/R/fread.R +++ b/R/fread.R @@ -76,13 +76,17 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 # nocov start tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - if (w<=2L && base::getRversion()<"3.2.2") { # https: or ftps: can be read by default by download.file() since 3.2.2 - stopf("URL requires download.file functionalities from R >=3.2.2. You can still manually download the file and fread the downloaded file.") + if (w<=2L) { # https: or ftps: + if (!requireNamespace("curl", quietly = TRUE)) + stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + + curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) + } else { + method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 + else getOption("download.file.method", default="auto") # http: or ftp: + download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" } - method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 - else getOption("download.file.method", default="auto") # http: or ftp: - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" - download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) file = tmpFile on.exit(unlink(tmpFile), add=TRUE) # nocov end @@ -131,7 +135,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!allNA(colClasses)) stopf("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") colClasses = NULL } - if (!is.null(colClasses) && is.atomic(colClasses)) { ## future R can use if (is.atomic(.)) + if (!is.null(colClasses) && is.atomic(colClasses)) { if (!is.character(colClasses)) stopf("colClasses is not type list or character vector") if (!length(colClasses)) { colClasses=NULL; diff --git a/R/froll.R b/R/froll.R index df901f0b84..697051e1b8 100644 --- a/R/froll.R +++ b/R/froll.R @@ -1,21 +1,107 @@ -froll = function(fun, x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { - stopifnot(!missing(fun), is.character(fun), length(fun)==1L, !is.na(fun)) - algo = match.arg(algo) +partial2adaptive = function(x, n, align, adaptive) { + if (align=="center") + stopf("'partial' cannot be used together with align='center'") + if (is.list(x) && length(unique(vapply(x, length, 0L)))!=1L) + stopf("'partial' does not support variable length of columns in 'x'") + if (!(is.numeric(n) || (is.list(n) && all(vapply(n, is.numeric, FALSE))))) + stopf("n must be integer vector or list of integer vectors") + len = if (is.list(x)) length(x[[1L]]) else length(x) + verbose = getOption("datatable.verbose") + if (!adaptive) { + n = as.list(n) ## test 6006.032 + if (verbose) + cat("partial2adaptive: froll partial=TRUE trimming 'n' and redirecting to adaptive=TRUE\n") + trimn = function(n, len, align) { + n = min(n, len) + if (align=="right") + c(seq.int(n), rep(n, len-n)) + else + c(rep(n, len-n), rev(seq.int(n))) + } + sapply(n, len, align, FUN=trimn, simplify=FALSE) + } else { + if (!is.list(n)) n = list(n) + if (length(unique(vapply(n, length, 0L)))!=1L) + stopf("adaptive window provided in 'n' must not to have different lengths") + if (length(n[[1L]]) != len) + stopf("length of vectors in 'x' must match to length of adaptive window in 'n'") + if (verbose) + cat("partial2adaptive: froll adaptive=TRUE and partial=TRUE trimming 'n'\n") + triman = function(n, align) { + if (align=="right") + pmin(n, seq_along(n)) + else + pmin(n, rev(seq_along(n))) + } + sapply(n, align, FUN=triman, simplify=FALSE) + } +} + +froll = function(fun, x, n, fill=NA, algo, align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, FUN, rho, give.names=FALSE) { align = match.arg(align) - ans = .Call(CfrollfunR, fun, x, n, fill, algo, align, na.rm, hasNA, adaptive) + if (isTRUE(give.names)) + orig = list(n=n, adaptive=adaptive) + if (isTRUE(partial)) { + n = partial2adaptive(x, n, align, adaptive) + adaptive = TRUE + } ## support for partial added in #5441 + leftadaptive = isTRUE(adaptive) && align=="left" + if (leftadaptive) { + verbose = getOption("datatable.verbose") + rev2 = function(x) if (is.list(x)) sapply(x, rev, simplify=FALSE) else rev(x) + if (verbose) + cat("froll: adaptive=TRUE && align='left' pre-processing for align='right'\n") + x = rev2(x) + n = rev2(n) + align = "right" + } ## support for left adaptive added in #5441 + if (missing(FUN)) + ans = .Call(CfrollfunR, fun, x, n, fill, algo, align, na.rm, has.nf, adaptive) + else + ans = .Call(CfrollapplyR, FUN, x, n, fill, align, adaptive, rho) + if (leftadaptive) { + if (verbose) + cat("froll: adaptive=TRUE && align='left' post-processing from align='right'\n") + ans = rev2(ans) + } + if (isTRUE(give.names) && is.list(ans)) { + n = orig$n + adaptive = orig$adaptive + nx = names(x) + nn = names(n) + if (is.null(nx)) nx = paste0("V", if (is.atomic(x)) 1L else seq_along(x)) + if (is.null(nn)) nn = if (adaptive) paste0("N", if (is.atomic(n)) 1L else seq_along(n)) else paste("roll", as.character(n), sep="_") + setattr(ans, "names", paste(rep(nx, each=length(nn)), nn, sep="_")) + } ans } -frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { - froll(fun="mean", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive) +frollfun = function(fun, x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) { + stopifnot(!missing(fun), is.character(fun), length(fun)==1L, !is.na(fun)) + if (!missing(hasNA)) { + if (!is.na(has.nf)) + stopf("hasNA is deprecated, use has.nf instead") + warning("hasNA is deprecated, use has.nf instead") + has.nf = hasNA + } # remove check on next major release + algo = match.arg(algo) + froll(fun=fun, x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, give.names=give.names) +} + +frollmean = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) { + frollfun(fun="mean", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names) +} +frollsum = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) { + frollfun(fun="sum", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names) } -frollsum = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { - froll(fun="sum", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive) +frollmax = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) { + frollfun(fun="max", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names) } -frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center")) { + +frollapply = function(x, n, FUN, ..., fill=NA, align=c("right","left","center"), adaptive=FALSE, partial=FALSE, give.names=FALSE) { + if (isTRUE(adaptive) && base::getRversion() < "3.4.0") ## support SET_GROWABLE_BIT + stopf("frollapply adaptive=TRUE requires at least R 3.4.0"); # nocov FUN = match.fun(FUN) - align = match.arg(align) rho = new.env() - ans = .Call(CfrollapplyR, FUN, x, n, fill, align, rho) - ans + froll(FUN=FUN, rho=rho, x=x, n=n, fill=fill, align=align, adaptive=adaptive, partial=partial, give.names=give.names) } diff --git a/R/fwrite.R b/R/fwrite.R index e1484b9e3c..c822b05678 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -115,5 +115,3 @@ fwrite = function(x, file="", append=FALSE, quote="auto", invisible() } -haszlib = function() .Call(Cdt_has_zlib) - diff --git a/R/onAttach.R b/R/onAttach.R index 6ff17972b3..9b71a6615c 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -35,10 +35,6 @@ else packageStartupMessagef("This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) } - if (.Call(CbeforeR340)) { - # not base::getRversion()<"3.4.0" in case the user upgrades R but does not reinstall data.table; a reasonable mistake since data.table would seem to be the latest version - packageStartupMessagef("**********\nThis data.table installation was compiled for R < 3.4.0 (Apr 2017) and is known to leak memory. Please upgrade R and reinstall data.table to fix the leak. Maintaining and testing code branches to support very old versions increases development time so please do upgrade R. We intend to bump data.table's dependency from 8 year old R 3.1.0 (Apr 2014) to 5 year old R 3.4.0 (Apr 2017).\n**********") - } } } diff --git a/R/print.data.table.R b/R/print.data.table.R index 7271ac458f..16950fd110 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -141,8 +141,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), } format.data.table = function (x, ..., justify="none") { - if (is.atomic(x) && !is.null(x)) { ## future R can use if (is.atomic(x)) - + if (is.atomic(x) && !is.null(x)) { stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } do.call("cbind", lapply(x, format_col, ..., justify=justify)) diff --git a/R/setkey.R b/R/setkey.R index 5f3027a2d7..3bd3f782c4 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -169,7 +169,7 @@ is.sorted = function(x, by=NULL) { ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) { - if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), + if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL } else { diff --git a/R/tables.R b/R/tables.R index e47a1a42e8..5196935eda 100644 --- a/R/tables.R +++ b/R/tables.R @@ -1,62 +1,46 @@ # globals to pass NOTE from R CMD check, see http://stackoverflow.com/questions/9439256 -MB = NCOL = NROW = INDICES = NULL +MB = NCOL = NROW = NULL -type_size = function(DT) { - # for speed and ram efficiency, a lower bound by not descending into character string lengths or list items - # if a more accurate and higher estimate is needed then user can pass object.size or alternative to mb= - # in case number of columns is very large (e.g. 1e6 columns) then we use a for() to avoid allocation of sapply() - ans = 0L - lookup = c("raw"=1L, "integer"=4L, "double"=8L, "complex"=16L) - for (i in seq_along(DT)) { - col = DT[[i]] - tt = lookup[storage.mode(col)] - if (is.na(tt)) tt = .Machine$sizeof.pointer - tt = tt*nrow(DT) - if (is.factor(col)) tt = tt + length(levels(col))*.Machine$sizeof.pointer - ans = ans + tt - } - ans + ncol(DT)*.Machine$sizeof.pointer # column name pointers -} - -tables = function(mb=type_size, order.col="NAME", width=80, +tables = function(mb=TRUE, order.col="NAME", width=80, env=parent.frame(), silent=FALSE, index=FALSE) { # Prints name, size and colnames of all data.tables in the calling environment by default - mb_name = as.character(substitute(mb)) - if (isTRUE(mb)) { mb=type_size; mb_name="type_size" } - names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .) - obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197 - w = which(vapply_1b(obj, is.data.table)) - if (!length(w)) { + # include "hidden" objects (starting with .) via all.names=TRUE, but exclude ... specifically, #5197 + all_obj = grep("...", ls(envir=env, all.names=TRUE), invert=TRUE, fixed=TRUE, value=TRUE) + if (order.col=="NAME") all_obj=sort(all_obj) # neither ls() nor objects() had sorted arg in R 3.1.0 + is_DT = vapply_1b(mget(all_obj, envir=env), is.data.table) + if (!any(is_DT)) { if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } - info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0, COLS=list(), KEY=list(), INDICES=list()) - for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables - DT = obj[[w[i]]] - set(info, i, "NROW", nrow(DT)) - set(info, i, "NCOL", ncol(DT)) - if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1024^2)) - if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done - if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt) - if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt) + DT_names = all_obj[is_DT] + info = rbindlist(lapply(DT_names, function(dt_n){ + DT = get(dt_n, envir=env) # doesn't copy + data.table( # data.table excludes any NULL items (MB and INDICES optional) unlike list() + NAME = dt_n, + NROW = nrow(DT), + NCOL = ncol(DT), + MB = if (mb) round(as.numeric(object.size(DT))/1024^2), # object.size() is slow hence optional; TODO revisit + COLS = list(names(DT)), + KEY = list(key(DT)), + INDICES = if (index) list(indices(DT))) + })) + if (order.col != "NAME") { + if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) + info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names } - if (!is.function(mb)) info[,MB:=NULL] - if (!index) info[,INDICES:=NULL] - if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) - info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names if (!silent) { - # add commas into NROW, NCOL and MB when displayed on console - # but this added all these numbers as strings to the character cache which causes the character cache to - # grow especially with a lot of tables, or changing tables over time. Stopped for now to avoid a tipping - # point in RSS in #5520 - # pretty_format = function(x, width) format(prettyNum(x, big.mark=","), width=width, justify="right") - # tt = shallow(info) - # tt[ , NROW := pretty_format(NROW, width=4L)] - # tt[ , NCOL := pretty_format(NCOL, width=4L)] - # if (is.function(mb)) tt[ , MB := pretty_format(MB, width=2L)] - print(info, class=FALSE, nrows=Inf) - if (is.function(mb)) catf("Total: %sMB using %s\n", prettyNum(sum(info$MB), big.mark=","), mb_name) + # prettier printing on console + pretty_format = function(x, width) { + format(prettyNum(x, big.mark=","), + width=width, justify="right") + } + tt = copy(info) + tt[ , NROW := pretty_format(NROW, width=4L)] + tt[ , NCOL := pretty_format(NCOL, width=4L)] + if (mb) tt[ , MB := pretty_format(MB, width=2L)] + print(tt, class=FALSE, nrows=Inf) + if (mb) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) } invisible(info) } diff --git a/R/test.data.table.R b/R/test.data.table.R index 6428bcc72b..298fc34c13 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,19 +1,8 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, - memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) { +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) - memtest = as.integer(memtest) - stopifnot(length(memtest)==1L, memtest %in% 0:2) - memtest.id = as.integer(memtest.id) - if (length(memtest.id)) { - if (length(memtest.id)==1L) memtest.id = rep(memtest.id, 2L) # for convenience of supplying one id rather than always a range - stopifnot(length(memtest.id)<=2L, # conditions quoted to user when false so "<=2L" even though following conditions rely on ==2L - !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L]) - if (memtest==0L) memtest=1L # using memtest.id implies memtest - } if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # package developer # nocov start - dev = TRUE if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") subdir = file.path("inst","tests") @@ -21,7 +10,6 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov end } else { # i) R CMD check and ii) user running test.data.table() - dev = FALSE rootdir = getNamespaceInfo("data.table","path") subdir = "tests" env = new.env(parent=parent.env(.GlobalEnv)) # when user runs test.data.table() we don't want their variables in .GlobalEnv affecting tests, #3705 @@ -40,7 +28,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F return(sapply(scripts, function(fn) { err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) cat("\n"); - isTRUE(err) + identical(err, TRUE) })) # nocov end } @@ -124,19 +112,14 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("whichfail", NULL, envir=env) assign("started.at", proc.time(), envir=env) assign("lasttime", proc.time()[3L], envir=env) # used by test() to attribute time inbetween tests to the next test - assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L, RSS=0.0 ), envir=env) # test timings aggregated to integer id - assign("memtest", memtest, envir=env) - assign("memtest.id", memtest.id, envir=env) + assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L ), envir=env) # test timings aggregated to integer id + assign("memtest", as.logical(Sys.getenv("TEST_DATA_TABLE_MEMTEST", "FALSE")), envir=env) assign("filename", fn, envir=env) + assign("inittime", as.integer(Sys.time()), envir=env) # keep measures from various test.data.table runs assign("showProgress", showProgress, envir=env) owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 on.exit(setwd(owd)) - - if (memtest) { - catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest) - if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") - } err = try(sys.source(fn, envir=env), silent=silent) @@ -173,7 +156,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (inherits(err,"try-error")) { # nocov start if (silent) return(FALSE) - stopf("Failed in %s after test %s before the next test() call in %s", timetaken(env$started.at), env$prevtest, fn) + stopf("Failed after test %s before the next test() call in %s", env$prevtest, fn) # the try() above with silent=FALSE will have already printed the error itself # nocov end } @@ -183,40 +166,50 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (nfail > 0L) { # nocov start stopf( - "%d error(s) out of %d. Search %s for test number(s) %s. Duration: %s.", - nfail, ntest, names(fn), toString(env$whichfail), timetaken(env$started.at) + "%d error(s) out of %d. Search %s for test number(s) %s", + nfail, ntest, names(fn), toString(env$whichfail) ) # important to stopf() here, so that 'R CMD check' fails # nocov end } # There aren't any errors, so we can use up 11 lines for the timings table - nTest = RSS = NULL # to avoid 'no visible binding' note - timings = env$timings[nTest>0] - if (!memtest) { - ans = head(timings[if (dev) -1L else TRUE][order(-time)], 10L)[,RSS:=NULL] # exclude id 1 in dev as that includes JIT - if ((x<-sum(timings[["nTest"]])) != ntest) { - warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov - } - catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-ans[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) - print(ans, class=FALSE) - } else { - y = head(order(-diff(timings$RSS)), 10L) - ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here - catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") - print(ans, class=FALSE) - get("dev.new")(width=14, height=7) - get("par")(mfrow=c(1,2)) - get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS))) - get("mtext")(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2) - get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)") - get("mtext")(lastRSS, side=4, at=lastRSS, las=1, font=2) + timings = env$timings + DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT + if ((x<-sum(timings[["nTest"]])) != ntest) { + warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov } + catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) + print(DT, class=FALSE) catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) - ans = nfail==0L - attr(ans, "timings") = timings # as attr to not upset callers who expect a TRUE/FALSE result - invisible(ans) + + ## this chunk requires to include new suggested deps: graphics, grDevices + #memtest.plot = function(.inittime) { + # if (!all(requireNamespace(c("graphics","grDevices"), quietly=TRUE))) return(invisible()) + # inittime=PS_rss=GC_used=GC_max_used=NULL + # m = fread("memtest.csv")[inittime==.inittime] + # if (nrow(m)) { + # ps_na = allNA(m[["PS_rss"]]) # OS with no 'ps -o rss R' support + # grDevices::png("memtest.png") + # p = graphics::par(mfrow=c(if (ps_na) 2 else 3, 2)) + # if (!ps_na) { + # m[, graphics::plot(test, PS_rss, pch=18, xlab="test num", ylab="mem MB", main="ps -o rss R")] + # m[, graphics::plot(timestamp, PS_rss, type="l", xlab="timestamp", ylab="mem MB", main="ps -o rss R")] + # } + # m[, graphics::plot(test, GC_used, pch=18, xlab="test num", ylab="mem MB", main="gc used")] + # m[, graphics::plot(timestamp, GC_used, type="l", xlab="timestamp", ylab="mem MB", main="gc used")] + # m[, graphics::plot(test, GC_max_used, pch=18, xlab="test num", ylab="mem MB", main="gc max used")] + # m[, graphics::plot(timestamp, GC_max_used, type="l", xlab="timestamp", ylab="mem MB", main="gc max used")] + # graphics::par(p) + # grDevices::dev.off() + # } else { + # warningf("test.data.table runs with memory testing but did not collect any memory statistics.") + # } + #} + #if (memtest<-get("memtest", envir=env)) memtest.plot(get("inittime", envir=env)) + + invisible(nfail==0L) } # nocov start @@ -240,9 +233,19 @@ compactprint = function(DT, topn=2L) { INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw +ps_mem = function() { + # nocov start + cmd = sprintf("ps -o rss %s | tail -1", Sys.getpid()) + ans = tryCatch(as.numeric(system(cmd, intern=TRUE, ignore.stderr=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) + stopifnot(length(ans)==1L) # extra check if other OSes would not handle 'tail -1' properly for some reason + # returns RSS memory occupied by current R process in MB rounded to 1 decimal places (as in gc), ps already returns KB + c("PS_rss"=round(ans / 1024, 1L)) + # nocov end +} + gc_mem = function() { # nocov start - # gc reports memory in MB + # gc reported memory in MB m = apply(gc()[, c(2L, 4L, 6L)], 2L, sum) names(m) = c("GC_used", "GC_gc_trigger", "GC_max_used") m @@ -275,22 +278,16 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) - memtest.id = get("memtest.id", parent.frame()) + inittime = get("inittime", parent.frame()) filename = get("filename", parent.frame()) foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) - time = nTest = RSS = NULL # to avoid 'no visible binding' note + time = nTest = NULL # to avoid 'no visible binding' note if (num>0) on.exit( { - took = proc.time()[3L]-lasttime # so that prep time between tests is attributed to the following test - timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] - if (memtest) { - if (memtest==1L) gc() # see #5515 for before/after - inum = as.integer(num) - timings[inum, RSS:=max(rss(),RSS), verbose=FALSE] # TODO prefix inum with .. for clarity when that works - if (length(memtest.id) && memtest.id[1L]<=inum && inum<=memtest.id[2L]) cat(rss(),"\n") # after 'testing id ...' output; not using between() as it has verbose output when getOption(datatable.verbose) - if (memtest==2L) gc() - } - assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest + now = proc.time()[3L] + took = now-lasttime # so that prep time between tests is attributed to the following test + assign("lasttime", now, parent.frame(), inherits=TRUE) + timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ] } ) if (showProgress) # \r can't be in gettextf msg @@ -303,7 +300,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # not be flushed to the output upon segfault, depending on OS). } else { # not `test.data.table` but developer running tests manually; i.e. `cc(F); test(...)` - memtest = 0L # nocov + memtest = FALSE # nocov filename = NA_character_ # nocov foreign = FALSE # nocov ; assumes users of 'cc(F); test(...)' has LANGUAGE=en showProgress = FALSE # nocov @@ -333,6 +330,9 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no actual$message <<- c(actual$message, conditionMessage(m)) m } + if (memtest) { + timestamp = as.numeric(Sys.time()) # nocov + } if (is.null(output) && is.null(notOutput)) { x = suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)) # save the overhead of capture.output() since there are a lot of tests, often called in loops @@ -340,6 +340,10 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } else { out = capture.output(print(x <- suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)))) } + if (memtest) { + mem = as.list(c(inittime=inittime, filename=basename(filename), timestamp=timestamp, test=num, ps_mem(), gc_mem())) # nocov + fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov + } fail = FALSE if (.test.data.table && num>0) { if (num
-[![CRAN status](https://badges.cranchecks.info/flavor/release/data.table.svg)](https://cran.r-project.org/web/checks/check_results_data.table.html) +[![CRAN status](https://cranchecks.info/badges/flavor/release/data.table)](https://cran.r-project.org/web/checks/check_results_data.table.html) [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/kayjdh5qtgymhoxr/branch/master?svg=true)](https://ci.appveyor.com/project/Rdatatable/data-table) -[![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://app.codecov.io/github/Rdatatable/data.table?branch=master) +[![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/-/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) [![CRAN usage](https://jangorecki.gitlab.io/rdeps/data.table/CRAN_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) @@ -33,9 +33,12 @@ * fast and feature rich joins: **ordered joins** (e.g. rolling forwards, backwards, nearest and limited staleness), **[overlapping range joins](https://github.com/Rdatatable/data.table/wiki/talks/EARL2014_OverlapRangeJoin_Arun.pdf)** (similar to `IRanges::findOverlaps`), **[non-equi joins](https://github.com/Rdatatable/data.table/wiki/talks/ArunSrinivasanUseR2016.pdf)** (i.e. joins using operators `>, >=, <, <=`), **aggregate on join** (`by=.EACHI`), **update on join** * fast add/update/delete columns **by reference** by group using no copies at all * fast and feature rich **reshaping** data: **[`?dcast`](https://rdatatable.gitlab.io/data.table/reference/dcast.data.table.html)** (_pivot/wider/spread_) and **[`?melt`](https://rdatatable.gitlab.io/data.table/reference/melt.data.table.html)** (_unpivot/longer/gather_) +* fast and feature rich various kinds of operations: [**rolling statistics**](https://rdatatable.gitlab.io/data.table/reference/froll.html) (_rolling mean, rolling max, rolling any R function_), [**grouping sets**](https://rdatatable.gitlab.io/data.table/reference/groupingsets.html) (_cube, rolllup_), [**set operations**](https://rdatatable.gitlab.io/data.table/reference/setops.html) (_union, intersect, setdiff_) +* faster implementations of many functions: `fsort`, `fifelse`, `fcase`, `fcoalesce`, `frank`, `between`, `nafill`, `rleid`, and more * **any R function from any R package** can be used in queries not just the subset of functions made available by a database backend, also columns of type `list` are supported +* [**meta-programming interface**](https://rdatatable.gitlab.io/data.table/articles/datatable-programming.html) for convenience of R developers importing data.table in their packages * has **[no dependencies](https://en.wikipedia.org/wiki/Dependency_hell)** at all other than base R itself, for simpler production/maintenance -* the R dependency is **as old as possible for as long as possible**, dated April 2014, and we continuously test against that version; e.g. v1.11.0 released on 5 May 2018 bumped the dependency up from 5 year old R 3.0.0 to 4 year old R 3.1.0 +* the R dependency is **as old as possible for as long as possible**, dated April 2014, and we continuously test against that version ## Installation @@ -81,7 +84,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species] ## Community -`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://medium.datadriveninvestor.com/most-starred-and-forked-github-repos-for-r-in-data-science-fb87a54d2a6a) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). +`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://www.r-pkg.org/starred) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). ### Stay up-to-date diff --git a/_pkgdown.yml b/_pkgdown.yml index 66488b9281..4b02b39491 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,8 @@ -url: https://rdatatable.gitlab.io/data.table +url: https://Rdatatable.gitlab.io/data.table + +template: + params: + ganalytics: UA-129166154-2 development: version_tooltip: "Development version" @@ -7,8 +11,6 @@ home: links: - text: CRAN-like website href: web/packages/data.table/index.html - - text: CRAN-like checks - href: web/checks/check_results_data.table.html navbar: structure: diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 04c5c490b4..bf0bf77e9f 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -168,335 +168,3 @@ test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40")) # Add scaled-up non-ASCII forder test 1896 -# Before #5501 do.call(data.table,) fully deparsed large unnamed args, #5492. -DF = data.frame(a=runif(1e6), b=runif(1e6)) -t1 = system.time(DT1 <- data.table(DF)) # 0.02s before and after -t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s after -test(, identical(DT1, DT2)) -test(, t2["elapsed"]/t1["elapsed"]<2) - -########################################################### -# largest tests by ram usage moved out of tests.Rraw, #5517 -########################################################### - -# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) -# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which -# this tests. But it's well under 10 seconds. -DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) -test(301, nrow(DT[,sum(B),by=C])==100010) -DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) -test(301.1, nrow(DT[,sum(B),by=C])==100010) - -# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -options(datatable.optimize=0L) -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(637.2, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(637.3, key(DT[,a:=99L,by=a]), NULL) -options(datatable.optimize=2L) -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(638.2, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(638.3, key(DT[,a:=99L,by=a]), NULL) - -# Test X[Y] slowdown, #2216 -# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes -# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. -X = CJ(a=seq_len(1e3),b=seq_len(1e3)) -Y = copy(X) -X[4,b:=3L] # create a dup group, to force allLen1=FALSE -setkey(X) -test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case -test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case - -# test uniqlengths -set.seed(45) -x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) -ox <- forderv(x) -o1 <- uniqlist(list(x), ox) -test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -o1 <- uniqlist(list(x)) -test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -rm(list=c("x","ox","o1")) -gc() - -# Fix for (usually small) memory leak when grouping, #2648. -# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). -DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) -before = gc()["Vcells",2] -for (i in 1:50) DT[, sum(B), by=A] -after = gc()["Vcells",2] -test(1157, after < before+3) # +3 = 3MB -# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. - -# Similar for when dogroups writes less rows than allocated, #2648. -DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) -before = gc()["Vcells",2] -for (i in 1:50) DT[ , unlist(.SD), by = 'k'] -after = gc()["Vcells",2] -test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 - -# fix DT[TRUE, :=] using too much working memory for i, #1249 -if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled - f = tempfile() - N = 1000000 # or any large number of rows - DT = data.table(A=1:N, B=rnorm(N)) - DT[TRUE, B := B * 2] # stabilize with initial dummy update - Rprofmem(f) - DT[TRUE, B := B * 2] # or some in-place update - Rprofmem(NULL) - test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only - unlink(f) -} - -if (FALSE) { - # Full range takes too long for CRAN. - dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") - dtsCh = as.character(dts) # 36s - dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 - test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) -} else { - # test on CRAN a reduced but important range - dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") - dtsCh = as.character(dts) - test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) -} -DT = data.table(A=dts, B=as.IDate(dts)) -test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) -test(1739.4, typeof(dts), "double") -f = tempfile() -g = tempfile() # Full range -fwrite(DT,f) # 0.092s -write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s -test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) -test(1739.6, readLines(f), readLines(g)) -unlink(f) -unlink(g) -rm(list=c("dtsCh","dts")) -gc() - -# catch malformed factor in rbindlist, #3315 -set.seed(32940) -NN=7e5; KK=4e4; TT=25 -DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) -test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites - -# print.data.table row id in non-scientific notation, #1167 -DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) -test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) -rm(DT) - -# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because -# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. -# Would need a very complicated construction of embedded new lines in quoted fields, to test that. -# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. -DT = as.data.table(CO2) -f = tempfile() -for (i in 0:1000) { - start = nrow(CO2)*i - fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) - if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) -} -test(1835, fread(f, verbose=TRUE), - output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", - warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") -unlink(f) - -# test no memory leak, #2191 and #2284 -# These take a few seconds each, and it's important to run these on CRAN to check no leak -gc(); before = gc()["Vcells","(Mb)"] -for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB -gc(); after = gc()["Vcells","(Mb)"] -test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin -gc(); before = gc()["Vcells","(Mb)"] -DF = data.frame(x=1:20, y=runif(20)) -for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } -gc(); after = gc()["Vcells","(Mb)"] -test(862, after < before+0.5) -gc(); before = gc()["Vcells","(Mb)"] -DT = data.table(x=1:20, y=runif(20)) -for (i in 1:2000) { x <- DT[1:5,]; rm(x) } -gc(); after = gc()["Vcells","(Mb)"] -test(863, after < before+0.5) - -# fread should use multiple threads on single column input. -# tests 2 threads; the very reasonable limit on CRAN -# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) -if (getDTthreads() == 1L) { - cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") -} else { - N = if (TRUE) 2e6 else 1e9 # offline speed check - fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) - test(1760.1, file.info(f)$size > 4*1024*1024) - test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") - unlink(f) -} - -# segfault of unprotected var caught with the help of address sanitizer; was test 1509 -# in #5517 I figured this test shouldn't be reduced in size due to its nature -set.seed(1) -val = sample(c(1:5, NA), 1e4L, TRUE) -dt <- setDT(replicate(100L, val, simplify=FALSE)) -## to ensure there's no segfault... -ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) -test(1035.21, ans, ans) - -# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 -# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. -# 2 threads are sufficient to fail before the fix. -N = 20 -DF = data.frame(a=rnorm(N), - b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), - c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) -DT = setDT(DF) # setDT required since data.table() already expanded altrep's -before = sum(gc()[, 2]) -fff = function(aref) { - ff = lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) - return(rbindlist(ff)) -} -for(i in 1:100) { - f = fff("a") - rm("f") -} -gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` - # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. -after = sum(gc()[, 2]) -test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). -# -before = sum(gc()[, 2]) -fff = function(aref) { - DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. - lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) -} -for(i in 1:100) { - fff("a") -} -gc() -after = sum(gc()[, 2]) -test(1912.2, after < before + 10) - -DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) -fwrite(DT, f<-tempfile()) -test(1815, fread(f, nrows=5), DT[1:5]) #2243: nrows small vs large nrow(DT) - -# Better jump sync and run-on in PR#2627 -# -# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 -# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly -x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) -x[51094]="" -cat(x, file=f<-tempfile(), sep="\n") -test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], - data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), - output="jumps=[0..2)") # ensure jump 1 happened -# -# out-of-sample short lines in the first jump, not near the jump point -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), - warning="Stopped early on line 5021.*<>") -test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,5020L,NA,NA,5042L,102184L)), - output="jumps=[0..2)") -# -# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 -# confirmed fails in master with that error before PR#2627 -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), - warning="Stopped early on line 51094.*<>", - output="jumps=[0..2)") -test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,51093L,NA,NA,51151L,102184L)), - output="jumps=[0..2)") -# -# jump inside a quoted field containing many new lines, to simulate a dirty jump -# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. -# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" -cat(x, file=f, sep="\n") -test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n - data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), - V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), - output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") -# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's -# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) -unlink(f) - -# chmatchdup test from benchmark at the bottom of chmatch.c -set.seed(45L) -x = sample(letters, 1e5, TRUE) -y = sample(letters, 1e6, TRUE) -test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) -rm(list=c("x","y")) - -# Add nq tests 1641-1652 here with larger sizes and calls that have been turned off in the past as took too long, and -# restore the exact parameters w.r.t. Jan's comment: https://github.com/Rdatatable/data.table/pull/5520#discussion_r1020180583 - -# issue 2351 -set.seed(1) -DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) -fwrite(DT, file=f<-tempfile(), eol="\r") -test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) -cat("id888,42", file=f, append=TRUE) # without final \r after last line -test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) -unlink(f) - -# segfault when rbindlist is asked to create a DT with more than 2bn rows -DT = data.table(1:1e6) -L = vector("list", 2148) -for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test -test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") -rm(L, DT) -gc() - -# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 -# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too -set.seed(1) -DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary -setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow -test(2201.1, nrow(DT[, .N, by=grp]), 255L) -test(2201.2, nrow(setkey(DT, grp)), 65536L) -set.seed(1) -DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun -test(2201.3, nrow(DT[, .N, by=grp]), 65536L) -test(2201.4, nrow(setkey(DT, grp)), 65536L) -setDTthreads() # restore default throttle - -# print of DT with many columns reordered them, #3306. -DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print -out = capture.output(print(DT)) -tt = out[grep("V",out)] -tt = unlist(strsplit(gsub(" ","",tt), "V")) -test(1982.1, tt[1L], "") -tt = as.integer(tt[tt!=""]) -test(1982.2, tt, seq_along(tt)) - -# fread leak, #3292 -dummy = rep("1\t2\t3\t4\t5", 10000000) -writeLines(dummy, "out.tsv") -start = gc()["Vcells",2] -for (i in 1:10) data.table::fread("out.tsv") -end = gc()["Vcells",2] -test(, end/start < 1.05) - - diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index f6a4f96a80..c5c1d7b151 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -7,6 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { require(data.table) test = data.table:::test froll = data.table:::froll + frollfun = data.table:::frollfun } exact_NaN = isTRUE(capabilities()["long.double"]) && identical(as.integer(.Machine$longdouble.digits), 64L) @@ -308,36 +309,36 @@ test(6000.0671, frollmean(c(1:2,NA,4:10), 4), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 4, hasna 0, narm 0", - "frollmeanFast: NA.*are present in input, skip non-NA attempt and run with extra care for NAs", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 4, hasnf 0, narm 0", + "frollmeanFast: non-finite values are present in input, skip non-finite inaware attempt and run with extra care for NFs straighaway", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*" )) -test(6000.0672, frollmean(c(1:2,NA,4:10), 4, hasNA=FALSE), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), output=c( +test(6000.0672, frollmean(c(1:2,NA,4:10), 4, has.nf=FALSE), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 4, hasna -1, narm 0", - "frollmeanFast: NA.*are present in input, skip non-NA attempt and run with extra care for NAs", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 4, hasnf -1, narm 0", + "frollmeanFast: non-finite values are present in input, skip non-finite inaware attempt and run with extra care for NFs straighaway", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*" -), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.0673, frollmean(c(1:2,NA,4:10), 2, hasNA=FALSE), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), output=c( +), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.0673, frollmean(c(1:2,NA,4:10), 2, has.nf=FALSE), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 2, hasna -1, narm 0", - "frollmeanFast: NA.*are present in input, re-running with extra care for NAs", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 2, hasnf -1, narm 0", + "frollmeanFast: non-finite values are present in input, re-running with extra care for NFs", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*" -), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") +), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") test(6000.0674, frollmean(c(1:2,NA,4:10), 4, align="center"), c(rep(NA_real_, 4), 5.5, 6.5, 7.5, 8.5, NA, NA), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", - "frollmeanFast: running for input length 10, window 4, hasna 0, narm 0", - "frollmeanFast: NA.*are present in input, skip non-NA attempt and run with extra care for NAs", - "frollmean: align 0, shift answer by -2", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 4, hasnf 0, narm 0", + "frollmeanFast: non-finite values are present in input, skip non-finite inaware attempt and run with extra care for NFs straighaway", + "frollfun: align 0, shift answer by -2", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*" )) options(datatable.verbose=FALSE) @@ -383,7 +384,7 @@ test(6000.093, frollmean(list(1:3, 4:6), 4), list(c(NA_real_, NA_real_, NA_real_ test(6000.0931, frollmean(list(1:3, 4:6), 4, align="center"), list(c(NA_real_, NA_real_, NA_real_), c(NA_real_, NA_real_, NA_real_))) test(6000.0932, frollmean(list(1:3, 4:6), 4, align="left"), list(c(NA_real_, NA_real_, NA_real_), c(NA_real_, NA_real_, NA_real_))) options(datatable.verbose=TRUE) -test(6000.0933, frollmean(list(1:3, 4:6), 4), list(c(NA_real_, NA_real_, NA_real_), c(NA_real_, NA_real_, NA_real_)), output="frollmean: window width longer than input vector, returning all NA vector") +test(6000.0933, frollmean(list(1:3, 4:6), 4), list(c(NA_real_, NA_real_, NA_real_), c(NA_real_, NA_real_, NA_real_)), output="frollfun: window width longer than input vector, returning all NA vector") options(datatable.verbose=FALSE) #### n==length(x) test(6000.094, frollmean(list(1:3, 4:6), 3), list(c(NA_real_, NA_real_, 2), c(NA_real_, NA_real_, 5))) @@ -426,19 +427,19 @@ test(6000.119, frollmean(1:5, list(1:5)), error="n must be integer, list is acce test(6000.1192, frollmean(1:5, 2, adaptive=NA), error="adaptive must be TRUE or FALSE") #### na.rm=NA test(6000.1193, frollmean(1:5, 2, na.rm=NA), error="na.rm must be TRUE or FALSE") -#### hasNA=1 -test(6000.1194, frollmean(1:5, 2, hasNA=1), error="hasNA must be TRUE, FALSE or NA") -#### hasNA=FALSE na.rm=TRUE -test(6000.1195, frollmean(1:5, 2, na.rm=TRUE, hasNA=FALSE), error="using hasNA FALSE and na.rm TRUE does not make sense, if you know there are NA values use hasNA TRUE, otherwise leave it as default NA") +#### has.nf=1 +test(6000.1194, frollmean(1:5, 2, has.nf=1), error="has.nf must be TRUE, FALSE or NA") +#### has.nf=FALSE na.rm=TRUE +test(6000.1195, frollmean(1:5, 2, na.rm=TRUE, has.nf=FALSE), error="using has.nf FALSE and na.rm TRUE does not make sense, if you know there are non-finite values then use has.nf TRUE, otherwise leave it as default NA") #### exact na.rm=TRUE adaptive=TRUE verbose=TRUE options(datatable.verbose=TRUE) test(6000.1196, frollmean(c(1:5,NA), 1:6, algo="exact", na.rm=TRUE, adaptive=TRUE), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*, not entering parallel execution here because algo='exact' will compute results in parallel", "frollfunR: 1:", - "fadaptiverollmeanExact: running in parallel for input length 6, hasna 0, narm 1", - "fadaptiverollmeanExact: NA.*are present in input, re-running with extra care for NAs", - "fadaptiverollmean: processing algo 1 took.*", + "frolladaptivemeanExact: running in parallel for input length 6, hasnf 0, narm 1", + "frolladaptivemeanExact: non-finite values are present in input, re-running with extra care for NFs", + "frolladaptivefun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*" )) #### exact na.rm=TRUE verbose=TRUE @@ -446,9 +447,9 @@ test(6000.1197, frollmean(c(1:5,NA), 2, algo="exact", na.rm=TRUE), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*, not entering parallel execution here because algo='exact' will compute results in parallel", "frollfunR: 1:", - "frollmeanExact: running in parallel for input length 6, window 2, hasna 0, narm 1", - "frollmeanExact: NA.*are present in input, re-running with extra care for NAs", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 6, window 2, hasnf 0, narm 1", + "frollmeanExact: non-finite values are present in input, re-running with extra care for NFs", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*" )) options(datatable.verbose=FALSE) @@ -468,25 +469,31 @@ ma = function(x, n, na.rm=FALSE, nf.rm=FALSE) { n = 4 x = 1:16 x[5] = NaN -test(6000.120, frollmean(x, n), ma(x, n, nf.rm=TRUE)) +test(6000.120, frollmean(x, n), ma(x, n)) test(6000.121, frollmean(x, n, algo="exact"), ma(x, n)) x[6] = NA -test(6000.122, frollmean(x, n), ma(x, n, nf.rm=TRUE)) +test(6000.122, frollmean(x, n), ma(x, n)) test(6000.123, frollmean(x, n, algo="exact"), ma(x, n)) # use do not use identical as NaN-NA behaviour is platform/compiler specific #3353 #### test inconsistency of NaN-NA order is consistent to https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17441 x[5] = NA x[6] = NaN -test(6000.124, frollmean(x, n), ma(x, n, nf.rm=TRUE)) +test(6000.124, frollmean(x, n), ma(x, n)) test(6000.125, frollmean(x, n, algo="exact"), ma(x, n)) x[5] = Inf -test(6000.126, frollmean(x, n), ma(x, n, nf.rm=TRUE)) +test(6000.126, frollmean(x, n), ma(x, n)) test(6000.127, frollmean(x, n, algo="exact"), ma(x, n)) x[6] = -Inf -test(6000.128, frollmean(x, n), ma(x, n, nf.rm=TRUE)) +test(6000.128, frollmean(x, n), ma(x, n)) test(6000.129, frollmean(x, n, algo="exact"), ma(x, n)) x[5:7] = c(NA, Inf, -Inf) -test(6000.130, frollmean(x, n), ma(x, n, nf.rm=TRUE)) +test(6000.130, frollmean(x, n), ma(x, n)) test(6000.131, frollmean(x, n, algo="exact"), ma(x, n)) +x = c(Inf,-Inf,-Inf,Inf,Inf) +n = 2 +test(6000.1311, frollmean(x, n), ma(x, n)) +test(6000.1312, frollmean(x, n, algo="exact"), ma(x, n)) +test(6000.1313, frollsum(x, n), c(NA,NaN,-Inf,NA,Inf)) +test(6000.1314, frollsum(x, n, algo="exact"), c(NA,NaN,-Inf,NA,Inf)) #### adaptive window ama = function(x, n, na.rm=FALSE, fill=NA, nf.rm=FALSE) { @@ -563,9 +570,16 @@ if (FALSE) { #### adaptive limitations test(6000.145, frollmean(1:2, 1:2, adaptive=TRUE, align="right"), c(1, 1.5)) -test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align argument different than 'right' is not implemented") -test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented") -test(6000.148, frollmean(list(1:2, 1:3), list(1:2), adaptive=TRUE), error="adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame. If you want to call rolling function on list having variable length of elements call it for each field separately") +test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align 'center' is not implemented") +test(6000.147, frollmean(list(1:2, 1:3), list(1:2), adaptive=TRUE), error="adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame. If you want to call rolling function on list having variable length of elements call it for each field separately") + +#### adaptive align - added in #5441 +options(datatable.verbose=TRUE) +test(6000.148, frollsum(c(1,3,4,2,0), c(3,2,2,3,2), adaptive=TRUE, align="left"), c(8,7,6,NA,NA), output=c("processing from align='right'")) +options(datatable.verbose=FALSE) +test(6000.1481, frollsum(c(1,3,4,2,0), list(c(3,2,2,3,2), c(3,3,3,3,3)), adaptive=TRUE, align="left"), list(c(8,7,6,NA,NA), c(8,9,6,NA,NA))) +test(6000.1482, frollsum(list(c(1,3,4,2,0), c(3,1,4,2,0)), c(3,2,2,3,2), adaptive=TRUE, align="left"), list(c(8,7,6,NA,NA), c(8,5,6,NA,NA))) +test(6000.1483, frollsum(list(c(1,3,4,2,0), c(3,1,4,2,0)), list(c(3,2,2,3,2), c(3,3,3,3,3)), adaptive=TRUE, align="left"), list(c(8,7,6,NA,NA),c(8,9,6,NA,NA),c(8,5,6,NA,NA),c(8,7,6,NA,NA))) #### adaptive exact fastama = function(x, n, na.rm, fill=NA) { @@ -635,24 +649,24 @@ test(6000.158, frollmean(1:10, list(1:5), adaptive=TRUE), error="length of integ n = c(4,1,4,5,5,4,6,5,4,4,2,3,4,3,2,4) x = 1:16 x[5] = NaN -test(6000.159, frollmean(x, n, adaptive=TRUE), ama(x, n, nf.rm=TRUE)) +test(6000.159, frollmean(x, n, adaptive=TRUE), ama(x, n)) test(6000.160, frollmean(x, n, algo="exact", adaptive=TRUE), ama(x, n)) x[6] = NA -test(6000.161, frollmean(x, n, adaptive=TRUE), ama(x, n, nf.rm=TRUE)) +test(6000.161, frollmean(x, n, adaptive=TRUE), ama(x, n)) test(6000.162, frollmean(x, n, algo="exact", adaptive=TRUE), ama(x, n)) # use do not use identical as NaN-NA behaviour is platform/compiler specific #3353 #### test inconsistency of NaN-NA order is consistent to https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17441 x[5] = NA x[6] = NaN -test(6000.163, frollmean(x, n, adaptive=TRUE), ama(x, n, nf.rm=TRUE)) +test(6000.163, frollmean(x, n, adaptive=TRUE), ama(x, n)) test(6000.164, frollmean(x, n, algo="exact", adaptive=TRUE), ama(x, n)) x[5] = Inf -test(6000.165, frollmean(x, n, adaptive=TRUE), ama(x, n, nf.rm=TRUE)) +test(6000.165, frollmean(x, n, adaptive=TRUE), ama(x, n)) test(6000.166, frollmean(x, n, algo="exact", adaptive=TRUE), ama(x, n)) x[6] = -Inf -test(6000.167, frollmean(x, n, adaptive=TRUE), ama(x, n, nf.rm=TRUE)) +test(6000.167, frollmean(x, n, adaptive=TRUE), ama(x, n)) test(6000.168, frollmean(x, n, algo="exact", adaptive=TRUE), ama(x, n)) x[5:7] = c(NA, Inf, -Inf) -test(6000.169, frollmean(x, n, adaptive=TRUE), ama(x, n, nf.rm=TRUE)) +test(6000.169, frollmean(x, n, adaptive=TRUE), ama(x, n)) test(6000.170, frollmean(x, n, algo="exact", adaptive=TRUE), ama(x, n)) ## test verbose messages @@ -663,82 +677,82 @@ test(6000.171, frollmean(x, n), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.172, frollmean(list(x, x+1), n), output=c( "frollfunR: allocating memory for results 2x1", "frollfunR: 2 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: 2:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.173, frollmean(x, c(n, n+1)), output=c( "frollfunR: allocating memory for results 1x2", "frollfunR: 1 column.*2 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: 2:", - "frollmeanFast: running for input length 10, window 4, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 4, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.174, frollmean(list(x, x+1), c(n, n+1)), output=c( "frollfunR: allocating memory for results 2x2", "frollfunR: 2 column.*2 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: 2:", - "frollmeanFast: running for input length 10, window 4, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 4, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: 3:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: 4:", - "frollmeanFast: running for input length 10, window 4, hasna 0, narm 0", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 4, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.175, frollmean(x, n, algo="exact"), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanExact: running in parallel for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*")) test(6000.176, frollmean(x, n, align="center"), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: align 0, shift answer by -1", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: align 0, shift answer by -1", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.177, frollmean(x, n, align="left"), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmean: align -1, shift answer by -2", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollfun: align -1, shift answer by -2", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) nn = c(1:4,2:3,1:4) test(6000.178, frollmean(x, nn, adaptive=TRUE), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "fadaptiverollmeanFast: running for input length 10, hasna 0, narm 0", - "fadaptiverollmean: processing algo 0 took.*", + "frolladaptivemeanFast: running for input length 10, hasnf 0, narm 0", + "frolladaptivefun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.179, frollmean(x, nn, algo="exact", adaptive=TRUE), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "fadaptiverollmeanExact: running in parallel for input length 10, hasna 0, narm 0", - "fadaptiverollmean: processing algo 1 took.*", + "frolladaptivemeanExact: running in parallel for input length 10, hasnf 0, narm 0", + "frolladaptivefun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*")) x[8] = NA @@ -746,33 +760,33 @@ test(6000.180, frollmean(x, n), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanFast: running for input length 10, window 3, hasna 0, narm 0", - "frollmeanFast: NA.*are present in input, re-running with extra care for NAs", - "frollmean: processing algo 0 took.*", + "frollmeanFast: running for input length 10, window 3, hasnf 0, narm 0", + "frollmeanFast: non-finite values are present in input, re-running with extra care for NFs", + "frollfun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.181, frollmean(x, n, algo="exact"), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanExact: running in parallel for input length 10, window 3, hasna 0, narm 0", - "frollmeanExact: NA.*are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 3, hasnf 0, narm 0", + "frollmeanExact: non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*")) test(6000.182, frollmean(x, nn, adaptive=TRUE), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "fadaptiverollmeanFast: running for input length 10, hasna 0, narm 0", - "fadaptiverollmeanFast: NA.*are present in input, re-running with extra care for NAs", - "fadaptiverollmean: processing algo 0 took.*", + "frolladaptivemeanFast: running for input length 10, hasnf 0, narm 0", + "frolladaptivemeanFast: non-finite values are present in input, re-running with extra care for NFs", + "frolladaptivefun: processing fun 0 algo 0 took.*", "frollfunR: processing.*took.*")) test(6000.183, frollmean(x, nn, algo="exact", adaptive=TRUE), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "fadaptiverollmeanExact: running in parallel for input length 10, hasna 0, narm 0", - "fadaptiverollmeanExact: NA.*are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run", - "fadaptiverollmean: processing algo 1 took.*", + "frolladaptivemeanExact: running in parallel for input length 10, hasnf 0, narm 0", + "frolladaptivemeanExact: non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run", + "frolladaptivefun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*")) d = as.data.table(list(1:10/2, 10:1/4)) @@ -780,38 +794,40 @@ test(6000.184, frollmean(d[,1], 3, algo="exact"), output=c( "frollfunR: allocating memory for results 1x1", "frollfunR: 1 column.*1 window.*", "frollfunR: 1:", - "frollmeanExact: running in parallel for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*" )) test(6000.185, frollmean(d, 3:4, algo="exact"), output=c( "frollfunR: allocating memory for results 2x2", "frollfunR: 2 column.*2 window.*", "frollfunR: 1:", - "frollmeanExact: running in parallel for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: 2:", - "frollmeanExact: running in parallel for input length 10, window 4, hasna 0, narm 0", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 4, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: 3:", - "frollmeanExact: running in parallel for input length 10, window 3, hasna 0, narm 0", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 3, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: 4:", - "frollmeanExact: running in parallel for input length 10, window 4, hasna 0, narm 0", - "frollmean: processing algo 1 took.*", + "frollmeanExact: running in parallel for input length 10, window 4, hasnf 0, narm 0", + "frollfun: processing fun 0 algo 1 took.*", "frollfunR: processing.*took.*" )) options(datatable.verbose=FALSE) ## test warnings -test(6000.186, frollmean(c(1:2,NA,4:10), 4, hasNA=FALSE), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.187, frollmean(c(1:2,NA,4:10), 2, hasNA=FALSE), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.188, frollmean(c(1:2,NA,4:10), 4, hasNA=FALSE, algo="exact"), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.189, frollmean(c(1:2,NA,4:10), 2, hasNA=FALSE, algo="exact"), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.190, frollmean(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, hasNA=FALSE), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="*hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.191, frollmean(c(1:2,NA,4:10), rep(2L,10), adaptive=TRUE, hasNA=FALSE), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="*hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.192, frollmean(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, hasNA=FALSE, algo="exact"), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="*hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.193, frollmean(c(1:2,NA,4:10), rep(2L,10), adaptive=TRUE, hasNA=FALSE, algo="exact"), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="*hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") +test(6000.186, frollmean(c(1:2,NA,4:10), 4, has.nf=FALSE), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.187, frollmean(c(1:2,NA,4:10), 2, has.nf=FALSE), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.188, frollmean(c(1:2,NA,4:10), 4, has.nf=FALSE, algo="exact"), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.189, frollmean(c(1:2,NA,4:10), 2, has.nf=FALSE, algo="exact"), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.190, frollmean(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, has.nf=FALSE), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.191, frollmean(c(1:2,NA,4:10), rep(2L,10), adaptive=TRUE, has.nf=FALSE), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.192, frollmean(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, has.nf=FALSE, algo="exact"), c(rep(NA_real_, 6), 5.5, 6.5, 7.5, 8.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.193, frollmean(c(1:2,NA,4:10), rep(2L,10), adaptive=TRUE, has.nf=FALSE, algo="exact"), c(NA, 1.5, NA, NA, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.199, frollmean(1:2, 1, hasNA=TRUE), c(1,2), warning="hasNA is deprecated, use has.nf instead") +test(6000.1991, frollmean(1:2, 1, has.nf=FALSE, hasNA=TRUE), error="hasNA is deprecated, use has.nf instead") ## frollsum x = 1:6/2 @@ -826,25 +842,236 @@ test(6000.202, ans1, expected) options(datatable.verbose=TRUE) test(6000.211, frollsum(1:5, 6), rep(NA_real_, 5L), output="window width longer than input vector") options(datatable.verbose=FALSE) -test(6000.212, frollsum(c(1:2,NA,4:10), 4, hasNA=FALSE), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.213, frollsum(c(1:2,NA,4:10), 2, hasNA=FALSE), c(NA, 3, NA, NA, 9, 11, 13, 15, 17, 19), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.214, frollsum(c(1:2,NA,4:10), 4, hasNA=FALSE, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") +test(6000.212, frollsum(c(1:2,NA,4:10), 4, has.nf=FALSE), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.213, frollsum(c(1:2,NA,4:10), 2, has.nf=FALSE), c(NA, 3, NA, NA, 9, 11, 13, 15, 17, 19), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.214, frollsum(c(1:2,NA,4:10), 4, has.nf=FALSE, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") options(datatable.verbose=TRUE) -test(6000.215, frollsum(c(1:2,NA,4:10), 4, algo="exact", na.rm=TRUE), c(rep(NA_real_, 3L), 7, 11, 15, 22, 26, 30, 34), output="re-running with extra care for NAs") -test(6000.216, frollsum(c(1:2,NA,4:10), 4, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), output="NAs were handled already, no need to re-run") +test(6000.215, frollsum(c(1:2,NA,4:10), 4, algo="exact", na.rm=TRUE), c(rep(NA_real_, 3L), 7, 11, 15, 22, 26, 30, 34), output="non-finite values are present in input, re-running with extra care for NFs") +test(6000.216, frollsum(c(1:2,NA,4:10), 4, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), output="non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run") options(datatable.verbose=FALSE) -test(6000.217, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, hasNA=FALSE), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="*hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") -test(6000.218, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, hasNA=FALSE, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="hasNA=FALSE used but NA.*are present in input, use default hasNA=NA to avoid this warning") +test(6000.217, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, has.nf=FALSE), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") +test(6000.218, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, has.nf=FALSE, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), warning="has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning") options(datatable.verbose=TRUE) -test(6000.219, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, algo="exact", na.rm=TRUE), c(rep(NA_real_, 3L), 7, 11, 15, 22, 26, 30, 34), output="re-running with extra care for NAs") -test(6000.220, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), output="NAs were handled already, no need to re-run") +test(6000.219, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, algo="exact", na.rm=TRUE), c(rep(NA_real_, 3L), 7, 11, 15, 22, 26, 30, 34), output="non-finite values are present in input, re-running with extra care for NFs") +test(6000.220, frollsum(c(1:2,NA,4:10), rep(4L,10), adaptive=TRUE, algo="exact"), c(rep(NA_real_, 6), 22, 26, 30, 34), output="non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run") test(6000.221, frollsum(1:3, 2), c(NA, 3, 5), output="frollsumFast: running for input length") -test(6000.222, frollsum(1:3, 2, align="left"), c(3, 5, NA), output="frollsum: align") -test(6000.223, frollsum(c(1,2,NA), 2), c(NA, 3, NA), output="re-running with extra care for NAs") -test(6000.224, frollsum(c(NA,2,3), 2), c(NA, NA, 5), output="skip non-NA attempt and run with extra care for NAs") -test(6000.225, frollsum(1:3, c(2,2,2), adaptive=TRUE), c(NA, 3, 5), output="fadaptiverollsumFast: running for input length") -test(6000.226, frollsum(c(NA,2,3), c(2,2,2), adaptive=TRUE), c(NA, NA, 5), output="re-running with extra care for NAs") +test(6000.222, frollsum(1:3, 2, align="left"), c(3, 5, NA), output="frollfun: align") +test(6000.223, frollsum(c(1,2,NA), 2), c(NA, 3, NA), output="non-finite values are present in input, re-running with extra care for NFs") +test(6000.224, frollsum(c(NA,2,3), 2), c(NA, NA, 5), output="non-finite values are present in input, skip non-finite inaware attempt and run with extra care for NFs straighaway") +test(6000.225, frollsum(1:3, c(2,2,2), adaptive=TRUE), c(NA, 3, 5), output="frolladaptivesumFast: running for input length") +test(6000.226, frollsum(c(NA,2,3), c(2,2,2), adaptive=TRUE), c(NA, NA, 5), output="non-finite values are present in input, re-running with extra care for NFs") +options(datatable.verbose=FALSE) + +## frollmax adaptive +options(datatable.verbose=TRUE) ## adaptive frollmax no fast algo +test(6000.3, frollmax(1:4, c(2,2,2,2), adaptive=TRUE), output="frolladaptivefun: algo 0 not implemented, fall back to 1") +test(6000.3001, frollmax(1:4, c(2,2,2,2), algo="fast", adaptive=TRUE), output="frolladaptivefun: algo 0 not implemented, fall back to 1") +test(6000.3002, frollmax(1:4, c(2,2,2,2), algo="exact", adaptive=TRUE), notOutput="frolladaptivefun: algo 0 not implemented, fall back to 1") +options(datatable.verbose=FALSE) +n = c(3,2,2,4,2,1,4,8) +x = c(7,2,3,6,3,2,6,6) # no NA +test(6000.3111, frollmax(x, n, adaptive=TRUE), c(NA,7,3,7,6,2,6,7)) # has.nf=NA # narm=F +test(6000.3112, frollmax(x, n, na.rm=TRUE, adaptive=TRUE), c(NA,7,3,7,6,2,6,7)) # narm=T +test(6000.3121, frollmax(x, n, has.nf=FALSE, adaptive=TRUE), c(NA,7,3,7,6,2,6,7)) # has.nf=F +test(6000.3122, frollmax(x, n, has.nf=FALSE, na.rm=TRUE, adaptive=TRUE), error="does not make sense") +test(6000.3131, frollmax(x, n, has.nf=TRUE, adaptive=TRUE), c(NA,7,3,7,6,2,6,7)) # has.nf=T +test(6000.3132, frollmax(x, n, has.nf=TRUE, na.rm=TRUE, adaptive=TRUE), c(NA,7,3,7,6,2,6,7)) +x = c(7,2,NA,6,3,NA,6,6) # NA +test(6000.3211, frollmax(x, n, adaptive=TRUE), c(NA,7,NA,NA,6,NA,NA,NA)) +test(6000.3212, frollmax(x, n, na.rm=TRUE, adaptive=TRUE), c(NA,7,2,7,6,-Inf,6,7)) +test(6000.3221, frollmax(x, n, has.nf=FALSE, adaptive=TRUE), c(NA,7,2,7,6,-Inf,6,7)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.3222, frollmax(x, n, has.nf=FALSE, na.rm=TRUE, adaptive=TRUE), error="does not make sense") +test(6000.3231, frollmax(x, n, has.nf=TRUE, adaptive=TRUE), c(NA,7,NA,NA,6,NA,NA,NA)) +test(6000.3232, frollmax(x, n, has.nf=TRUE, na.rm=TRUE, adaptive=TRUE), c(NA,7,2,7,6,-Inf,6,7)) +x = rep(NA_real_, 8) # all NA +test(6000.3241, frollmax(x, n, adaptive=TRUE), rep(NA_real_, 8)) +test(6000.3242, frollmax(x, n, na.rm=TRUE, adaptive=TRUE), c(NA, rep(-Inf, 7))) +test(6000.3251, frollmax(x, n, has.nf=FALSE, adaptive=TRUE), c(NA, rep(-Inf, 7))) +test(6000.3252, frollmax(x, n, has.nf=FALSE, na.rm=TRUE, adaptive=TRUE), error="does not make sense") +test(6000.3261, frollmax(x, n, has.nf=TRUE, adaptive=TRUE), rep(NA_real_, 8)) +test(6000.3262, frollmax(x, n, has.nf=TRUE, na.rm=TRUE, adaptive=TRUE), c(NA, rep(-Inf, 7))) +x = c(NA,NaN,NA,NaN,NaN,NaN,NA,NA) # all NaN/NA +test(6000.3271, frollmax(x, n, adaptive=TRUE), c(NA,NA,NA,NA,NaN,NaN,NA,NA)) +test(6000.3272, frollmax(x, n, na.rm=TRUE, adaptive=TRUE), c(NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) +test(6000.3281, frollmax(x, n, has.nf=FALSE, adaptive=TRUE), c(NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.3282, frollmax(x, n, has.nf=FALSE, na.rm=TRUE, adaptive=TRUE), error="does not make sense") +test(6000.3291, frollmax(x, n, has.nf=TRUE, adaptive=TRUE), c(NA,NA,NA,NA,NaN,NaN,NA,NA)) +test(6000.3292, frollmax(x, n, has.nf=TRUE, na.rm=TRUE, adaptive=TRUE), c(NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) +x = c(7,2,NA,6,3,Inf,6,6) # Inf +test(6000.3311, frollmax(x, n, adaptive=TRUE), c(NA,7,NA,NA,6,Inf,Inf,NA)) +test(6000.3312, frollmax(x, n, na.rm=TRUE, adaptive=TRUE), c(NA,7,2,7,6,Inf,Inf,Inf)) +test(6000.3321, frollmax(x, n, has.nf=FALSE, adaptive=TRUE), c(NA,7,2,7,6,Inf,Inf,Inf)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.3322, frollmax(x, n, has.nf=FALSE, na.rm=TRUE, adaptive=TRUE), error="does not make sense") +test(6000.3331, frollmax(x, n, has.nf=TRUE, adaptive=TRUE), c(NA,7,NA,NA,6,Inf,Inf,NA)) +test(6000.3332, frollmax(x, n, has.nf=TRUE, na.rm=TRUE, adaptive=TRUE), c(NA,7,2,7,6,Inf,Inf,Inf)) +x = c(7,2,-Inf,6,3,NA,6,6) # -Inf +test(6000.3341, frollmax(x, n, adaptive=TRUE), c(NA,7,2,7,6,NA,NA,NA)) +test(6000.3342, frollmax(x, n, na.rm=TRUE, adaptive=TRUE), c(NA,7,2,7,6,-Inf,6,7)) +test(6000.3351, frollmax(x, n, has.nf=FALSE, adaptive=TRUE), c(NA,7,2,7,6,-Inf,6,7)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.3352, frollmax(x, n, has.nf=FALSE, na.rm=TRUE, adaptive=TRUE), error="does not make sense") +test(6000.3361, frollmax(x, n, has.nf=TRUE, adaptive=TRUE), c(NA,7,2,7,6,NA,NA,NA)) +test(6000.3362, frollmax(x, n, has.nf=TRUE, na.rm=TRUE, adaptive=TRUE), c(NA,7,2,7,6,-Inf,6,7)) + +## frollmax non-adaptive +options(datatable.verbose=TRUE) +test(6000.4001, frollmax(1:3, 2), c(NA, 2, 3), output="frollmaxFast: running for input length") +test(6000.4002, frollmax(1:10, 5), c(NA,NA,NA,NA,5,6,7,8,9,10), output="frollmaxFast: nested window max calculation called 0 times") +test(6000.4003, frollmax(10:1, 5), c(NA,NA,NA,NA,10,9,8,7,6,5), output="frollmaxFast: nested window max calculation called 5 times") +test(6000.4004, frollmax(1:3, 2, algo="exact"), c(NA, 2, 3), output="frollmaxExact: running in parallel for input length") +test(6000.4005, frollmax(c(1,2,3,NA,5), 2), c(NA, 2, 3, NA, NA), output="continue with extra care for NFs") +options(datatable.verbose=FALSE) +n = 3 +x = c(7,2,3,6,3,2,4,5) # no NA +ans = c(NA,NA,7,6,6,6,4,5) +test(6000.4111, frollmax(x, n), ans) # has.nf=NA # narm=F +test(6000.4112, frollmax(x, n, na.rm=TRUE), ans) # narm=T +test(6000.4113, frollmax(x, n, algo="exact"), ans) # has.nf=NA # narm=F +test(6000.4114, frollmax(x, n, algo="exact", na.rm=TRUE), ans) # narm=T +test(6000.4121, frollmax(x, n, has.nf=FALSE), ans) # has.nf=F +test(6000.4122, frollmax(x, n, has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4123, frollmax(x, n, algo="exact", has.nf=FALSE), ans) # has.nf=F +test(6000.4124, frollmax(x, n, algo="exact", has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4131, frollmax(x, n, has.nf=TRUE), ans) # has.nf=T +test(6000.4132, frollmax(x, n, has.nf=TRUE, na.rm=TRUE), ans) +test(6000.4133, frollmax(x, n, algo="exact", has.nf=TRUE), ans) # has.nf=T +test(6000.4134, frollmax(x, n, algo="exact", has.nf=TRUE, na.rm=TRUE), ans) +x = c(7,2,3,NA,3,2,4,NA) # NA +test(6000.4211, frollmax(x, n), c(NA,NA,7,NA,NA,NA,4,NA)) +test(6000.4212, frollmax(x, n, na.rm=TRUE), c(NA,NA,7,3,3,3,4,4)) +test(6000.4213, frollmax(x, n, algo="exact"), c(NA,NA,7,NA,NA,NA,4,NA)) +test(6000.4214, frollmax(x, n, algo="exact", na.rm=TRUE), c(NA,NA,7,3,3,3,4,4)) +test(6000.4221, frollmax(x, n, has.nf=FALSE), c(NA,NA,7,3,3,3,4,4)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4222, frollmax(x, n, has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4223, frollmax(x, n, algo="exact", has.nf=FALSE), c(NA,NA,7,3,3,3,4,4)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4224, frollmax(x, n, algo="exact", has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4231, frollmax(x, n, has.nf=TRUE), c(NA,NA,7,NA,NA,NA,4,NA)) +test(6000.4232, frollmax(x, n, has.nf=TRUE, na.rm=TRUE), c(NA,NA,7,3,3,3,4,4)) +test(6000.4233, frollmax(x, n, algo="exact", has.nf=TRUE), c(NA,NA,7,NA,NA,NA,4,NA)) +test(6000.4234, frollmax(x, n, algo="exact", has.nf=TRUE, na.rm=TRUE), c(NA,NA,7,3,3,3,4,4)) +x = rep(NA_real_, 8) # all NA +test(6000.4241, frollmax(x, n), rep(NA_real_, 8)) +test(6000.4242, frollmax(x, n, na.rm=TRUE), c(NA,NA, rep(-Inf, 6))) +test(6000.4243, frollmax(x, n, algo="exact"), rep(NA_real_, 8)) +test(6000.4244, frollmax(x, n, algo="exact", na.rm=TRUE), c(NA,NA, rep(-Inf, 6))) +test(6000.4251, frollmax(x, n, has.nf=FALSE), c(NA,NA, rep(-Inf, 6))) +test(6000.4252, frollmax(x, n, has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4253, frollmax(x, n, algo="exact", has.nf=FALSE), c(NA,NA, rep(-Inf, 6))) +test(6000.4254, frollmax(x, n, algo="exact", has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4261, frollmax(x, n, has.nf=TRUE), rep(NA_real_, 8)) +test(6000.4262, frollmax(x, n, has.nf=TRUE, na.rm=TRUE), c(NA,NA, rep(-Inf, 6))) +test(6000.4263, frollmax(x, n, algo="exact", has.nf=TRUE), rep(NA_real_, 8)) +test(6000.4264, frollmax(x, n, algo="exact", has.nf=TRUE, na.rm=TRUE), c(NA,NA, rep(-Inf, 6))) +x = c(NA,NaN,NA,NaN,NaN,NaN,NA,NA) # all NaN/NA +test(6000.4271, frollmax(x, n), c(NA,NA,NA,NA,NA,NaN,NA,NA)) +test(6000.4272, frollmax(x, n, na.rm=TRUE), c(NA,NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) +test(6000.4273, frollmax(x, n, algo="exact"), c(NA,NA,NA,NA,NA,NaN,NA,NA)) +test(6000.4274, frollmax(x, n, algo="exact", na.rm=TRUE), c(NA,NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) +test(6000.4281, frollmax(x, n, has.nf=FALSE), c(NA,NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4282, frollmax(x, n, has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4283, frollmax(x, n, algo="exact", has.nf=FALSE), c(NA,NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4284, frollmax(x, n, algo="exact", has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4291, frollmax(x, n, has.nf=TRUE), c(NA,NA,NA,NA,NA,NaN,NA,NA)) +test(6000.4292, frollmax(x, n, has.nf=TRUE, na.rm=TRUE), c(NA,NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) +test(6000.4293, frollmax(x, n, algo="exact", has.nf=TRUE), c(NA,NA,NA,NA,NA,NaN,NA,NA)) +test(6000.4294, frollmax(x, n, algo="exact", has.nf=TRUE, na.rm=TRUE), c(NA,NA,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf)) +x = c(NA,2,6,3,Inf,2,4,5) # Inf +test(6000.4311, frollmax(x, n), c(NA,NA,NA,6,Inf,Inf,Inf,5)) +test(6000.4312, frollmax(x, n, na.rm=TRUE), c(NA,NA,6,6,Inf,Inf,Inf,5)) +test(6000.4313, frollmax(x, n, algo="exact"), c(NA,NA,NA,6,Inf,Inf,Inf,5)) +test(6000.4314, frollmax(x, n, algo="exact", na.rm=TRUE), c(NA,NA,6,6,Inf,Inf,Inf,5)) +test(6000.4321, frollmax(x, n, has.nf=FALSE), c(NA,NA,6,6,Inf,Inf,Inf,5)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4322, frollmax(x, n, has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4323, frollmax(x, n, algo="exact", has.nf=FALSE), c(NA,NA,6,6,Inf,Inf,Inf,5)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4324, frollmax(x, n, algo="exact", has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4331, frollmax(x, n, has.nf=TRUE), c(NA,NA,NA,6,Inf,Inf,Inf,5)) +test(6000.4332, frollmax(x, n, has.nf=TRUE, na.rm=TRUE), c(NA,NA,6,6,Inf,Inf,Inf,5)) +test(6000.4333, frollmax(x, n, algo="exact", has.nf=TRUE), c(NA,NA,NA,6,Inf,Inf,Inf,5)) +test(6000.4334, frollmax(x, n, algo="exact", has.nf=TRUE, na.rm=TRUE), c(NA,NA,6,6,Inf,Inf,Inf,5)) +x = c(NA,2,-Inf,3,Inf,2,4,5) # -Inf +test(6000.4341, frollmax(x, n), c(NA,NA,NA,3,Inf,Inf,Inf,5)) +test(6000.4342, frollmax(x, n, na.rm=TRUE), c(NA,NA,2,3,Inf,Inf,Inf,5)) +test(6000.4343, frollmax(x, n, algo="exact"), c(NA,NA,NA,3,Inf,Inf,Inf,5)) +test(6000.4344, frollmax(x, n, algo="exact", na.rm=TRUE), c(NA,NA,2,3,Inf,Inf,Inf,5)) +test(6000.4351, frollmax(x, n, has.nf=FALSE), c(NA,NA,2,3,Inf,Inf,Inf,5)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4352, frollmax(x, n, has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4353, frollmax(x, n, algo="exact", has.nf=FALSE), c(NA,NA,2,3,Inf,Inf,Inf,5)) ## expected incorrect results, see manual has.nf section for details, added in #5441 +test(6000.4354, frollmax(x, n, algo="exact", has.nf=FALSE, na.rm=TRUE), error="does not make sense") +test(6000.4361, frollmax(x, n, has.nf=TRUE), c(NA,NA,NA,3,Inf,Inf,Inf,5)) +test(6000.4362, frollmax(x, n, has.nf=TRUE, na.rm=TRUE), c(NA,NA,2,3,Inf,Inf,Inf,5)) +test(6000.4363, frollmax(x, n, algo="exact", has.nf=TRUE), c(NA,NA,NA,3,Inf,Inf,Inf,5)) +test(6000.4364, frollmax(x, n, algo="exact", has.nf=TRUE, na.rm=TRUE), c(NA,NA,2,3,Inf,Inf,Inf,5)) +# edge cases +test(6000.501, frollmax(c(5,NA,1), 1L), c(5,NA,1)) ## na.rm=FALSE window recalc and NA happens to be the first element in a nested loop ## didn't help for codecov, adding internal error to wmax till we have a data that can reach there +test(6000.502, frollmax(c(5,NaN,1), 1L), c(5,NaN,1)) +test(6000.503, frollmax(c(5,1,1,NaN,1,1,1), 2L), c(NA,5,1,NaN,NaN,1,1)) +test(6000.504, frollmax(c(5,1,NA,NaN,1,1,1), 2L), c(NA,5,NA,NA,NaN,1,1)) + +## partial +x = 1:6/2 +n = 3 +an = function(n, len) c(seq.int(n), rep(n, len-n)) +test(6006.011, frollmean(x, an(n, length(x)), adaptive=TRUE), c(0.5,0.75,1,1.5,2,2.5)) +test(6006.012, frollmean(x, n, partial=TRUE), c(0.5,0.75,1,1.5,2,2.5)) +ans = frollmean(x, n) +ans[seq.int(n-1L)] = frollmean(x[seq.int(n-1L)], n, partial=TRUE) +test(6006.013, ans, c(0.5,0.75,1,1.5,2,2.5)) +test(6006.021, frollmean(x, rev(an(rev(n), length(x))), adaptive=TRUE, align="left"), c(1,1.5,2,2.5,2.75,3)) +test(6006.022, frollmean(x, n, partial=TRUE, align="left"), c(1,1.5,2,2.5,2.75,3)) +ans = frollmean(x, n, align="left") +ans[(length(x)-n-1L):length(x)] = frollmean(x[(length(x)-n-1L):length(x)], n, partial=TRUE, align="left") +test(6006.023, ans, c(1,1.5,2,2.5,2.75,3)) +ans = list(c(0.50,0.75,1.00,1.50,2.00,2.50), c(0.50,0.75,1.00,1.25,1.75,2.25)) +test(6006.031, frollmean(1:6/2, list(3L,4L), partial=TRUE), ans) +test(6006.032, frollmean(1:6/2, 3:4, partial=TRUE), ans) +options(datatable.verbose=TRUE) +test(6006.901, frollmean(x, n, partial=TRUE), c(0.5,0.75,1,1.5,2,2.5), output="froll partial=TRUE trimming 'n' and redirecting to adaptive=TRUE") +test(6006.902, frollmean(x, rep(n, length(x)), adaptive=TRUE, partial=TRUE), c(0.5,0.75,1,1.5,2,2.5), output="trimming", notOutput="redirecting") options(datatable.verbose=FALSE) +test(6006.903, frollmean(1:4, 2L, align="center", partial=TRUE), error="'partial' cannot be used together with align='center'") +test(6006.904, frollmean(list(1:4, 2:4), n, partial=TRUE), error="'partial' does not support variable length of columns in 'x'") +test(6006.905, frollmean(x, TRUE, partial=TRUE), error="n must be integer vector or list of integer vectors") +test(6006.906, frollmean(x, list(TRUE), partial=TRUE), error="n must be integer vector or list of integer vectors") +## partial adaptive +test(6006.930, frollmean(1:4, rep(2L,4L), adaptive=TRUE, partial=TRUE), c(1,1.5,2.5,3.5)) +test(6006.9301, frollmean(1:4, list(1:4, 1:3), adaptive=TRUE, partial=TRUE), error="adaptive window provided in 'n' must not to have different lengths") +test(6006.9302, frollmean(1:4, list(1:3), adaptive=TRUE, partial=TRUE), error="length of vectors in 'x' must match to length of adaptive window in 'n'") +test(6006.9303, frollmean(1:4, list(rep(2L,4L)), adaptive=TRUE, partial=TRUE), c(1,1.5,2.5,3.5)) +test(6006.9311, frollsum(1:4, 1:4, adaptive=TRUE, partial=TRUE), c(1,3,6,10)) ## all same as index +test(6006.9312, frollsum(1:4, 1:4, align="left", adaptive=TRUE, partial=TRUE), c(1,5,7,4)) +test(6006.9321, frollsum(1:4, c(2,3,1,1), adaptive=TRUE, partial=TRUE), c(1,3,3,4)) ## leading two bigger than index +test(6006.9322, frollsum(1:4, c(2,3,1,1), align="left", adaptive=TRUE, partial=TRUE), c(3,9,3,4)) +test(6006.9323, frollsum(1:4, c(6,5,4,2), adaptive=TRUE, partial=TRUE), c(1,3,6,7)) ## leading two bigger than rev index +test(6006.9324, frollsum(1:4, c(6,5,4,2), align="left", adaptive=TRUE, partial=TRUE), c(10,9,7,4)) +test(6006.9331, frollsum(1:4, c(2,4,5,6), adaptive=TRUE, partial=TRUE), c(1,3,6,10)) ## trailing two bigger than index +test(6006.9332, frollsum(1:4, c(2,4,5,6), align="left", adaptive=TRUE, partial=TRUE), c(3,9,7,4)) +test(6006.9333, frollsum(1:4, c(1,1,3,2), adaptive=TRUE, partial=TRUE), c(1,2,6,7)) ## trailing two bigger than rev index +test(6006.9334, frollsum(1:4, c(1,1,3,2), align="left", adaptive=TRUE, partial=TRUE), c(1,2,7,4)) + +## give.names +test(6006.951, frollsum(1:3, 2, give.names=TRUE), c(NA,3,5)) +test(6006.952, frollsum(1:3, c(b=2), give.names=TRUE), c(NA,3,5)) +test(6006.953, frollsum(c(a1=1,a2=2,a3=3), c(b=2), give.names=TRUE), c(NA,3,5)) +test(6006.954, frollsum(list(a=1:3), c(b=2), give.names=TRUE), list(a_b=c(NA,3,5))) +test(6006.955, frollsum(list(a=1:3), c(2), give.names=TRUE), list(a_roll_2=c(NA,3,5))) +test(6006.956, frollsum(list(a=1:3, b=3:1), c(2), give.names=TRUE), list(a_roll_2=c(NA,3,5), b_roll_2=c(NA,5,3))) +test(6006.957, frollsum(list(a=1:3, b=3:1), c(small=2, big=3), give.names=TRUE), list(a_small=c(NA,3,5), a_big=c(NA,NA,6), b_small=c(NA,5,3), b_big=c(NA,NA,6))) +test(6006.958, frollapply(FUN=sum, list(a=1:3, b=3:1), c(small=2, big=3), give.names=TRUE), list(a_small=c(NA,3,5), a_big=c(NA,NA,6), b_small=c(NA,5,3), b_big=c(NA,NA,6))) +test(6006.959, frollsum(list(1:3, 3:1), c(small=2, big=3), give.names=TRUE), list(V1_small=c(NA,3,5), V1_big=c(NA,NA,6), V2_small=c(NA,5,3), V2_big=c(NA,NA,6))) +test(6006.960, frollsum(list(1:3, 3:1), c(2, 3), give.names=TRUE), list(V1_roll_2=c(NA,3,5), V1_roll_3=c(NA,NA,6), V2_roll_2=c(NA,5,3), V2_roll_3=c(NA,NA,6))) +test(6006.961, frollsum(list(1:3, 3:1), list(c(2,2,2), c(3,3,3)), adaptive=TRUE, give.names=TRUE), list(V1_N1=c(NA,3,5), V1_N2=c(NA,NA,6), V2_N1=c(NA,5,3), V2_N2=c(NA,NA,6))) +test(6006.962, frollsum(list(a=1:3, b=3:1), list(small=c(2,2,2), big=c(3,3,3)), adaptive=TRUE, give.names=TRUE), list(a_small=c(NA,3,5), a_big=c(NA,NA,6), b_small=c(NA,5,3), b_big=c(NA,NA,6))) +test(6006.963, frollsum(list(a=1:3, b=3:1), list(small=c(2,2,2)), adaptive=TRUE, give.names=TRUE), list(a_small=c(NA,3,5), b_small=c(NA,5,3))) +test(6006.964, frollsum(list(a=1:3, b=3:1), c(2,2,2), adaptive=TRUE, give.names=TRUE), list(a_N1=c(NA,3,5), b_N1=c(NA,5,3))) +test(6006.965, frollsum(list(a=1:3), c(2,2,2), adaptive=TRUE, give.names=TRUE), list(a_N1=c(NA,3,5))) +test(6006.966, frollsum(list(1:3), c(2,2,2), adaptive=TRUE, give.names=TRUE), list(V1_N1=c(NA,3,5))) +test(6006.967, frollsum(1:3, c(2,2,2), adaptive=TRUE, give.names=TRUE), c(NA,3,5)) +test(6006.968, frollsum(list(a=1:3), c(b=2), partial=TRUE, give.names=TRUE), list(a_b=c(1,3,5))) +test(6006.969, frollsum(list(a=1:3, b=3:1), c(small=2, big=3), partial=TRUE, give.names=TRUE), list(a_small=c(1,3,5), a_big=c(1,3,6), b_small=c(3,5,3), b_big=c(3,5,6))) +test(6006.970, frollsum(list(a=1:3), 2, partial=TRUE, give.names=TRUE), list(a_roll_2=c(1,3,5))) +test(6006.971, frollsum(list(1:3), 2, partial=TRUE, give.names=TRUE), list(V1_roll_2=c(1,3,5))) +test(6006.972, frollsum(list(1:3), c(b=2), partial=TRUE, give.names=TRUE), list(V1_b=c(1,3,5))) +test(6006.973, frollsum(list(1:3), 2, partial=TRUE, give.names=TRUE), list(V1_roll_2=c(1,3,5))) +test(6006.974, frollsum(list(1:3, 3:1), c(2, 3), partial=TRUE, give.names=TRUE), list(V1_roll_2=c(1,3,5), V1_roll_3=c(1,3,6), V2_roll_2=c(3,5,3), V2_roll_3=c(3,5,6))) ## validation @@ -862,27 +1089,51 @@ makeNA = function(x, ratio=0.1, nf=FALSE) { } x } -num = 6001.0 +num = 6007.0 ## against base to verify exactness of non-finite values, not handled in zoo -rollfun = function(x, n, FUN, fill=NA_real_, na.rm=FALSE, nf.rm=FALSE) { +rollfun = function(x, n, FUN, fill=NA_real_, na.rm=FALSE, nf.rm=FALSE, partial=FALSE) { ans = rep(fill, nx<-length(x)) f = match.fun(FUN) if (nf.rm) x[is.infinite(x)] = NA_real_ - for (i in n:nx) ans[i] = f(x[(i-n+1):i], na.rm=na.rm) + for (i in seq_along(x)) { + ans[i] = if (i >= n) + f(x[(i-n+1):i], na.rm=na.rm) + else if (partial) + f(x[max((i-n+1), 1L):i], na.rm=na.rm) + else + as.double(fill) + } ans } -base_compare = function(x, n, funs=c("mean","sum"), algos=c("fast","exact")) { +base_compare = function(x, n, funs=c("mean","sum","max"), algos=c("fast","exact")) { num.step = 0.001 for (fun in funs) { for (na.rm in c(FALSE, TRUE)) { for (fill in c(NA_real_, 0)) { - for (algo in algos) { + for (partial in c(FALSE,TRUE)) { + for (has.nf in c(NA,TRUE,FALSE)) { + if (identical(has.nf, FALSE)) { + if (na.rm) + next ## errors "not make sense" + if (any(!is.finite(x))) + next ## do not test warnings (mean, sum) or incorrect expect results (max) + } + for (algo in algos) { + num <<- num + num.step + eval(substitute( # so we can have values displayed in output/log rather than variables + test(.num, ignore.warning="no non-missing arguments", + rollfun(x, n, FUN=.fun, fill=.fill, na.rm=.na.rm, partial=.partial), + frollfun(.fun, x, n, fill=.fill, na.rm=.na.rm, algo=.algo, partial=.partial, has.nf=.has.nf)), + list(.num=num, .fun=fun, .fill=fill, .na.rm=na.rm, .algo=algo, .partial=partial, .has.nf=has.nf) + )) + } + } num <<- num + num.step eval(substitute( # so we can have values displayed in output/log rather than variables - test(.num, - froll(.fun, x, n, fill=.fill, na.rm=.na.rm, algo=.algo), - rollfun(x, n, FUN=.fun, fill=.fill, na.rm=.na.rm, nf.rm=.nf.rm)), - list(.num=num, .fun=fun, .fill=fill, .na.rm=na.rm, .algo=algo, .nf.rm=algo!="exact") + test(.num, ignore.warning="no non-missing arguments", + frollapply(x, n, FUN=match.fun(.fun), fill=.fill, na.rm=.na.rm, partial=.partial), + frollfun(.fun, x, n, fill=.fill, na.rm=.na.rm, partial=.partial)), + list(.num=num, .fun=fun, .fill=fill, .na.rm=na.rm, .partial=partial) )) } } @@ -898,24 +1149,43 @@ x = makeNA(rnorm(1e3), nf=TRUE); n = 51 base_compare(x, n) x = makeNA(rnorm(1e3+1), nf=TRUE); n = 51 base_compare(x, n) -num = 6002.0 +num = 6008.0 #### against zoo if (requireNamespace("zoo", quietly=TRUE)) { drollapply = function(...) as.double(zoo::rollapply(...)) # rollapply is not consistent in data type of answer, force to double - zoo_compare = function(x, n, funs=c("mean","sum"), algos=c("fast","exact")) { + zoo_compare = function(x, n, funs=c("mean","sum","max"), algos=c("fast","exact")) { num.step = 0.0001 - #### fun, align, na.rm, fill, algo + #### fun, align, na.rm, fill, algo, partial for (fun in funs) { for (align in c("right","center","left")) { for (na.rm in c(FALSE, TRUE)) { for (fill in c(NA_real_, 0)) { - for (algo in algos) { + for (partial in c(FALSE,TRUE)) { + if (partial && align=="center") + next ## not implemented + for (has.nf in c(NA,TRUE,FALSE)) { + if (identical(has.nf, FALSE)) { + if (na.rm) + next ## errors "not make sense" + if (any(!is.finite(x))) + next ## do not test warnings (mean, sum) or incorrect expect results (max) + } + for (algo in algos) { + num <<- num + num.step + eval(substitute( # so we can have values displayed in output/log rather than variables + test(.num, ignore.warning="no non-missing arguments", + drollapply(x, n, FUN=.fun, fill=.fill, align=.align, na.rm=.na.rm, partial=.partial), + frollfun(.fun, x, n, align=.align, fill=.fill, na.rm=.na.rm, algo=.algo, partial=.partial, has.nf=.has.nf)), + list(.num=num, .fun=fun, .align=align, .fill=fill, .na.rm=na.rm, .algo=algo, .partial=partial, .has.nf=has.nf) + )) + } + } num <<- num + num.step eval(substitute( # so we can have values displayed in output/log rather than variables - test(.num, - froll(.fun, x, n, align=.align, fill=.fill, na.rm=.na.rm, algo=.algo), - drollapply(x, n, FUN=.fun, fill=.fill, align=.align, na.rm=.na.rm)), - list(.num=num, .fun=fun, .align=align, .fill=fill, .na.rm=na.rm, .algo=algo) + test(.num, ignore.warning="no non-missing arguments", + frollapply(x, n, FUN=.fun, fill=.fill, align=.align, na.rm=.na.rm, partial=.partial), + frollfun(.fun, x, n, align=.align, fill=.fill, na.rm=.na.rm, partial=.partial)), + list(.num=num, .fun=fun, .align=align, .fill=fill, .na.rm=na.rm, .partial=partial) )) } } @@ -952,34 +1222,66 @@ if (requireNamespace("zoo", quietly=TRUE)) { zoo_compare(x, n) } #### adaptive moving average compare -num = 6003.0 -arollfun = function(fun, x, n, na.rm=FALSE, fill=NA, nf.rm=FALSE) { +num = 6009.0 +arollfun = function(FUN, x, n, na.rm=FALSE, align=c("right","left"), fill=NA, nf.rm=FALSE, partial=FALSE) { # adaptive moving average in R stopifnot((nx<-length(x))==length(n)) - ans = rep(NA_real_, nx) + align = match.arg(align) + ans = rep(fill, nx) if (nf.rm) x[is.infinite(x)] = NA_real_ - FUN = match.fun(fun) - for (i in seq_along(x)) { - ans[i] = if (i >= n[i]) - FUN(x[(i-n[i]+1):i], na.rm=na.rm) - else as.double(fill) + f = match.fun(FUN) + if (align=="right") { + for (i in seq_along(x)) { + if (i >= n[i]) + ans[i] = f(x[(i-n[i]+1):i], na.rm=na.rm) + else if (partial) + ans[i] = f(x[1L:i], na.rm=na.rm) + } + } else { + for (i in seq_along(x)) { + if (i <= nx-n[i]+1) + ans[i] = f(x[i:(i+n[i]-1)], na.rm=na.rm) + else if (partial) + ans[i] = f(x[i:length(x)], na.rm=na.rm) + } } ans } -afun_compare = function(x, n, funs=c("mean","sum"), algos=c("fast","exact")) { +afun_compare = function(x, n, funs=c("mean","sum","max"), algos=c("fast","exact")) { num.step = 0.0001 - #### fun, na.rm, fill, algo + #### fun, align, na.rm, fill, algo for (fun in funs) { - for (na.rm in c(FALSE, TRUE)) { - for (fill in c(NA_real_, 0)) { - for (algo in algos) { - num <<- num + num.step - eval(substitute( - test(.num, - froll(.fun, x, n, fill=.fill, na.rm=.na.rm, algo=.algo, adaptive=TRUE), - arollfun(.fun, x, n, fill=.fill, na.rm=.na.rm, nf.rm=.nf.rm)), - list(.num=num, .fun=fun, .fill=fill, .na.rm=na.rm, .algo=algo, .nf.rm=algo!="exact") - )) + for (align in c("right","left")) { + for (na.rm in c(FALSE, TRUE)) { + for (fill in c(NA_real_, 0)) { + for (partial in c(FALSE,TRUE)) { + for (has.nf in c(NA,TRUE,FALSE)) { + if (identical(has.nf, FALSE)) { + if (na.rm) + next ## errors "not make sense" + if (any(!is.finite(x))) + next ## do not test warnings (mean, sum) or incorrect expect results (max) + } + for (algo in algos) { + num <<- num + num.step + eval(substitute( + test(.num, ignore.warning="no non-missing arguments", + arollfun(.fun, x, n, fill=.fill, na.rm=.na.rm, align=.align, partial=.partial), + frollfun(.fun, x, n, fill=.fill, na.rm=.na.rm, algo=.algo, adaptive=TRUE, align=.align, has.nf=.has.nf, partial=.partial)), + list(.num=num, .fun=fun, .fill=fill, .na.rm=na.rm, .algo=algo, .align=align, .has.nf=has.nf, .partial=partial) + )) + } + } + if (base::getRversion() >= "3.4.0") { ## SET_GROWABLE_BIT + num <<- num + num.step + eval(substitute( + test(.num, ignore.warning="no non-missing arguments", + frollapply(x, n, FUN=match.fun(.fun), fill=.fill, na.rm=.na.rm, adaptive=TRUE, align=.align, partial=.partial), + frollfun(.fun, x, n, fill=.fill, na.rm=.na.rm, adaptive=TRUE, align=.align, partial=.partial)), + list(.num=num, .fun=fun, .fill=fill, .na.rm=na.rm, .align=align, .partial=partial) + )) + } + } } } } @@ -1062,9 +1364,37 @@ f = function(x) { } #test(6010.106, head(frollapply(1:5, 3, f), 3L), c(NA_real_,NA_real_,1), output=c("frollapplyR: allocating memory.*","frollapply: took.*","frollapplyR: processing.*took.*")) # only head 3 is valid, rest is undefined as REAL is applied on logical type, can return garbage or fail with REAL error options(datatable.verbose=FALSE) + +# frollapply adaptive +r340 = base::getRversion() >= "3.4.0" ## support SET_GROWABLE_BIT +if (!r340) { + test(6010.2, frollapply(1:3, c(3,3,3), sum, adaptive=TRUE), error="frollapply adaptive=TRUE requires at least R 3.4.0") +} else { + test(6010.2011, frollapply(1:3, c(3,3,3), sum, adaptive=TRUE), c(NA,NA,6)) + test(6010.2012, frollapply(1:3, c(4,4,4), sum, adaptive=TRUE), rep(NA_real_,3)) # none of the windows in k was small enough to cover length of x + test(6010.2013, frollapply(1:5, rep(2, 5), mean, adaptive=NA), error="adaptive must be TRUE or FALSE") + test(6010.2014, frollapply(1:5, rep(3, 5), toString, adaptive=TRUE), error="frolladaptiveapply: results from provided FUN are not of type double") + test(6010.2015, frollapply(1:2, 1:2, mean, adaptive=TRUE, align="right"), c(1, 1.5)) + test(6010.2016, frollapply(1:2, 1:2, mean, adaptive=TRUE, align="center"), error="using adaptive TRUE and align 'center' is not implemented") + test(6010.2017, frollapply(list(1:2, 1:3), list(1:2), mean, adaptive=TRUE), error="adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame. If you want to call rolling function on list having variable length of elements call it for each field separately") + test(6010.2018, frollapply(1:5, rep(3, 5), function(x) head(x, 2), adaptive=TRUE), error="frolladaptiveapply: results from provided FUN are not length 1") + test(6010.2019, frollapply(1:10, list(1:5), mean, adaptive=TRUE), error="length of integer vector(s) provided as list to 'n' argument must be equal to number of observations provided in 'x'") + test(6010.2020, frollapply(1:10, 1:5, mean, adaptive=TRUE), error="length of integer vector(s) provided as list to 'n' argument must be equal to number of observations provided in 'x'") + test(6010.2021, frollapply(1:4, rep(2L,4L), mean, adaptive=TRUE, partial=TRUE), c(1,1.5,2.5,3.5)) + options(datatable.verbose=TRUE) + test(6010.2029, frollapply(c(1,3,4,2,0), c(3,2,2,3,2), sum, adaptive=TRUE, align="left"), c(8,7,6,NA,NA), output=c("processing from align='right'")) + x = c(1,2,1,1,1,2,3,2) + ans = c(NA,NA,2,2,1,2,3,2) + numUniqueN = function(x) as.numeric(uniqueN(x)) + test(6010.203, frollapply(x, rep(3, length(x)), uniqueN, adaptive=TRUE), ans, output=c("frollapplyR: allocating memory.*","frolladaptiveapply: results from provided FUN are not of type double, coercion from integer or logical will be applied on each iteration.*","frolladaptiveapply: took.*","frollapplyR: processing.*took.*")) + test(6010.204, frollapply(x, rep(3, length(x)), numUniqueN, adaptive=TRUE), ans, output=c("frollapplyR: allocating memory.*","frolladaptiveapply: took.*","frollapplyR: processing.*took.*")) + test(6010.205, as.logical(frollapply(c(1,2,1,1,NA,2,NA,2), rep(3, length(x)), anyNA, adaptive=TRUE)), c(NA,NA,FALSE,FALSE,TRUE,TRUE,TRUE,TRUE), output=c("frollapplyR: allocating memory.*","frolladaptiveapply: results from provided FUN are not of type double, coercion from integer or logical will be applied on each iteration","frolladaptiveapply: took.*","frollapplyR: processing.*took.*")) + options(datatable.verbose=FALSE) + test(6010.206, frollapply(c(2,2,2,3,4), c(1,3,3,2,3), uniqueN, adaptive=TRUE), c(1,NA,1,2,3)) ## window width bigger than location +} + #### test coverage test(6010.501, frollapply(1:3, "b", sum), error="n must be integer") -test(6010.502, frollapply(1:3, 2.5, sum), error="n must be integer") test(6010.503, frollapply(1:3, integer(), sum), error="n must be non 0 length") test(6010.504, frollapply(1:3, 2L, sum, fill=1:2), error="fill must be a vector of length 1") test(6010.505, frollapply(1:3, 2L, sum, fill=NA_integer_), c(NA,3,5)) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 807a67c19e..11b00cc546 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,16 +1,10 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by # users running test.data.table("other.Rraw"). # zoo needs to be before xts for #5101 otherwise xts's dependency zoo gets attached at position 2 if xts is loaded first -# Optional Suggest-ed package tests moved from tests.Rraw to here in #5516. Retaining their comments: -# "xts", # we have xts methods in R/xts.R -# "nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though) -# "yaml" # for fread's yaml argument (csvy capability) -# # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though - if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || !"package:data.table" %in% search()) { stop("Usage: R CMD INSTALL; require(data.table); test.data.table('other.Rraw')") @@ -213,509 +207,3 @@ if (loaded[["sf"]]) { #2273 test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4") } -if (loaded[["yaml"]]) { # csvy; #1701. Was 2032-2033 in tests.Rraw, #5516 - f = testDir("csvy/test.csvy") - DT = data.table(var1 = c("A", "B"), - var2 = c(1L, 3L), - var3 = c(2.5, 4.3)) - DT_yaml = copy(DT) - setattr(DT_yaml, 'yaml_metadata', - list(name = "my-dataset", - source = "https://github.com/leeper/csvy/tree/master/inst/examples", - schema = list(fields = list( - list(name = "var1", title = "variable 1", type = "string", - description = "explaining var1", - constraints = list(list(required = TRUE))), - list(name = "var2", title = "variable 2", type = "integer"), - list(name = "var3", title = "variable 3", type = "number") - )))) - ## with skip = '__auto__', fread can figure out - ## how to start after the metadata (just ignoring it) - test(16.01, fread(f), DT) - ## should be the same, but with yaml_metadata attribute - test(16.02, fread(f, yaml = TRUE), DT_yaml) - ## testing verbose messaging - test(16.03, fread(f, yaml = TRUE, verbose = TRUE), - DT_yaml, output = 'Processed.*YAML metadata.*') - ## this file is identical, except the body of the - ## YAML header is commented out with # (should read identically) - test(16.04, - fread(testDir('csvy/test_comment.csvy'), yaml = TRUE), - DT_yaml) - ## user input is taken as most intentional & overrides YAML - DT_yaml[ , var2 := as.numeric(var2)] - test(16.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')), - DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2') - ## extraneous/unused fields shouldn't throw off reading - DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE) - test(16.06, names(DT), c('Date', 'WTI')) - test(16.07, attr(DT, 'yaml_metadata'), - list(names = c("Date", "WTI"), class = "data.frame", - title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv", - fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv", - sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D", - source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples", - item = "PET", sourcekey = "RWTC", freq = "Daily", - rate = "MID", type = "price", units = "Dollars per Barrel", - latestdate = "2015-08-31", releasedate = "2015-09-02", - nextreleasedate = "2015-09-10", source = "Thomson Reuters", - contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800")) - ## yaml can also handle sep, dec, quote, and na.strings - DT_out = data.table(var1 = c("A", "B"), - var2 = c(1L, NA), - var3 = c(2.5, 4.3)) - meta = - list(name = NULL, - schema = list(fields = list( - list(name = "var1", title = "variable 1", type = "string", - description = "a single-quoted character variable"), - list(name = "var2", title = "variable 2", type = "integer"), - list(name = "var3", title = "variable 3", type = "number", - description = "European-style numeric") - )), - header = TRUE, sep = "|", dec = ",", - quote = "'", na.strings = "@") - attr(DT_out, 'yaml_metadata') = meta - test(16.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out) - ## user-specified attributes can override data from YAML - meta$sep = "-" - setattr(DT_out, 'yaml_metadata', meta) - test(16.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out, - message = 'User-supplied.*sep.*override') - - meta$sep = "|" - setattr(DT_out, 'yaml_metadata', meta) - test(16.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE), - DT_out, message = 'User-supplied.*header.*override') - col.names = c('x', 'y', 'z') - setnames(DT_out, col.names) - test(16.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out, - message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override')) - - test(16.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names), - DT_out, message = 'User-supplied.*col.names') - - setnames(DT_out, c('var1', 'var2', 'var3')) - meta$quote = "^" - setattr(DT_out, 'yaml_metadata', meta) - test(16.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"), - DT_out, message = 'User-supplied.*quote') - - meta$quote = "'" - meta$dec = "." - setattr(DT_out, 'yaml_metadata', meta) - test(16.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','), - DT_out, message = 'User-supplied.*dec') - - meta$dec = ',' - meta$na.strings = 'NA' - setattr(DT_out, 'yaml_metadata', meta) - test(16.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'), - DT_out, message = 'User-supplied.*na.strings') - - ## error if YAML malformed - test(16.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE), - error = 'Reached the end.*YAML.*valid csvy') - ## use any other CSV in test directory which doesn't have YAML - if (loaded[["R.utils"]]) test(16.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE), - error = 'Encountered.*unskipped.*constitute.*valid YAML') - ## no problem if some fields are missing a type (just - ## resort to standard auto-inferral, i.e., identical to - ## the case of partially-specified colClasses) - DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L), - var3 = c(2.5, 4.3)) - setattr(DT, 'yaml_metadata', - list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples", - schema = list(fields = list( - list(name = "var1"), list(name = "var2", type = "integer"), - list(name = "var3", type = "number") - )))) - test(16.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT) - ## skip applies starting after the YAML header - setattr(DT, 'yaml_metadata', - list(schema = list(fields = list( - list(name = "var1", type = "string"), - list(name = "var2", type = "integer"), - list(name = "var3", type = "number") - )))) - test(16.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT) - ## user-supplied col.names override metadata (as for colClasses) - cn = paste0('V', 1:3) - setnames(DT, cn) - test(16.20, fread(testDir('csvy/test_skip.csvy'), - yaml = TRUE, skip = 2L, col.names = cn), - DT, message = 'User-supplied column names.*override.*YAML') - ## invalid value fails - test(16.21, fread(f, yaml = 'gobble'), - error = 'isTRUEorFALSE\\(yaml\\) is not TRUE') - - ## warning that skip-as-search doesn't work with yaml - DT_yaml[ , var2 := as.integer(var2)] - test(16.22, fread(f, skip = 'var1,', yaml = TRUE), - DT_yaml, warning = 'Combining a search.*YAML.*') - - # fwrite csvy: #3534 - tmp = tempfile() - DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) - # force eol for platform independence - fwrite(DT, tmp, yaml = TRUE, eol = '\n') - as_read = readLines(tmp) - test(17.01, as_read[c(1L, 24L)], c('---', '---')) - test(17.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) - test(17.03, grepl('creation_time_utc', as_read[3L])) - test(17.04, as_read[4:23], - c("schema:", " fields:", " - name: a", " type: integer", - " - name: b", " type: numeric", " - name: c", " type: character", - "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", - # NB: apparently \n is encoded like this in YAML - "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", - "logical01: no")) - tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") - test(17.05, as_read[25:30], tbl_body) - - # windows eol - fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') - test(17.06, readLines(tmp)[18L], 'eol: "\\r\\n"') - - # multi-class columns - DT[ , t := .POSIXct(1:5, tz = 'UTC')] - fwrite(DT, tmp, yaml = TRUE) - as_read = readLines(tmp) - test(17.07, as_read[13L], " type: POSIXct") - - # ~invertibility~ - # fread side needs to be improved for Hugh's colClasses update - DT[ , t := NULL] - fwrite(DT, tmp, yaml = TRUE) - DT2 = fread(tmp, yaml = TRUE) - # remove metadata to compare - attr(DT2, 'yaml_metadata') = NULL - test(17.08, all.equal(DT, DT2)) - - test(17.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), - output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) - - # TODO: test gzip'd yaml which is now supported - - # yaml + bom arguments - DT = data.table(l=letters, n=1:26) - fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) - fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 in tests.Rraw - lines = readLines(fcon) - lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows - # remove the blank here so we don't need to change this test if/when that changes in yaml package - test(17.11, length(lines), 48L) - close(fcon) - test(17.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) - # re-write should have same output (not appended) - fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) - fcon = file(f, encoding="UTF-8") - lines = readLines(fcon) - lines = lines[lines!=""] - test(17.13, length(lines), 48L) - close(fcon) - test(17.14, fread(f), DT) - unlink(f) -} - -if (loaded[["xts"]]) { # was 1465 in tests.Rraw, #5516 - # data.table-xts conversion #882 - # Date index - dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - dt_xt = as.data.table(xt) - xt_dt = as.xts.data.table(dt) - test(18.01, all.equal(dt, dt_xt, check.attributes = FALSE)) - test(18.02, xt, xt_dt) - # POSIXct index - dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - dt_xt = as.data.table(xt) - xt_dt = as.xts.data.table(dt) - test(18.03, all.equal(dt, dt_xt, check.attributes = FALSE)) - test(18.04, xt, xt_dt) - # index types returned from to.period - dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index) - xt_w = xts::to.weekly(xt) - xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w)) - xt_m = xts::to.monthly(xt) - xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m)) - xt_q = xts::to.quarterly(xt) - xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q)) - xt_y = xts::to.yearly(xt) - xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y)) - test(18.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE)) - test(18.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE)) - test(18.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE)) - test(18.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE)) - - test(18.09, xts::last(1:5), 5L) # was test 1531 - - # xts issue from Joshua, #1347 - x = as.Date(1:5, origin="2015-01-01") - test(18.10, last(x), tail(x, 1L)) # was test 1559 - - x = xts(1:100, Sys.Date()+1:100) - test(18.11, last(x,10), x[91:100,]) # was test 841 - # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. - # But that isn't tested by R CMD check because xts is loaded above data.table, there. - # So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)" - # rather than: "require(data.table);require(xts)" - # Which was the main thrust of bug#2312 fixed in v1.8.3 - - # fix for #1484; was test 1589 - x = xts::as.xts(8, order.by = as.Date("2016-01-03")) - test(18.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) - - # IDate support in as.xts.data.table #1499; was test 1663 - dt <- data.table(date = c(as.IDate("2014-12-31"), - as.IDate("2015-12-31"), - as.IDate("2016-12-31")), - nav = c(100,101,99), - key = "date") - dt.xts <- as.xts.data.table(dt) - test(18.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) - - # additional coverage missing uncovered in #3117 - dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - test(18.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) - names(xt)[1L] = 'index' - test(18.15, as.data.table(xt), error = 'Input xts object should not') - names(xt)[1L] = 'quantity' - setcolorder(dt, c(3, 1, 2)) - if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286 - test(18.16, as.xts(dt), error = 'data.table must have a time based') - setcolorder(dt, c(2, 3, 1)) - dt[ , char_col := 'a'] - test(18.17, as.xts(dt), xt, warning = 'columns are not numeric') - if (base::getRversion() < "3.6.0") rm(as.xts) - - # 890 -- key argument for as.data.table.xts - x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) - old = options(datatable.verbose=FALSE) - test(18.18, capture.output(as.data.table(x, key="index")), - c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", - " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", - " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", - " 9: 1970-01-10 9", "10: 1970-01-11 10")) - options(old) - - # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 - M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above - test(18.19, inherits(as.data.table(M)$index,"POSIXct")) - - # non-numeric xts coredata, #5268 - x = xts::xts(x=c(TRUE,FALSE), order.by=Sys.Date()+(1:2)) - colnames(x) = "value" # perhaps relates to #4897 - test(18.20, identical(x, as.xts(as.data.table(x), numeric.only=FALSE))) -} - -# was 2108 in tests.Rraw, #5516 -# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached -# stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces()) -x = as.POSIXct("2019-09-09")+0:1 -old = options(datatable.verbose=TRUE) -test(19.01, last(x), x[length(x)], output="!is.xts(x)") -test(19.02, first(x), x[1L], output="!is.xts(x)") -if (loaded[["xts"]]) { - xt = xts(1:2, x) - test(19.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)") - test(19.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)") - xt = xts(matrix(1:4, 2L, 2L), x) - test(19.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)") - test(19.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)") -} -# first on empty df now match head(df, n=1L), #3858 -df = data.frame(a=integer(), b=integer()) -test(19.11, first(df), df, output="!is.xts(x)") -test(19.12, last(df), df, output="!is.xts(x)") -options(datatable.verbose=FALSE) # so the as.data.table() doesn't pollute output -# xts last-first dispatch fix #4053 -x = 1:3 -y = as.POSIXct(x, origin="1970-01-01") -df = data.frame(a=1:2, b=3:2) -dt = as.data.table(df) -mx = matrix(1:9, 3, 3) -ar = array(1:27, c(3,3,3)) -xt = structure( - c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994, - 140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993, - 94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"), - .Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) -) -options(datatable.verbose=TRUE) -if (loaded[["xts"]]) { - test(19.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(19.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(19.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(19.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - xt_last = structure( - c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(1167955200, tzone = "UTC", tclass = "Date"), - .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - xt_last2 = structure( - c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005, - 141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"), - .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - test(19.25, last(xt), xt_last, output="using xts::last: is.xts(x)") - test(19.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)") - test(19.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(19.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(19.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(19.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - xt_first = structure( - c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(1167782400, tzone = "UTC", tclass = "Date"), - .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - xt_first2 = structure( - c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"), - .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - test(19.35, first(xt), xt_first, output="using xts::first: is.xts(x)") - test(19.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)") -} else { - test(19.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.25, last(xt), error="you should have 'xts' installed already") - test(19.26, last(xt, n=2L), error="you should have 'xts' installed already") - test(19.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(19.35, first(xt), error="you should have 'xts' installed already") - test(19.36, first(xt, n=2L), error="you should have 'xts' installed already") -} -test(19.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(19.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(19.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(19.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(19.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(19.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(19.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(19.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic -test(19.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127 -test(19.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -test(19.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127 -test(19.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -options(old) - -if (loaded[["xts"]]) { # was 2133 in tests.Rraw, #5516 - # keep.rownames in as.data.table.xts() supports a string, #4232 - xts = xts::xts(1:10, structure(1:10, class = "Date")) - colnames(xts) = "VALUE" - DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE") - test(20.1, colnames(DT), c("DATE", "VALUE")) - test(20.2, key(DT), "DATE") - test(20.3, as.data.table(xts, keep.rownames = "VALUE"), - error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") - test(20.4, as.data.table(xts, keep.rownames = character()), - error = "keep.rownames must be length 1") - test(20.5, as.data.table(xts, keep.rownames = NA_character_), - error = "keep.rownames must not be NA") -} - -if (loaded[["nanotime"]]) { - - # was 1463.62-65 in tests.Rraw, #5516 - x=nanotime(1:4) - test(21.1, shift(x ), c(nanotime::nanotime(NA), x[1:3])) - test(21.2, shift(x, fill=0L), c(nanotime::nanotime(0L), x[1:3])) - test(21.3, shift(x, 1, type="cyclic"), c(x[4L], x[-4L])) - test(21.4, shift(x, -1, type="cyclic"), c(x[-1L], x[1L])) - - # was 1752 in tests.Rraw, #5516 - DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", - "2016-09-29T23:59:00.000000001Z", - "2016-09-29T23:59:00.000000999Z", - "1970-01-01T00:01:01.000001000Z", - "1970-01-01T00:00:00.000000000Z", - "1969-12-31T23:59:59.999999999Z", - "1969-12-31T23:59:59.000000089Z", - "1969-12-31T12:13:14.000000000Z", - "1969-12-31T12:13:14.999999999Z", - "1969-12-31T12:13:14.000000001Z", - "1967-03-15T00:00:00.300000002Z", - "1967-03-15T23:59:59.300000002Z"))) - test(22, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) - - # was 2060.401-405 in tests.Rraw, #5516 - nt = nanotime(c(1L, 2L, NA_integer_, 4L)) - nt_val = nanotime(1:4) - test(23.1, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46 - test(23.2, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val)) - test(23.3, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val)) - test(23.4, fcoalesce(nt, 1), error='Item 2 has a different class than item 1') - test(23.5, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double') - - # was 2080.01-05 in tests.Rraw, #5516 - n = nanotime(1:4) - n[2L] = NA - opt = options(datatable.verbose=TRUE) - test(24.1, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") - test(24.2, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") - test(24.3, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") - options(opt) - test(24.4, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") - test(24.5, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") - - # was 2085.11 in tests.Rraw, #5516 - n = nanotime(1:4) - test(25, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4])) - - # was 2127.27 in tests.Rraw, #5516 - n = nanotime(1:12) - test(26, fcase(c(-5L:5L<0L,NA), n, c(-5L:5L>0L,NA), n+100), c(n[1L:5L], nanotime(NA), n[7L:11L]+100, as.integer64(NA))) - - # na.omit works for nanotime, #4744. Was 2205 in tests.Rraw, #5516 - DT = data.table(time=nanotime(c(1,NA,3))) - test(27, na.omit(DT), DT[c(1,3)]) - -} - -# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 -DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 ) -test(28.1, DT[,plot(b,f)], NULL) -test(28.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 -test(28.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) -try(graphics.off(),silent=TRUE) - -# test DT$.<- in a data.table-unaware package -# moved from tests.Rraw 1890 to here to save ram of loading stats package and plot, #5517 -DT = data.table(A=1:5) -test(29.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") -# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() -test(29.2, DT, data.table(A=1:5)) - -if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure we need to still test reshape2 - # test dispatch for non-data.table objects, #4864. - if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { - test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - error="The melt generic in data.table has been passed a data.frame") - } else { - # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 - # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) - # 3) in dev locally I have reshape2 installed to run caret in other.Rraw - test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), - warning="The melt generic in data.table has been passed a data.frame") - } -} - diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8eeb8f7ee7..e05f522814 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7,8 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") - rm_all = function() {} - DTfun = DT ## otherwise DT would be re-defined by many tests + DTfun = DT # just in dev-mode, DT() gets overwritten in .GlobalEnv by DT objects here in tests.Rraw; we restore DT() in test 2212 } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -33,7 +32,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table - DTfun = data.table:::DT endsWith = data.table:::endsWith endsWithAny = data.table:::endsWithAny forder = data.table:::forder @@ -55,7 +53,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { print.data.table = data.table:::print.data.table replace_dot_alias = data.table:::replace_dot_alias rollup.data.table = data.table:::rollup.data.table - rss = data.table:::rss selfrefok = data.table:::selfrefok setcoalesce = data.table:::setcoalesce setdiff_ = data.table:::setdiff_ @@ -71,7 +68,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { which.first = data.table:::which.first which.last = data.table:::which.last `-.IDate` = data.table:::`-.IDate` - haszlib = data.table:::haszlib # Also, for functions that are masked by other packages, we need to map the data.table one. Or else, # the other package's function would be picked up. As above, we only need to do this because we desire @@ -103,21 +99,17 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { year = data.table::year # lubridate yearmon = data.table::yearmon # zoo yearqtr = data.table::yearqtr # zoo - - rm_all = function(env=parent.frame()) { - tt = setdiff(ls(envir=env), .do_not_rm) - rm(list=tt, envir=env) - gc() - invisible() - } } -# Optional suggests are now tested in other.Rraw, #5516. No calls to require() or library() should occur -# in this file other than for methods and data.table above, and these here. -# These are included in code coverage, and on CRAN. The reason for inclusion is stated next to each package. +# Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN +# The reason for inclusion here is stated next to each package sugg = c( "bit64", # if big integers are detected in file, fread reads them as bit64::integer64 if installed (warning if not) - "R.utils" # many fread test input files are compressed to save space; fundamental to test environment + "xts", # we have xts methods in R/xts.R + "nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though) + "R.utils", # for fread to accept .gz and .bz2 files directly + "yaml" # for fread's yaml argument (csvy capability) + # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though ) for (s in sugg) { assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( @@ -168,13 +160,10 @@ base_messages = list( mixed_subscripts = get_msg(letters[-1:1]) ) -########################## -.do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") -test(1.2, tables(silent=TRUE)[,.(NAME,NROW,MB)], # memtest=TRUE adds some columns so exclude NCOL and COLS here - data.table(NAME="timings", NROW=9999L, MB=0)) +test(1.2, tables(silent=TRUE), data.table(NAME="timings", NROW=9999L, NCOL=3L, MB=0, COLS=list(c("ID","time","nTest")), KEY=list(NULL))) TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7) setkey(TESTDT,a,b) @@ -351,7 +340,7 @@ test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) -# test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated ## this is very old DT() functionality, completely different than DT() discussed in 2023 +# test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default test(87, TESTDT[,list(MySum=sum(v)),by="b"], data.table(b=c("e","f","i","b"),MySum=INT(3,7,11,7))) @@ -534,7 +523,11 @@ test(164, foo(f), DT[,mean(b),by=d]) test(165, subset(DT,a>2), DT[a>2]) test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2]) -# 167 tested graphics::plot, moved to other.Rraw 28 to save ram, #5517 +# and that plotting works +test(167.1, DT[,plot(b,f)], NULL) +test(167.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 +test(167.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) +try(graphics.off(),silent=TRUE) # IDateTime conversion methods that ggplot2 uses (it calls as.data.frame method) # Since %b is e.g. "nov." in LC_TIME=fr_FR.UTF-8 locale, we need to @@ -971,7 +964,13 @@ DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) -# 301 moved to benchmark.Rraw, #5517 +# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) +# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which +# this tests. But it's well under 10 seconds. +DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) +test(301, nrow(DT[,sum(B),by=C])==100010) +DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) +test(301.1, nrow(DT[,sum(B),by=C])==100010) # Test fast assign DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a") @@ -1934,7 +1933,21 @@ DT = data.table(x=1:3,y=1:3) test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) -# 637-638 moved to benchmark.Rraw, #5517 +# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. +options(datatable.optimize=0L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(637.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(637.3, key(DT[,a:=99L,by=a]), NULL) +options(datatable.optimize=2L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(638.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(638.3, key(DT[,a:=99L,by=a]), NULL) # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) @@ -1954,9 +1967,9 @@ test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of # Test faster mean with a lot of very small groups. Example from (now not needed as much) data.table wiki point 3. # benchmarks.Rraw contains the same, to be scaled up. set.seed(9) -n=1e3 # very small n (1e4) so as not to overload daily CRAN checks. Then reduced even further to just 1e3, #5517 -DT=data.table(grp1=sample.int(150L, n, replace=TRUE), - grp2=sample.int(150L, n, replace=TRUE), +n=1e4 # very small n so as not to overload daily CRAN checks. +DT=data.table(grp1=sample(1:150, n, replace=TRUE), + grp2=sample(1:150, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect. @@ -2413,7 +2426,16 @@ mycols = 2 test(814.12, DT[,!..mycols], ans) test(814.13, DT[,-..mycols], ans) -# 819-820 moved to benchmark.Rraw, #5517 + +# Test X[Y] slowdown, #2216 +# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes +# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. +X = CJ(a=seq_len(1e3),b=seq_len(1e3)) +Y = copy(X) +X[4,b:=3L] # create a dup group, to force allLen1=FALSE +setkey(X) +test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case +test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case # Optimization of lapply(,"+"), #2212 DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12) @@ -2515,7 +2537,24 @@ i = data.frame(foo=1) test(859, DT[i], DT[J(i)]) test(860, DT[i], DT[data.table(i)]) -# 861-863 moved to benchmark.Rraw, #5517 +# test no memory leak, #2191 and #2284 +# These take a few seconds each, and it's important to run these on CRAN to check no leak +gc(); before = gc()["Vcells","(Mb)"] +for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB +gc(); after = gc()["Vcells","(Mb)"] +test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin + +gc(); before = gc()["Vcells","(Mb)"] +DF = data.frame(x=1:20, y=runif(20)) +for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } +gc(); after = gc()["Vcells","(Mb)"] +test(862, after < before+0.5) + +gc(); before = gc()["Vcells","(Mb)"] +DT = data.table(x=1:20, y=runif(20)) +for (i in 1:2000) { x <- DT[1:5,]; rm(x) } +gc(); after = gc()["Vcells","(Mb)"] +test(863, after < before+0.5) # rbindlist should look for the first non-empty data.table - New changes (from Arun). Explanation below: # Even if data.table is empty, as long as there are column names, they should be considered. @@ -3215,7 +3254,13 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.20, melt(DT, id.vars=1:2), data.table(A=1:2, B=3:4, variable=factor(rep(1L, 4L), labels="D"), value=5:8)) - # 1035.21 moved to benchmark.Rraw, #5517 + # segfault of unprotected var caught with the help of address sanitizer; was test 1509 + set.seed(1) + val = sample(c(1:5, NA), 1e4L, TRUE) + dt <- setDT(replicate(100L, val, simplify=FALSE)) + ## to ensure there's no segfault... + ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) + test(1035.21, ans, ans) # improper levels fix, #1359; was test 1563 dt = data.table(id=1:3, x=NA_character_, y=c('a', NA_character_, 'c')) @@ -3318,8 +3363,18 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") - # 1038 moved to other.Rraw, #5517 - + # test dispatch for non-data.table objects, #4864. + if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { + test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + error="The melt generic in data.table has been passed a data.frame") + } else { + # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 + # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) + # 3) in dev locally I have reshape2 installed to run caret in other.Rraw + test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), + warning="The melt generic in data.table has been passed a data.frame") + } } # sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105 @@ -4018,8 +4073,7 @@ if (test_longdouble) { old = getNumericRounding() set.seed(6) - x = rnorm(1e4)*1e4 # first 1e4 reduced from 1e6 to save ram, #5517 - x = c(x, 11969.235757385, 11969.235757322) # add back 2 numbers from the 1e6 sample whose order is changed in test 1147.3 + x = rnorm(1e6)*1e4 ans = base::sort.list(x, method="shell") setNumericRounding(0) test(1147.1, ans, forderv(x)) @@ -4053,7 +4107,16 @@ if (test_longdouble) { test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) -# 1151 moved to benchmark.Rraw, #5517 +# test uniqlengths +set.seed(45) +x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) +ox <- forderv(x) +o1 <- uniqlist(list(x), ox) +test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +o1 <- uniqlist(list(x)) +test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +rm(list=c("x","ox","o1")) +gc() # #67 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #355 implementation dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20)) @@ -4093,7 +4156,21 @@ setkey(dt, x) test(1155.4, dt[J(NaN)], dt[is.nan(x)]) test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)]) -# 1157-1158 moved to benchmark.Rraw, #5517 +# Fix for (usually small) memory leak when grouping, #2648. +# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). +DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) +before = gc()["Vcells",2] +for (i in 1:50) DT[, sum(B), by=A] +after = gc()["Vcells",2] +test(1157, after < before+3) # +3 = 3MB +# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. + +# Similar for when dogroups writes less rows than allocated, #2648. +DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) +before = gc()["Vcells",2] +for (i in 1:50) DT[ , unlist(.SD), by = 'k'] +after = gc()["Vcells",2] +test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 # tests for 'setDT' - convert list, DF to DT without copy x <- data.frame(a=1:4, b=5:8) @@ -4405,46 +4482,48 @@ seed = as.integer(Sys.time()) # sample(9999L, 1L) temporary fix, because all the seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") # no NaN (because it's hard to match with base::order); tested below in 1988.4-8 set.seed(seed) -foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, collapse="") +foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="") i1 = as.integer(sample(c(-100:100), 1e3, TRUE)) i2 = as.integer(sample(c(-100:100, -1e6, 1e6), 1e3, TRUE)) d1 = as.numeric(sample(c(-100:100,Inf,-Inf), 1e3, TRUE)) d2 = as.numeric(rnorm(1e3)) -c1 = sample(letters, 1e3, TRUE) -c2 = sample(foo(50), 1e3, TRUE) +c1 = sample(c(letters), 1e3, TRUE) +c2 = sample(foo(200), 1e3, TRUE) DT = data.table(i1, i2, d1, d2, c1, c2) # randomise col order as well colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") +ans = vector("list", length(names(DT))) test_no = 1223.0 oldnfail = nfail -for (nvars in seq_along(names(DT))) { - signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) - combn(names(DT), nvars, simplify=FALSE, function(x) { # simplify=FALSE needed for R 3.1.0 - for (i in seq_len(nrow(signs))) { +for (i in seq_along(names(DT))) { + cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) + ans[[i]] = combn(names(DT), i, function(x) { + tmp = apply(cj, 1, function(y) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { - if (signs[i,j] == 1L) + if (y[j] == 1L) as.name(x[j]) else { - if (is.character(DT[[x[j]]])) + if (class(DT[[x[j]]]) =="character") as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j]))))) else as.call(list(as.name("-"), as.name(x[j]))) } }) )) - test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) - } - integer() + test(test_no, forderv(DT, by=x, order=y), with(DT, eval(ll))) + }) + dim(tmp)=NULL + list(tmp) }) } +ans = NULL if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce -rm_all() # fix for bug #44 - unique on null data.table should return null data.table test(1224, unique(data.table(NULL)), data.table(NULL)) @@ -4540,7 +4619,7 @@ if (base::getRversion() < "3.3.0") { # Test for optimisation of 'order' to 'forder'. Copied to benchmarks.Rraw too. set.seed(45L) -DT = data.table(x=sample.int(1e2, 1e3, TRUE), y=sample.int(1e2, 1e3, TRUE)) # 1e5 reduced again to 1e3, #5517 +DT = data.table(x=sample(1e2, 1e5, TRUE), y=sample(1e2, 1e5, TRUE)) test(1241, DT[order(x,-y)], # optimized to forder() DT[base_order(x,-y)]) # not optimized @@ -4814,7 +4893,7 @@ test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], # Wide range numeric and integer64, to test all bits old_rounding = getNumericRounding() -x = sample( c(seq(-1e100, 1e100, length.out=1e3), c(seq(-1e-100,1e-100,length.out=1e3))) ) # 1e5 reduced to 1e3, #5517 +x = sample( c(seq(-1e100, 1e100, length.out=1e5), c(seq(-1e-100,1e-100,length.out=1e5))) ) setNumericRounding(0) test(1269, forderv(x), base::order(x)) setNumericRounding(2) # not affected by rounding @@ -5136,8 +5215,8 @@ DT = DT[1L] set(DT,1L,"b",FALSE) # passing 1L as i here is needed to avoid column plonk, so changes the logical singleton in place test(1297, as.integer(TRUE[1]), 1L) # In R 3.1, TRUE[1] returns the global TRUE but TRUE doesn't yet (parses as new vector) test(1298, as.integer(TRUE), 1L) -# orignal example, verbatim from James Sams; sizes reduced to save ram in #5517 -upc_table = data.table(upc=1:1000, upc_ver_uc=rep(c(1,2), times=500), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=250), product_module_code=rep(1:4, times=250), ignore.column=2:1001) +# orignal example, verbatim from James Sams : +upc_table = data.table(upc=1:100000, upc_ver_uc=rep(c(1,2), times=50000), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=25000), product_module_code=rep(1:4, times=25000), ignore.column=2:100001) test(1299, upc_table[, .N, by=list(upc, upc_ver_uc)][,max(N)], 1L) # all size 1 groups test(1300, upc_table[, list(is_PL, product_module_code), keyby=list(upc, upc_ver_uc)][,upc[1:3]], 1:3L) # was warning "internal TRUE value has been modified" rm(list="upc_table") @@ -6719,7 +6798,13 @@ ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8))) setattr(ans, 'names', nm) test(1463.61, shift(x, 1:2, give.names=TRUE), ans) -# 1463.62-65 tested nanotime moved to other.Rraw 21, #5516 +if (test_nanotime) { + x=nanotime(1:4) + test(1463.62, shift(x ), c(nanotime::nanotime(NA), x[1:3])); + test(1463.63, shift(x, fill=0L), c(nanotime::nanotime(0L), x[1:3])); + test(1463.64, shift(x, 1, type="cyclic"), c(x[4L], x[-4L])); + test(1463.65, shift(x, -1, type="cyclic"), c(x[-1L], x[1L])); +} # shift circular x = 1:5 @@ -6752,7 +6837,106 @@ test(1464.12, rleidv(DT, 1:2), ans<-INT(1,2,3,4,5,6,6,6,7,8,8,9,10,11,12,13,14,1 test(1464.13, rleidv(DT, 2:1), ans) test(1464.14, rleidv(DT, c(3,1)), INT(1,1,2,2,3,4,5,5,6,7,8,9,10,11,12,13,14,15,16,17)) -# 1465 tested xts moved to other.Rraw 18, #5516 +if (test_xts) { + + Sys.unsetenv("_R_CHECK_LENGTH_1_LOGIC2_") + # package xts has an issue with an && clause (https://github.com/joshuaulrich/xts/pull/269). When that is fixed in xts and released to CRAN, we can remove this Sys.unsetenv + # Sys.setenv is called again at the end of this xts branch. The original env variable value was stored at the top of this file and restored at the end. + + # data.table-xts conversion #882 + # Date index + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + dt_xt = as.data.table(xt) + xt_dt = as.xts.data.table(dt) + test(1465.01, all.equal(dt, dt_xt, check.attributes = FALSE)) + test(1465.02, xt, xt_dt) + # POSIXct index + dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + dt_xt = as.data.table(xt) + xt_dt = as.xts.data.table(dt) + test(1465.03, all.equal(dt, dt_xt, check.attributes = FALSE)) + test(1465.04, xt, xt_dt) + # index types returned from to.period + dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index) + xt_w = xts::to.weekly(xt) + xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w)) + xt_m = xts::to.monthly(xt) + xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m)) + xt_q = xts::to.quarterly(xt) + xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q)) + xt_y = xts::to.yearly(xt) + xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y)) + test(1465.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE)) + test(1465.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE)) + test(1465.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE)) + test(1465.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE)) + + test(1465.09, xts::last(1:5), 5L) # was test 1531 + + # xts issue from Joshua, #1347 + x = as.Date(1:5, origin="2015-01-01") + test(1465.10, last(x), tail(x, 1L)) # was test 1559 + + x = xts(1:100, Sys.Date()+1:100) + test(1465.11, last(x,10), x[91:100,]) # was test 841 + # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. + # But that isn't tested by R CMD check because xts is loaded above data.table, there. + # So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)" + # rather than: "require(data.table);require(xts)" + # Which was the main thrust of bug#2312 fixed in v1.8.3 + + # fix for #1484; was test 1589 + x = xts::as.xts(8, order.by = as.Date("2016-01-03")) + test(1465.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) + + # IDate support in as.xts.data.table #1499; was test 1663 + dt <- data.table(date = c(as.IDate("2014-12-31"), + as.IDate("2015-12-31"), + as.IDate("2016-12-31")), + nav = c(100,101,99), + key = "date") + dt.xts <- as.xts.data.table(dt) + test(1465.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) + + # additional coverage missing uncovered in #3117 + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + test(1465.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) + names(xt)[1L] = 'index' + test(1465.15, as.data.table(xt), error = 'Input xts object should not') + names(xt)[1L] = 'quantity' + setcolorder(dt, c(3, 1, 2)) + if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286 + test(1465.16, as.xts(dt), error = 'data.table must have a time based') + setcolorder(dt, c(2, 3, 1)) + dt[ , char_col := 'a'] + test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') + if (base::getRversion() < "3.6.0") rm(as.xts) + + # 890 -- key argument for as.data.table.xts + x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) + old = options(datatable.verbose=FALSE) + test(1465.18, capture.output(as.data.table(x, key="index")), + c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", + " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", + " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", + " 9: 1970-01-10 9", "10: 1970-01-11 10")) + options(old) + + # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 + M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above + test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) + + # non-numeric xts coredata, #5268 + x = xts::xts(x=c(TRUE,FALSE), order.by=Sys.Date()+(1:2)) + colnames(x) = "value" # perhaps relates to #4897 + test(1465.20, identical(x, as.xts(as.data.table(x), numeric.only=FALSE))) + + Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) +} # as.data.table.default #969 ar <- array(NA, dim=c(10,4),dimnames = list(NULL,paste("col",1:4,sep=""))) @@ -7553,8 +7737,18 @@ dtab <- data.table(pid = factor(c("i", "nouana")), c("pid", "year")) test(1541, key(dtp[dtab]), c("pid", "year")) -# 1542.0 moved to benchmark.Rraw, #5517 - +# fix DT[TRUE, :=] using too much working memory for i, #1249 +if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled + f = tempfile() + N = 1000000 # or any large number of rows + DT = data.table(A=1:N, B=rnorm(N)) + DT[TRUE, B := B * 2] # stabilize with initial dummy update + Rprofmem(f) + DT[TRUE, B := B * 2] # or some in-place update + Rprofmem(NULL) + test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only + unlink(f) +} # DT[TRUE] should shallow copy as v1.11.8 and earlier did (#3214); in future more will shallow copy too DT = data.table(id = 1:5, key="id") DT1 = DT[TRUE] @@ -7703,7 +7897,10 @@ ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") -# 1549 moved to benchmark.Rraw, #5517 +# #1167 print.data.table row id in non-scientific notation +DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) +test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) +rm(DT) # PR by @dselivanov # fixes #504 - handle nastring while reading (without coercion to character) @@ -8690,7 +8887,7 @@ test(1613.601, all.equal(data.table(a=1), data.frame(a=1)), "target is data.tabl test(1613.602, all.equal(data.table(a=1), data.frame(a=1), check.attributes = FALSE)) test(1613.603, all.equal(data.table(a=1), list(a=1), check.attributes = FALSE)) test(1613.604, all.equal(data.table(a=1), 1, check.attributes = FALSE)) -test(1613.605, !isTRUE(all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE))) +test(1613.605, all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE), "target is data.table but current is not and failed to be coerced to it") L1 = list(a = data.table(1), b = setattr("foo1613", "tbl", data.table(1))) L2 = list(a = 1, b = setattr("foo1613", "tbl", 1)) test(1613.606, all(grepl("target is data.table, current is numeric", all.equal(L1, L2)))) @@ -9104,8 +9301,6 @@ dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") options(datatable.optimize=Inf) -rm_all() - #1389 - split.data.table - big chunk of unit tests set.seed(123) dt = data.table(x1 = rep(letters[1:2], 6), x2 = rep(letters[3:5], 4), x3 = rep(letters[5:8], 3), y = rnorm(12)) @@ -9197,14 +9392,14 @@ test(1639.056, TRUE, all( sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset are expanded -# memtest tracing in #5520 showed this split() and the one before 1639.188 (both by 3 columns) account for the RAM usage in 1639. But they should be gc()'d eventually after rm_all(). -test(1639.0571, is.list(l)) -test(1639.0572, names(l), c("b","a")) -test(1639.0573, all(sapply(l, function(x) !is.data.table(x) && is.list(x)))) -test(1639.0574, all(sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)))) -test(1639.0575, lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))) -test(1639.0576, all(sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6))) -test(1639.0577, all(sapply(l, sapply, sapply, ncol) == rep(4L, 24))) +test(1639.057, TRUE, all( + is.list(l), identical(names(l), c("b","a")), + sapply(l, function(x) !is.data.table(x) && is.list(x)), + sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), + identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))), + sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6), + sapply(l, sapply, sapply, ncol) == rep(4L, 24) +)) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # multi col rev test(1639.058, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), @@ -9569,7 +9764,6 @@ test(1639.141, all(sapply(dtL, truelength) > 1000)) dt <- data.table(x = factor("a"), y = 1) test(1639.142, x = split(dt, by = "x"), y = list(a = dt)) test(1639.143, x = split(dt, by = "y"), y = list(`1` = dt)) -rm_all() # allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax # patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] @@ -9582,10 +9776,10 @@ test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c( # tests for non-equi joins # function to create a random data.table with all necessary columns nq_fun = function(n=100L) { - i1 = sample(sample.int(n, 10L), n, TRUE) - i2 = sample.int(n, n, TRUE) - as.integer(n/2) # this used to be type numeric before #5517 which didn't seem intentional - i3 = sample.int(2e6, n, TRUE) - as.integer(1e6) # used to sample from -1e6:1e6 which if allocated would be 8MB, #5517 - i4 = sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE) + i1 = sample(sample(n, 10L), n, TRUE) + i2 = sample(-n/2:n/2, n, TRUE) + i3 = sample(-1e6:1e6, n, TRUE) + i4 = sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE) d1 = sample(rnorm(10L), n, TRUE) d2 = sample(rnorm(50), n, TRUE) @@ -9597,55 +9791,15 @@ nq_fun = function(n=100L) { dt = data.table(i1,i2,i3,i4, d1,d2,d3,d4, c1,c2) if (test_bit64) { - I1 = as.integer64(sample(sample.int(n, 10L), n, TRUE)) - I2 = as.integer64(sample.int(n, n, TRUE) - as.integer(n/2)) - I3 = as.integer64(sample.int(2e6, n, TRUE) - as.integer(1e6)) # there used to be another -1e6:1e6 here whose altrep likely allocated when sample accessed it, #5517 - I4 = as.integer64(sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE)) + I1 = as.integer64(sample(sample(n, 10L), n, TRUE)) + I2 = as.integer64(sample(-n/2:n/2, n, TRUE)) + I3 = as.integer64(sample(-1e6:1e6, n, TRUE)) + I4 = as.integer64(sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE)) dt = cbind(dt, data.table(I1,I2,I3,I4)) } dt } -construct <- function(cols, vals, ops, x, y) { - expr = lapply(seq_along(cols), function(i) { - GT_or_LT = ops[i]==">" || ops[i]=="<" - if (inherits(vals[[i]], "integer64")) { - if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) - # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN - } else { - if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) - else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) - } - }) - Reduce(function(x,y)call("&",x,y), expr) -} - -check <- function(x, y, cols, ops, mult="all") { - # gather just row numbers here and then select all rows once afterwards, rather than rbindlist - rowNums = unlist(lapply(1:nrow(y), function(i) { - e = construct(cols, y[i, ..cols], ops, x, y) - rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization - if (!length(rowNums) || mult=="all") - rowNums - else if (mult=="first") - rowNums[1L] - else # mult=="last" - rowNums[length(rowNums)] - })) - x[rowNums] -} - -nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { - sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) - ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] - setnames(ans, gsub("^x[.]", "", names(ans))) - setcolorder(ans, names(x))[] -} - -is_only_na <- function(x) is.na(x) & !is.nan(x) - nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { ops = c("==", ">=", "<=", ">", "<") xclass = sapply(x, class) @@ -9656,6 +9810,42 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { thisops[startsWith(cols, "c")] = "==" thisops }) + is_only_na <- function(x) is.na(x) & !is.nan(x) + construct <- function(cols, vals, ops) { + expr = lapply(seq_along(cols), function(i) { + GT_or_LT = ops[i]==">" || ops[i]=="<" + if (inherits(vals[[i]], "integer64")) { + if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) + # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN + } else { + if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) + else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) + } + }) + Reduce(function(x,y)call("&",x,y), expr) + } + check <- function(x, y, cols, ops, mult="all") { + # gather just row numbers here and then select all rows once afterwards, rather than rbindlist + rowNums = unlist(lapply(1:nrow(y), function(i) { + e = construct(cols, y[i, ..cols], ops) + rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization + if (!length(rowNums) || mult=="all") + rowNums + else if (mult=="first") + rowNums[1L] + else # mult=="last" + rowNums[length(rowNums)] + })) + x[rowNums] + } + nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { + sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) + ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] + setnames(ans, gsub("^x[.]", "", names(ans))) + setcolorder(ans, names(x))[] + } for (i in seq_along(runcmb)) { thiscols = runcmb[[i]] thisops = runops[[i]] @@ -9668,7 +9858,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { gc() # no longer needed but left in place just in case, no harm } -dt1 = nq_fun(100L) # 400 reduced to 100, #5517 +dt1 = nq_fun(400L) dt2 = nq_fun(50L) x = na.omit(dt1) y = na.omit(dt2) @@ -9883,20 +10073,16 @@ test(1658.39, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\ test(1658.40, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") # fwrite compress -if (!haszlib()) { - test(1658.409, fwrite(data.table(a=1), file=tempfile(), compress="gzip"), error="header files were not found at the time data.table was compiled") -} else { - test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console - DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25)) - test(1658.421, fwrite(DT, file=f1<-tempfile(fileext=".gz"), verbose=TRUE), NULL, - output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1") # [12] for Windows where eolLen==2 - test(1658.422, fwrite(DT, file=f2<-tempfile()), NULL) - test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0) - if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform) - fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz - test(1658.441, file.info(f3)$size, file.info(f1)$size) - unlink(c(f1,f2,f3)) -} +test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console +DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25)) +test(1658.421, fwrite(DT, file=f1<-tempfile(fileext=".gz"), verbose=TRUE), NULL, + output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1") # [12] for Windows where eolLen==2 +test(1658.422, fwrite(DT, file=f2<-tempfile()), NULL) +test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0) +if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform) +fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz +test(1658.44, file.info(f3)$size, file.info(f1)$size) +unlink(c(f1,f2,f3)) DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c("foo", "bar", "baz"))) test(1658.45, fwrite(DT), output=c("a,b","1,1|2|3|4","2,3.14|1e+12","3,foo|bar|baz")) DT[3,b:=as.raw(0:2)] @@ -9923,12 +10109,10 @@ test(1658.52, file.info(f1)$size, file.info(f2)$size) unlink(c(f1, f2)) # compression error -5 due to only 3 bytes (bom) in first block; #3599 -if (haszlib()) { - DT = data.table(l=letters, n=1:26) - test(1658.53, fwrite(DT, file=f<-tempfile(fileext=".gz"), bom=TRUE, col.names=FALSE), NULL) - if (test_R.utils) test(1658.54, fread(f), setnames(DT,c("V1","V2"))) - unlink(f) -} +DT = data.table(l=letters, n=1:26) +test(1658.53, fwrite(DT, file=f<-tempfile(fileext=".gz"), bom=TRUE, col.names=FALSE), NULL) +if (test_R.utils) test(1658.54, fread(f), setnames(DT,c("V1","V2"))) +unlink(f) # complex column support for fwrite, part of #3690 DT = data.table(a=1:3, z=0:2 - (2:0)*1i) @@ -9937,7 +10121,7 @@ test(1658.56, fwrite(data.table(exp(1) - pi*1i)), output='2.718[0-9]*-3.141[0-9] ## formerly 1658.46 DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c(3i,4i,5i))) test(1658.57, fwrite(DT), output='0+3i|0+4i|0+5i') -DT[ , b := c(1i, -1-1i, NA_complex_)] +DT[ , b := c(1i, -1-1i, NA)] test(1658.58, fwrite(DT), output='a,b\n1,0\\+1i\n2,-1-1i\n3,$') # more coverage @@ -10406,7 +10590,7 @@ test(1702.2, isoweek(as.Date(test_cases)), test_values) test(1702.3, isoweek(as.POSIXct(test_cases)), test_values) # 1% sample of a 400-year cycle of dates for extra robustness -if (test_R.utils) test(1702.4, isoweek((DT<-fread(testDir('isoweek_test.csv.bz2')))$input_date), DT$expected_output, ignore.warning="datetimes before") ## ignore.warning due to #5785 +if (test_R.utils) test(1702.4, isoweek((DT<-fread(testDir('isoweek_test.csv.bz2')))$input_date), DT$expected_output) # fread, ensure no shell commands #1702 if (.Platform$OS.type=="unix") { @@ -10805,7 +10989,31 @@ test(1738.3, sapply(DT,typeof), c(A="double",B="integer")) test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L)) -# 1739 moved to benchmark.Rraw, #5517 +if (FALSE) { + # Full range takes too long for CRAN. + dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") + dtsCh = as.character(dts) # 36s + dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 + test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) +} else { + # test on CRAN a reduced but important range + dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") + dtsCh = as.character(dts) + test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) +} +DT = data.table(A=dts, B=as.IDate(dts)) +test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) +test(1739.4, typeof(dts), "double") +f = tempfile() +g = tempfile() # Full range +fwrite(DT,f) # 0.092s +write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s +test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) +test(1739.6, readLines(f), readLines(g)) +unlink(f) +unlink(g) +rm(list=c("dtsCh","dts")) +gc() # dateTimeAs DT = data.table( @@ -10865,34 +11073,15 @@ setattr(DT[[4]], "tzone", NULL) setattr(DT[[5]], "tzone", NULL) # format() now supports digits = 0, to display nsmall decimal places. -# Oct 2022: R-devel changed write.csv behavior to no longer respect digits.secs, #5478. -# For now we'll get out of the way while R-devel discussion is ongoing so that 1.14.4 can -# be submitted to CRAN. -# These tests test fwrite(, dateTimeAs="write.csv") whose -# very point is to match write.csv. Rather than turn off these tests, we'll for now -# continue to test that at least fwrite continues to work as intended. Otherwise -# coverage will drop and we could miss a plain old crash or error bug. -# Note that tzone has been removed above so these tests output the POSIXct in the -# R session's timezone because here dateTimeAs="write.csv" and that's what write.csv does. -# This is the reason `y` can't be fixed strings because depending on the timezone of the -# session which is running test.data.table, the results will be different. -# data.table's fwrite achieves local timezone writing (when dateTimeAs="write.csv") via -# an R call to format.POSIXct in fwriteR.c. By default fwrite writes datetime in UTC for -# consistent and reproducible research, which is different to write.csv. -# TODO: revisit when R-devel has settled w.r.t. write.csv behavior. -format_rows_as_csv = function(DT, digits) apply(sapply(DT, format, digits=digits), 1L, paste0, collapse=",") old=options(digits.secs=0) test(1741.3, x1<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - c("A,B,C,D,E", format_rows_as_csv(DT, digits=0L))) - # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=3) test(1741.4, x2<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - c("A,B,C,D,E", format_rows_as_csv(DT, digits=3L))) - # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=6) test(1741.5, x3<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - c("A,B,C,D,E", format_rows_as_csv(DT, digits=6L))) - # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) # check that extra digits made it into output test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3))) options(old) @@ -10966,7 +11155,7 @@ test(1743.217, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor test(1743.218, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 4, 2, 3)), class), y = c(e = "character", d = "factor", b = "factor", c = "factor")) test(1743.22, fread("a,b,c\n1999/01/01,2,f", colClasses=list(Date=1L), drop="a"), data.table(b=2L, c="f")) -test(1743.231, fread("a,b,c\n2,1,4j", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4j"), +test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), warning=paste0(base_messages$coerce_na, ".*left as type 'character'")) test(1743.232, fread("a,b,c\n2,1,3+4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c=3+4i)) test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b"), drop="a"), data.table(b=2L, c="f")) @@ -11123,13 +11312,12 @@ test(1750.07, # 0 length `by`, must also use `sets=list()`, so 0L rows result nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = character(), .SDcols=c("amount","value"), sets=list(), id=TRUE)), 0L ) -# for any single value from dataset there should be always be the same aggregate result on any level of grouping -# changed from all(sapply()) to for() to save ram, #5517 -for (i in seq_len(nrow(dt))) { - test(1750.08+i/10000, uniqueN( +test(1750.08, all( # for any single value from dataset there should be always same aggregate result on any level of grouping + sapply(seq_len(nrow(dt)), function(i) uniqueN( groupingsets(dt[i], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character())), - by=c("amount","value")) == 1L) -} + by=c("amount","value") + )) == 1L +), TRUE) # all grouping id matches in all totals r = groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character()), id=TRUE) test(1750.09, uniqueN( @@ -11296,7 +11484,23 @@ test(1751.3, capture.output(fwrite(DT,na="NA",verbose=FALSE)), c("\"x\"","NA")) test(1751.4, fread({fwrite(DT, f<-tempfile());f}), DT) # the important thing unlink(f) -# 1752 tested nanotime moved to other.Rraw 22, #5516 +if (test_nanotime) { + old = options(warnPartialMatchArgs=FALSE) # option off temporarily pending https://github.com/eddelbuettel/nanotime/pull/49 + DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", + "2016-09-29T23:59:00.000000001Z", + "2016-09-29T23:59:00.000000999Z", + "1970-01-01T00:01:01.000001000Z", + "1970-01-01T00:00:00.000000000Z", + "1969-12-31T23:59:59.999999999Z", + "1969-12-31T23:59:59.000000089Z", + "1969-12-31T12:13:14.000000000Z", + "1969-12-31T12:13:14.999999999Z", + "1969-12-31T12:13:14.000000001Z", + "1967-03-15T00:00:00.300000002Z", + "1967-03-15T23:59:59.300000002Z"))) + options(old) + test(1752, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) +} # check too many fields error from ,\n line ending highlighted in #2044 test(1753.1, fread("X,Y\n1,2\n3,4\n5,6"), data.table(X=INT(1,3,5),Y=INT(2,4,6))) @@ -11358,7 +11562,18 @@ if (test_R.utils) test(1759, fread(testDir("alluniquechar.csv.gz"))[c(1,2,499,50 H=c("tokakysooopwtmlkeimzbgpein","hguwmynjhecsxpxldyzlemavmw", "lyclruzkazfqhyxnppaafwcveo","myfqhltlwzwwxyvshwrzrdmfyq"))) -# 1760 moved to benchmark.Rraw, #5517 +# fread should use multiple threads on single column input. +# tests 2 threads; the very reasonable limit on CRAN +# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) +if (getDTthreads() == 1L) { + cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") +} else { + N = if (TRUE) 2e6 else 1e9 # offline speed check + fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) + test(1760.1, file.info(f)$size > 4*1024*1024) + test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") + unlink(f) +} # fread single column with superfluous fill=TRUE, #2118 test(1761.1, fread("1\n2\n3", fill=TRUE), data.table(V1=1:3)) @@ -11703,10 +11918,10 @@ ld = sapply(same, as.IDate) test(1779.01, uniqueN(ld)==1L) lt = sapply(same[1:2], as.ITime) # exclude date test(1779.02, uniqueN(lt)==1L) -# some random timestamps old defaults vs new methods UTC +# some random 1e6 timestamps old defaults vs new methods UTC intpx = function(x) as.integer(as.POSIXct(x, origin = "1970-01-01", tz = "UTC")) set.seed(1) -i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e3, TRUE) + intpx("2014-10-12") # 1e5 reduced to 1e3, #5517 +i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e5, TRUE) + intpx("2014-10-12") p = as.POSIXct(i, origin = "1970-01-01", tz = "UTC") test(1779.03, identical(as.ITime.default(p), as.ITime(p))) test(1779.04, identical(as.IDate.default(p), as.IDate(p))) @@ -11778,7 +11993,9 @@ test(1812, fread("A,B\n1,2\n3,4\n", skip="4", verbose=TRUE), data.table(V1=3L, V test(1813, fread("A,B\n1,2\n3,4", skip=10L), error="skip=10 but the input only has 3 lines") test(1814, fread("A,B\n1,2\n3,4\n \n\t", skip=3L), error="skip has been set after the last non-whitespace") -# 1815 moved to benchmark.Rraw, #5517 +DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) +fwrite(DT, f<-tempfile()) +test(1815, fread(f, nrows=5), DT[1:5]) #2243 test(1816.1, fread("A,E\n1,2\n5,7\n4,6\n\x1A\x1A", verbose=TRUE), data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L)), @@ -11895,7 +12112,14 @@ fwrite(DT, f) test(1825.22, fread(f, colClasses = c(a = "numeric", b = "integer")), DT, warning="Attempt to override column 2.*ignored") unlink(f) -# 1826 moved to benchmark.Rraw, #5517 +# issue 2351 +set.seed(1) +DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) +fwrite(DT, file=f<-tempfile(), eol="\r") +test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) +cat("id888,42", file=f, append=TRUE) # without final \r after last line +test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) +unlink(f) # Issue 2222 test(1827.1, fread("A,B\n1987,1\n1987,3\n", na.strings=c("1987", "NA")), data.table(A=c(NA,NA),B=c(1L,3L))) @@ -11983,7 +12207,21 @@ if (test_R.utils) { V12=c("AAAAAAAAAAAAA","","AAAAAAA","AAA"))) } -# 1835 moved to benchmark.Rraw, #5517 +# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because +# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. +# Would need a very complicated construction of embedded new lines in quoted fields, to test that. +# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. +DT = as.data.table(CO2) +f = tempfile() +for (i in 0:1000) { + start = nrow(CO2)*i + fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) + if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) +} +test(1835, fread(f, verbose=TRUE), + output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", + warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") +unlink(f) test(1836, fread('1,2,"3,a"\n4,5,"6,b"'), data.table(V1=c(1L,4L), V2=c(2L,5L), V3=c("3,a","6,b"))) # 2196 @@ -12088,7 +12326,7 @@ rand_strings = function(n) { apply(M, 1, function(x) paste0(letters[x], collapse="")) } set.seed(123) # the random data here doesn't match the data in issue 2275 because they used stringi::stri_rand_strings which has a different RNG -n = 1000 # reduced from 100000 to 1000 for #5517 +n = 100000 DT1 = data.table(RANDOM_STRING = rand_strings(n), DATE = sample(seq(as.Date('2016-01-01'), as.Date('2016-12-31'), by="day"), n, replace=TRUE)) DT2 = data.table(RANDOM_STRING = rand_strings(n), @@ -12133,7 +12371,13 @@ test(1849.9, fread(f, select=c("Date", "Description", "Balance")), data.table(Date=20150725L,Description="abcd",Balance="$5,006")) unlink(f) -# 1850 moved to benchmark.Rraw, #5517 +# segfault when rbindlist is asked to create a DT with more than 2bn rows +DT = data.table(1:1e6) +L = vector("list", 2148) +for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test +test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") +rm(list=c("L","DT")) +gc() # by=.EACHI missings to list columns, #2300 dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5])) @@ -12438,7 +12682,60 @@ fwrite(DT,f<-tempfile()) test(1873, fread(f), DT) unlink(f) -# 1874-1875 moved to benchmark.Rraw, #5517 +# Better jump sync and run-on in PR#2627 +# +# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 +# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly +x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) +x[51094]="" +cat(x, file=f<-tempfile(), sep="\n") +test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], + data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), + output="jumps=[0..2)") # ensure jump 1 happened +# +# out-of-sample short lines in the first jump, not near the jump point +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), + warning="Stopped early on line 5021.*<>") +test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,5020L,NA,NA,5042L,102184L)), + output="jumps=[0..2)") +# +# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 +# confirmed fails in master with that error before PR#2627 +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), + warning="Stopped early on line 51094.*<>", + output="jumps=[0..2)") +test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,51093L,NA,NA,51151L,102184L)), + output="jumps=[0..2)") +# +# jump inside a quoted field containing many new lines, to simulate a dirty jump +# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. +# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" +cat(x, file=f, sep="\n") +test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n + data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), + V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), + output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") +# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's +# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) +unlink(f) test(1876, fread("http://hkhfsk\nhttp://fhdkf\nhttp://kjfhskd\nhttp://hfkjf", header=FALSE), # data not a download, #2531 data.table(V1=c("http://hkhfsk","http://fhdkf","http://kjfhskd","http://hfkjf"))) @@ -12532,7 +12829,7 @@ DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784 test(1887.3, na.omit(DT), DT[0L]) test(1887.4, na.omit(DT, invert=TRUE), DT) -x = runif(1e3) # 1e4 reduced to 1e3 in #5517 but really it was the 1e6 just after 1888.5 below which is now 1e3 too +x = runif(1e4) test(1888, fsort(x), base::sort(x)) test(1888.1, fsort(x, decreasing = TRUE), base::sort(x, decreasing = TRUE), warning = "New parallel sort has not been implemented for decreasing=TRUE.*one thread") @@ -12546,7 +12843,7 @@ test(1888.4, fsort(x, decreasing = TRUE, na.last = TRUE), base::sort(x, decreasi x <- as.integer(x) test(1888.5, fsort(x), base::sort(x, na.last = FALSE), warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread") -x = runif(1e3) +x = runif(1e6) test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") @@ -12559,7 +12856,11 @@ test(1889, chmatch(x,x), 1:1000) rm(list=x) gc() -# 1890 used stats::ts.plot, moved to other.Rraw 29 to save ram, #5517 +# test DT$.<- in a data.table-unaware package +DT = data.table(A=1:5) +test(1890.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") +# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() +test(1890.2, DT, data.table(A=1:5)) # na="" default, #2524 test(1891.1, fread('A,B,C\n1,foo,4\n2,,5\n3,bar,6\n', na.strings=""), data.table(A=1:3, B=c("foo",NA,"bar"), C=4:6)) @@ -12775,7 +13076,43 @@ test(1911.2, DT[, COL_INT := integer(0)], error = "RHS of assignment to existing column 'COL_INT' is zero length but not NULL.*") -# 1912 moved to benchmark.Rraw, #5517 +# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 +# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. +# 2 threads are sufficient to fail before the fix. +N = 20 +DF = data.frame(a=rnorm(N), + b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), + c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) +DT = setDT(DF) # setDT required since data.table() already expanded altrep's +before = sum(gc()[, 2]) +fff = function(aref) { + ff = lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) + return(rbindlist(ff)) +} +for(i in 1:100) { + f = fff("a") + rm("f") +} +gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` + # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. +after = sum(gc()[, 2]) +test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). +# +before = sum(gc()[, 2]) +fff = function(aref) { + DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. + lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) +} +for(i in 1:100) { + fff("a") +} +gc() +after = sum(gc()[, 2]) +test(1912.2, after < before + 10) # BEGIN port of old testthat tests, #2740. Issue numbers may be from R-forge. # @@ -13579,7 +13916,10 @@ y = as.ITime('543210', format = '%S%M%H') test(1962.095, y, structure(37974L, class = "ITime")) test(1962.096, capture.output(print(y)), '[1] "10:32:54"') test(1962.097, rep(y, 2L), structure(c(37974L, 37974L), class = "ITime")) -test(1962.098, format(as.POSIXlt(y, date='2018-12-01', tz='UTC'), usetz=TRUE), "2018-12-01 10:32:54 UTC") +test(1962.098, as.POSIXlt(y, date = '2018-12-01', tz = 'UTC'), + structure(list(sec = 54, min = 32L, hour = 10L, mday = 1L, mon = 11L, + year = 118L, wday = 6L, yday = 334L, isdst = 0L), + class = c("POSIXlt", "POSIXt"), tzone = "UTC")) test(1962.099, as.POSIXct(x, y), structure(1533119574, tzone = "UTC", class = c("POSIXct", "POSIXt"))) @@ -13980,7 +14320,11 @@ test(1977.4, DT["D", -"GRP"], data.table(ID="D", X=NA_real_, key="ID")) test(1977.5, DT["D", c("ID","GRP")], data.table(ID="D", GRP=NA_integer_, key="ID")) test(1977.6, DT[c("A","D"), c("ID","GRP")], data.table(ID=c("A","A","D"), GRP=INT(1,1,NA))) -# 1978 moved to benchmark.Rraw, #5517 +# catch malformed factor in rbindlist, #3315 +set.seed(32940) +NN=7e5; KK=4e4; TT=25 +DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) +test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites # Drop Null Values from `j` list elements #1406 DT = data.table(a = 1:3,b = letters[1:3],c = LETTERS[1:3]) @@ -14000,7 +14344,14 @@ DT = data.table( id = 1:5 , val = letters[1:5] ) test(1981.3, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -# 1982 moved to benchmark.Rraw, #5517 +# print of DT with many columns reordered them, #3306. +DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print +out = capture.output(print(DT)) +tt = out[grep("V",out)] +tt = unlist(strsplit(gsub(" ","",tt), "V")) +test(1982.1, tt[1L], "") +tt = as.integer(tt[tt!=""]) +test(1982.2, tt, seq_along(tt)) # parse(text = 'list(`\\phantom{.}`)') fails, #3319 DT <- data.table(x=1, y=1:5) @@ -14289,7 +14640,12 @@ dx = data.table(id = 1L, key = "id") di = list(z=c(2L, 1L)) test(1999.2, key(dx[di]), NULL) -# 2000 moved to benchmark.Rraw, #5517 +# chmatchdup test from benchmark at the bottom of chmatch.c +set.seed(45L) +x = sample(letters, 1e5, TRUE) +y = sample(letters, 1e6, TRUE) +test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) +rm(list=c("x","y")) # rbindlist use.names=TRUE returned random column order when ncol>255; #3373 DT = setDT(replicate(300, rnorm(3L), simplify = FALSE)) @@ -14338,10 +14694,7 @@ test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, us test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4), c=INT(5,6,NA,NA))) test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE), - data.table(a=c(1:4), c=INT(NA,NA,5,6))) -# rbindlist segfault with fill=TRUE and usenames=FALSE #5444 -test(2003.6, rbindlist(list(list(1), list(2,3)), fill=TRUE, use.names=FALSE), data.table(c(1,2), c(NA, 3))) -test(2003.7, rbindlist(list(list(1), list(2,factor(3))), fill=TRUE, use.names=FALSE), data.table(c(1,2), factor(c(NA, 3)))) + data.table(a=c(1:4), V1=INT(NA,NA,5,6))) # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" @@ -14485,8 +14838,8 @@ options(datatable.rbindlist.check=NULL) # this option is set to NULL at the top if (.Platform$OS.type == 'windows') local({ lc_collate <- Sys.getlocale(c('LC_COLLATE')) lc_ctype <- Sys.getlocale(c('LC_CTYPE')) - suppressWarnings(Sys.setlocale('LC_COLLATE', "Chinese (Simplified)_China.936")) ## fix CRAN warning #5696 - suppressWarnings(Sys.setlocale('LC_CTYPE', "Chinese (Simplified)_China.936")) + Sys.setlocale('LC_COLLATE', "Chinese (Simplified)_China.936") + Sys.setlocale('LC_CTYPE', "Chinese (Simplified)_China.936") on.exit({ Sys.setlocale('LC_COLLATE', lc_collate) Sys.setlocale('LC_CTYPE', lc_ctype) @@ -14737,7 +15090,210 @@ test(2030.18, .Last.updated, 0L) # zero match test(2031.01, rbind(data.table(A=1:3, B=7:9), data.table(A=4:6, B=as.list(10:12))), ans<-data.table(A=1:6, B=as.list(7:12))) test(2031.02, rbind(data.table(A=1:3, B=as.list(7:9)), data.table(A=4:6, B=10:12)), ans) -# 2032-2033 tested yaml moved to other.Rraw 16-17, #5516 +if (test_yaml) { # csvy; #1701 + f = testDir("csvy/test.csvy") + DT = data.table(var1 = c("A", "B"), + var2 = c(1L, 3L), + var3 = c(2.5, 4.3)) + DT_yaml = copy(DT) + setattr(DT_yaml, 'yaml_metadata', + list(name = "my-dataset", + source = "https://github.com/leeper/csvy/tree/master/inst/examples", + schema = list(fields = list( + list(name = "var1", title = "variable 1", type = "string", + description = "explaining var1", + constraints = list(list(required = TRUE))), + list(name = "var2", title = "variable 2", type = "integer"), + list(name = "var3", title = "variable 3", type = "number") + )))) + ## with skip = '__auto__', fread can figure out + ## how to start after the metadata (just ignoring it) + test(2032.01, fread(f), DT) + ## should be the same, but with yaml_metadata attribute + test(2032.02, fread(f, yaml = TRUE), DT_yaml) + ## testing verbose messaging + test(2032.03, fread(f, yaml = TRUE, verbose = TRUE), + DT_yaml, output = 'Processed.*YAML metadata.*') + ## this file is identical, except the body of the + ## YAML header is commented out with # (should read identically) + test(2032.04, + fread(testDir('csvy/test_comment.csvy'), yaml = TRUE), + DT_yaml) + ## user input is taken as most intentional & overrides YAML + DT_yaml[ , var2 := as.numeric(var2)] + test(2032.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')), + DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2') + ## extraneous/unused fields shouldn't throw off reading + DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE) + test(2032.06, names(DT), c('Date', 'WTI')) + test(2032.07, attr(DT, 'yaml_metadata'), + list(names = c("Date", "WTI"), class = "data.frame", + title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv", + fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv", + sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D", + source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples", + item = "PET", sourcekey = "RWTC", freq = "Daily", + rate = "MID", type = "price", units = "Dollars per Barrel", + latestdate = "2015-08-31", releasedate = "2015-09-02", + nextreleasedate = "2015-09-10", source = "Thomson Reuters", + contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800")) + ## yaml can also handle sep, dec, quote, and na.strings + DT_out = data.table(var1 = c("A", "B"), + var2 = c(1L, NA), + var3 = c(2.5, 4.3)) + meta = + list(name = NULL, + schema = list(fields = list( + list(name = "var1", title = "variable 1", type = "string", + description = "a single-quoted character variable"), + list(name = "var2", title = "variable 2", type = "integer"), + list(name = "var3", title = "variable 3", type = "number", + description = "European-style numeric") + )), + header = TRUE, sep = "|", dec = ",", + quote = "'", na.strings = "@") + attr(DT_out, 'yaml_metadata') = meta + test(2032.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out) + ## user-specified attributes can override data from YAML + meta$sep = "-" + setattr(DT_out, 'yaml_metadata', meta) + test(2032.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out, + message = 'User-supplied.*sep.*override') + + meta$sep = "|" + setattr(DT_out, 'yaml_metadata', meta) + test(2032.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE), + DT_out, message = 'User-supplied.*header.*override') + col.names = c('x', 'y', 'z') + setnames(DT_out, col.names) + test(2032.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out, + message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override')) + + test(2032.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names), + DT_out, message = 'User-supplied.*col.names') + + setnames(DT_out, c('var1', 'var2', 'var3')) + meta$quote = "^" + setattr(DT_out, 'yaml_metadata', meta) + test(2032.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"), + DT_out, message = 'User-supplied.*quote') + + meta$quote = "'" + meta$dec = "." + setattr(DT_out, 'yaml_metadata', meta) + test(2032.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','), + DT_out, message = 'User-supplied.*dec') + + meta$dec = ',' + meta$na.strings = 'NA' + setattr(DT_out, 'yaml_metadata', meta) + test(2032.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'), + DT_out, message = 'User-supplied.*na.strings') + + ## error if YAML malformed + test(2032.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE), + error = 'Reached the end.*YAML.*valid csvy') + ## use any other CSV in test directory which doesn't have YAML + if (test_R.utils) test(2032.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE), + error = 'Encountered.*unskipped.*constitute.*valid YAML') + ## no problem if some fields are missing a type (just + ## resort to standard auto-inferral, i.e., identical to + ## the case of partially-specified colClasses) + DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L), + var3 = c(2.5, 4.3)) + setattr(DT, 'yaml_metadata', + list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples", + schema = list(fields = list( + list(name = "var1"), list(name = "var2", type = "integer"), + list(name = "var3", type = "number") + )))) + test(2032.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT) + ## skip applies starting after the YAML header + setattr(DT, 'yaml_metadata', + list(schema = list(fields = list( + list(name = "var1", type = "string"), + list(name = "var2", type = "integer"), + list(name = "var3", type = "number") + )))) + test(2032.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT) + ## user-supplied col.names override metadata (as for colClasses) + cn = paste0('V', 1:3) + setnames(DT, cn) + test(2032.20, fread(testDir('csvy/test_skip.csvy'), + yaml = TRUE, skip = 2L, col.names = cn), + DT, message = 'User-supplied column names.*override.*YAML') + ## invalid value fails + test(2032.21, fread(f, yaml = 'gobble'), + error = 'isTRUEorFALSE\\(yaml\\) is not TRUE') + + ## warning that skip-as-search doesn't work with yaml + DT_yaml[ , var2 := as.integer(var2)] + test(2032.22, fread(f, skip = 'var1,', yaml = TRUE), + DT_yaml, warning = 'Combining a search.*YAML.*') + + # fwrite csvy: #3534 + tmp = tempfile() + DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) + # force eol for platform independence + fwrite(DT, tmp, yaml = TRUE, eol = '\n') + as_read = readLines(tmp) + test(2033.01, as_read[c(1L, 24L)], c('---', '---')) + test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) + test(2033.03, grepl('creation_time_utc', as_read[3L])) + test(2033.04, as_read[4:23], + c("schema:", " fields:", " - name: a", " type: integer", + " - name: b", " type: numeric", " - name: c", " type: character", + "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", + # NB: apparently \n is encoded like this in YAML + "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", + "logical01: no")) + tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") + test(2033.05, as_read[25:30], tbl_body) + + # windows eol + fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') + test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') + + # multi-class columns + DT[ , t := .POSIXct(1:5, tz = 'UTC')] + fwrite(DT, tmp, yaml = TRUE) + as_read = readLines(tmp) + test(2033.07, as_read[13L], " type: POSIXct") + + # ~invertibility~ + # fread side needs to be improved for Hugh's colClasses update + DT[ , t := NULL] + fwrite(DT, tmp, yaml = TRUE) + DT2 = fread(tmp, yaml = TRUE) + # remove metadata to compare + attr(DT2, 'yaml_metadata') = NULL + test(2033.08, all.equal(DT, DT2)) + + test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), + output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) + + # TODO: test gzip'd yaml which is now supported + + # yaml + bom arguments + DT = data.table(l=letters, n=1:26) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 + lines = readLines(fcon) + lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows + # remove the blank here so we don't need to change this test if/when that changes in yaml package + test(2033.11, length(lines), 48L) + close(fcon) + test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + # re-write should have same output (not appended) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") + lines = readLines(fcon) + lines = lines[lines!=""] + test(2033.13, length(lines), 48L) + close(fcon) + test(2033.14, fread(f), DT) + unlink(f) +} # fcast coverage DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8) @@ -14753,10 +15309,10 @@ test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans) # source() printing edge case; #2369 setup = c('DT = data.table(a = 1)') writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile()) -test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo=TRUE, local=TRUE)), fixed=TRUE))) # local= #5514 +test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE))) ## test force-printing still works writeLines(c(setup, 'DT[ , a := 1][]'), tmp) -test(2036.2, source(tmp, echo=TRUE, local=TRUE), output="1:\\s+1") +test(2036.2, source(tmp, echo = TRUE), output = "1:\\s+1") # more helpful guidance when assigning before setDT() after readRDS(); #1729 DT = data.table(a = 1:3) @@ -15394,7 +15950,16 @@ if (test_bit64) { test(2060.304, fcoalesce(int64, 1), error='Item 2 has a different class than item 1') test(2060.305, fcoalesce(int64, 1L), error = 'Item 2 is type integer but the first item is type double') } -# 2060.401-405 tested nanotime moved to other.Rraw 23, #5516 +# nanotime tests +if (test_nanotime) { + nt = nanotime(int) + nt_val = nanotime(1:4) + test(2060.401, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46 + test(2060.402, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val)) + test(2060.403, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val)) + test(2060.404, fcoalesce(nt, 1), error='Item 2 has a different class than item 1') + test(2060.405, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double') +} # setcoalesce x = c(11L, NA, 13L, NA, 15L, NA) y = c(NA, 12L, 5L, NA, NA, NA) @@ -15411,9 +15976,9 @@ test(2060.503, xx_addr, address(xx)) test(2060.504, xx, x) test(2060.505, address(setcoalesce(xx)), xx_addr) # complex support for fcoalesce -z1 = c(1i, NA_complex_, 1-1i, NA_complex_, 0+3i, NA_complex_) -z2 = c(NA_complex_, 4-2i, 0+0i, NA_complex_, NA_complex_, NA_complex_) -z3 = c(2, NA_complex_, 3+6i, 5-1i, NA_complex_, NA_complex_) +z1 = c(1i, NA, 1-1i, NA, 0+3i, NA) +z2 = c(NA, 4-2i, 0+0i, NA, NA, NA) +z3 = c(2, NA, 3+6i, 5-1i, NA, NA) na_idx = c(2L, 4L, 6L) test(2060.600, fcoalesce(z1, 0+0i), `[<-`(z1, na_idx, 0+0i)) test(2060.601, fcoalesce(z1, z2), `[<-`(z1, na_idx, c(4-2i, NA, NA))) @@ -15514,7 +16079,7 @@ z = c(1:3) + c(3:1)*1i test(2067.1, shift(z), c(NA, z[1:2])) test(2067.2, shift(z, type = 'lead'), c(z[2:3], NA)) test(2067.3, shift(z, fill = 1i), c(1i, z[1:2])) -test(2067.4, shift(list(z, 1:3)), list(c(NA_complex_, z[1:2]), c(NA, 1:2))) +test(2067.4, shift(list(z, 1:3)), list(c(NA, z[1:2]), c(NA, 1:2))) test(2067.5, shift(z, n=1, type = 'cyclic'), c(z[3], z[1:2])) test(2067.6, shift(z, n=-1, type = 'cyclic'), c(z[2:3], z[1])) test(2067.7, shift(list(z, 1L:3L), n=1, type = 'cyclic'), list(c(z[3], z[1:2]), c(3L, 1:2))) @@ -15885,7 +16450,18 @@ test(2078.32, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE, NAboun test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA)) test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE)) test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA") -# 2080.01-05 tested nanotime moved to other.Rraw 24, #5516 +# nanotime support +if (test_nanotime) { + n=nanotime(1:4) + n[2L]=NA + op = options(datatable.verbose=TRUE) + test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") + test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") + test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") + options(op) + test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") + test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") +} # use raw type to cover fallback to R in between.R old = options(datatable.verbose=TRUE) test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FALSE, TRUE, TRUE, TRUE, FALSE), output="fallback to slow R") @@ -15929,7 +16505,10 @@ if (test_bit64) { i = as.integer64(1:4)+3e9 test(2085.01, fifelse(c(TRUE,FALSE,NA,TRUE), i, i+100), c(i[1L], i[2L]+100, as.integer64(NA), i[4])) } -# 2085.11 tested nanotime moved to other.Rraw 25, #5516 +if (test_nanotime) { + n = nanotime(1:4) + test(2085.11, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4])) +} test(2085.21, fifelse(c(TRUE,FALSE,NA), 1:3, c(1,2,3)), c(1,2,NA)) test(2085.22, fifelse(c(TRUE,FALSE,NA), c(1,2,3), 1:3), c(1,2,NA)) test(2085.31, fifelse(c(a=TRUE,b=FALSE), list(m=1,n=2), list(x=11,y=12)), list(a=1, b=12)) @@ -16161,7 +16740,109 @@ test(2107.3, names(DT), c('A','b','c')) setnames(DT, -(1:2), toupper) test(2107.4, names(DT), c('A','b','C')) -# 2108 tested xts moved to other.Rraw 19, #5516 +# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached +#stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces()) +x = as.POSIXct("2019-09-09")+0:1 +old = options(datatable.verbose=TRUE) +test(2108.01, last(x), x[length(x)], output="!is.xts(x)") +test(2108.02, first(x), x[1L], output="!is.xts(x)") +if (test_xts) { + xt = xts(1:2, x) + test(2108.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)") + test(2108.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)") + xt = xts(matrix(1:4, 2L, 2L), x) + test(2108.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)") + test(2108.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)") +} +# first on empty df now match head(df, n=1L), #3858 +df = data.frame(a=integer(), b=integer()) +test(2108.11, first(df), df, output="!is.xts(x)") +test(2108.12, last(df), df, output="!is.xts(x)") +options(old) +# xts last-first dispatch fix #4053 +x = 1:3 +y = as.POSIXct(x, origin="1970-01-01") +df = data.frame(a=1:2, b=3:2) +dt = as.data.table(df) +mx = matrix(1:9, 3, 3) +ar = array(1:27, c(3,3,3)) +xt = structure( + c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994, + 140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993, + 94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"), + .Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) +) +old = options(datatable.verbose=TRUE) +if (test_xts) { + test(2108.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(2108.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(2108.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(2108.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + xt_last = structure( + c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(1167955200, tzone = "UTC", tclass = "Date"), + .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + xt_last2 = structure( + c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005, + 141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"), + .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + test(2108.25, last(xt), xt_last, output="using xts::last: is.xts(x)") + test(2108.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)") + test(2108.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(2108.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(2108.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(2108.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + xt_first = structure( + c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(1167782400, tzone = "UTC", tclass = "Date"), + .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + xt_first2 = structure( + c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"), + .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + test(2108.35, first(xt), xt_first, output="using xts::first: is.xts(x)") + test(2108.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)") +} else { + test(2108.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.25, last(xt), error="you should have 'xts' installed already") + test(2108.26, last(xt, n=2L), error="you should have 'xts' installed already") + test(2108.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(2108.35, first(xt), error="you should have 'xts' installed already") + test(2108.36, first(xt, n=2L), error="you should have 'xts' installed already") +} +test(2108.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(2108.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(2108.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(2108.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(2108.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(2108.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(2108.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(2108.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic +test(2108.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127 +test(2108.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +test(2108.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127 +test(2108.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +options(old) # error in autonaming by={...}, #3156 DT = data.table(State=c("ERROR", "COMPLETED", "ERROR"), ExitCode=c(1, 0, 2)) @@ -16238,8 +16919,8 @@ g = function(x) { if (x==1L) factor(c("a","b")) else factor(c("a","b","c")) } test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a","b","a","b","c")))) # original test verbatim from the same issue #2199 set.seed(2) -ids = sample(letters, 10) # reduced from 20 to 10 -dates = 1:10 # and 40 to 10 to save ram, #5517 +ids = sample(letters, 20) +dates = 1:40 dt = data.table(CJ(dates, ids, ids)) setnames(dt, c("date", "id1", "id2")) dt[, value := rnorm(length(date))] @@ -16250,8 +16931,8 @@ f1 = function(sdt) { melt.data.table(dt1, id.vars = "id1") } res = dt[, f1(.SD), by=date] -test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)], dt[c(1,.N)]) -test(2114.4, print(res), output="date.*-0.830") +test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)][], dt[c(1,.N)]) +test(2114.4, print(res), output="date.*0.433") # and from #2522 DT = data.table(id=1:9, grp=rep(1:3,each=3), val=c("a","b","c", "a","b","c", "a","b","c")) test(2114.5, as.character(DT[, valfactor1 := factor(val), by = grp]$valfactor1), ans<-rep(c("a","b","c"),3)) @@ -16511,7 +17192,10 @@ if(test_bit64) { i=as.integer64(1:12)+3e9 test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:11L]+100, as.integer64(NA))) } -# 2127.27 tested nanotime moved to other.Rraw 26, #5516 +if(test_nanotime) { + n=nanotime(1:12) + test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:11L]+100, as.integer64(NA))) +} test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec)) test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec) test(2127.30, fcase(test_vec1, rep("1",11L), test_vec2, rep("0",11L)), as.character(out_vec)) @@ -16676,8 +17360,20 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") rm(s1, s2, class2132) - -# 2133 tested xts moved to other.Rraw 20, #5516 +if (test_xts) { + # keep.rownames in as.data.table.xts() supports a string, #4232 + xts = xts::xts(1:10, structure(1:10, class = "Date")) + colnames(xts) = "VALUE" + DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE") + test(2133.1, colnames(DT), c("DATE", "VALUE")) + test(2133.2, key(DT), "DATE") + test(2133.3, as.data.table(xts, keep.rownames = "VALUE"), + error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") + test(2133.4, as.data.table(xts, keep.rownames = character()), + error = "keep.rownames must be length 1") + test(2133.5, as.data.table(xts, keep.rownames = NA_character_), + error = "keep.rownames must not be NA") +} # friendlier error for common mistake of using := in i instead of j, #4227 DT = data.table(a = 1) @@ -17450,7 +18146,18 @@ d[1:50, "a"] = d[51:100, "a"] setDT(d) test(2200, nrow(d[a==99]), 2L) -# 2201 moved to benchmark.Rraw, #5517 +# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 +# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too +set.seed(1) +DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary +setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow +test(2201.1, nrow(DT[, .N, by=grp]), 255L) +test(2201.2, nrow(setkey(DT, grp)), 65536L) +set.seed(1) +DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun +test(2201.3, nrow(DT[, .N, by=grp]), 65536L) +test(2201.4, nrow(setkey(DT, grp)), 65536L) +setDTthreads() # restore default throttle # fwrite now allows sep="", #4817 test(2202.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), @@ -17490,7 +18197,11 @@ test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty l test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), setnames(setkey(as.data.table(mtcars, keep.rownames = TRUE), rn), 'rn', 'model')) -# 2205 tested nanotime moved to other.Rraw 27, #5516 +# na.omit works for nanotime, #4744 +if (test_nanotime) { + DT = data.table(time=nanotime(c(1,NA,3))) + test(2205, na.omit(DT), DT[c(1,3)]) +} # isRealReallyInt, #3966 test(2206.01, isRealReallyInt(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE) @@ -17589,8 +18300,8 @@ for (col in c("a","b","c")) { # DT() functional form, #4872 #5106 #5107 #5129 if (base::getRversion() >= "4.1.0") { - DT = DTfun # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 + if (exists("DTfun")) DT=DTfun # just in dev-mode restore DT() in .GlobalEnv as DT object overwrote it in tests above droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below test(2212.011, EVAL("mtcars |> DT(mpg>20, .(mean_hp=round(mean(hp),2)), by=cyl)"), data.frame(cyl=c(6,4), mean_hp=c(110.0, 82.64))) @@ -17641,7 +18352,6 @@ if (base::getRversion() >= "4.1.0") { test(2212.52, EVAL("D |> DT(D[, .I[which.max(mpg)], by=cyl]$V1)"), ans) test(2212.53, EVAL("filter |> DT(filter[, .I[which.max(mpg)], by=cyl]$V1)"), error="unused.*argument.*by.*cyl") # R's [.data.frame error on filter[...] test(2212.54, EVAL("filter |> DT((filter |> DT(, .I[which.max(mpg)], by=cyl))$V1)"), as.data.frame(ans)) - rm(DT) } # precision powers of 10^(-n), #4461 @@ -17767,30 +18477,26 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE), l = sample(c(TRUE, FALSE, NA), 20, TRUE), r = as.raw(sample(1:5, 20, TRUE))) -load(testDir("test2224.Rdata")) # 47KB array 24x8 where each cell contains a length-20 result +load(testDir("test2224.Rdata")) # ans array if (test_bit64) { DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))] } else { ans = ans[, -match("i64",colnames(ans))] } -i = 1L -for (col in names(DT)[-1]) { - for (n in list(1, 5, -1, -5, c(1,2), c(-1,1))) { - for (type in c('lag','lead','shift','cyclic')) { - # fill is tested by group in tests 2218.*; see comments in #5205 - # sapply(sapply()) changed to for(for(for())) to save 29MB, #5517 - test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE - EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), - ans[[i]]) - i = i+1L - } - } -} +test(2224.01, sapply(names(DT)[-1], function(col) { + sapply(list(1, 5, -1, -5, c(1,2), c(-1,1)), function(n) list( + # fill is tested by group in tests 2218.*; see comments in #5205 + EVAL(sprintf("DT[, shift(%s, %d, type='lag'), by=x]$V1", col, n)), + EVAL(sprintf("DT[, shift(%s, %d, type='lead'), by=x]$V1", col, n)), + EVAL(sprintf("DT[, shift(%s, %d, type='shift'), by=x]$V1", col, n)), + EVAL(sprintf("DT[, shift(%s, %d, type='cyclic'), by=x]$V1", col, n)) + )) +}), ans) a = 1:2 # fill argument with length > 1 which is not a call -test(2224.2, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") +test(2224.02, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") DT = data.table(x=pairlist(1), g=1) # unsupported type as argument -test(2224.3, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") +test(2224.03, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") # groupingsets by named by argument test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), @@ -18080,16 +18786,16 @@ test(2235.1, copy(DT)[, c("z", "x") := {x = NULL; list(2, NULL)}], data.table(z test(2235.2, copy(DT)[, c("z", "x") := {list(2, NULL)}], data.table(z = 2)) # move IDate from POSIXlt to C, add yearquarter; #649 -x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01", NA) -test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L, NA)) -test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L, NA)) -test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L, NA)) -test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L, NA)) -test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L, NA)) -test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L, NA)) -test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L, NA)) -test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12, NA)) -test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100, NA)) +x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01") +test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L)) +test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L)) +test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L)) +test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L)) +test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L)) +test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L)) +test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L)) +test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12)) +test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100)) # as.data.table() no longer ignores row.names=, #5319 dt = data.table(a=1:2, b=3:4) @@ -18108,6 +18814,3 @@ test(2238.6, "a" %notin% integer(), TRUE) test(2238.7, "a" %notin% NULL, TRUE) test(2238.8, NA %notin% 1:5, TRUE) test(2238.9, NA %notin% c(1:5, NA), FALSE) - -# shift actionable error on matrix input #5287 -test(2239.1, shift(matrix(1:10, ncol = 1)), error="consider wrapping") diff --git a/man/assign.Rd b/man/assign.Rd index df255d395a..bb87a5221b 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -66,7 +66,7 @@ All of the following result in a friendly error (by design) : DT[, {col1 := 1L; col2 := 2L}] # Use the functional form, `:=`(), instead (see above). } -For additional resources, please read \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}. Also have a look at StackOverflow's \href{https://stackoverflow.com/questions/tagged/data.table/}{data.table tag}. +For additional resources, please read \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}. Also have a look at StackOverflow's \href{https://stackoverflow.com/search?q=\%5Bdata.table\%5D+reference}{data.table tag}. \code{:=} in \code{j} can be combined with all types of \code{i} (such as binary search), and all types of \code{by}. This a one reason why \code{:=} has been implemented in \code{j}. Please see \href{../doc/datatable-reference-semantics}{\code{vignette("datatable-reference-semantics")}} and also \code{FAQ 2.16} for analogies to SQL. diff --git a/man/data.table.Rd b/man/data.table.Rd index b8011b422a..ecc79e2a54 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -5,6 +5,7 @@ \alias{Ops.data.table} \alias{is.na.data.table} \alias{[.data.table} +\alias{DT} \alias{.} \alias{.(} \alias{.()} @@ -61,13 +62,13 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac If \code{i} is a \code{data.table}, the columns in \code{i} to be matched against \code{x} can be specified using one of these ways: \itemize{ - \item \code{on} argument (see below). It allows for both \code{equi-} and the newly implemented \code{non-equi} joins. + \item{\code{on} argument (see below). It allows for both \code{equi-} and the newly implemented \code{non-equi} joins.} - \item If not, \code{x} \emph{must be keyed}. Key can be set using \code{\link{setkey}}. If \code{i} is also keyed, then first \emph{key} column of \code{i} is matched against first \emph{key} column of \code{x}, second against second, etc.. + \item{If not, \code{x} \emph{must be keyed}. Key can be set using \code{\link{setkey}}. If \code{i} is also keyed, then first \emph{key} column of \code{i} is matched against first \emph{key} column of \code{x}, second against second, etc.. If \code{i} is not keyed, then first column of \code{i} is matched against first \emph{key} column of \code{x}, second column of \code{i} against second \emph{key} column of \code{x}, etc\ldots - This is summarised in code as \code{min(length(key(x)), if (haskey(i)) length(key(i)) else ncol(i))}. + This is summarised in code as \code{min(length(key(x)), if (haskey(i)) length(key(i)) else ncol(i))}.} } Using \code{on=} is recommended (even during keyed joins) as it helps understand the code better and also allows for \emph{non-equi} joins. @@ -99,15 +100,15 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts: \itemize{ - \item A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]} + \item{A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]}} - \item a \code{list()} of expressions of column names: e.g., \code{DT[, .(sa=sum(a)), by=.(x=x>0, y)]} + \item{a \code{list()} of expressions of column names: e.g., \code{DT[, .(sa=sum(a)), by=.(x=x>0, y)]}} - \item a single character string containing comma separated column names (where spaces are significant since column names may contain spaces even at the start or end): e.g., \code{DT[, sum(a), by="x,y,z"]} + \item{a single character string containing comma separated column names (where spaces are significant since column names may contain spaces even at the start or end): e.g., \code{DT[, sum(a), by="x,y,z"]}} - \item a character vector of column names: e.g., \code{DT[, sum(a), by=c("x", "y")]} + \item{a character vector of column names: e.g., \code{DT[, sum(a), by=c("x", "y")]}} - \item or of the form \code{startcol:endcol}: e.g., \code{DT[, sum(a), by=x:z]} + \item{or of the form \code{startcol:endcol}: e.g., \code{DT[, sum(a), by=x:z]}} } \emph{Advanced:} When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}), \code{DT[i, j, by=.EACHI]} evaluates \code{j} for the groups in `DT` that each row in \code{i} joins to. That is, you can join (in \code{i}) and aggregate (in \code{j}) simultaneously. We call this \emph{grouping by each i}. See \href{https://stackoverflow.com/a/27004566/559784}{this StackOverflow answer} for a more detailed explanation until we \href{https://github.com/Rdatatable/data.table/issues/944}{roll out vignettes}. @@ -127,10 +128,10 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{roll}{ When \code{i} is a \code{data.table} and its row matches to all but the last \code{x} join column, and its value in the last \code{i} join column falls in a gap (including after the last observation in \code{x} for that group), then: \itemize{ - \item \code{+Inf} (or \code{TRUE}) rolls the \emph{prevailing} value in \code{x} forward. It is also known as last observation carried forward (LOCF). - \item \code{-Inf} rolls backwards instead; i.e., next observation carried backward (NOCB). - \item finite positive or negative number limits how far values are carried forward or backward. - \item "nearest" rolls the nearest value instead. + \item{\code{+Inf} (or \code{TRUE}) rolls the \emph{prevailing} value in \code{x} forward. It is also known as last observation carried forward (LOCF).} + \item{\code{-Inf} rolls backwards instead; i.e., next observation carried backward (NOCB).} + \item{finite positive or negative number limits how far values are carried forward or backward.} + \item{"nearest" rolls the nearest value instead.} } Rolling joins apply to the last join column, generally a date but can be any variable. It is particularly fast using a modified binary search. @@ -138,8 +139,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{rollends}{ A logical vector length 2 (a single logical is recycled) indicating whether values falling before the first value or after the last value for a group should be rolled as well. \itemize{ - \item If \code{rollends[2]=TRUE}, it will roll the last value forward. \code{TRUE} by default for LOCF and \code{FALSE} for NOCB rolls. - \item If \code{rollends[1]=TRUE}, it will roll the first value backward. \code{TRUE} by default for NOCB and \code{FALSE} for LOCF rolls. + \item{If \code{rollends[2]=TRUE}, it will roll the last value forward. \code{TRUE} by default for LOCF and \code{FALSE} for NOCB rolls.} + \item{If \code{rollends[1]=TRUE}, it will roll the first value backward. \code{TRUE} by default for NOCB and \code{FALSE} for LOCF rolls.} } When \code{roll} is a finite number, that limit is also applied when rolling the ends.} @@ -162,28 +163,27 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When \code{.NATURAL} keyword provided then \emph{natural join} is made (join on common columns). There are multiple ways of specifying the \code{on} argument: \itemize{ - \item As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}. - \item \emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}. + \item{As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}.} + \item{\emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}. For example, \code{X[Y, on=c(x1="y1", x2="y2")]} joins \code{X} and \code{Y} by matching columns \code{x1} and \code{x2} in \code{X} with columns \code{y1} and \code{y2} in \code{Y}, respectively. From v1.9.8, you can also express foreign key joins using the binary operator \code{==}, e.g. \code{X[Y, on=c("x1==y1", "x2==y2")]}. - NB: shorthand like \code{X[Y, on=c("a", V2="b")]} is also possible if, e.g., column \code{"a"} is common between the two tables. - - \item For convenience during interactive scenarios, it is also possible to use \code{.()} syntax as \code{X[Y, on=.(a, b)]}. - \item From v1.9.8, (non-equi) joins using binary operators \code{>=, >, <=, <} are also possible, e.g., \code{X[Y, on=c("x>=a", "y<=b")]}, or for interactive use as \code{X[Y, on=.(x>=a, y<=b)]}. + NB: shorthand like \code{X[Y, on=c("a", V2="b")]} is also possible if, e.g., column \code{"a"} is common between the two tables.} + \item{For convenience during interactive scenarios, it is also possible to use \code{.()} syntax as \code{X[Y, on=.(a, b)]}.} + \item{From v1.9.8, (non-equi) joins using binary operators \code{>=, >, <=, <} are also possible, e.g., \code{X[Y, on=c("x>=a", "y<=b")]}, or for interactive use as \code{X[Y, on=.(x>=a, y<=b)]}.} } See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } - \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } + \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr \enumerate{ - \item programming time (easier to write, read, debug and maintain), and - \item compute time (fast and memory efficient). + \item{programming time (easier to write, read, debug and maintain), and} + \item{compute time (fast and memory efficient).} } The general form of data.table syntax is:\cr @@ -212,7 +212,7 @@ The way to read this out loud is: "Take \code{DT}, subset rows by \code{i}, \emp X[c>1, sum(a), by=c] # get rows where c>1 is TRUE, and on those rows, get sum(a) grouped by 'c' X[Y, .(a, b), on="c"] # get rows where Y$c == X$c, and select columns 'X$a' and 'X$b' for those rows X[Y, .(a, i.a), on="c"] # get rows where Y$c == X$c, and then select 'X$a' and 'Y$a' (=i.a) - X[Y, sum(a*i.a), on="c", by=.EACHI] # for *each* 'Y$c', get sum(a*i.a) on matching rows in 'X$c' + X[Y, sum(a*i.a), on="c" by=.EACHI] # for *each* 'Y$c', get sum(a*i.a) on matching rows in 'X$c' X[, plot(a, b), by=c] # j accepts any expression, generates plot for each group and returns no data # see ?assign to add/update/delete columns by reference using the same consistent interface @@ -434,6 +434,13 @@ dev.off() # using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v' DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b] +# functional query DT(...) +\dontshow{ #dontrun to pass R CMD check prior to R 4.1.0 when |> was added + # an if getRVersion()>"4.1.0" still has its code parsed } +\dontrun{ +mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) +} + # Support guide and links: # https://github.com/Rdatatable/data.table/wiki/Support diff --git a/man/fread.Rd b/man/fread.Rd index 4456e11d10..cc96062dec 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -88,15 +88,15 @@ On Windows, "French_France.1252" is tried which should be available as standard When \code{quote} is a single character, \itemize{ - \item Spaces and other whitespace (other than \code{sep} and \code{\\n}) may appear in unquoted character fields, e.g., \code{\dots,2,Joe Bloggs,3.14,\dots}. + \item{Spaces and other whitespace (other than \code{sep} and \code{\\n}) may appear in unquoted character fields, e.g., \code{\dots,2,Joe Bloggs,3.14,\dots}.} - \item When \code{character} columns are \emph{quoted}, they must start and end with that quoting character immediately followed by \code{sep} or \code{\\n}, e.g., \code{\dots,2,"Joe Bloggs",3.14,\dots}. + \item{When \code{character} columns are \emph{quoted}, they must start and end with that quoting character immediately followed by \code{sep} or \code{\\n}, e.g., \code{\dots,2,"Joe Bloggs",3.14,\dots}. In essence quoting character fields are \emph{required} only if \code{sep} or \code{\\n} appears in the string value. Quoting may be used to signify that numeric data should be read as text. Unescaped quotes may be present in a quoted field, e.g., \code{\dots,2,"Joe, "Bloggs"",3.14,\dots}, as well as escaped quotes, e.g., \code{\dots,2,"Joe \",Bloggs\"",3.14,\dots}. If an embedded quote is followed by the separator inside a quoted field, the embedded quotes up to that point in that field must be balanced; e.g. \code{\dots,2,"www.blah?x="one",y="two"",3.14,\dots}. - On those fields that do not satisfy these conditions, e.g., fields with unbalanced quotes, \code{fread} re-attempts that field as if it isn't quoted. This is quite useful in reading files that contains fields with unbalanced quotes as well, automatically. + On those fields that do not satisfy these conditions, e.g., fields with unbalanced quotes, \code{fread} re-attempts that field as if it isn't quoted. This is quite useful in reading files that contains fields with unbalanced quotes as well, automatically.} } To read fields \emph{as is} instead, use \code{quote = ""}. @@ -106,16 +106,16 @@ To read fields \emph{as is} instead, use \code{quote = ""}. Currently, the \code{yaml} setting is somewhat inflexible with respect to incorporating metadata to facilitate file reading. Information on column classes should be stored at the top level under the heading \code{schema} and subheading \code{fields}; those with both a \code{type} and a \code{name} sub-heading will be merged into \code{colClasses}. Other supported elements are as follows: \itemize{ - \item \code{sep} (or alias \code{delimiter}) - \item \code{header} - \item \code{quote} (or aliases \code{quoteChar}, \code{quote_char}) - \item \code{dec} (or alias \code{decimal}) - \item \code{na.strings} + \item{ \code{sep} (or alias \code{delimiter}) } + \item{ \code{header} } + \item{ \code{quote} (or aliases \code{quoteChar}, \code{quote_char}) } + \item{ \code{dec} (or alias \code{decimal}) } + \item{ \code{na.strings} } } \bold{File Download:} -When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \code{fread} detects this and \emph{downloads} the target to a temporary file (at \code{tempfile()}) before proceeding to read the file as usual. URLS (ftps:// and https:// as well as ftp:// and http://) paths are downloaded with \code{download.file} and \code{method} set to \code{getOption("download.file.method")}, defaulting to \code{"auto"}; and file:// is downloaded with \code{download.file} with \code{method="internal"}. NB: this implies that for file://, even files found on the current machine will be "downloaded" (i.e., hard-copied) to a temporary file. See \code{\link{download.file}} for more details. +When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \code{fread} detects this and \emph{downloads} the target to a temporary file (at \code{tempfile()}) before proceeding to read the file as usual. Secure URLS (ftps:// and https://) are downloaded with \code{curl::curl_download}; ftp:// and http:// paths are downloaded with \code{download.file} and \code{method} set to \code{getOption("download.file.method")}, defaulting to \code{"auto"}; and file:// is downloaded with \code{download.file} with \code{method="internal"}. NB: this implies that for file://, even files found on the current machine will be "downloaded" (i.e., hard-copied) to a temporary file. See \code{\link{download.file}} for more details. \bold{Shell commands:} diff --git a/man/froll.Rd b/man/froll.Rd index d6cb75067f..5f86d791a0 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -7,115 +7,115 @@ \alias{rollmean} \alias{frollmean} \alias{rollsum} +\alias{rollmax} \alias{frollsum} +\alias{frollmax} \alias{rollapply} \alias{frollapply} \title{Rolling functions} \description{ - Fast rolling functions to calculate aggregates on sliding windows. Function name and arguments are experimental. + Fast rolling functions to calculate aggregates on sliding windows. } \usage{ -frollmean(x, n, fill=NA, algo=c("fast", "exact"), - align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollsum(x, n, fill=NA, algo=c("fast","exact"), - align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) + frollmean(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), + na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) + frollsum(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), + na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) + frollmax(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), + na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, hasNA, give.names=FALSE) + frollapply(x, n, FUN, \dots, fill=NA, align=c("right","left","center"), + adaptive=FALSE, partial=FALSE, give.names=FALSE) } \arguments{ \item{x}{ Vector, \code{data.frame} or \code{data.table} of integer, numeric or logical columns over which to calculate the windowed aggregations. May also be a list, in which case the rolling function is applied to each of its elements. } - \item{n}{ Integer vector giving rolling window size(s). This is the \emph{total} number of included values. Adaptive rolling functions also accept a list of integer vectors. } + \item{n}{ Integer vector giving rolling window size(s). This is the \emph{total} number of included values in aggregate function. Adaptive rolling functions also accept a list of integer vectors when applying multiple window sizes. } \item{fill}{ Numeric; value to pad by. Defaults to \code{NA}. } - \item{algo}{ Character, default \code{"fast"}. When set to \code{"exact"}, a slower (but more accurate) algorithm is used. It - suffers less from floating point rounding errors by performing an extra pass, and carefully handles all non-finite values. - It will use mutiple cores where available. See Details for more information. } + \item{algo}{ Character, default \code{"fast"}. When set to \code{"exact"}, a slower (in some cases more accurate) algorithm is used. See \emph{Implementation} section below for details. } \item{align}{ Character, specifying the "alignment" of the rolling window, defaulting to \code{"right"}. \code{"right"} covers preceding rows (the window \emph{ends} on the current value); \code{"left"} covers following rows (the window \emph{starts} on the current value); \code{"center"} is halfway in between (the window is \emph{centered} on the current value, biased towards \code{"left"} when \code{n} is even). } - \item{na.rm}{ Logical, default \code{FALSE}. Should missing values be removed when - calculating window? For details on handling other non-finite values, see Details. } - \item{hasNA}{ Logical. If it is known that \code{x} contains \code{NA} - then setting this to \code{TRUE} will speed up calculation. Defaults to \code{NA}. } - \item{adaptive}{ Logical, default \code{FALSE}. Should the rolling function be calculated adaptively? See Details below. } - \item{FUN}{ The function to be applied to the rolling window; see Details for restrictions. } + \item{na.rm}{ Logical, default \code{FALSE}. Should missing values be removed when calculating window? } + \item{has.nf}{ Logical. If it is known that \code{x} contains (or not) non-finite values (\code{NA, NaN, Inf, -Inf}) then setting this to \code{TRUE}/\code{FALSE} may speed up computation. Defaults to \code{NA}. See \emph{has.nf argument} section below for details. } + \item{adaptive}{ Logical, default \code{FALSE}. Should the rolling function be calculated adaptively? See \emph{Adaptive rolling functions} section below for details. } + \item{partial}{ Logical, default \code{FALSE}. Should the rolling window size(s) provided in \code{n} be trimmed to available observations. See \emph{\code{partial} argument} section below for details. } + \item{FUN}{ The function to be applied to the rolling window in \code{frollapply}; See \emph{frollaply} section below for details. } \item{\dots}{ Extra arguments passed to \code{FUN} in \code{frollapply}. } + \item{hasNA}{ Logical. Deprecated, use \code{has.nf} argument instead. } + \item{give.names}{ Logical, default \code{FALSE}. When \code{TRUE}, names are automatically generated corresponding to names of \code{x} and names of \code{n}. If answer is an atomic vector, then the argument is ignored, see examples. } } \details{ - \code{froll*} functions accept vectors, lists, \code{data.frame}s or - \code{data.table}s. They always return a list except when the input is a - \code{vector} and \code{length(n)==1}, in which case a \code{vector} - is returned, for convenience. Thus, rolling functions can be used - conveniently within \code{data.table} syntax. + \code{froll*} functions accept vector, list, \code{data.frame} or \code{data.table}. Functions operate on a single vector, when passing a non-atomic input, then function is applied column-by-column, not to a complete set of column at once. - Argument \code{n} allows multiple values to apply rolling functions on - multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list. - Each list element must be integer vector of window sizes corresponding - to every single observation in each column; see Examples. + Argument \code{n} allows multiple values to apply rolling function on multiple window sizes. If \code{adaptive=TRUE}, then \code{n} can be a list to specify multiple window sizes for adaptive rolling computation. See \emph{Adaptive rolling functions} section below for details. - When \code{algo="fast"} an \emph{"on-line"} algorithm is used, and - all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. - Setting \code{algo="exact"} will make rolling functions to use a more - computationally-intensive algorithm that suffers less from floating point - rounding error (the same consideration applies to \code{\link[base]{mean}}). - \code{algo="exact"} also handles \code{NaN, +Inf, -Inf} consistently to - base R. In case of some functions (like \emph{mean}), it will additionally - make extra pass to perform floating point error correction. Error - corrections might not be truly exact on some platforms (like Windows) - when using multiple threads. + When multiple columns and/or multiple windows width are provided, then computation run in parallel (except for \code{frollapply}. The exception is for \code{algo="exact"}, which runs in parallel even for single column and single window width. By default data.table uses only half of available CPUs, see \code{\link{setDTthreads}} for details on how to tune CPU usage. - Adaptive rolling functions are a special case where each - observation has its own corresponding rolling window width. Due to the logic - of adaptive rolling functions, the following restrictions apply: - \itemize{ - \item \code{align} only \code{"right"}. - \item if list of vectors is passed to \code{x}, then all - vectors within it must have equal length. - } - - When multiple columns or multiple windows width are provided, then they - are run in parallel. The exception is for \code{algo="exact"}, which runs in - parallel already. - - \code{frollapply} computes rolling aggregate on arbitrary R functions. - The input \code{x} (first argument) to the function \code{FUN} - is coerced to \emph{numeric} beforehand and \code{FUN} - has to return a scalar \emph{numeric} value. Checks for that are made only - during the first iteration when \code{FUN} is evaluated. Edge cases can be - found in examples below. Any R function is supported, but it is not optimized - using our own C implementation -- hence, for example, using \code{frollapply} - to compute a rolling average is inefficient. It is also always single-threaded - because there is no thread-safe API to R's C \code{eval}. Nevertheless we've - seen the computation speed up vis-a-vis versions implemented in base R. + Setting \code{options(datatable.verbose=TRUE)} will display various information about how rolling function processed. It will not print information in a real-time but only at the end of the processing. } \value{ - A list except when the input is a \code{vector} and - \code{length(n)==1} in which case a \code{vector} is returned. + A list except when the input is a \code{vector} and \code{length(n)==1}, in which case a \code{vector} is returned, for convenience. Thus, rolling functions can be used conveniently within \code{data.table} syntax. } \note{ - Users coming from most popular package for rolling functions - \code{zoo} might expect following differences in \code{data.table} - implementation. + Be aware that rolling functions operates on the physical order of input. If the intent is to roll values in a vector by a logical window, for example an hour, or a day, then one has to use adaptive rolling function or has to ensure that there are no gaps in input. For details see \href{https://github.com/Rdatatable/data.table/issues/3241}{issue #3241}. +} +\section{\code{has.nf} argument}{ + \code{has.nf} can be used to speed up processing in cases when it is known if \code{x} contains (or not) non-finite values (\code{NA, NaN, Inf, -Inf}). \itemize{ - \item rolling function will always return result of the same length as input. - \item \code{fill} defaults to \code{NA}. - \item \code{fill} accepts only constant values. It does not support - for \emph{na.locf} or other functions. - \item \code{align} defaults to \code{"right"}. - \item \code{na.rm} is respected, and other functions are not needed - when input contains \code{NA}. - \item integers and logical are always coerced to double. - \item when \code{adaptive=FALSE} (default), then \code{n} must be a - numeric vector. List is not accepted. - \item when \code{adaptive=TRUE}, then \code{n} must be vector of - length equal to \code{nrow(x)}, or list of such vectors. - \item \code{partial} window feature is not supported, although it can - be accomplished by using \code{adaptive=TRUE}, see - examples. \code{NA} is always returned for incomplete windows. + \item{ Default \code{has.nf=NA} uses faster implementation that does not support non-finite values, but when non-finite values are detected it will re-run non-finite supported implementation. } + \item{ \code{has.nf=TRUE} uses non-finite aware implementation straightaway. } + \item{ \code{has.nf=FALSE} uses faster implementation that does not support non-finite values. Then depending on the rolling function it will either: + \itemize{ + \item{ (\emph{mean, sum}) detect non-finite, re-run non-finite aware. } + \item{ (\emph{max}) not detect NFs and may silently give incorrect answer. } + } + In general \code{has.nf=FALSE && any(!is.finite(x))} should be considered as undefined behavior. Therefore \code{has.nf=FALSE} should be used with care. } + } +} +\section{Implementation}{ + Each rolling function has 4 different implementations. First factor that decides which implementation is being used is \code{adaptive} argument, see setion below for details. Then for each of those two algorithms (adaptive \code{TRUE/FALSE}) there are two \code{algo} argument values. + \itemize{ + \item{ \code{algo="fast"} uses \emph{"on-line"}, single pass, algorithm. + \itemize{ + \item{ \emph{max} rolling function will not do only a single pass but, on average \code{length(x)/n}, nested loops will be computed. The bigger the window the bigger advantage over algo \emph{exact} which computes \code{length(x)} nested loops. Note that \emph{exact} uses multiple CPUs so for a small window size and many CPUs it is possible it will be actually faster than \emph{fast} but in those cases elapsed timings will likely be far below a single second. } + \item{ Not all functions have \emph{fast} implementation available. As of now \emph{max} and \code{adaptive=TRUE} does not have, therefore it will automatically fall back to \emph{exact} implementation. \code{datatable.verbose} option can be used to check that. } + }} + \item{ \code{algo="exact"} will make rolling functions to use a more computationally-intensive algorithm. For each observation from input vector it will compute a function on a window from scratch (complexity \eqn{O(n^2)}). + \itemize{ + \item { Depeneding on the function, this algorithm may suffers less from floating point rounding error (the same consideration applies to base \code{\link[base]{mean}}). } + \item{ In case of \emph{mean} (and possibly other functions in future), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) when using multiple threads. } + }} + } +} +\section{Adaptive rolling functions}{ + Adaptive rolling functions are a special case where each observation has its own corresponding rolling window width. Therefore values passed to \code{n} argument must be series corresponding to observations in \code{x}. If multiple windows is meant to be computed then a list of integer vectors is expected; each list element must be an integer vector of window size corresponding to observations in \code{x}; see Examples. Due to the logic or implementation of adaptive rolling functions, the following restrictions apply + \itemize{ + \item{ \code{align} does not support \code{"center"}. } + \item{ if list of vectors is passed to \code{x}, then all vectors within it must have equal length due to the fact that length of adaptive window widths must match the length of vectors in \code{x}. } + } +} +\section{\code{partial} argument}{ + \code{partial=TRUE} will turn a function into adaptive function and trim window size in \code{n} argument using \code{n = c(seq.int(n), rep(n, len-n))} to available observations. It inherits limitations of adaptive rolling functions, see above. Adaptive functions uses more complex algorithms, therefore if performance is important then \code{partial=TRUE} should be avoided in favour of computing only missing observations separately after the rolling function; see examples. +} +\section{\code{frollapply}}{ + \code{frollapply} computes rolling aggregate on arbitrary R functions. The input \code{x} (first argument) to the function \code{FUN} is coerced to \emph{numeric} beforehand and \code{FUN} has to return a scalar \emph{numeric} value. Checks for that are made only during the first iteration when \code{FUN} is evaluated. Edge cases can be found in examples below. Any R function is supported, but it is not optimized using our own C implementation -- hence, for example, using \code{frollapply} to compute a rolling average is inefficient. It is also always single-threaded because there is no thread-safe API to R's C \code{eval}. Nevertheless we've seen the computation speed up vis-a-vis versions implemented in base R, especially when combined with \code{adaptive=TRUE}. + Support for \code{adaptive=TRUE} in \code{frollapply} requires R 3.4.0 or higher. +} +\section{\code{zoo} package users notice}{ + Users coming from most popular package for rolling functions \code{zoo} might expect following differences in \code{data.table} implementation + \itemize{ + \item{ rolling function will always return result of the same length as input. } + \item{ \code{fill} defaults to \code{NA}. } + \item{ \code{fill} accepts only constant values. No support for \emph{na.locf} or other functions. } + \item{ \code{align} defaults to \code{"right"}. } + \item{ \code{na.rm} is respected, and other functions are not needed when input contains \code{NA}. } + \item{ integers and logical are always coerced to double. } + \item{ when \code{adaptive=FALSE} (default), then \code{n} must be a numeric vector. List is not accepted. } + \item{ when \code{adaptive=TRUE}, then \code{n} must be vector of length equal to \code{nrow(x)}, or list of such vectors. } + \item{ \code{by.column} argument is not yet supported in \code{frollapply}. For details/upvote see \href{https://github.com/Rdatatable/data.table/issues/4887}{issue #4887}. } } - - Be aware that rolling functions operates on the physical order of input. - If the intent is to roll values in a vector by a logical window, for - example an hour, or a day, one has to ensure that there are no gaps in - input. For details see \href{https://github.com/Rdatatable/data.table/issues/3241}{issue #3241}. } \examples{ +# single vector and single window +frollmean(1:6, 3) + d = as.data.table(list(1:6/2, 3:8/4)) # rollmean of single vector and single window frollmean(d[, V1], 3) @@ -127,14 +127,32 @@ frollmean(d[, .(V1)], c(3, 4)) frollmean(d, c(3, 4)) ## three calls above will use multiple cores when available -# partial window using adaptive rolling function -an = function(n, len) c(seq.int(n), rep(n, len-n)) -n = an(3, nrow(d)) -frollmean(d, n, adaptive=TRUE) - # frollsum frollsum(d, 3:4) +# frollmax +frollmax(d, 3:4) + +# partial=TRUE +x = 1:6/2 +n = 3 +ans1 = frollmean(x, n, partial=TRUE) +# same using adaptive=TRUE +an = function(n, len) c(seq.int(n), rep(n, len-n)) +ans2 = frollmean(x, an(n, length(x)), adaptive=TRUE) +all.equal(ans1, ans2) +# much faster by using partial only for incomplete observations +ans3 = frollmean(x, n) +ans3[seq.int(n-1L)] = frollmean(x[seq.int(n-1L)], n, partial=TRUE) +all.equal(ans1, ans3) + +# give.names +frollsum(list(x=1:5, y=5:1), c(tiny=2, big=4), give.names=TRUE) + +# has.nf=FALSE should be used with care +frollmax(c(1,2,NA,4,5), 2) +frollmax(c(1,2,NA,4,5), 2, has.nf=FALSE) + # frollapply frollapply(d, 3:4, sum) f = function(x, ...) if (sum(x, ...)>5) min(x, ...) else max(x, ...) @@ -200,7 +218,7 @@ f = function(x) { ## FUN is not type-stable try(frollapply(1:5, 3, f)) } \seealso{ - \code{\link{shift}}, \code{\link{data.table}} + \code{\link{shift}}, \code{\link{data.table}}, \code{\link{setDTthreads}} } \references{ \href{https://en.wikipedia.org/wiki/Round-off_error}{Round-off error} diff --git a/man/fsort.Rd b/man/fsort.Rd index 0eba047a16..6c11022d2c 100644 --- a/man/fsort.Rd +++ b/man/fsort.Rd @@ -20,9 +20,9 @@ fsort(x, decreasing = FALSE, na.last = FALSE, internal=FALSE, verbose=FALSE, \do Process will raise error if \code{x} contains negative values. Unless \code{x} is already sorted \code{fsort} will redirect processing to slower single threaded \emph{order} followed by \emph{subset} in following cases: \itemize{ - \item data type other than \emph{double} (\emph{numeric}) - \item data having \code{NA}s - \item \code{decreasing==FALSE} + \item{data type other than \emph{double} (\emph{numeric})} + \item{data having \code{NA}s} + \item{\code{decreasing==FALSE}} } } \value{ diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 42ae44a29a..870acaac75 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -37,18 +37,18 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{col.names}{Should the column names (header row) be written? The default is \code{TRUE} for new files and when overwriting existing files (\code{append=FALSE}). Otherwise, the default is \code{FALSE} to prevent column names appearing again mid-file when stacking a set of \code{data.table}s or appending rows to the end of a file.} \item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. \itemize{ - \item "escape" - the quote character (as well as the backslash character) is escaped in C style by a backslash, or - \item "double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one. + \item{"escape" - the quote character (as well as the backslash character) is escaped in C style by a backslash, or} + \item{"double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one.} }} \item{logical01}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?} \item{logicalAsInt}{Deprecated. Old name for `logical01`. Name change for consistency with `fread` for which `logicalAsInt` would not make sense.} \item{scipen}{ \code{integer} In terms of printing width, how much of a bias should there be towards printing whole numbers rather than scientific notation? See Details. } \item{dateTimeAs}{ How \code{Date}/\code{IDate}, \code{ITime} and \code{POSIXct} items are written. \itemize{ - \item "ISO" (default) - \code{2016-09-12}, \code{18:12:16} and \code{2016-09-12T18:12:16.999999Z}. 0, 3 or 6 digits of fractional seconds are printed if and when present for convenience, regardless of any R options such as \code{digits.secs}. The idea being that if milli and microseconds are present then you most likely want to retain them. R's internal UTC representation is written faithfully to encourage ISO standards, stymie timezone ambiguity and for speed. An option to consider is to start R in the UTC timezone simply with \code{"$ TZ='UTC' R"} at the shell (NB: it must be one or more spaces between \code{TZ='UTC'} and \code{R}, anything else will be silently ignored; this TZ setting applies just to that R process) or \code{Sys.setenv(TZ='UTC')} at the R prompt and then continue as if UTC were local time. - \item "squash" - \code{20160912}, \code{181216} and \code{20160912181216999}. This option allows fast and simple extraction of \code{yyyy}, \code{mm}, \code{dd} and (most commonly to group by) \code{yyyymm} parts using integer div and mod operations. In R for example, one line helper functions could use \code{\%/\%10000}, \code{\%/\%100\%\%100}, \code{\%\%100} and \code{\%/\%100} respectively. POSIXct UTC is squashed to 17 digits (including 3 digits of milliseconds always, even if \code{000}) which may be read comfortably as \code{integer64} (automatically by \code{fread()}). - \item "epoch" - \code{17056}, \code{65536} and \code{1473703936.999999}. The underlying number of days or seconds since the relevant epoch (1970-01-01, 00:00:00 and 1970-01-01T00:00:00Z respectively), negative before that (see \code{?Date}). 0, 3 or 6 digits of fractional seconds are printed if and when present. - \item "write.csv" - this currently affects \code{POSIXct} only. It is written as \code{write.csv} does by using the \code{as.character} method which heeds \code{digits.secs} and converts from R's internal UTC representation back to local time (or the \code{"tzone"} attribute) as of that historical date. Accordingly this can be slow. All other column types (including \code{Date}, \code{IDate} and \code{ITime} which are independent of timezone) are written as the "ISO" option using fast C code which is already consistent with \code{write.csv}. + \item{"ISO" (default) - \code{2016-09-12}, \code{18:12:16} and \code{2016-09-12T18:12:16.999999Z}. 0, 3 or 6 digits of fractional seconds are printed if and when present for convenience, regardless of any R options such as \code{digits.secs}. The idea being that if milli and microseconds are present then you most likely want to retain them. R's internal UTC representation is written faithfully to encourage ISO standards, stymie timezone ambiguity and for speed. An option to consider is to start R in the UTC timezone simply with \code{"$ TZ='UTC' R"} at the shell (NB: it must be one or more spaces between \code{TZ='UTC'} and \code{R}, anything else will be silently ignored; this TZ setting applies just to that R process) or \code{Sys.setenv(TZ='UTC')} at the R prompt and then continue as if UTC were local time.} + \item{"squash" - \code{20160912}, \code{181216} and \code{20160912181216999}. This option allows fast and simple extraction of \code{yyyy}, \code{mm}, \code{dd} and (most commonly to group by) \code{yyyymm} parts using integer div and mod operations. In R for example, one line helper functions could use \code{\%/\%10000}, \code{\%/\%100\%\%100}, \code{\%\%100} and \code{\%/\%100} respectively. POSIXct UTC is squashed to 17 digits (including 3 digits of milliseconds always, even if \code{000}) which may be read comfortably as \code{integer64} (automatically by \code{fread()}).} + \item{"epoch" - \code{17056}, \code{65536} and \code{1473703936.999999}. The underlying number of days or seconds since the relevant epoch (1970-01-01, 00:00:00 and 1970-01-01T00:00:00Z respectively), negative before that (see \code{?Date}). 0, 3 or 6 digits of fractional seconds are printed if and when present.} + \item{"write.csv" - this currently affects \code{POSIXct} only. It is written as \code{write.csv} does by using the \code{as.character} method which heeds \code{digits.secs} and converts from R's internal UTC representation back to local time (or the \code{"tzone"} attribute) as of that historical date. Accordingly this can be slow. All other column types (including \code{Date}, \code{IDate} and \code{ITime} which are independent of timezone) are written as the "ISO" option using fast C code which is already consistent with \code{write.csv}.} } The first three options are fast due to new specialized C code. The epoch to date-part conversion uses a fast approach by Howard Hinnant (see references) using a day-of-year starting on 1 March. You should not be able to notice any difference in write speed between those three options. The date range supported for \code{Date} and \code{IDate} is [0000-03-01, 9999-12-31]. Every one of these 3,652,365 dates have been tested and compared to base R including all 2,790 leap days in this range. \cr \cr This option applies to vectors of date/time in list column cells, too. \cr \cr @@ -64,7 +64,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writting raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } } \details{ -\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. +\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://www.h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. To save space, \code{fwrite} prefers to write wide numeric values in scientific notation -- e.g. \code{10000000000} takes up much more space than \code{1e+10}. Most file readers (e.g. \code{\link{fread}}) understand scientific notation, so there's no fidelity loss. Like in base R, users can control this by specifying the \code{scipen} argument, which follows the same rules as \code{\link[base]{options}('scipen')}. \code{fwrite} will see how much space a value will take to write in scientific vs. decimal notation, and will only write in scientific notation if the latter is more than \code{scipen} characters wider. For \code{10000000000}, then, \code{1e+10} will be written whenever \code{scipen<6}. @@ -73,17 +73,17 @@ To save space, \code{fwrite} prefers to write wide numeric values in scientific The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom: \itemize{ - \item \code{source} - Contains the R version and \code{data.table} version used to write the file - \item \code{creation_time_utc} - Current timestamp in UTC time just before the header is written - \item \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. - \item \code{header} - same as \code{col.names} (which is \code{header} on input) - \item \code{sep} - \item \code{sep2} - \item \code{eol} - \item \code{na.strings} - same as \code{na} - \item \code{dec} - \item \code{qmethod} - \item \code{logical01} + \item{ \code{source} - Contains the R version and \code{data.table} version used to write the file } + \item{ \code{creation_time_utc} - Current timestamp in UTC time just before the header is written } + \item{ \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. } + \item{ \code{header} - same as \code{col.names} (which is \code{header} on input) } + \item{ \code{sep} } + \item{ \code{sep2} } + \item{ \code{eol} } + \item{ \code{na.strings} - same as \code{na} } + \item{ \code{dec} } + \item{ \code{qmethod} } + \item{ \code{logical01} } } } diff --git a/man/notin.Rd b/man/notin.Rd index e041ff5cbd..d84bb2024d 100644 --- a/man/notin.Rd +++ b/man/notin.Rd @@ -1,30 +1,33 @@ \name{notin} \alias{\%notin\%} + \title{ Convenience operator for checking if an example is not in a set of elements } + \description{ -Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. See examples on how missing values are being handled. +Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. } + \usage{ x \%notin\% table } + \arguments{ \item{x}{ Vector or \code{NULL}: the values to be matched. } \item{table}{ Vector or \code{NULL}: the values to be matched against. } } + + \value{ Logical vector, \code{TRUE} for each element of \code{x} \emph{absent} from \code{table}, and \code{FALSE} for each element of \code{x} \emph{present} in \code{table}. } + \seealso{ \code{\link[base]{match}}, \code{\link[data.table]{chmatch}} } + + \examples{ 11 \%notin\% 1:10 # TRUE "a" \%notin\% c("a", "b") # FALSE - - ## NAs on the LHS - NA \%in\% 1:2 - NA \%notin\% 1:2 - ## NAs on the RHS - NA \%in\% c(1:2,NA) - NA \%notin\% c(1:2,NA) } + diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index df942009c6..71e469ed72 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -37,18 +37,18 @@ Internally parallelized code is used in the following places: \itemize{ - \item\file{between.c} - \code{\link{between}()} - \item\file{cj.c} - \code{\link{CJ}()} - \item\file{coalesce.c} - \code{\link{fcoalesce}()} - \item\file{fifelse.c} - \code{\link{fifelse}()} - \item\file{fread.c} - \code{\link{fread}()} - \item\file{forder.c}, \file{fsort.c}, and \file{reorder.c} - \code{\link{forder}()} and related - \item\file{froll.c}, \file{frolladaptive.c}, and \file{frollR.c} - \code{\link{froll}()} and family - \item\file{fwrite.c} - \code{\link{fwrite}()} - \item\file{gsumm.c} - GForce in various places, see \link{GForce} - \item\file{nafill.c} - \code{\link{nafill}()} - \item\file{subset.c} - Used in \code{\link[=data.table]{[.data.table}} subsetting - \item\file{types.c} - Internal testing usage + \item{\file{between.c} - \code{\link{between}()}} + \item{\file{cj.c} - \code{\link{CJ}()}} + \item{\file{coalesce.c} - \code{\link{fcoalesce}()}} + \item{\file{fifelse.c} - \code{\link{fifelse}()}} + \item{\file{fread.c} - \code{\link{fread}()}} + \item{\file{forder.c}, \file{fsort.c}, and \file{reorder.c} - \code{\link{forder}()} and related} + \item{\file{froll.c}, \file{frolladaptive.c}, and \file{frollR.c} - \code{\link{froll}()} and family} + \item{\file{fwrite.c} - \code{\link{fwrite}()}} + \item{\file{gsumm.c} - GForce in various places, see \link{GForce}} + \item{\file{nafill.c} - \code{\link{nafill}()}} + \item{\file{subset.c} - Used in \code{\link[=data.table]{[.data.table}} subsetting} + \item{\file{types.c} - Internal testing usage} } } \examples{ diff --git a/man/setops.Rd b/man/setops.Rd index dfa2572c74..395cdab339 100644 --- a/man/setops.Rd +++ b/man/setops.Rd @@ -23,12 +23,16 @@ fsetequal(x, y, all = TRUE) \arguments{ \item{x, y}{\code{data.table}s.} \item{all}{Logical. Default is \code{FALSE} and removes duplicate rows on the result. When \code{TRUE}, if there are \code{xn} copies of a particular row in \code{x} and \code{yn} copies of the same row in \code{y}, then: - \itemize{ - \item\code{fintersect} will return \code{min(xn, yn)} copies of that row. - \item\code{fsetdiff} will return \code{max(0, xn-yn)} copies of that row. - \item\code{funion} will return \code{xn+yn} copies of that row. - \item\code{fsetequal} will return \code{FALSE} unless \code{xn == yn}. - } + \itemize{ + + \item{\code{fintersect} will return \code{min(xn, yn)} copies of that row.} + + \item{\code{fsetdiff} will return \code{max(0, xn-yn)} copies of that row.} + + \item{\code{funion} will return \code{xn+yn} copies of that row.} + + \item{\code{fsetequal} will return \code{FALSE} unless \code{xn == yn}.} + } } } \details{ diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 9fb3cb45a4..c96cbef5c4 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -19,12 +19,12 @@ These symbols used in \code{j} are defined as follows. \itemize{ - \item \code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}). - \item \code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable. - \item \code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}. - \item \code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. - \item \code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc. - \item \code{.NGRP} is an integer, length 1, containing the number of groups. + \item{\code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}).} + \item{\code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable.} + \item{\code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}.} + \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. } + \item{\code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc.} + \item{\code{.NGRP} is an integer, length 1, containing the number of groups. } } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. diff --git a/man/tables.Rd b/man/tables.Rd index a8a74b0a7d..5b95edffa2 100644 --- a/man/tables.Rd +++ b/man/tables.Rd @@ -5,11 +5,11 @@ Convenience function for concisely summarizing some metadata of all \code{data.table}s in memory (or an optionally specified environment). } \usage{ -tables(mb=type_size, order.col="NAME", width=80, +tables(mb=TRUE, order.col="NAME", width=80, env=parent.frame(), silent=FALSE, index=FALSE) } \arguments{ - \item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. } + \item{mb}{ \code{logical}; \code{TRUE} adds the rough size of each \code{data.table} in megabytes to the output under column \code{MB}. } \item{order.col}{ Column name (\code{character}) by which to sort the output. } \item{width}{ \code{integer}; number of characters beyond which the output for each of the columns \code{COLS}, \code{KEY}, and \code{INDICES} are truncated. } \item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. } @@ -19,9 +19,9 @@ tables(mb=type_size, order.col="NAME", width=80, \details{ Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt. -`mb = utils::object.size` provides a higher and more accurate estimate of size, but may take longer. Its default `units="b"` is appropriate. +Note that on older versions of \R, \code{object.size} may be slow, so setting \code{mb=FALSE} may speed up execution of \code{tables} significantly. -Setting \code{silent=TRUE} prints nothing; the metadata is returned as a \code{data.table} invisibly whether \code{silent} is \code{TRUE} or \code{FALSE}. +Setting \code{silent=TRUE} prints nothing; the metadata are returned as a \code{data.table}, invisibly, whether silent is \code{TRUE} or \code{FALSE}. } \value{ A \code{data.table} containing the information printed. diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index c36e5f9d40..ba0fe25f9c 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -7,9 +7,7 @@ \usage{ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, - showProgress = interactive() && !silent, - memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), - memtest.id = NULL) + showProgress = interactive() && !silent) } \arguments{ \item{script}{ Run arbitrary R test script. } @@ -17,8 +15,6 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } -\item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } -\item{memtest.id}{ An id for which to print memory usage for every sub id. May be a range of ids. } } \details{ Runs a series of tests. These can be used to see features and examples of usage, too. Running test.data.table will tell you the full location of the test file(s) to open. diff --git a/man/update_dev_pkg.Rd b/man/update_dev_pkg.Rd index 66fff0422d..3db5b98316 100644 --- a/man/update_dev_pkg.Rd +++ b/man/update_dev_pkg.Rd @@ -2,14 +2,14 @@ \alias{update_dev_pkg} \title{Perform update of development version of a package} \description{ - Downloads and installs latest development version, only when a new commit is available. Defaults are set to update \code{data.table}, other packages can be used as well. Repository of a package has to include git commit SHA information in PACKAGES file. + Downloads and installs latest development version only when a new commit is available which has also passed all tests. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } -\usage{update_dev_pkg(pkg="data.table", +\usage{update_dev_pkg(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } \arguments{ - \item{pkg}{ character scalar, package name. } + \item{object}{ character scalar, package name. } \item{repo}{ character scalar, url of package devel repository. } \item{field}{ character scalar, metadata field to use in PACKAGES file and DESCRIPTION file, default \code{"Revision"}. } @@ -20,18 +20,13 @@ \item{\dots}{ passed to \code{\link[utils]{install.packages}}. } } \details{ - In case if a devel repository does not provide binaries user will need development tools installed for package compilation, like \emph{Rtools} on Windows, or alternatively eventually set \code{type="source"}. -} -\section{data.table repositories}{ - By default the function uses our GitLab-hosted R repository at \code{https://Rdatatable.gitlab.io/data.table}. This repository is updated nightly. It runs multiple test jobs (on top of GitHub tests jobs run upstream) and publish the package (sources and binaries), even if GitLab test jobs are failing. Status of GitLab test jobs can be checked at \href{https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html}{Package Check Results}.\cr - We also publish bleeding edge version of the package on GitHub-hosted R repository at \code{https://Rdatatable.gitlab.io/data.table} (just minor change in url from \emph{lab} to \emph{hub}). GitHub version should be considered less stable than GitLab one. It publishes only package sources.\cr - There are also other repositories maintained by R community, for example \code{https://rdatatable.r-universe.dev}. Those can be used as well, but as they are unlikely to provide git commit SHA, the function will install the package even if latest version is already installed. + In case if a devel repository does not provide binaries user will need development tools installed for package compilation, like \emph{Rtools} on Windows, and eventually set \code{type="source"}. } \note{ Package namespace is unloaded before attempting to install newer version. } \value{ - Invisibly \code{TRUE} if package was updated, otherwise \code{FALSE}. + NULL. } \examples{ \dontshow{ # using if(FALSE) because \dontrun could still be run by --run-dontrun; #5421 } diff --git a/src/assign.c b/src/assign.c index ce2c707dfd..7fb09fa71e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -470,8 +470,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as // strong error message for now. else if (TRUELENGTH(names) != oldtncol) - // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didnt work - error(_("Internal error: selfrefnames is ok but tl names [%lld] != tl [%d]"), (long long)TRUELENGTH(names), oldtncol); // # nocov + error(_("Internal error: selfrefnames is ok but tl names [%d] != tl [%d]"), TRUELENGTH(names), oldtncol); // # nocov SETLENGTH(dt, oldncol+LENGTH(newcolnames)); SETLENGTH(names, oldncol+LENGTH(newcolnames)); for (int i=0; i=tt[i+1]) - error(_("Internal error: %d column numbers to delete not now in strictly increasing order. No-dups were checked earlier."), i); // # nocov + error(_("Internal error: %d column numbers to delete not now in strictly increasing order. No-dups were checked earlier.")); // # nocov } for (int i=tt[0], j=1, k=tt[0]+1; inlevel) { - error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc(colnum, colname), val, nlevel); + error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc, val, nlevel); } } } else { @@ -743,7 +738,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel)) { - error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc(colnum, colname), val, nlevel); + error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc, val, nlevel); } } } @@ -809,7 +804,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // # nocov start for (int k=0; k=3) { @@ -855,27 +850,27 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con Rprintf(_("Zero-copy coerce when assigning '%s' to '%s' %s.\n"), sourceIsI64 ? "integer64" : type2char(TYPEOF(source)), targetIsI64 ? "integer64" : type2char(TYPEOF(target)), - targetDesc(colnum, colname)); + targetDesc); } // The following checks are up front here, otherwise we'd need them twice in the two branches // inside BODY that cater for 'where' or not. Maybe there's a way to merge the two macros in future. // The idea is to do these range checks without calling coerceVector() (which allocates) - #define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO, FMTVAL) {{ \ - const STYPE *sd = (const STYPE *)RFUN(source); \ - for (int i=0; i0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls) @@ -1195,9 +1190,9 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { static SEXP *saveds=NULL; static R_len_t *savedtl=NULL, nalloc=0, nsaved=0; -void savetl_init(void) { +void savetl_init() { if (nsaved || nalloc || saveds || savedtl) { - error(_("Internal error: savetl_init checks failed (%d %d %p %p). please report to data.table issue tracker."), nsaved, nalloc, (void *)saveds, (void *)savedtl); // # nocov + error(_("Internal error: savetl_init checks failed (%d %d %p %p). please report to data.table issue tracker."), nsaved, nalloc, saveds, savedtl); // # nocov } nsaved = 0; nalloc = 100; @@ -1236,7 +1231,7 @@ void savetl(SEXP s) nsaved++; } -void savetl_end(void) { +void savetl_end() { // Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such // as to clear up before error. Also, it might be that nothing needed to be saved anyway. for (int i=0; i A(TL=1),B(2),C(3),D(4),E(5) => dupMap 1 2 3 5 6 | 8 7 4 // dupLink 7 8 | 6 (blank=0) int *counts = (int *)calloc(nuniq, sizeof(int)); - unsigned int mapsize = tablelen+nuniq; // lto compilation warning #5760 // +nuniq to store a 0 at the end of each group - int *map = (int *)calloc(mapsize, sizeof(int)); + int *map = (int *)calloc(tablelen+nuniq, sizeof(int)); // +nuniq to store a 0 at the end of each group if (!counts || !map) { // # nocov start for (int i=0; i #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT @@ -115,7 +115,7 @@ extern size_t __typeorder[100]; // __ prefix otherwise if we use these names dir long long DtoLL(double x); double LLtoD(long long x); -int GetVerbose(void); +int GetVerbose(); // cj.c SEXP cj(SEXP base_list); @@ -128,14 +128,14 @@ SEXP growVector(SEXP x, R_len_t newlen); SEXP allocNAVector(SEXPTYPE type, R_len_t n); SEXP allocNAVectorLike(SEXP x, R_len_t n); void writeNA(SEXP v, const int from, const int n, const bool listNA); -void savetl_init(void), savetl(SEXP s), savetl_end(void); +void savetl_init(), savetl(SEXP s), savetl_end(); int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); -int getNumericRounding_C(void); +int getNumericRounding_C(); // reorder.c SEXP reorder(SEXP x, SEXP order); @@ -192,33 +192,41 @@ double iquickselect(int *x, int n); double i64quickselect(int64_t *x, int n); // fread.c -double wallclock(void); +double wallclock(); // openmp-utils.c -void initDTthreads(void); +void initDTthreads(); int getDTthreads(const int64_t n, const bool throttle); -void avoid_openmp_hang_within_fork(void); +void avoid_openmp_hang_within_fork(); +typedef enum { // adding rolling functions here and in frollfunR in frollR.c + MEAN = 0, + SUM = 1, + MAX = 2 +} rollfun_t; // froll.c -void frollmean(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); -void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); -void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); -void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); -void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); -void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); +void frollfun(rollfun_t rfun, unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasnf, bool verbose); +void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose); +void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose); +void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose); +void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose); +void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose); +void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose); void frollapply(double *x, int64_t nx, double *w, int k, ans_t *ans, int align, double fill, SEXP call, SEXP rho, bool verbose); // frolladaptive.c -void fadaptiverollmean(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -void fadaptiverollmeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -void fadaptiverollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); +void frolladaptivefun(rollfun_t rfun, unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); +void frolladaptivemeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); +void frolladaptivemeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); +void frolladaptivesumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); +void frolladaptivesumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); +//void frolladaptivemaxFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); // does not exists as of now +void frolladaptivemaxExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose); +void frolladaptiveapply(double *x, int64_t nx, SEXP pw, int *k, ans_t *ans, double fill, SEXP call, SEXP rho, bool verbose); // frollR.c -SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasNA, SEXP adaptive); -SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho); +SEXP frollfunR(SEXP fun, SEXP xobj, SEXP kobj, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasnf, SEXP adaptive); +SEXP frollapplyR(SEXP fun, SEXP xobj, SEXP kobj, SEXP fill, SEXP align, SEXP adaptive, SEXP rho); // nafill.c void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose); @@ -250,7 +258,8 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg); // types.c char *end(char *start); -void ansMsg(ans_t *ans, int n, bool verbose, const char *func); +void ansSetMsg(ans_t *ans, uint8_t status, const char *msg, const char *func); +void ansGetMsgs(ans_t *ans, int n, bool verbose, const char *func); SEXP testMsgR(SEXP status, SEXP x, SEXP k); //fifelse.c @@ -265,76 +274,3 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); - -// functions called from R level .Call/.External and registered in init.c -// these now live here to pass -Wstrict-prototypes, #5477 -// all arguments must be SEXP since they are called from R level -// where there are no arguments, it must be (void) not () to be a strict prototype -SEXP setattrib(SEXP, SEXP, SEXP); -SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP copy(SEXP); -SEXP alloccolwrapper(SEXP, SEXP, SEXP); -SEXP selfrefokwrapper(SEXP, SEXP); -SEXP truelength(SEXP); -SEXP setcharvec(SEXP, SEXP, SEXP); -SEXP chmatch_R(SEXP, SEXP, SEXP); -SEXP chmatchdup_R(SEXP, SEXP, SEXP); -SEXP chin_R(SEXP, SEXP); -SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP rbindlist(SEXP, SEXP, SEXP, SEXP); -SEXP setlistelt(SEXP, SEXP, SEXP); -SEXP address(SEXP); -SEXP expandAltRep(SEXP); -SEXP fmelt(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP issorted(SEXP, SEXP); -SEXP gforce(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP gsum(SEXP, SEXP); -SEXP gmean(SEXP, SEXP); -SEXP gmin(SEXP, SEXP); -SEXP gmax(SEXP, SEXP); -SEXP setNumericRounding(SEXP); -SEXP getNumericRounding(void); -SEXP binary(SEXP); -SEXP subsetDT(SEXP, SEXP, SEXP); -SEXP convertNegAndZeroIdx(SEXP, SEXP, SEXP, SEXP); -SEXP frank(SEXP, SEXP, SEXP, SEXP); -SEXP lookup(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP overlaps(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP whichwrapper(SEXP, SEXP); -SEXP shift(SEXP, SEXP, SEXP, SEXP); -SEXP transpose(SEXP, SEXP, SEXP, SEXP); -SEXP anyNA(SEXP, SEXP); -SEXP setlevels(SEXP, SEXP, SEXP); -SEXP rleid(SEXP, SEXP); -SEXP gmedian(SEXP, SEXP); -SEXP gtail(SEXP, SEXP); -SEXP ghead(SEXP, SEXP); -SEXP glast(SEXP); -SEXP gfirst(SEXP); -SEXP gnthvalue(SEXP, SEXP); -SEXP dim(SEXP); -SEXP gvar(SEXP, SEXP); -SEXP gsd(SEXP, SEXP); -SEXP gprod(SEXP, SEXP); -SEXP gshift(SEXP, SEXP, SEXP, SEXP); -SEXP nestedid(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP setDTthreads(SEXP, SEXP, SEXP, SEXP); -SEXP getDTthreads_R(SEXP); -SEXP nqRecreateIndices(SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP fsort(SEXP, SEXP); -SEXP inrange(SEXP, SEXP, SEXP, SEXP); -SEXP hasOpenMP(void); -SEXP beforeR340(void); -SEXP uniqueNlogical(SEXP, SEXP); -SEXP dllVersion(void); -SEXP initLastUpdated(SEXP); -SEXP allNAR(SEXP); -SEXP test_dt_win_snprintf(void); -SEXP dt_zlib_version(void); -SEXP dt_has_zlib(void); -SEXP startsWithAny(SEXP, SEXP, SEXP); -SEXP convertDate(SEXP, SEXP); -SEXP fastmean(SEXP); - diff --git a/src/forder.c b/src/forder.c index c9063782bf..6e8a77ecf5 100644 --- a/src/forder.c +++ b/src/forder.c @@ -10,9 +10,9 @@ http://stereopsis.com/radix.html Previous version of this file was promoted into base R, see ?base::sort. - Denmark useR! presentation https://github.com/Rdatatable/data.table/wiki/talks/useR2015_Matt.pdf - Stanford DSC presentation https://github.com/Rdatatable/data.table/wiki/talks/DSC2016_ParallelSort.pdf - JSM presentation https://github.com/Rdatatable/data.table/wiki/talks/JSM2018_Matt.pdf + Denmark useR! presentation + Stanford DSC presentation + JSM presentation Techniques used : skewed groups are split in parallel finds unique bytes to save 256 sweeping @@ -56,7 +56,7 @@ static int *anso = NULL; static bool notFirst=false; static char msg[1001]; -#define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error("%s", msg);} while(0) // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon +#define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error(msg);} while(0) // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon // use STOP in this file (not error()) to ensure cleanup() is called first // snprintf to msg first in case nrow (just as an example) is provided in the message because cleanup() sets nrow to 0 #undef warning @@ -68,14 +68,14 @@ static char msg[1001]; * Therefore, using <> approach to cleanup() on error. */ -static void free_ustr(void) { +static void free_ustr() { for(int i=0; i> has whitespace at the beginning or end"), ch); - if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || - strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || - strcmp(ch,"True")==0 || strcmp(ch,"False")==0) - STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); - if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) - STOP(_("freadMain: NAstring <<%s>> and logical01=TRUE, this is not permitted."), ch); - char *end; - errno = 0; - (void)strtod(ch, &end); // careful not to let "" get to here as strtod considers "" numeric - if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; + // if blank is the only one, as is the default, clear NAstrings so that doesn't have to be checked + if (nastr==NAstrings && nastr+1==NULL) NAstrings=NULL; + nastr++; + continue; } + const char *ch = *nastr; + size_t nchar = strlen(ch); + if (isspace(ch[0]) || isspace(ch[nchar-1])) + STOP(_("freadMain: NAstring <<%s>> has whitespace at the beginning or end"), ch); + if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || + strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || + strcmp(ch,"True")==0 || strcmp(ch,"False")==0) + STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); + if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) + STOP(_("freadMain: NAstring <<%s>> and logical01=%s, this is not permitted."), ch, args.logical01 ? "TRUE" : "FALSE"); + char *end; + errno = 0; + (void)strtod(ch, &end); // careful not to let "" get to here (see continue above) as strtod considers "" numeric + if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; nastr++; } disabled_parsers[CT_BOOL8_N] = !args.logical01; @@ -1323,10 +1325,6 @@ int freadMain(freadMainArgs _args) { DTPRINT(_(" show progress = %d\n"), args.showProgress); DTPRINT(_(" 0/1 column will be read as %s\n"), args.logical01? "boolean" : "integer"); } - if (*NAstrings==NULL || // user sets na.strings=NULL - (**NAstrings=='\0' && *(NAstrings+1)==NULL)) { // user sets na.strings="" - NAstrings=NULL; // clear NAstrings to save end_NA_string() dealing with these cases (blank_is_a_NAstring was set to true above) - } stripWhite = args.stripWhite; skipEmptyLines = args.skipEmptyLines; @@ -1398,14 +1396,14 @@ int freadMain(freadMainArgs _args) { attempts++; // Looped retry to avoid ephemeral locks by system utilities as recommended here : http://support.microsoft.com/kb/316609 } - if (hFile==INVALID_HANDLE_VALUE) STOP(_("Unable to open file after %d attempts (error %lu): %s"), attempts, GetLastError(), fnam); + if (hFile==INVALID_HANDLE_VALUE) STOP(_("Unable to open file after %d attempts (error %d): %s"), attempts, GetLastError(), fnam); LARGE_INTEGER liFileSize; if (GetFileSizeEx(hFile,&liFileSize)==0) { CloseHandle(hFile); STOP(_("GetFileSizeEx failed (returned 0) on file: %s"), fnam); } fileSize = (size_t)liFileSize.QuadPart; if (fileSize<=0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); } if (verbose) DTPRINT(_(" File opened, size = %s.\n"), filesize_to_str(fileSize)); HANDLE hMap=CreateFileMapping(hFile, NULL, PAGE_WRITECOPY, 0, 0, NULL); - if (hMap==NULL) { CloseHandle(hFile); STOP(_("This is Windows, CreateFileMapping returned error %lu for file %s"), GetLastError(), fnam); } + if (hMap==NULL) { CloseHandle(hFile); STOP(_("This is Windows, CreateFileMapping returned error %d for file %s"), GetLastError(), fnam); } mmp = MapViewOfFile(hMap,FILE_MAP_COPY,0,0,fileSize); // fileSize must be <= hilo passed to CreateFileMapping above. CloseHandle(hMap); // we don't need to keep the file open; the MapView keeps an internal reference; CloseHandle(hFile); // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa366537(v=vs.85).aspx @@ -1895,7 +1893,7 @@ int freadMain(freadMainArgs _args) { if (sampleLines>0) for (int j=0; jCT_EMPTY) { args.header=true; - if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %"PRId64" sample rows\n"), + if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"), j+1, typeName[type[j]], sampleLines); break; } @@ -2537,8 +2535,9 @@ int freadMain(freadMainArgs _args) { rowSize1 = rowSize4 = rowSize8 = 0; nStringCols = 0; nNonStringCols = 0; - for (int j=0; jmessage[0]), 500, _("%s: window width longer than input vector, returning all NA vector\n"), __func__); // implicit n_message limit discussed here: https://github.com/Rdatatable/data.table/issues/3423#issuecomment-487722586 - for (int i=0; idbl_v[i] = fill; } return; } - double tic = 0; - if (verbose) - tic = omp_get_wtime(); - if (algo==0) { - frollmeanFast(x, nx, ans, k, fill, narm, hasna, verbose); - } else if (algo==1) { - frollmeanExact(x, nx, ans, k, fill, narm, hasna, verbose); + switch (rfun) { + case MEAN : + if (algo==0) { + frollmeanFast(x, nx, ans, k, fill, narm, hasnf, verbose); + } else if (algo==1) { + frollmeanExact(x, nx, ans, k, fill, narm, hasnf, verbose); + } + break; + case SUM : + if (algo==0) { + frollsumFast(x, nx, ans, k, fill, narm, hasnf, verbose); + } else if (algo==1) { + frollsumExact(x, nx, ans, k, fill, narm, hasnf, verbose); + } + break; + case MAX : + if (algo==0) { + frollmaxFast(x, nx, ans, k, fill, narm, hasnf, verbose); + } else if (algo==1) { + frollmaxExact(x, nx, ans, k, fill, narm, hasnf, verbose); + } + break; + default: + error(_("Internal error: Unknown rfun value in froll: %d"), rfun); // #nocov } - if (ans->status < 3 && align < 1) { // align center or left, only when no errors occurred - int k_ = align==-1 ? k-1 : floor(k/2); // offset to shift + if (align < 1 && ans->status < 3) { + int k_ = align==-1 ? k-1 : floor(k/2); // offset to shift if (verbose) snprintf(end(ans->message[0]), 500, _("%s: align %d, shift answer by %d\n"), __func__, align, -k_); memmove((char *)ans->dbl_v, (char *)ans->dbl_v + (k_*sizeof(double)), (nx-k_)*sizeof(double)); // apply shift to achieve expected align - for (uint64_t i=nx-k_; idbl_v[i] = fill; } } if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); + snprintf(end(ans->message[0]), 500, _("%s: processing fun %d algo %u took %.3fs\n"), __func__, rfun, algo, omp_get_wtime()-tic); } + /* fast rolling mean - fast - * when no info on NA (hasNA argument) then assume no NAs run faster version + * when no info on NF (has.nf argument) then assume no NFs run faster version * rollmean implemented as single pass sliding window for align="right" - * if NAs detected re-run rollmean implemented as single pass sliding window with NA support + * if non-finite detected re-run rollmean implemented as single pass sliding window with NA support */ -void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { +void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollmeanFast", (uint64_t)nx, k, hasna, (int)narm); + snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasnf %d, narm %d\n"), "frollmeanFast", (uint64_t)nx, k, hasnf, (int)narm); long double w = 0.0; // sliding window aggregate - bool truehasna = hasna>0; // flag to re-run with NA support if NAs detected - if (!truehasna) { + bool truehasnf = hasnf>0; // flag to re-run with NA support if NAs detected + if (!truehasnf) { int i; // iterator declared here because it is being used after for loop for (i=0; idbl_v[i] = (double) (w / k); // rollfun to answer vector } if (!R_FINITE((double) w)) { // mark to re-run with NA care - if (hasna==-1) { // raise warning - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - w = 0.0; - truehasna = true; + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + w = 0.0; truehasnf = true; } } else { // early stopping branch when NAs detected in first k obs - if (hasna==-1) { // raise warning - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, skip non-NA attempt and run with extra care for NAs\n"), __func__); - w = 0.0; - truehasna = true; + ansSetMsg(ans, 0, "%s: non-finite values are present in input, skip non-finite inaware attempt and run with extra care for NFs straighaway\n", __func__); + w = 0.0; truehasnf = true; } } - if (truehasna) { - int nc = 0; // NA counter within sliding window + if (truehasnf) { + int nc = 0, pinf = 0, ninf = 0; // NA counter within sliding window int i; // iterator declared here because it is being used after for loop + +#undef MEAN_WINDOW_STEP_VALUE +#define MEAN_WINDOW_STEP_VALUE \ + if (nc == 0) { \ + if (pinf == 0) { \ + if (ninf == 0) { \ + ans->dbl_v[i] = (double) (w / k); \ + } else { \ + ans->dbl_v[i] = R_NegInf; \ + } \ + } else if (ninf == 0) { \ + ans->dbl_v[i] = R_PosInf; \ + } else { \ + ans->dbl_v[i] = R_NaN; \ + } \ + } else if (nc == k) { \ + ans->dbl_v[i] = narm ? R_NaN : NA_REAL; \ + } else { \ + if (narm) { \ + if (pinf == 0) { \ + if (ninf == 0) { \ + ans->dbl_v[i] = (double) (w / (k - nc)); \ + } else { \ + ans->dbl_v[i] = R_NegInf; \ + } \ + } else if (ninf == 0) { \ + ans->dbl_v[i] = R_PosInf; \ + } else { \ + ans->dbl_v[i] = R_NaN; \ + } \ + } else { \ + ans->dbl_v[i] = NA_REAL; \ + } \ + } + for (i=0; idbl_v[i] = fill; // partial window fill all } - if (R_FINITE(x[i])) { - w += x[i]; // i==k-1 - } else { - nc++; - } - if (nc == 0) { - ans->dbl_v[i] = (double) (w / k); // no NAs in first full window - } else if (nc == k) { - ans->dbl_v[i] = narm ? R_NaN : NA_REAL; // all values in sliding window are NA, expected output for fun(NA, na.rm=T/F) - } else { - ans->dbl_v[i] = narm ? (double) (w / (k - nc)) : NA_REAL; // some values in window are NA - } + SUM_WINDOW_STEP_FRONT // i==k-1 + MEAN_WINDOW_STEP_VALUE for (uint64_t i=k; idbl_v[i] = (double) (w / k); // no NAs in sliding window for present observation - } else if (nc == k) { - ans->dbl_v[i] = narm ? R_NaN : NA_REAL; // all values in window are NA, expected output for fun(NA, na.rm=T/F) - } else { - ans->dbl_v[i] = narm ? (double) (w / (k - nc)) : NA_REAL; // some values in window are NA - } + SUM_WINDOW_STEP_FRONT + SUM_WINDOW_STEP_BACK + MEAN_WINDOW_STEP_VALUE } } } /* fast rolling mean - exact - * when no info on NA (hasNA argument) then assume no NAs run faster version, also when na.rm=FALSE faster version can proceed + * when no info on NF (has.nf argument) then assume no NFs run faster version, also when na.rm=FALSE faster version can proceed * rollmean implemented as mean of k obs for each observation for align="right" - * if NAs detected and na.rm=TRUE then re-run rollmean implemented as mean of k bos for each observation with NA support + * if non-finite detected and na.rm=TRUE then re-run NF aware rollmean */ -void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { +void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollmeanExact", (uint64_t)nx, k, hasna, (int)narm); + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasnf %d, narm %d\n"), "frollmeanExact", (uint64_t)nx, k, hasnf, (int)narm); for (int i=0; idbl_v[i] = fill; } - bool truehasna = hasna>0; // flag to re-run with NA support if NAs detected - if (!truehasna || !narm) { + bool truehasnf = hasnf>0; // flag to re-run with NA support if NAs detected + if (!truehasnf || !narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=k-1; idbl_v[i] = (double) (res + (err / k)); // adjust calculated rollfun with roundoff correction - } else { + } else if (ISNAN((double) w)) { if (!narm) { ans->dbl_v[i] = (double) (w / k); // NAs should be propagated } - truehasna = true; // NAs detected for this window, set flag so rest of windows will not be re-run + truehasnf = true; // NAs detected for this window, set flag so rest of windows will not be re-run + } else { + ans->dbl_v[i] = (double) w; // Inf and -Inf } } - if (truehasna) { - if (hasna==-1) { // raise warning - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (truehasnf) { + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) { - if (narm) { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - } else { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n"), __func__); - } + if (narm) + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + else + ansSetMsg(ans, 0, "%s: non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run\n", __func__); } } } - if (truehasna && narm) { + if (truehasnf && narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=k-1; i DBL_MAX) { - ans->dbl_v[i] = R_PosInf; // handle Inf for na.rm=TRUE consistently to base R - } else if (w < -DBL_MAX) { - ans->dbl_v[i] = R_NegInf; - } else { + if (R_FINITE((double) w)) { long double res = w / k; // keep results as long double for intermediate processing long double err = 0.0; // roundoff corrector if (nc == 0) { // no NAs in current window @@ -211,47 +252,22 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } else { // nc == k ans->dbl_v[i] = R_NaN; // all values NAs and narm so produce expected values } + } else { + ans->dbl_v[i] = (double) w; // Inf and -Inf } } } } -/* fast rolling sum */ -void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose) { - if (nx < k) { - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: window width longer than input vector, returning all NA vector\n"), __func__); - for (int i=0; idbl_v[i] = fill; - } - return; - } - double tic = 0; - if (verbose) - tic = omp_get_wtime(); - if (algo==0) { - frollsumFast(x, nx, ans, k, fill, narm, hasna, verbose); - } else if (algo==1) { - frollsumExact(x, nx, ans, k, fill, narm, hasna, verbose); - } - if (ans->status < 3 && align < 1) { - int k_ = align==-1 ? k-1 : floor(k/2); - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: align %d, shift answer by %d\n"), __func__, align, -k_); - memmove((char *)ans->dbl_v, (char *)ans->dbl_v + (k_*sizeof(double)), (nx-k_)*sizeof(double)); - for (uint64_t i=nx-k_; idbl_v[i] = fill; - } - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); -} -void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { +/* fast rolling sum - fast + * same as mean fast + */ +void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollsumFast", (uint64_t)nx, k, hasna, (int)narm); + snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasnf %d, narm %d\n"), "frollsumFast", (uint64_t)nx, k, hasnf, (int)narm); long double w = 0.0; - bool truehasna = hasna>0; - if (!truehasna) { + bool truehasnf = hasnf>0; + if (!truehasnf) { int i; for (i=0; idbl_v[i] = (double) w; } if (!R_FINITE((double) w)) { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - w = 0.0; - truehasna = true; + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + w = 0.0; truehasnf = true; } } else { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, skip non-NA attempt and run with extra care for NAs\n"), __func__); - w = 0.0; - truehasna = true; + ansSetMsg(ans, 0, "%s: non-finite values are present in input, skip non-finite inaware attempt and run with extra care for NFs straighaway\n", __func__); + w = 0.0; truehasnf = true; } } - if (truehasna) { - int nc = 0; - int i; - for (i=0; idbl_v[i] = fill; - } - if (R_FINITE(x[i])) { - w += x[i]; - } else { - nc++; - } - if (nc == 0) { - ans->dbl_v[i] = (double) w; - } else if (nc == k) { - ans->dbl_v[i] = narm ? 0.0 : NA_REAL; - } else { - ans->dbl_v[i] = narm ? (double) w : NA_REAL; + if (truehasnf) { + int nc = 0, pinf = 0, ninf = 0; // NA counter within sliding window + int i; // iterator declared here because it is being used after for loop + +#undef SUM_WINDOW_STEP_VALUE +#define SUM_WINDOW_STEP_VALUE \ +if (nc == 0) { \ + if (pinf == 0) { \ + if (ninf == 0) { \ + ans->dbl_v[i] = (double) w; \ + } else { \ + ans->dbl_v[i] = R_NegInf; \ + } \ + } else if (ninf == 0) { \ + ans->dbl_v[i] = R_PosInf; \ + } else { \ + ans->dbl_v[i] = R_NaN; \ + } \ +} else if (nc == k) { \ + ans->dbl_v[i] = narm ? 0.0 : NA_REAL; \ +} else { \ + if (narm) { \ + if (pinf == 0) { \ + if (ninf == 0) { \ + ans->dbl_v[i] = (double) w; \ + } else { \ + ans->dbl_v[i] = R_NegInf; \ + } \ + } else if (ninf == 0) { \ + ans->dbl_v[i] = R_PosInf; \ + } else { \ + ans->dbl_v[i] = R_NaN; \ + } \ + } else { \ + ans->dbl_v[i] = NA_REAL; \ + } \ +} + + for (i=0; idbl_v[i] = fill; // partial window fill all } - for (uint64_t i=k; idbl_v[i] = (double) w; - } else if (nc == k) { - ans->dbl_v[i] = narm ? 0.0 : NA_REAL; - } else { - ans->dbl_v[i] = narm ? (double) w : NA_REAL; - } + SUM_WINDOW_STEP_FRONT // i==k-1 + SUM_WINDOW_STEP_VALUE + for (uint64_t i=k; imessage[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollsumExact", (uint64_t)nx, k, hasna, (int)narm); + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasnf %d, narm %d\n"), "frollsumExact", (uint64_t)nx, k, hasnf, (int)narm); for (int i=0; idbl_v[i] = fill; } - bool truehasna = hasna>0; - if (!truehasna || !narm) { + bool truehasnf = hasnf>0; + if (!truehasnf || !narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=k-1; idbl_v[i] = (double) w; - } else { + } else if (ISNAN((double) w)) { if (!narm) { ans->dbl_v[i] = (double) w; } - truehasna = true; + truehasnf = true; + } else { + ans->dbl_v[i] = (double) w; } } - if (truehasna) { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (truehasnf) { + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) { - if (narm) { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - } else { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n"), __func__); - } + if (narm) + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + else + ansSetMsg(ans, 0, "%s: non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run\n", __func__); } } } - if (truehasna && narm) { + if (truehasnf && narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=k-1; i DBL_MAX) { + if (w > DBL_MAX) { // in contrast to mean, here we can overflow long double more than DBL_MAX ans->dbl_v[i] = R_PosInf; } else if (w < -DBL_MAX) { ans->dbl_v[i] = R_NegInf; @@ -397,11 +416,207 @@ void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } } +static inline void wmax(double *x, uint64_t o, int k, double *w, uint64_t *iw, bool narm) { + if (narm) { + for (int i=0; i= w[0]) { // this never true if all x NAs and narm=TRUE + iw[0] = o+i-k+1; + w[0] = x[iw[0]]; + } + } + } else { + double ww = R_NegInf; + uint64_t iww = 0; + for (int i=0; i x[i]: NA > NaN + } else { // no NA in window so NaN >= than any non-NA + iww = ii; ww = R_NaN; + } + } else if (ISNAN(ww)) { + // w still within the window and is NA or NaN, x[i] is not NA - already checked above, therefore to nothing + } else if (x[ii] >= ww) { + iww = ii; ww = x[iww]; + } + } + iw[0] = iww; + w[0] = ww; + } +} +/* fast rolling max - fast + * fast online algorithm do single pass over elements keeping track of recent max and its index + * if index of max is within progressing window then it keeps running single pass + * whenever max is leaving the window (index of max is outside of iterator minus window size) then new maximum is computed via nested loop on current location + * new max is used to continue outer single pass as long as new max index is not leaving the running window + * should scale well for bigger window size, may carry overhead for small window, needs benchmarking + */ +void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose) { + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasnf %d, narm %d\n"), "frollmaxFast", (uint64_t)nx, k, hasnf, (int)narm); + double w = R_NegInf; // window max + uint64_t cmax = 0; // counter of nested loops for verbose + uint64_t iw = 0; // index of window max + uint64_t i; + if (narm || hasnf==-1) { + for (i=0; i= w) { // >= rather than > because we track most recent maximum using iw + iw = i; w = x[iw]; + } + ans->dbl_v[i] = fill; + } + for (i=k-1; i= w) { + iw = i; w = x[iw]; + } + ans->dbl_v[i] = w; + } + } else { + bool truehasnf = hasnf>0; + for (i=0; i x[i]: NA > NaN + } else { + iw = i; w = R_NaN; + } + } else if (x[i] >= w) { + iw = i; w = x[iw]; + } + ans->dbl_v[i] = fill; + } + if (!truehasnf) { // maybe no NAs + for (; i= w) { + iw = i; w = x[iw]; + } + ans->dbl_v[i] = w; + } + } + if (truehasnf) { + for (; i x[i]: NA > NaN + } else { // no NA in window so NaN >= than any non-NA + iw = i; w = R_NaN; + } + } else if (iw+k <= i) { // max left current window + iw = i-k; w = R_NegInf; + wmax(x, i, k, &w, &iw, false); cmax++; + } else if (ISNAN(w)) { + // w still within the window and is NA or NaN, x[i] is not NA - already checked above, therefore do nothing + } else if (x[i] >= w) { + iw = i; w = x[iw]; + } + ans->dbl_v[i] = w; + } + } + } + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: nested window max calculation called %"PRIu64" times\n"), __func__, cmax); +} +/* fast rolling max - exact + * for each observation in x compute max in window from scratch + * faster version ignores NAs (narm=T or has.nf=F), as they are not propagated by `>` operator + * otherwise we scan for NaN/NA and run either of two loops + * has.nf=FALSE can give incorrect results if NAs provided, documented to be used with care + */ +void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose) { + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasnf %d, narm %d\n"), "frollmaxExact", (uint64_t)nx, k, hasnf, (int)narm); + for (int i=0; idbl_v[i] = fill; + } + if (narm || hasnf==-1) { // ignore NAs as > does not propagate + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=k-1; i w) + w = x[i+j]; + } + ans->dbl_v[i] = w; + } + } else { + bool *isnan = malloc(nx*sizeof(bool)); // isnan lookup - we use it to reduce ISNAN calls in nested loop + if (!isnan) { // # nocov start + ansSetMsg(ans, 3, "%s: Unable to allocate memory for isnan", __func__); // raise error + free(isnan); + return; + } // # nocov end + bool truehasnf = hasnf>0; + for (uint64_t i=0; i w) + w = x[i+j]; + } + ans->dbl_v[i] = w; + } + } else { // there are some NAs + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=k-1; i NaN + } else { + w = R_NaN; // continue nested loop in case there is NA there + } + } else if (x[i+j] > w) + w = x[i+j]; + } + } + ans->dbl_v[i] = w; + } + } + } +} + /* fast rolling any R function * not plain C, not thread safe * R eval() allocates */ void frollapply(double *x, int64_t nx, double *w, int k, ans_t *ans, int align, double fill, SEXP call, SEXP rho, bool verbose) { + // early stopping for window bigger than input if (nx < k) { if (verbose) Rprintf(_("%s: window width longer than input vector, returning all NA vector\n"), __func__); @@ -450,6 +665,7 @@ void frollapply(double *x, int64_t nx, double *w, int k, ans_t *ans, int align, UNPROTECT(1); // evali } } + // align if (ans->status < 3 && align < 1) { int k_ = align==-1 ? k-1 : floor(k/2); if (verbose) diff --git a/src/frollR.c b/src/frollR.c index 74cc7dd4ef..57eb692872 100644 --- a/src/frollR.c +++ b/src/frollR.c @@ -1,7 +1,8 @@ #include "data.table.h" // first (before Rdefines.h) for clang-13-omp, #5122 #include -SEXP coerceToRealListR(SEXP obj) { +// validate and coerce to list of real +SEXP coerceX(SEXP obj) { // accept atomic/list of integer/logical/real returns list of real int protecti = 0; if (isVectorAtomic(obj)) { @@ -20,83 +21,91 @@ SEXP coerceToRealListR(SEXP obj) { UNPROTECT(protecti); return x; } - -SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasna, SEXP adaptive) { +// validate and coerce to integer or list of integer +SEXP coerceK(SEXP obj, bool adaptive) { int protecti = 0; - const bool verbose = GetVerbose(); - - if (!xlength(obj)) - return(obj); // empty input: NULL, list() - double tic = 0; - if (verbose) - tic = omp_get_wtime(); - SEXP x = PROTECT(coerceToRealListR(obj)); protecti++; - R_len_t nx=length(x); // number of columns to roll on - - if (xlength(k) == 0) // check that window is non zero length - error(_("n must be non 0 length")); - - if (!IS_TRUE_OR_FALSE(adaptive)) - error(_("%s must be TRUE or FALSE"), "adaptive"); - bool badaptive = LOGICAL(adaptive)[0]; - - R_len_t nk = 0; // number of rolling windows, for adaptive might be atomic to be wrapped into list, 0 for clang -Wall - SEXP ik = R_NilValue; // holds integer window width, if doing non-adaptive roll fun - SEXP kl = R_NilValue; // holds adaptive window width, if doing adaptive roll fun - if (!badaptive) { // validating n input for adaptive=FALSE - if (isNewList(k)) + SEXP ans = R_NilValue; + if (!adaptive) { + if (isNewList(obj)) error(_("n must be integer, list is accepted for adaptive TRUE")); - - if (isInteger(k)) { // check that k is integer vector - ik = k; - } else if (isReal(k)) { // if n is double then convert to integer - ik = PROTECT(coerceVector(k, INTSXP)); protecti++; + if (isInteger(obj)) { + ans = obj; + } else if (isReal(obj)) { + ans = PROTECT(coerceVector(obj, INTSXP)); protecti++; } else { error(_("n must be integer")); } - - nk = length(k); - R_len_t i=0; // check that all window values positive - while (i < nk && INTEGER(ik)[i] > 0) i++; + int nk = length(obj); + R_len_t i = 0; + int *iik = INTEGER(ans); + while (i < nk && iik[i] > 0) i++; if (i != nk) error(_("n must be positive integer values (> 0)")); - } else { // validating n input for adaptive=TRUE - if (isVectorAtomic(k)) { // if not-list then wrap into list - kl = PROTECT(allocVector(VECSXP, 1)); protecti++; - if (isInteger(k)) { // check that k is integer vector - SET_VECTOR_ELT(kl, 0, k); - } else if (isReal(k)) { // if n is double then convert to integer - SET_VECTOR_ELT(kl, 0, coerceVector(k, INTSXP)); + } else { + if (isVectorAtomic(obj)) { + ans = PROTECT(allocVector(VECSXP, 1)); protecti++; + if (isInteger(obj)) { + SET_VECTOR_ELT(ans, 0, obj); + } else if (isReal(obj)) { + SET_VECTOR_ELT(ans, 0, coerceVector(obj, INTSXP)); } else { error(_("n must be integer vector or list of integer vectors")); } - nk = 1; } else { - nk = length(k); - kl = PROTECT(allocVector(VECSXP, nk)); protecti++; - for (R_len_t i=0; i 0 && (inx[i]!=inx[i-1])) // variable length list input not allowed for adaptive roll error(_("adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame; If you want to call rolling function on list having variable length of elements call it for each field separately")); - if (xlength(VECTOR_ELT(kl, j))!=inx[0]) // check that length of integer vectors in n list match to xrows[0] ([0] and not [i] because there is above check for equal xrows) + if (xlength(VECTOR_ELT(k, j))!=inx[0]) // check that length of integer vectors in n list match to xrows[0] ([0] and not [i] because there is above check for equal xrows) error(_("length of integer vector(s) provided as list to 'n' argument must be equal to number of observations provided in 'x'")); } SET_VECTOR_ELT(ans, i*nk+j, allocVector(REALSXP, inx[i]));// allocate answer vector for this column-window @@ -132,11 +141,13 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX dx[i] = REAL(VECTOR_ELT(x, i)); // assign source columns to C pointers } - enum {MEAN, SUM} sfun; + rollfun_t rfun; // adding fun needs to be here and data.table.h if (!strcmp(CHAR(STRING_ELT(fun, 0)), "mean")) { - sfun = MEAN; + rfun = MEAN; } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "sum")) { - sfun = SUM; + rfun = SUM; + } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "max")) { + rfun = MAX; } else { error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "fun", "rolling"); // # nocov } @@ -149,10 +160,10 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX bool bnarm = LOGICAL(narm)[0]; - int ihasna = // plain C tri-state boolean as integer - LOGICAL(hasna)[0]==NA_LOGICAL ? 0 : // hasna NA, default, no info about NA - LOGICAL(hasna)[0]==TRUE ? 1 : // hasna TRUE, might be some NAs - -1; // hasna FALSE, there should be no NAs + int ihasnf = // plain C tri-state boolean as integer + LOGICAL(hasnf)[0]==NA_LOGICAL ? 0 : // hasnf NA, default, no info about NA + LOGICAL(hasnf)[0]==TRUE ? 1 : // hasnf TRUE, might be some NAs + -1; // hasnf FALSE, there should be no NAs // or there must be no NAs for rollmax #5441 unsigned int ialgo; // decode algo to integer if (!strcmp(CHAR(STRING_ELT(algo, 0)), "fast")) @@ -162,15 +173,6 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "algo", "rolling"); // # nocov - int* iik = NULL; - if (!badaptive) { - if (!isInteger(ik)) - error(_("Internal error: badaptive=%d but ik is not integer"), badaptive); // # nocov - iik = INTEGER(ik); // pointer to non-adaptive window width, still can be vector when doing multiple windows - } else { - // ik is still R_NilValue from initialization. But that's ok as it's only needed below when !badaptive. - } - if (verbose) { if (ialgo==0) Rprintf(_("%s: %d column(s) and %d window(s), if product > 1 then entering parallel execution\n"), __func__, nx, nk); @@ -180,35 +182,32 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX #pragma omp parallel for if (ialgo==0) schedule(dynamic) collapse(2) num_threads(getDTthreads(nx*nk, false)) for (R_len_t i=0; i mk) + mk = k[i]; + return mk; +} +SEXP frollapplyR(SEXP fun, SEXP xobj, SEXP kobj, SEXP fill, SEXP align, SEXP adaptive, SEXP rho) { int protecti = 0; const bool verbose = GetVerbose(); @@ -217,30 +216,31 @@ SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho) { if (!isEnvironment(rho)) error(_("internal error: 'rho' should be an environment")); // # nocov - if (!xlength(obj)) - return(obj); + if (!xlength(xobj)) + return(xobj); double tic = 0; if (verbose) tic = omp_get_wtime(); - SEXP x = PROTECT(coerceToRealListR(obj)); protecti++; + SEXP x = PROTECT(coerceX(xobj)); protecti++; R_len_t nx = length(x); - if (!isInteger(k)) { - if (isReal(k)) { - if (isRealReallyInt(k)) { - SEXP ik = PROTECT(coerceVector(k, INTSXP)); protecti++; - k = ik; - } else { - error(_("n must be integer")); - } - } else { - error(_("n must be integer")); - } - } - R_len_t nk = length(k); - if (nk == 0) + if (xlength(kobj) == 0) error(_("n must be non 0 length")); - int *ik = INTEGER(k); + + if (!IS_TRUE_OR_FALSE(adaptive)) + error(_("%s must be TRUE or FALSE"), "adaptive"); + bool badaptive = LOGICAL(adaptive)[0]; + + SEXP k = PROTECT(coerceK(kobj, badaptive)); protecti++; + int nk = length(k); + int *ik = NULL; int **lk = NULL; + if (!badaptive) { + ik = INTEGER(k); + } else { + lk = (int**)R_alloc(nk, sizeof(int*)); + for (int j=0; j 0 && (inx[i]!=inx[i-1])) + error(_("adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame; If you want to call rolling function on list having variable length of elements call it for each field separately")); + if (xlength(VECTOR_ELT(k, j))!=inx[0]) + error(_("length of integer vector(s) provided as list to 'n' argument must be equal to number of observations provided in 'x'")); + } SET_VECTOR_ELT(ans, i*nk+j, allocVector(REALSXP, inx[i])); dans[i*nk+j] = ((ans_t) { .dbl_v=REAL(VECTOR_ELT(ans, i*nk+j)), .status=0, .message={"\0","\0","\0","\0"} }); } dx[i] = REAL(VECTOR_ELT(x, i)); } - double* dw; SEXP pw, pc; // in the outer loop we handle vectorized k argument // for each k we need to allocate a width window object: pw // we also need to construct distinct R call pointing to that window - for (R_len_t j=0; jmessage[0]), 500, _("%s: algo %u not implemented, fall back to %u\n"), __func__, algo, (unsigned int) 1); + } + frolladaptivemaxExact(x, nx, ans, k, fill, narm, hasnf, verbose); + break; + default: + error(_("Internal error: Unknown rfun value in froll: %d"), rfun); // #nocov } if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); - // implicit n_message limit discussed here: https://github.com/Rdatatable/data.table/issues/3423#issuecomment-487722586 + snprintf(end(ans->message[0]), 500, _("%s: processing fun %d algo %u took %.3fs\n"), __func__, rfun, algo, omp_get_wtime()-tic); } -/* fast adaptive rolling mean - fast - * when no info on NA (hasNA argument) then assume no NAs run faster version + +/* fast rolling adaptive mean - fast + * when no info on NF (has.nf argument) then assume no NFs run faster version * adaptive rollmean implemented as cumsum first pass, then diff cumsum by indexes `i` to `i-k[i]` - * if NAs detected re-run rollmean implemented as cumsum with NA support + * if NFs detected re-run rollmean implemented as cumsum with NF support */ -void fadaptiverollmeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { +void frolladaptivemeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollmeanFast", (uint64_t)nx, hasna, (int) narm); - bool truehasna = hasna>0; // flag to re-run if NAs detected + snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", hasnf %d, narm %d\n"), "frolladaptivemeanFast", (uint64_t)nx, hasnf, (int) narm); + bool truehasnf = hasnf>0; // flag to re-run if NAs detected long double w = 0.0; double *cs = malloc(nx*sizeof(double)); // cumsum vector, same as double cs[nx] but no segfault if (!cs) { // # nocov start - ans->status = 3; // raise error - snprintf(ans->message[3], 500, _("%s: Unable to allocate memory for cumsum"), __func__); + ansSetMsg(ans, 3, "%s: Unable to allocate memory for cumsum", __func__); // raise error free(cs); return; } // # nocov end - if (!truehasna) { + if (!truehasnf) { for (uint64_t i=0; idbl_v[i] = fill; // position in a vector smaller than obs window width - partial window } } - } else { // update truehasna flag if NAs detected - if (hasna==-1) { // raise warning - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + } else { // update truehasnf flag if NAs detected + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - w = 0.0; - truehasna = true; + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + w = 0.0; truehasnf = true; } } - if (truehasna) { - uint64_t nc = 0; // running NA counter + if (truehasnf) { + uint64_t nc = 0, pinf = 0, ninf = 0; // running NA counter uint64_t *cn = malloc(nx*sizeof(uint64_t)); // cumulative NA counter, used the same way as cumsum, same as uint64_t cn[nx] but no segfault if (!cn) { // # nocov start - ans->status = 3; // raise error - snprintf(ans->message[3], 500, _("%s: Unable to allocate memory for cum NA counter"), __func__); - free(cs); - free(cn); + ansSetMsg(ans, 3, "%s: Unable to allocate memory for cum NA counter", __func__); // raise error + free(cs); free(cn); + return; + } // # nocov end + uint64_t *cpinf = malloc(nx*sizeof(uint64_t)); + if (!cpinf) { // # nocov start + ansSetMsg(ans, 3, "%s: Unable to allocate memory for cum Inf counter", __func__); // raise error + free(cs); free(cn); free(cpinf); + return; + } // # nocov end + uint64_t *cninf = malloc(nx*sizeof(uint64_t)); + if (!cninf) { // # nocov start + ansSetMsg(ans, 3, "%s: Unable to allocate memory for cum -Inf counter", __func__); // raise error + free(cs); free(cn); free(cpinf); free(cninf); return; } // # nocov end for (uint64_t i=0; i0) { \ + if (narm) { \ + if (wpinf > 0) { \ + if (wninf > 0) { \ + ans->dbl_v[i] = R_NaN; \ + } else { \ + ans->dbl_v[i] = R_PosInf; \ + } \ + } else if (wninf > 0) { \ + ans->dbl_v[i] = R_NegInf; \ + } else { \ + int thisk = k[i] - ((int) wn); \ + ans->dbl_v[i] = thisk==0 ? R_NaN : ws/thisk; \ + } \ + } else { \ + ans->dbl_v[i] = NA_REAL; \ + } \ + } else { \ + if (wpinf > 0) { \ + if (wninf > 0) { \ + ans->dbl_v[i] = R_NaN; \ + } else { \ + ans->dbl_v[i] = R_PosInf; \ + } \ + } else if (wninf > 0) { \ + ans->dbl_v[i] = R_NegInf; \ + } else { \ + ans->dbl_v[i] = ws/k[i]; \ + } \ } + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = fill; - } else if (!narm) { // this branch reduce number of branching in narm=1 below - if (i+1 == k[i]) { - ans->dbl_v[i] = cn[i]>0 ? NA_REAL : cs[i]/k[i]; - } else if (i+1 > k[i]) { - ans->dbl_v[i] = (cn[i] - cn[i-k[i]])>0 ? NA_REAL : (cs[i]-cs[i-k[i]])/k[i]; - } - } else if (i+1 == k[i]) { // window width equal to observation position in vector - int thisk = k[i] - ((int) cn[i]); // window width taking NAs into account, we assume single window width is int32, cum NA counter can be int64 - ans->dbl_v[i] = thisk==0 ? R_NaN : cs[i]/thisk; // handle all obs NAs and na.rm=TRUE - } else if (i+1 > k[i]) { // window width smaller than observation position in vector - int thisk = k[i] - ((int) (cn[i] - cn[i-k[i]])); // window width taking NAs into account, we assume single window width is int32, cum NA counter can be int64 - ans->dbl_v[i] = thisk==0 ? R_NaN : (cs[i]-cs[i-k[i]])/thisk; // handle all obs NAs and na.rm=TRUE + } else if (i+1 == k[i]) { // first full window + wn = cn[i]; + wpinf = cpinf[i]; + wninf = cninf[i]; + ws = cs[i]; + MEAN_WINDOW_STEP_VALUE + } else { // all the remaining full windows + wn = cn[i] - cn[i-k[i]]; // NAs in current window + wpinf = cpinf[i] - cpinf[i-k[i]]; // Inf in current window + wninf = cninf[i] - cninf[i-k[i]]; // -Inf in current window + ws = cs[i] - cs[i-k[i]]; // cumsum in current window + MEAN_WINDOW_STEP_VALUE } } - free(cn); - } // end of truehasna + free(cninf); free(cpinf); free(cn); + } // end of truehasnf free(cs); } -/* fast adaptive rolling mean exact +/* fast rolling adaptive mean exact * extra nested loop to calculate mean of each obs and error correction * requires much more cpu * uses multiple cores */ -void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { +void frolladaptivemeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollmeanExact", (uint64_t)nx, hasna, (int) narm); - bool truehasna = hasna>0; // flag to re-run if NAs detected - if (!truehasna || !narm) { // narm=FALSE handled here as NAs properly propagated in exact algo + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasnf %d, narm %d\n"), "frolladaptivemeanExact", (uint64_t)nx, hasnf, (int) narm); + bool truehasnf = hasnf>0; // flag to re-run if NAs detected + if (!truehasnf || !narm) { // narm=FALSE handled here as NAs properly propagated in exact algo #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = (double) (res + (err / k[i])); // adjust calculated fun with roundoff correction - } else { + } else if (ISNAN((double) w)) { if (!narm) { - ans->dbl_v[i] = (double) (w / k[i]); // NAs should be propagated + ans->dbl_v[i] = (double) w; } - truehasna = true; // NAs detected for this window, set flag so rest of windows will not be re-run + truehasnf = true; // NAs detected for this window, set flag so rest of windows will not be re-run + } else { + ans->dbl_v[i] = (double) w; // Inf and -Inf } } } - if (truehasna) { - if (hasna==-1) { // raise warning - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (truehasnf) { + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) { - if (narm) { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - } else { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n"), __func__); - } + if (narm) + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + else + ansSetMsg(ans, 0, "%s: non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run\n", __func__); } } } - if (truehasna && narm) { + if (truehasnf && narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; i DBL_MAX) { - ans->dbl_v[i] = R_PosInf; // handle Inf for na.rm=TRUE consistently to base R - } else if (w < -DBL_MAX) { - ans->dbl_v[i] = R_NegInf; - } else { + if (R_FINITE((double) w)) { if (nc == 0) { // no NAs in current window res = w / k[i]; for (int j=-k[i]+1; j<=0; j++) { // sub-loop on window width to accumulate roundoff error @@ -194,38 +258,29 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f } else { // nc == k[i] ans->dbl_v[i] = R_NaN; // this branch assume narm so R_NaN always here } + } else { + ans->dbl_v[i] = (double) w; } } } - } // end of truehasna + } // end of truehasnf } -/* fast adaptive rolling sum */ -void fadaptiverollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { - double tic = 0; - if (verbose) - tic = omp_get_wtime(); - if (algo==0) { - fadaptiverollsumFast(x, nx, ans, k, fill, narm, hasna, verbose); - } else if (algo==1) { - fadaptiverollsumExact(x, nx, ans, k, fill, narm, hasna, verbose); - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); -} -void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { +/* fast rolling adaptive sum - fast + * same as adaptive mean fast + */ +void frolladaptivesumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollsumFast", (uint64_t)nx, hasna, (int) narm); - bool truehasna = hasna>0; + snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", hasnf %d, narm %d\n"), "frolladaptivesumFast", (uint64_t)nx, hasnf, (int) narm); + bool truehasnf = hasnf>0; long double w = 0.0; double *cs = malloc(nx*sizeof(double)); if (!cs) { // # nocov start - ans->status = 3; - snprintf(ans->message[3], 500, _("%s: Unable to allocate memory for cumsum"), __func__); + ansSetMsg(ans, 3, "%s: Unable to allocate memory for cumsum", __func__); // raise error free(cs); return; } // # nocov end - if (!truehasna) { + if (!truehasnf) { for (uint64_t i=0; istatus = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - w = 0.0; - truehasna = true; + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + w = 0.0; truehasnf = true; } } - if (truehasna) { - uint64_t nc = 0; - uint64_t *cn = malloc(nx*sizeof(uint64_t)); + if (truehasnf) { + uint64_t nc = 0, pinf = 0, ninf = 0; // running NA counter + uint64_t *cn = malloc(nx*sizeof(uint64_t)); // cumulative NA counter, used the same way as cumsum, same as uint64_t cn[nx] but no segfault if (!cn) { // # nocov start - ans->status = 3; - snprintf(ans->message[3], 500, _("%s: Unable to allocate memory for cum NA counter"), __func__); - free(cs); - free(cn); + ansSetMsg(ans, 3, "%s: Unable to allocate memory for cum NA counter", __func__); // raise error + free(cs); free(cn); return; } // # nocov end - for (uint64_t i=0; i0) { \ + if (narm) { \ + if (wpinf > 0) { \ + if (wninf > 0) { \ + ans->dbl_v[i] = R_NaN; \ + } else { \ + ans->dbl_v[i] = R_PosInf; \ + } \ + } else if (wninf > 0) { \ + ans->dbl_v[i] = R_NegInf; \ + } else { \ + int thisk = k[i] - ((int) wn); \ + ans->dbl_v[i] = thisk==0 ? 0.0 : ws; \ + } \ + } else { \ + ans->dbl_v[i] = NA_REAL; \ + } \ + } else { \ + if (wpinf > 0) { \ + if (wninf > 0) { \ + ans->dbl_v[i] = R_NaN; \ + } else { \ + ans->dbl_v[i] = R_PosInf; \ + } \ + } else if (wninf > 0) { \ + ans->dbl_v[i] = R_NegInf; \ + } else { \ + ans->dbl_v[i] = ws; \ + } \ } + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = fill; - } else if (!narm) { - if (i+1 == k[i]) { - ans->dbl_v[i] = cn[i]>0 ? NA_REAL : cs[i]; - } else if (i+1 > k[i]) { - ans->dbl_v[i] = (cn[i] - cn[i-k[i]])>0 ? NA_REAL : cs[i]-cs[i-k[i]]; - } - } else if (i+1 == k[i]) { - int thisk = k[i] - ((int) cn[i]); - ans->dbl_v[i] = thisk==0 ? 0.0 : cs[i]; - } else if (i+1 > k[i]) { - int thisk = k[i] - ((int) (cn[i] - cn[i-k[i]])); - ans->dbl_v[i] = thisk==0 ? 0.0 : cs[i]-cs[i-k[i]]; + } else if (i+1 == k[i]) { // first full window + wn = cn[i]; + wpinf = cpinf[i]; + wninf = cninf[i]; + ws = cs[i]; + SUM_WINDOW_STEP_VALUE + } else { // all the remaining full windows + wn = cn[i] - cn[i-k[i]]; // NAs in current window + wpinf = cpinf[i] - cpinf[i-k[i]]; // Inf in current window + wninf = cninf[i] - cninf[i-k[i]]; // -Inf in current window + ws = cs[i] - cs[i-k[i]]; // cumsum in current window + SUM_WINDOW_STEP_VALUE } } - free(cn); + free(cninf); free(cpinf); free(cn); } free(cs); } -void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { +/* fast rolling adaptive sum - exact + * same as adaptive mean exact + */ +void frolladaptivesumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasnf, bool verbose) { if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollsumExact", (uint64_t)nx, hasna, (int) narm); - bool truehasna = hasna>0; - if (!truehasna || !narm) { + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasnf %d, narm %d\n"), "frolladaptivesumExact", (uint64_t)nx, hasnf, (int) narm); + bool truehasnf = hasnf>0; + if (!truehasnf || !narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = (double) w; - } else { + } else if (ISNAN((double) w)) { if (!narm) { ans->dbl_v[i] = (double) w; } - truehasna = true; + truehasnf = true; // NAs detected for this window, set flag so rest of windows will not be re-run + } else { + ans->dbl_v[i] = (double) w; // Inf and -Inf } } } - if (truehasna) { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } + if (truehasnf) { + if (hasnf==-1) + ansSetMsg(ans, 2, "%s: has.nf=FALSE used but non-finite values are present in input, use default has.nf=NA to avoid this warning", __func__); if (verbose) { - if (narm) { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - } else { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n"), __func__); - } + if (narm) + ansSetMsg(ans, 0, "%s: non-finite values are present in input, re-running with extra care for NFs\n", __func__); + else + ansSetMsg(ans, 0, "%s: non-finite values are present in input, na.rm=FALSE and algo='exact' propagates NFs properply, no need to re-run\n", __func__); } } } - if (truehasna && narm) { + if (truehasnf && narm) { #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; imessage[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasnf %d, narm %d\n"), "frolladaptivemaxExact", (uint64_t)nx, hasnf, (int) narm); + if (narm || hasnf==-1) { // fastest we can get for adaptive max as there is no algo='fast', therefore we drop any NA checks when has.nf=FALSE + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=0; idbl_v[i] = fill; + } else { + double w = R_NegInf; + for (int j=-k[i]+1; j<=0; j++) { + if (x[i+j] > w) + w = x[i+j]; + } + ans->dbl_v[i] = w; + } + } + } else { + bool *isnan = malloc(nx*sizeof(bool)); // isnan lookup - we use it to reduce ISNAN calls in nested loop + if (!isnan) { // # nocov start + ansSetMsg(ans, 3, "%s: Unable to allocate memory for isnan", __func__); // raise error + free(isnan); + return; + } // # nocov end + bool truehasnf = hasnf>0; + for (uint64_t i=0; idbl_v[i] = fill; + } else { + double w = R_NegInf; + for (int j=-k[i]+1; j<=0; j++) { + if (x[i+j] > w) + w = x[i+j]; + } + ans->dbl_v[i] = w; + } + } + } else { // there are some NAs + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=0; idbl_v[i] = fill; + } else { + double w = R_NegInf; + if (isnan[i] && ISNA(x[i])) { + w = NA_REAL; + } else { + for (int j=-k[i]+1; j<=0; j++) { + if (isnan[i+j]) { + if (ISNA(x[i+j])) { + w = NA_REAL; + break; + } else { + w = R_NaN; + } + } else if (x[i+j] > w) + w = x[i+j]; + } + } + ans->dbl_v[i] = w; + } + } + } + } +} + +/* fast rolling adaptive any R function + * not plain C, not thread safe + * R eval() allocates + * takes SEXP because it has SETLENGTH for each window + */ +void frolladaptiveapply(double *x, int64_t nx, SEXP pw, int *k, ans_t *ans, double fill, SEXP call, SEXP rho, bool verbose) { + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + + double *w = REAL(pw); + // this is i=k[0]-1 iteration - first full window - taken out from the loop + // we use it to add extra check that results of a FUN are length 1 numeric + SEXPTYPE teval0; + uint64_t i; // #loop_counter_not_local_scope_ok + for (i=0; idbl_v[i] = fill; + } else { + SETLENGTH(pw, k[i]); + memcpy(w, x+(i-k[i]+1), k[i]*sizeof(double)); + SEXP eval0 = PROTECT(eval(call, rho)); + if (xlength(eval0) != 1) + error(_("%s: results from provided FUN are not length 1"), __func__); + teval0 = TYPEOF(eval0); + if (teval0 == REALSXP) { + ans->dbl_v[i] = REAL(eval0)[0]; + } else { + if (teval0==INTSXP || teval0==LGLSXP) { + if (verbose) + Rprintf(_("%s: results from provided FUN are not of type double, coercion from integer or logical will be applied on each iteration\n"), __func__); + ans->dbl_v[i] = REAL(coerceVector(eval0, REALSXP))[0]; + } else { + error(_("%s: results from provided FUN are not of type double"), __func__); + } + } + UNPROTECT(1); // eval0 + break; + } + } + if (i==nx) { // none of the windows in k was small enough to cover length of x + return; + } + // for each row it sets length of current window because it is adaptive version + // then copies expected window data into w + // evaluate call which has been prepared to point into w + if (teval0 == REALSXP) { + for (; idbl_v[i] = fill; + } else { + SETLENGTH(pw, k[i]); + memcpy(w, x+(i-k[i]+1), k[i]*sizeof(double)); + ans->dbl_v[i] = REAL(eval(call, rho))[0]; // this may fail with for a not type-stable fun + } + } + } else { + for (; idbl_v[i] = fill; + } else { + SETLENGTH(pw, k[i]); + memcpy(w, x+(i-k[i]+1), k[i]*sizeof(double)); + SEXP evali = PROTECT(eval(call, rho)); + ans->dbl_v[i] = REAL(coerceVector(evali, REALSXP))[0]; + UNPROTECT(1); // evali + } + } + } + if (verbose) + Rprintf(_("%s: took %.3fs\n"), __func__, omp_get_wtime()-tic); +} diff --git a/src/fsort.c b/src/fsort.c index 2618ec577b..6dbb85d550 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -165,7 +165,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { int MSBNbits = maxBit > 15 ? 16 : maxBit+1; // how many bits make up the MSB int shift = maxBit + 1 - MSBNbits; // the right shift to leave the MSB bits remaining size_t MSBsize = 1LL< 65,536) - if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%zu\n"), maxBit, MSBNbits, shift, MSBsize); + if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n"), maxBit, MSBNbits, shift, MSBsize); uint64_t *counts = (uint64_t *)R_alloc(nBatch*MSBsize, sizeof(uint64_t)); memset(counts, 0, nBatch*MSBsize*sizeof(uint64_t)); @@ -242,11 +242,11 @@ SEXP fsort(SEXP x, SEXP verboseArg) { if (verbose) { Rprintf(_("Top 20 MSB counts: ")); for(int i=0; i0 && msbCounts[order[MSBsize-1]] < 2) MSBsize--; if (verbose) { - Rprintf(_("%zu by excluding 0 and 1 counts\n"), MSBsize); + Rprintf(_("%d by excluding 0 and 1 counts\n"), MSBsize); } bool failed=false, alloc_fail=false, non_monotonic=false; // shared bools only ever assigned true; no need for atomic or critical assign diff --git a/src/fwrite.c b/src/fwrite.c index 322909749a..4922dd8b78 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -60,17 +60,17 @@ inline void write_chars(const char *x, char **pch) *pch = ch; } -void writeBool8(const void *col, int64_t row, char **pch) +void writeBool8(int8_t *col, int64_t row, char **pch) { - int8_t x = ((const int8_t *)col)[row]; + int8_t x = col[row]; char *ch = *pch; *ch++ = '0'+(x==1); *pch = ch-(x==INT8_MIN); // if NA then step back, to save a branch } -void writeBool32(const void *col, int64_t row, char **pch) +void writeBool32(int32_t *col, int64_t row, char **pch) { - int32_t x = ((const int32_t *)col)[row]; + int32_t x = col[row]; char *ch = *pch; if (x==INT32_MIN) { // TODO: when na=='\0' as recommended, use a branchless writer write_chars(na, &ch); @@ -80,9 +80,9 @@ void writeBool32(const void *col, int64_t row, char **pch) *pch = ch; } -void writeBool32AsString(const void *col, int64_t row, char **pch) +void writeBool32AsString(int32_t *col, int64_t row, char **pch) { - int32_t x = ((const int32_t *)col)[row]; + int32_t x = col[row]; char *ch = *pch; if (x == INT32_MIN) { write_chars(na, &ch); @@ -106,10 +106,10 @@ static inline void reverse(char *upp, char *low) } } -void writeInt32(const void *col, int64_t row, char **pch) +void writeInt32(int32_t *col, int64_t row, char **pch) { char *ch = *pch; - int32_t x = ((const int32_t *)col)[row]; + int32_t x = col[row]; if (x == INT32_MIN) { write_chars(na, &ch); } else { @@ -122,10 +122,10 @@ void writeInt32(const void *col, int64_t row, char **pch) *pch = ch; } -void writeInt64(const void *col, int64_t row, char **pch) +void writeInt64(int64_t *col, int64_t row, char **pch) { char *ch = *pch; - int64_t x = ((const int64_t *)col)[row]; + int64_t x = col[row]; if (x == INT64_MIN) { write_chars(na, &ch); } else { @@ -177,7 +177,7 @@ void genLookups() { } */ -void writeFloat64(const void *col, int64_t row, char **pch) +void writeFloat64(double *col, int64_t row, char **pch) { // hand-rolled / specialized for speed // *pch is safely the output destination with enough space (ensured via calculating maxLineLen up front) @@ -187,7 +187,7 @@ void writeFloat64(const void *col, int64_t row, char **pch) // ii) no C libary calls such as sprintf() where the fmt string has to be interpretted over and over // iii) no need to return variables or flags. Just writes. // iv) shorter, easier to read and reason with in one self contained place. - double x = ((const double *)col)[row]; + double x = col[row]; char *ch = *pch; if (!isfinite(x)) { if (isnan(x)) { @@ -301,9 +301,9 @@ void writeFloat64(const void *col, int64_t row, char **pch) *pch = ch; } -void writeComplex(const void *col, int64_t row, char **pch) +void writeComplex(Rcomplex *col, int64_t row, char **pch) { - Rcomplex x = ((const Rcomplex *)col)[row]; + Rcomplex x = col[row]; char *ch = *pch; writeFloat64(&x.r, 0, &ch); if (!ISNAN(x.i)) { @@ -340,8 +340,8 @@ static inline void write_time(int32_t x, char **pch) *pch = ch; } -void writeITime(const void *col, int64_t row, char **pch) { - write_time(((const int32_t *)col)[row], pch); +void writeITime(int32_t *col, int64_t row, char **pch) { + write_time(col[row], pch); } static inline void write_date(int32_t x, char **pch) @@ -394,16 +394,15 @@ static inline void write_date(int32_t x, char **pch) *pch = ch; } -void writeDateInt32(const void *col, int64_t row, char **pch) { - write_date(((const int32_t *)col)[row], pch); +void writeDateInt32(int32_t *col, int64_t row, char **pch) { + write_date(col[row], pch); } -void writeDateFloat64(const void *col, int64_t row, char **pch) { - double x = ((const double *)col)[row]; - write_date(isfinite(x) ? (int)(x) : INT32_MIN, pch); +void writeDateFloat64(double *col, int64_t row, char **pch) { + write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, pch); } -void writePOSIXct(const void *col, int64_t row, char **pch) +void writePOSIXct(double *col, int64_t row, char **pch) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -412,7 +411,7 @@ void writePOSIXct(const void *col, int64_t row, char **pch) // All positive integers up to 2^53 (9e15) are exactly representable by double which is relied // on in the ops here; number of seconds since epoch. - double x = ((const double *)col)[row]; + double x = col[row]; char *ch = *pch; if (!isfinite(x)) { write_chars(na, &ch); @@ -465,9 +464,9 @@ void writePOSIXct(const void *col, int64_t row, char **pch) *pch = ch; } -void writeNanotime(const void *col, int64_t row, char **pch) +void writeNanotime(int64_t *col, int64_t row, char **pch) { - int64_t x = ((const int64_t *)col)[row]; + int64_t x = col[row]; char *ch = *pch; if (x == INT64_MIN) { write_chars(na, &ch); @@ -550,12 +549,12 @@ static inline void write_string(const char *x, char **pch) void writeString(const void *col, int64_t row, char **pch) { - write_string(getString((const SEXP *)col, row), pch); + write_string(getString(col, row), pch); } void writeCategString(const void *col, int64_t row, char **pch) { - write_string(getCategString((const SEXP *)col, row), pch); + write_string(getCategString(col, row), pch); } #ifndef NOZLIB @@ -714,7 +713,7 @@ void fwriteMain(fwriteMainArgs args) } if (headerLen) { char *buff = malloc(headerLen); - if (!buff) STOP(_("Unable to allocate %zu MiB for header: %s"), headerLen / 1024 / 1024, strerror(errno)); + if (!buff) STOP(_("Unable to allocate %d MiB for header: %s"), headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; if (args.bom) {*ch++=(char)0xEF; *ch++=(char)0xBB; *ch++=(char)0xBF; } // 3 appears above (search for "bom") memcpy(ch, args.yaml, yamlLen); @@ -736,7 +735,7 @@ void fwriteMain(fwriteMainArgs args) } if (f==-1) { *ch = '\0'; - DTPRINT("%s", buff); + DTPRINT(buff); free(buff); } else { int ret1=0, ret2=0; @@ -753,7 +752,7 @@ void fwriteMain(fwriteMainArgs args) char *zbuff = malloc(zbuffSize); if (!zbuff) { free(buff); // # nocov - STOP(_("Unable to allocate %zu MiB for zbuffer: %s"), zbuffSize / 1024 / 1024, strerror(errno)); // # nocov + STOP(_("Unable to allocate %d MiB for zbuffer: %s"), zbuffSize / 1024 / 1024, strerror(errno)); // # nocov } size_t zbuffUsed = zbuffSize; ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (size_t)(ch-buff)); @@ -820,7 +819,7 @@ void fwriteMain(fwriteMainArgs args) char *buffPool = malloc(nth*(size_t)buffSize); if (!buffPool) { // # nocov start - STOP(_("Unable to allocate %zu MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), + STOP(_("Unable to allocate %d MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), (size_t)buffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } @@ -831,7 +830,7 @@ void fwriteMain(fwriteMainArgs args) if (!zbuffPool) { // # nocov start free(buffPool); - STOP(_("Unable to allocate %zu MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), + STOP(_("Unable to allocate %d MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), (size_t)zbuffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } @@ -926,7 +925,7 @@ void fwriteMain(fwriteMainArgs args) errno=0; if (f==-1) { *ch='\0'; // standard C string end marker so DTPRINT knows where to stop - DTPRINT("%s", myBuff); + DTPRINT(myBuff); } else if ((args.is_gzip ? WRITE(f, myzBuff, (int)myzbuffUsed) : WRITE(f, myBuff, (int)(ch-myBuff))) == -1) { failed=true; // # nocov diff --git a/src/fwrite.h b/src/fwrite.h index 0fef0c7f6e..6886c7791d 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -3,30 +3,29 @@ #else #define STRICT_R_HEADERS #include - #include // for SEXP in writeList() prototype #include "po.h" #define STOP error #define DTPRINT Rprintf #endif -typedef void writer_fun_t(const void *, int64_t, char **); +typedef void (*writer_fun_t)(const void *, int64_t, char **); // in the order of writer_fun_t in fwriteR.c -writer_fun_t writeBool8; -writer_fun_t writeBool32; -writer_fun_t writeBool32AsString; -writer_fun_t writeInt32; -writer_fun_t writeInt64; -writer_fun_t writeFloat64; -writer_fun_t writeComplex; -writer_fun_t writeITime; -writer_fun_t writeDateInt32; -writer_fun_t writeDateFloat64; -writer_fun_t writePOSIXct; -writer_fun_t writeNanotime; -writer_fun_t writeString; -writer_fun_t writeCategString; -writer_fun_t writeList; +void writeBool8(); +void writeBool32(); +void writeBool32AsString(); +void writeInt32(); +void writeInt64(); +void writeFloat64(); +void writeComplex(); +void writeITime(); +void writeDateInt32(); +void writeDateFloat64(); +void writePOSIXct(); +void writeNanotime(); +void writeString(); +void writeCategString(); +void writeList(); void write_chars(const char *source, char **dest); @@ -76,7 +75,7 @@ typedef struct fwriteMainArgs int64_t nrow; // a vector of pointers to all-same-length column vectors const void **columns; - writer_fun_t **funs; // a vector of writer_fun_t function pointers + writer_fun_t *funs; // a vector of writer_fun_t function pointers // length ncol vector containing which fun[] to use for each column // one byte to use 8 times less cache lines than a vector of function pointers would do diff --git a/src/fwriteR.c b/src/fwriteR.c index f64768d70b..a36e443156 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -19,7 +19,7 @@ static const char *sep2start, *sep2end; // if there are no list columns, set sep2=='\0' // Non-agnostic helpers ... -const char *getString(const SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c +const char *getString(SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c SEXP x = col[row]; return x==NA_STRING ? NULL : ENCODED_CHAR(x); } @@ -53,7 +53,7 @@ const char *getCategString(SEXP col, int64_t row) { return x==NA_INTEGER ? NULL : ENCODED_CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } -writer_fun_t *funs[] = { +writer_fun_t funs[] = { &writeBool8, &writeBool32, &writeBool32AsString, @@ -73,8 +73,8 @@ writer_fun_t *funs[] = { static int32_t whichWriter(SEXP); -void writeList(const void *col, int64_t row, char **pch) { - SEXP v = ((const SEXP *)col)[row]; +void writeList(SEXP *col, int64_t row, char **pch) { + SEXP v = col[row]; int32_t wf = whichWriter(v); if (TYPEOF(v)==VECSXP || wf==INT32_MIN || isFactor(v)) { error(_("Internal error: getMaxListItemLen should have caught this up front.")); // # nocov @@ -82,7 +82,7 @@ void writeList(const void *col, int64_t row, char **pch) { char *ch = *pch; write_chars(sep2start, &ch); const void *data = DATAPTR_RO(v); - writer_fun_t *fun = funs[wf]; + writer_fun_t fun = funs[wf]; for (int j=0; j16) bitshift=nb/2; // TODO: when we have stress-test off mode, do this - mask = (1<>bitshift) + 1; + shift = nb/2; // /2 so that high and low can be uint16_t, and no limit (even for nb=4) to stress-test. + // shift=MAX(nb-8,0); if (shift>16) shift=nb/2; // TODO: when we have stress-test off mode, do this + mask = (1<>shift) + 1; grp = (int *)R_alloc(nrow, sizeof(int)); // TODO: use malloc and made this local as not needed globally when all functions here use gather // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc @@ -86,8 +86,8 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { // TODO: enable stress-test mode in tests only (#3205) which can be turned off by default in release to decrease overhead on small data // if that is established to be biting (it may be fine). if (nBatch<1 || batchSize<1 || lastBatchSize<1) { - error(_("Internal error: nrow=%d ngrp=%d nbit=%d bitshift=%d highSize=%zu nBatch=%zu batchSize=%zu lastBatchSize=%zu\n"), // # nocov - nrow, ngrp, nb, bitshift, highSize, nBatch, batchSize, lastBatchSize); // # nocov + error(_("Internal error: nrow=%d ngrp=%d nbit=%d shift=%d highSize=%d nBatch=%d batchSize=%d lastBatchSize=%d\n"), // # nocov + nrow, ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize); // # nocov } // initial population of g: #pragma omp parallel for num_threads(getDTthreads(ngrp, false)) @@ -108,9 +108,9 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *restrict op = INTEGER(o); // o is a permutation of 1:nrow int nb = nbit(nrow-1); - int bitshift = MAX(nb-8, 0); // TODO: experiment nb/2. Here it doesn't have to be /2 currently. - int highSize = ((nrow-1)>>bitshift) + 1; - //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d bitshift=%d nBatch=%d\n"), highSize, nb, bitshift, nBatch); + int shift = MAX(nb-8, 0); // TODO: experiment nb/2. Here it doesn't have to be /2 currently. + int highSize = ((nrow-1)>>shift) + 1; + //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d shift=%d nBatch=%d\n"), highSize, nb, shift, nBatch); int *counts = calloc(nBatch*highSize, sizeof(int)); // TODO: cache-line align and make highSize a multiple of 64 int *TMP = malloc(nrow*2l*sizeof(int)); // must multiple the long int otherwise overflow may happen, #4295 if (!counts || !TMP ) error(_("Internal error: Failed to allocate counts or TMP when assigning g in gforce")); @@ -120,7 +120,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *my_o = op + b*batchSize; int *restrict my_counts = counts + b*highSize; for (int i=0; i> bitshift; + const int w = (my_o[i]-1) >> shift; my_counts[w]++; } for (int i=0, cum=0; i> bitshift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too + const int w = (my_o[i]-1) >> shift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too int *p = my_tmp + 2*my_counts[w]++; *p++ = my_o[i]-1; *p = my_g[i]; @@ -172,7 +172,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *my_pg = gp + b*batchSize; const int howMany = b==nBatch-1 ? lastBatchSize : batchSize; for (int i=0; i> bitshift; + const int w = my_pg[i] >> shift; my_counts[w]++; my_high[i] = (uint16_t)w; // reduce 4 bytes to 2 } @@ -185,7 +185,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize; memcpy(my_tmpcounts, my_counts, highSize*sizeof(int)); for (int i=0; i> bitshift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too + const int w = my_pg[i] >> shift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too my_low[my_tmpcounts[w]++] = (uint16_t)(my_pg[i] & mask); } // counts is now cumulated within batch (with ending values) and we leave it that way @@ -362,7 +362,7 @@ SEXP gsum(SEXP x, SEXP narmArg) if (!anyNA) { #pragma omp parallel for num_threads(getDTthreads(highSize, false)) //schedule(dynamic,1) for (int h=0; h8) error(_("Pointers are %zu bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); + if (sizeof(char *)>8) error(_("Pointers are %d bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); // One place we need the largest sizeof is the working memory malloc in reorder.c } @@ -177,24 +268,23 @@ void attribute_visible R_init_data_table(DllInfo *info) const char *msg = _("... failed. Please forward this message to maintainer('data.table')."); if ((int)NA_INTEGER != (int)INT_MIN) error(_("Checking NA_INTEGER [%d] == INT_MIN [%d] %s"), NA_INTEGER, INT_MIN, msg); if ((int)NA_INTEGER != (int)NA_LOGICAL) error(_("Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s"), NA_INTEGER, NA_LOGICAL, msg); - if (sizeof(int) != 4) error(_("Checking sizeof(%s) [%zu] is %d %s"), "int", sizeof(int), 4, msg); - if (sizeof(double) != 8) error(_("Checking sizeof(%s) [%zu] is %d %s"), "double", sizeof(double), 8, msg); // 8 on both 32bit and 64bit - // alignof not available in C99: if (alignof(double) != 8) error(_("Checking alignof(double) [%lu] is 8 %s"), alignof(double), msg); // 8 on both 32bit and 64bit - if (sizeof(long long) != 8) error(_("Checking sizeof(%s) [%zu] is %d %s"), "long long", sizeof(long long), 8, msg); - if (sizeof(char *) != 4 && sizeof(char *) != 8) error(_("Checking sizeof(pointer) [%zu] is 4 or 8 %s"), sizeof(char *), msg); - if (sizeof(SEXP) != sizeof(char *)) error(_("Checking sizeof(SEXP) [%zu] == sizeof(pointer) [%zu] %s"), sizeof(SEXP), sizeof(char *), msg); - if (sizeof(uint64_t) != 8) error(_("Checking sizeof(%s) [%zu] is %d %s"), "uint64_t", sizeof(uint64_t), 8, msg); - if (sizeof(int64_t) != 8) error(_("Checking sizeof(%s) [%zu] is %d %s"), "int64_t", sizeof(int64_t), 8, msg); - if (sizeof(signed char) != 1) error(_("Checking sizeof(%s) [%zu] is %d %s"), "signed char", sizeof(signed char), 1, msg); - if (sizeof(int8_t) != 1) error(_("Checking sizeof(%s) [%zu] is %d %s"), "int8_t", sizeof(int8_t), 1, msg); - if (sizeof(uint8_t) != 1) error(_("Checking sizeof(%s) [%zu] is %d %s"), "uint8_t", sizeof(uint8_t), 1, msg); - if (sizeof(int16_t) != 2) error(_("Checking sizeof(%s) [%zu] is %d %s"), "int16_t", sizeof(int16_t), 2, msg); - if (sizeof(uint16_t) != 2) error(_("Checking sizeof(%s) [%zu] is %d %s"), "uint16_t", sizeof(uint16_t), 2 ,msg); + if (sizeof(int) != 4) error(_("Checking sizeof(%s) [%d] is %d %s"), "int", sizeof(int), 4, msg); + if (sizeof(double) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "double", sizeof(double), 8, msg); // 8 on both 32bit and 64bit + // alignof not available in C99: if (alignof(double) != 8) error(_("Checking alignof(double) [%d] is 8 %s"), alignof(double), msg); // 8 on both 32bit and 64bit + if (sizeof(long long) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "long long", sizeof(long long), 8, msg); + if (sizeof(char *) != 4 && sizeof(char *) != 8) error(_("Checking sizeof(pointer) [%d] is 4 or 8 %s"), sizeof(char *), msg); + if (sizeof(SEXP) != sizeof(char *)) error(_("Checking sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s"), sizeof(SEXP), sizeof(char *), msg); + if (sizeof(uint64_t) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint64_t", sizeof(uint64_t), 8, msg); + if (sizeof(int64_t) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "int64_t", sizeof(int64_t), 8, msg); + if (sizeof(signed char) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "signed char", sizeof(signed char), 1, msg); + if (sizeof(int8_t) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "int8_t", sizeof(int8_t), 1, msg); + if (sizeof(uint8_t) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint8_t", sizeof(uint8_t), 1, msg); + if (sizeof(int16_t) != 2) error(_("Checking sizeof(%s) [%d] is %d %s"), "int16_t", sizeof(int16_t), 2, msg); + if (sizeof(uint16_t) != 2) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint16_t", sizeof(uint16_t), 2 ,msg); SEXP tmp = PROTECT(allocVector(INTSXP,2)); if (LENGTH(tmp)!=2) error(_("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s"), LENGTH(tmp), msg); - // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768 - if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%lld] is 0 %s"), (long long)TRUELENGTH(tmp), msg); + if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s"), TRUELENGTH(tmp), msg); UNPROTECT(1); // According to IEEE (http://en.wikipedia.org/wiki/IEEE_754-1985#Zero) we can rely on 0.0 being all 0 bits. @@ -311,7 +401,7 @@ inline double LLtoD(long long x) { return u.d; } -int GetVerbose(void) { +int GetVerbose() { // don't call repetitively; save first in that case SEXP opt = GetOption(sym_verbose, R_NilValue); if ((!isLogical(opt) && !isInteger(opt)) || LENGTH(opt)!=1 || INTEGER(opt)[0]==NA_INTEGER) @@ -320,7 +410,7 @@ int GetVerbose(void) { } // # nocov start -SEXP hasOpenMP(void) { +SEXP hasOpenMP() { // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is // why RPRINTF is capitalized in this comment to avoid that grep. @@ -333,16 +423,6 @@ SEXP hasOpenMP(void) { } // # nocov end -SEXP beforeR340(void) { - // used in onAttach.R for message about fread memory leak fix needing R 3.4.0 - // at C level to catch if user upgrades R but does not reinstall data.table - #if defined(R_VERSION) && R_VERSION=xn || ixo[j]<=0) { + if (ixo[j] <= 0 || j >= xn) { // NA_integer_ = INT_MIN is checked in init.c // j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) inewstarts[i] = inomatch; diff --git a/src/openmp-utils.c b/src/openmp-utils.c index 483a91654c..c9003ee07b 100644 --- a/src/openmp-utils.c +++ b/src/openmp-utils.c @@ -29,7 +29,7 @@ static int getIntEnv(const char *name, int def) static inline int imin(int a, int b) { return a < b ? a : b; } static inline int imax(int a, int b) { return a > b ? a : b; } -void initDTthreads(void) { +void initDTthreads() { // called at package startup from init.c // also called by setDTthreads(threads=NULL) (default) to reread environment variables; see setDTthreads below // No verbosity here in this setter. Verbosity is in getDTthreads(verbose=TRUE) @@ -169,16 +169,16 @@ SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent, SEXP thro static int pre_fork_DTthreads = 0; -void when_fork(void) { +void when_fork() { pre_fork_DTthreads = DTthreads; DTthreads = 1; } -void after_fork(void) { +void after_fork() { if (RestoreAfterFork) DTthreads = pre_fork_DTthreads; } -void avoid_openmp_hang_within_fork(void) { +void avoid_openmp_hang_within_fork() { // Called once on loading data.table from init.c #ifdef _OPENMP pthread_atfork(&when_fork, &after_fork, NULL); diff --git a/src/rbindlist.c b/src/rbindlist.c index ba19d2c389..3669028835 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -208,7 +208,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : ""; snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"), w2+1, str, i+1, missi+1, extra ); - if (usenames==TRUE) error("%s", buff); + if (usenames==TRUE) error(buff); i = LENGTH(l); // break from outer i loop break; // break from inner j loop } @@ -229,8 +229,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) } const char *o = isNull(opt) ? "message" : CHAR(STRING_ELT(opt,0)); if (strcmp(o,"message")==0) { eval(PROTECT(lang2(install("message"),PROTECT(ScalarString(mkChar(buff))))), R_GlobalEnv); UNPROTECT(2); } - else if (strcmp(o,"warning")==0) warning("%s", buff); - else if (strcmp(o,"error")==0) error("%s", buff); + else if (strcmp(o,"warning")==0) warning(buff); + else if (strcmp(o,"error")==0) error(buff); else if (strcmp(o,"none")!=0) warning(_("options()$datatable.rbindlist.check=='%s' which is not 'message'|'warning'|'error'|'none'. See news item 5 in v1.12.2."), o); } } @@ -282,7 +282,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) for (int i=0; i maxSize) @@ -24,7 +24,7 @@ SEXP reorder(SEXP x, SEXP order) copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16 && SIZEOF(x)!=1) - error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%zu)"), type2char(TYPEOF(x)), SIZEOF(x)); + error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)"), type2char(TYPEOF(x)), SIZEOF(x)); if (ALTREP(x)) error(_("Internal error in reorder.c: cannot reorder an ALTREP vector. Please see NEWS item 2 in v1.11.4 and report this as a bug.")); // # nocov maxSize = SIZEOF(x); nrow = length(x); diff --git a/src/shift.c b/src/shift.c index 30c13a547a..dba598fe50 100644 --- a/src/shift.c +++ b/src/shift.c @@ -8,8 +8,6 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) if (!xlength(obj)) return(obj); // NULL, list() SEXP x; if (isVectorAtomic(obj)) { - if (!isNull(getAttrib(obj, R_DimSymbol))) - error(_("shift input must not be matrix or array, consider wrapping it into data.table() or c()")); x = PROTECT(allocVector(VECSXP, 1)); nprotect++; SET_VECTOR_ELT(x, 0, obj); } else { diff --git a/src/snprintf.c b/src/snprintf.c index f322931fc7..94199af707 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -184,7 +184,7 @@ int dt_win_snprintf(char *dest, const size_t n, const char *fmt, ...) return nc; } -SEXP test_dt_win_snprintf(void) +SEXP test_dt_win_snprintf() { char buff[50]; @@ -214,7 +214,7 @@ SEXP test_dt_win_snprintf(void) int res = dt_win_snprintf(buff, 10, "%4$d%2$d%3$d%5$d%1$d", 111, 222, 33, 44, 555); // fmt longer than n if (strlen(buff)!=9 || strcmp(buff, "442223355")) error(_("dt_win_snprintf test %d failed: %s"), 9, buff); - if (res!=13) /* should return what would have been written if not chopped */ error(_("dt_win_snprintf test %d failed: %d"), 10, res); + if (res!=13) /* should return what would have been written if not chopped */ error(_("dt_win_snprintf test %d failed: %s"), 10, res); dt_win_snprintf(buff, 39, "%l", 3); if (strlen(buff)!=38 || strcmp(buff, "0 %l does not end with recognized t")) error(_("dt_win_snprintf test %d failed: %s"), 11, buff); diff --git a/src/subset.c b/src/subset.c index d3513a7759..2158451798 100644 --- a/src/subset.c +++ b/src/subset.c @@ -281,7 +281,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++; rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE), ScalarLogical(TRUE))); nprotect++; const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset); - if (err!=NULL) error("%s", err); + if (err!=NULL) error(err); } if (!isInteger(cols)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "cols", "Csubset", type2char(TYPEOF(cols)), "integer"); // # nocov diff --git a/src/types.c b/src/types.c index 6e9020bb59..cc64aa39dc 100644 --- a/src/types.c +++ b/src/types.c @@ -8,10 +8,20 @@ char *end(char *start) { return strchr(start, 0); } +/* + * logging status and messages, warnings, errors to ans_t + */ +void ansSetMsg(ans_t *ans, uint8_t status, const char *msg, const char *func) { + if (status > ans->status) + ans->status = status; + snprintf(end(ans->message[status]), 500, _(msg), func); // func should be passed via ... really, thus this helper cannot replace all cases we need + // implicit n_message limit discussed here: https://github.com/Rdatatable/data.table/issues/3423#issuecomment-487722586 +} + /* * function to print verbose messages, stderr messages, warnings and errors stored in ans_t struct */ -void ansMsg(ans_t *ans, int n, bool verbose, const char *func) { +void ansGetMsgs(ans_t *ans, int n, bool verbose, const char *func) { for (int i=0; imessage[0]), 500, "%s: stdout 1 message\n", __func__); - snprintf(end(ans->message[0]), 500, "%s: stdout 2 message\n", __func__); + ansSetMsg(ans, 0, "%s: stdout 1 message\n", __func__); + ansSetMsg(ans, 0, "%s: stdout 2 message\n", __func__); } if (istatus == 1 || istatus == 12 || istatus == 13 || istatus == 123) { - snprintf(end(ans->message[1]), 500, "%s: stderr 1 message\n", __func__); - snprintf(end(ans->message[1]), 500, "%s: stderr 2 message\n", __func__); - ans->status = 1; + ansSetMsg(ans, 1, "%s: stderr 1 message\n", __func__); + ansSetMsg(ans, 1, "%s: stderr 2 message\n", __func__); } if (istatus == 2 || istatus == 12 || istatus == 23 || istatus == 123) { - snprintf(end(ans->message[2]), 500, "%s: stderr 1 warning\n", __func__); - snprintf(end(ans->message[2]), 500, "%s: stderr 2 warning\n", __func__); - ans->status = 2; + ansSetMsg(ans, 2, "%s: stderr 1 warning\n", __func__); + ansSetMsg(ans, 2, "%s: stderr 2 warning\n", __func__); } if (istatus == 3 || istatus == 13 || istatus == 23 || istatus == 123) { - snprintf(end(ans->message[3]), 500, "%s: stderr 1 error\n", __func__); - snprintf(end(ans->message[3]), 500, "%s: stderr 2 error\n", __func__); // printed too because errors appended and raised from ansMsg later on - ans->status = 3; + ansSetMsg(ans, 3, "%s: stderr 1 error\n", __func__); + ansSetMsg(ans, 3, "%s: stderr 2 error\n", __func__); // printed too because errors appended and raised from ansGetMsgs later on } - ans->int_v[0] = ans->status; + ans->int_v[0] = ans->status; // just a return value of status } SEXP testMsgR(SEXP status, SEXP x, SEXP k) { if (!isInteger(status) || !isInteger(x) || !isInteger(k)) @@ -57,7 +64,7 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k) { const bool verbose = GetVerbose(); int istatus = INTEGER(status)[0], nx = INTEGER(x)[0], nk = INTEGER(k)[0]; - // TODO below chunk into allocansList helper, not for 1.12.4 + // TODO below chunk into allocAnsList helper - not easy for variable length of inner vectors SEXP ans = PROTECT(allocVector(VECSXP, nk * nx)); protecti++; ans_t *vans = (ans_t *)R_alloc(nx*nk, sizeof(ans_t)); if (verbose) @@ -76,7 +83,7 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k) { } } - ansMsg(vans, nx*nk, verbose, __func__); + ansGetMsgs(vans, nx*nk, verbose, __func__); UNPROTECT(protecti); return ans; } diff --git a/src/utils.c b/src/utils.c index e5e343ac9f..9d6f5d7592 100644 --- a/src/utils.c +++ b/src/utils.c @@ -348,7 +348,7 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { if (!isNull(getAttrib(x, R_DimSymbol))) error(_("'x' must not be matrix or array")); if (!isNull(getAttrib(as, R_DimSymbol))) - error(_("input must not be matrix or array")); + error(_("'as' must not be matrix or array")); bool verbose = GetVerbose()>=2; // verbose level 2 required if (!LOGICAL(copyArg)[0] && TYPEOF(x)==TYPEOF(as) && class1(x)==class1(as)) { if (verbose) @@ -370,7 +370,7 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { #ifndef NOZLIB #include #endif -SEXP dt_zlib_version(void) { +SEXP dt_zlib_version() { char out[71]; #ifndef NOZLIB snprintf(out, 70, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); @@ -379,13 +379,6 @@ SEXP dt_zlib_version(void) { #endif return ScalarString(mkChar(out)); } -SEXP dt_has_zlib(void) { -#ifndef NOZLIB - return ScalarLogical(1); -#else - return ScalarLogical(0); -#endif -} SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { // for is_url in fread.R added in #5097 diff --git a/tests/knitr.Rout.mock b/tests/knitr.Rout.mock index ea37b2c465..1f17724c81 100644 --- a/tests/knitr.Rout.mock +++ b/tests/knitr.Rout.mock @@ -8,11 +8,10 @@ DT # yes ``` ``` -## x y -## -## 1: 1 4 -## 2: 2 5 -## 3: 3 6 +## x y +## 1: 1 4 +## 2: 2 5 +## 3: 3 6 ``` ```r @@ -21,11 +20,10 @@ print(DT[, z := 10:12]) # yes ``` ``` -## x y z -## -## 1: 1 4 10 -## 2: 2 5 11 -## 3: 3 6 12 +## x y z +## 1: 1 4 10 +## 2: 2 5 11 +## 3: 3 6 12 ``` ```r @@ -34,11 +32,10 @@ DT # yes ``` ``` -## x y z a -## -## 1: 1 4 10 1 -## 2: 2 5 11 1 -## 3: 3 6 12 1 +## x y z a +## 1: 1 4 10 1 +## 2: 2 5 11 1 +## 3: 3 6 12 1 ``` Some text. diff --git a/vignettes/css/bootstrap.css b/vignettes/css/bootstrap.css new file mode 100644 index 0000000000..1453f27bf9 --- /dev/null +++ b/vignettes/css/bootstrap.css @@ -0,0 +1,118 @@ +code, +kbd, +pre, +samp { + font-family: Source Code Pro, Inconsolata, Monaco, Consolas, Menlo, Courier New, monospace; +} + +code { + padding: 0px 2px; + font-size: 90%; + color: #c7254e; + white-space: nowrap; + background-color: #f9f2f4; + border-radius: 3px; + border: 0px; +} + +pre { + display: block; + padding: 9.5px; + margin: 0 0 10px; + font-size: 14px; + line-height: 1.428571429; + color: #c7254e; + background-color: #f9f2f4 + word-break: break-all; + word-wrap: break-word; + border: 0px ; + border-radius: 3px; + /*background-color: #FDF6E3;*/ + /*background-color: #f5f5f5; */ + /*border: 1px solid #FDF6E3;*/ +} + +pre code { + padding: 0; + font-size: inherit; + color: inherit; + white-space: pre-wrap; + background-color: transparent; + border-radius: 0; +} + +.bs-callout { + margin:20px 0; + padding:20px; + border-left:3px solid #eee +} + +.bs-callout h4 { + margin-top:0; + margin-bottom:5px +} + +.bs-callout p:last-child { + margin-bottom:0 +} + +.bs-callout code { + background-color:#fff; + border-radius:3px +} + +.bs-callout pre code { + background-color:transparent; + border-radius:3px +} + +.bs-callout-danger { + background-color:#fdf7f7; + border-color:#d9534f +} + +.bs-callout-danger h4 { + color:#d9534f +} + +.bs-callout-warning { + background-color:#fcf8f2; + border-color:#f0ad4e +} + +.bs-callout-warning h4 { + color:#f0ad4e +} + +.bs-callout-info { + background-color:#f4f8fa; + border-color:#5bc0de +} + +.bs-callout-info h4 { + color:#5bc0de +} + +// KeyWordTok +.sourceCode .kw { color: #268BD2; } +// DataTypeTok +.sourceCode .dt { color: #268BD2; } + +// DecValTok (decimal value), BaseNTok, FloatTok +.sourceCode .dv, .sourceCode .bn, .sourceCode .fl { color: #D33682; } +// CharTok +.sourceCode .ch { color: #DC322F; } +// StringTok +.sourceCode .st { color: #2AA198; } +// CommentTok +.sourceCode .co { color: #93A1A1; } +// OtherTok +.sourceCode .ot { color: #A57800; } +// AlertTok +.sourceCode .al { color: #CB4B16; font-weight: bold; } +// FunctionTok +.sourceCode .fu { color: #268BD2; } +// RegionMarkerTok +.sourceCode .re { } +// ErrorTok +.sourceCode .er { color: #D30102; font-weight: bold; } diff --git a/vignettes/css/toc.css b/vignettes/css/toc.css deleted file mode 100644 index 86adaba5b1..0000000000 --- a/vignettes/css/toc.css +++ /dev/null @@ -1,6 +0,0 @@ -#TOC { - border: 1px solid #ccc; - border-radius: 5px; - padding-left: 1em; - background: #f6f6f6; -} diff --git a/vignettes/datatable-benchmarking.Rmd b/vignettes/datatable-benchmarking.Rmd index da580764b8..7614a27d54 100644 --- a/vignettes/datatable-benchmarking.Rmd +++ b/vignettes/datatable-benchmarking.Rmd @@ -2,24 +2,15 @@ title: "Benchmarking data.table" date: "`r Sys.Date()`" output: - markdown::html_format: - options: - toc: true - number_sections: true - meta: - css: [default, css/toc.css] + rmarkdown::html_vignette: + toc: true + number_sections: true vignette: > %\VignetteIndexEntry{Benchmarking data.table} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- - - This document is meant to guide on measuring performance of `data.table`. Single place to document best practices and traps to avoid. # fread: clear caches diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index a2de14a2f6..4b0645e6b6 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -2,15 +2,12 @@ title: "Frequently Asked Questions about data.table" date: "`r Sys.Date()`" output: - markdown::html_format: - options: - toc: true - number_sections: true - meta: - css: [default, css/toc.css] + rmarkdown::html_vignette: + toc: true + number_sections: true vignette: > %\VignetteIndexEntry{Frequently Asked Questions about data.table} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -29,7 +26,6 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) -.old.th = setDTthreads(1) ``` The first section, Beginner FAQs, is intended to be read in order, from start to finish. It's just written in a FAQ style to be digested more easily. It isn't really the most frequently asked questions. A better measure for that is looking on Stack Overflow. @@ -98,13 +94,13 @@ As [highlighted above](#j-num), `j` in `[.data.table` is fundamentally different Furthermore, data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table can be passed to any package that only accepts `data.frame` and that package can use `[.data.frame` syntax on the data.table. See [this answer](https://stackoverflow.com/a/10529888/403310) for how that is achieved. -We _have_ proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0: +We _have_ proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0 : > `unique()` and `match()` are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c. A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://stat.ethz.ch/pipermail/r-devel/2010-April/057249.html). -A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0: +A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 : > The radix sort algorithm and implementation from data.table (forder) replaces the previous radix (counting) sort and adds a new method for order(). Contributed by Matt Dowle and Arun Srinivasan, the new algorithm supports logical, integer (even with large values), real, and character vectors. It outperforms all other methods, but there are some caveats (see ?sort). @@ -240,7 +236,7 @@ Then you are using a version prior to 1.5.3. Prior to 1.5.3 `[.data.table` detec ## What are the scoping rules for `j` expressions? -Think of the subset as an environment where all the column names are variables. When a variable `foo` is used in the `j` of a query such as `X[Y, sum(foo)]`, `foo` is looked for in the following order: +Think of the subset as an environment where all the column names are variables. When a variable `foo` is used in the `j` of a query such as `X[Y, sum(foo)]`, `foo` is looked for in the following order : 1. The scope of `X`'s subset; _i.e._, `X`'s column names. 2. The scope of each row of `Y`; _i.e._, `Y`'s column names (_join inherited scope_) @@ -299,18 +295,18 @@ The `Z[Y]` part is not a single name so that is evaluated within the frame of `X ## Can you explain further why data.table is inspired by `A[B]` syntax in `base`? -Consider `A[B]` syntax using an example matrix `A`: +Consider `A[B]` syntax using an example matrix `A` : ```{r} A = matrix(1:12, nrow = 4) A ``` -To obtain cells `(1, 2) = 5` and `(3, 3) = 11` many users (we believe) may try this first: +To obtain cells `(1, 2) = 5` and `(3, 3) = 11` many users (we believe) may try this first : ```{r} A[c(1, 3), c(2, 3)] ``` -However, this returns the union of those rows and columns. To reference the cells, a 2-column matrix is required. `?Extract` says: +However, this returns the union of those rows and columns. To reference the cells, a 2-column matrix is required. `?Extract` says : > When indexing arrays by `[` a single argument `i` can be a matrix with as many columns as there are dimensions of `x`; the result is then a vector with elements corresponding to the sets of indices in each row of `i`. @@ -358,7 +354,7 @@ Furthermore, matrices, especially sparse matrices, are often stored in a 3-colum data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table _can_ be passed to any package that _only_ accepts `data.frame`. When that package uses `[.data.frame` syntax on the data.table, it works. It works because `[.data.table` looks to see where it was called from. If it was called from such a package, `[.data.table` diverts to `[.data.frame`. ## I've heard that data.table syntax is analogous to SQL. -Yes: +Yes : - `i` $\Leftrightarrow$ where - `j` $\Leftrightarrow$ select @@ -371,7 +367,7 @@ Yes: - `mult = "first"|"last"` $\Leftrightarrow$ N/A because SQL is inherently unordered - `roll = TRUE` $\Leftrightarrow$ N/A because SQL is inherently unordered -The general form is: +The general form is : ```{r, eval = FALSE} DT[where, select|update, group by][order by][...] ... [...] @@ -451,7 +447,7 @@ Many thanks to the R core team for fixing the issue in Sep 2019. data.table v1.1 This comes up quite a lot but it's really earth-shatteringly simple. A function such as `merge` is _generic_ if it consists of a call to `UseMethod`. When you see people talking about whether or not functions are _generic_ functions they are merely typing the function without `()` afterwards, looking at the program code inside it and if they see a call to `UseMethod` then it is _generic_. What does `UseMethod` do? It literally slaps the function name together with the class of the first argument, separated by period (`.`) and then calls that function, passing along the same arguments. It's that simple. For example, `merge(X, Y)` contains a `UseMethod` call which means it then _dispatches_ (i.e. calls) `paste("merge", class(X), sep = ".")`. Functions with dots in their name may or may not be methods. The dot is irrelevant really, other than dot being the separator that `UseMethod` uses. Knowing this background should now highlight why, for example, it is obvious to R folk that `as.data.table.data.frame` is the `data.frame` method for the `as.data.table` generic function. Further, it may help to elucidate that, yes, you are correct, it is not obvious from its name alone that `ls.fit` is not the fit method of the `ls` generic function. You only know that by typing `ls` (not `ls()`) and observing it isn't a single call to `UseMethod`. -You might now ask: where is this documented in R? Answer: it's quite clear, but, you need to first know to look in `?UseMethod` and _that_ help file contains: +You might now ask: where is this documented in R? Answer: it's quite clear, but, you need to first know to look in `?UseMethod` and _that_ help file contains : > When a function calling `UseMethod('fun')` is applied to an object with class attribute `c('first', 'second')`, the system searches for a function called `fun.first` and, if it finds it, applies it to the object. If no such function is found a function called `fun.second` is tried. If no class name produces a suitable function, the function `fun.default` is used, if it exists, or an error results. @@ -485,7 +481,7 @@ copied in bulk (`memcpy` in C) rather than looping in C. ## What are primary and secondary indexes in data.table? Manual: [`?setkey`](https://www.rdocumentation.org/packages/data.table/functions/setkey) -S.O.: [What is the purpose of setting a key in data.table?](https://stackoverflow.com/questions/20039335/what-is-the-purpose-of-setting-a-key-in-data-table/20057411#20057411) +S.O. : [What is the purpose of setting a key in data.table?](https://stackoverflow.com/questions/20039335/what-is-the-purpose-of-setting-a-key-in-data-table/20057411#20057411) `setkey(DT, col1, col2)` orders the rows by column `col1` then within each group of `col1` it orders by `col2`. This is a _primary index_. The row order is changed _by reference_ in RAM. Subsequent joins and groups on those key columns then take advantage of the sort order for efficiency. (Imagine how difficult looking for a phone number in a printed telephone directory would be if it wasn't sorted by surname then forename. That's literally all `setkey` does. It sorts the rows by the columns you specify.) The index doesn't use any RAM. It simply changes the row order in RAM and marks the key columns. Analogous to a _clustered index_ in SQL. @@ -525,7 +521,7 @@ DT[ , { mySD = copy(.SD) Please upgrade to v1.8.1 or later. From this version, if `.N` is returned by `j` it is renamed to `N` to avoid any ambiguity in any subsequent grouping between the `.N` special variable and a column called `".N"`. -The old behaviour can be reproduced by forcing `.N` to be called `.N`, like this: +The old behaviour can be reproduced by forcing `.N` to be called `.N`, like this : ```{r} DT = data.table(a = c(1,1,2,2,2), b = c(1,2,2,2,1)) DT @@ -537,7 +533,7 @@ cat(try( If you are already running v1.8.1 or later then the error message is now more helpful than the "cannot change value of locked binding" error, as you can see above, since this vignette was produced using v1.8.1 or later. -The more natural syntax now works: +The more natural syntax now works : ```{r} if (packageVersion("data.table") >= "1.8.1") { DT[ , .N, by = list(a, b)][ , unique(N), by = a] @@ -559,7 +555,7 @@ Hopefully, this is self explanatory. The full message is: Coerced numeric RHS to integer to match the column's type; may have truncated precision. Either change the column to numeric first by creating a new numeric vector length 5 (nrows of entire table) yourself and assigning that (i.e. 'replace' column), or coerce RHS to integer yourself (e.g. 1L or as.integer) to make your intent clear (and for speed). Or, set the column type correctly up front when you create the table and stick to it, please. -To generate it, try: +To generate it, try : ```{r} DT = data.table(a = 1:5, b = 1:5) @@ -616,6 +612,3 @@ Sure. You're more likely to get a faster answer from the Issues page or Stack Ov Please see [this answer](https://stackoverflow.com/a/10529888/403310). -```{r, echo=FALSE} -setDTthreads(.old.th) -``` \ No newline at end of file diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index c37cd6f755..41a3d629ae 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -2,10 +2,10 @@ title: "Importing data.table" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Importing data.table} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 3624a7c5be..3a5eda34cd 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -2,10 +2,10 @@ title: "Introduction to data.table" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to data.table} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -18,7 +18,6 @@ knitr::opts_chunk$set( cache = FALSE, collapse = TRUE ) -.old.th = setDTthreads(1) ``` This vignette introduces the `data.table` syntax, its general form, how to *subset* rows, *select and compute* on columns, and perform aggregations *by group*. Familiarity with `data.frame` data structure from base R is useful, but not essential to follow this vignette. @@ -87,7 +86,7 @@ class(DT$ID) You can also convert existing objects to a `data.table` using `setDT()` (for `data.frame`s and `list`s) and `as.data.table()` (for other structures); the difference is beyond the scope of this vignette, see `?setDT` and `?as.data.table` for more details. -#### Note that: +#### Note that: {.bs-callout .bs-callout-info} * Row numbers are printed with a `:` in order to visually separate the row number from the first column. @@ -112,7 +111,7 @@ DT[i, j, by] Users who have an SQL background might perhaps immediately relate to this syntax. -#### The way to read it (out loud) is: +#### The way to read it (out loud) is: {.bs-callout .bs-callout-info} Take `DT`, subset/reorder rows using `i`, then calculate `j`, grouped by `by`. @@ -127,6 +126,8 @@ ans <- flights[origin == "JFK" & month == 6L] head(ans) ``` +#### {.bs-callout .bs-callout-info} + * Within the frame of a `data.table`, columns can be referred to *as if they are variables*, much like in SQL or Stata. Therefore, we simply refer to `origin` and `month` as if they are variables. We do not need to add the prefix `flights$` each time. Nevertheless, using `flights$origin` and `flights$month` would work just fine. * The *row indices* that satisfy the condition `origin == "JFK" & month == 6L` are computed, and since there is nothing else left to do, all columns from `flights` at rows corresponding to those *row indices* are simply returned as a `data.table`. @@ -139,6 +140,7 @@ head(ans) ans <- flights[1:2] ans ``` +#### {.bs-callout .bs-callout-info} * In this case, there is no condition. The row indices are already provided in `i`. We therefore return a `data.table` with all columns from `flights` at rows for those *row indices*. @@ -151,7 +153,7 @@ ans <- flights[order(origin, -dest)] head(ans) ``` -#### `order()` is internally optimised +#### `order()` is internally optimised {.bs-callout .bs-callout-info} * We can use "-" on a `character` columns within the frame of a `data.table` to sort in decreasing order. @@ -168,6 +170,8 @@ ans <- flights[, arr_delay] head(ans) ``` +#### {.bs-callout .bs-callout-info} + * Since columns can be referred to as if they are variables within the frame of `data.table`s, we directly refer to the *variable* we want to subset. Since we want *all the rows*, we simply skip `i`. * It returns *all* the rows for the column `arr_delay`. @@ -179,13 +183,15 @@ ans <- flights[, list(arr_delay)] head(ans) ``` +#### {.bs-callout .bs-callout-info} + * We wrap the *variables* (column names) within `list()`, which ensures that a `data.table` is returned. In case of a single column name, not wrapping with `list()` returns a vector instead, as seen in the [previous example](#select-j-1d). * `data.table` also allows wrapping columns with `.()` instead of `list()`. It is an *alias* to `list()`; they both mean the same. Feel free to use whichever you prefer; we have noticed most users seem to prefer `.()` for conciseness, so we will continue to use `.()` hereafter. `data.table`s (and `data.frame`s) are internally `list`s as well, with the stipulation that each element has the same length and the `list` has a `class` attribute. Allowing `j` to return a `list` enables converting and returning `data.table` very efficiently. -#### Tip: {#tip-1} +#### Tip: {.bs-callout .bs-callout-warning #tip-1} As long as `j-expression` returns a `list`, each element of the list will be converted to a column in the resulting `data.table`. This makes `j` quite powerful, as we will see shortly. It is also very important to understand this for when you'd like to make more complicated queries!! @@ -199,6 +205,8 @@ head(ans) # ans <- flights[, list(arr_delay, dep_delay)] ``` +#### {.bs-callout .bs-callout-info} + * Wrap both columns within `.()`, or `list()`. That's it. #### -- Select both `arr_delay` and `dep_delay` columns *and* rename them to `delay_arr` and `delay_dep`. @@ -221,7 +229,7 @@ ans <- flights[, sum( (arr_delay + dep_delay) < 0 )] ans ``` -#### What's happening here? +#### What's happening here? {.bs-callout .bs-callout-info} * `data.table`'s `j` can handle more than just *selecting columns* - it can handle *expressions*, i.e., *computing on columns*. This shouldn't be surprising, as columns can be referred to as if they are variables. Then we should be able to *compute* by calling functions on those variables. And that's what precisely happens here. @@ -235,6 +243,8 @@ ans <- flights[origin == "JFK" & month == 6L, ans ``` +#### {.bs-callout .bs-callout-info} + * We first subset in `i` to find matching *row indices* where `origin` airport equals `"JFK"`, and `month` equals `6L`. We *do not* subset the _entire_ `data.table` corresponding to those rows _yet_. * Now, we look at `j` and find that it uses only *two columns*. And what we have to do is to compute their `mean()`. Therefore we subset just those columns corresponding to the matching rows, and compute their `mean()`. @@ -252,7 +262,7 @@ The function `length()` requires an input argument. We just needed to compute th This type of operation occurs quite frequently, especially while grouping (as we will see in the next section), to the point where `data.table` provides a *special symbol* `.N` for it. -#### Special symbol `.N`: {#special-N} +#### Special symbol `.N`: {.bs-callout .bs-callout-info #special-N} `.N` is a special built-in variable that holds the number of observations _in the current group_. It is particularly useful when combined with `by` as we'll see in the next section. In the absence of group by operations, it simply returns the number of rows in the subset. @@ -263,6 +273,8 @@ ans <- flights[origin == "JFK" & month == 6L, .N] ans ``` +#### {.bs-callout .bs-callout-info} + * Once again, we subset in `i` to get the *row indices* where `origin` airport equals *"JFK"*, and `month` equals *6*. * We see that `j` uses only `.N` and no other columns. Therefore the entire subset is not materialised. We simply return the number of rows in the subset (which is just the length of row indices). @@ -360,6 +372,8 @@ ans # ans <- flights[, .(.N), by = "origin"] ``` +#### {.bs-callout .bs-callout-info} + * We know `.N` [is a special variable](#special-N) that holds the number of rows in the current group. Grouping by `origin` obtains the number of rows, `.N`, for each group. * By doing `head(flights)` you can see that the origin airports occur in the order *"JFK"*, *"LGA"* and *"EWR"*. The original order of grouping variables is preserved in the result. _This is important to keep in mind!_ @@ -386,6 +400,8 @@ ans <- flights[carrier == "AA", .N, by = origin] ans ``` +#### {.bs-callout .bs-callout-info} + * We first obtain the row indices for the expression `carrier == "AA"` from `i`. * Using those *row indices*, we obtain the number of rows while grouped by `origin`. Once again no columns are actually materialised here, because the `j-expression` does not require any columns to be actually subsetted and is therefore fast and memory efficient. @@ -400,6 +416,8 @@ head(ans) # ans <- flights[carrier == "AA", .N, by = c("origin", "dest")] ``` +#### {.bs-callout .bs-callout-info} + * `by` accepts multiple columns. We just provide all the columns by which to group by. Note the use of `.()` again in `by` -- again, this is just shorthand for `list()`, and `list()` can be used here as well. Again, we'll stick with `.()` in this vignette. #### -- How can we get the average arrival and departure delay for each `orig,dest` pair for each month for carrier code `"AA"`? {#origin-dest-month} @@ -411,6 +429,8 @@ ans <- flights[carrier == "AA", ans ``` +#### {.bs-callout .bs-callout-info} + * Since we did not provide column names for the expressions in `j`, they were automatically generated as `V1` and `V2`. * Once again, note that the input order of grouping columns is preserved in the result. @@ -430,6 +450,8 @@ ans <- flights[carrier == "AA", ans ``` +#### {.bs-callout .bs-callout-info} + * All we did was to change `by` to `keyby`. This automatically orders the result by the grouping variables in increasing order. In fact, due to the internal implementation of `by` first requiring a sort before recovering the original table's order, `keyby` is typically faster than `by` because it doesn't require this second step. **Keys:** Actually `keyby` does a little more than *just ordering*. It also *sets a key* after ordering by setting an `attribute` called `sorted`. @@ -453,6 +475,8 @@ ans <- ans[order(origin, -dest)] head(ans) ``` +#### {.bs-callout .bs-callout-info} + * Recall that we can use `-` on a `character` column in `order()` within the frame of a `data.table`. This is possible to due `data.table`'s internal query optimisation. * Also recall that `order(...)` with the frame of a `data.table` is *automatically optimised* to use `data.table`'s internal fast radix order `forder()` for speed. @@ -464,6 +488,8 @@ ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)] head(ans, 10) ``` +#### {.bs-callout .bs-callout-info} + * We can tack expressions one after another, *forming a chain* of operations, i.e., `DT[ ... ][ ... ][ ... ]`. * Or you can also chain them vertically: @@ -486,6 +512,8 @@ ans <- flights[, .N, .(dep_delay>0, arr_delay>0)] ans ``` +#### {.bs-callout .bs-callout-info} + * The last row corresponds to `dep_delay > 0 = TRUE` and `arr_delay > 0 = FALSE`. We can see that `r flights[!is.na(arr_delay) & !is.na(dep_delay), .N, .(dep_delay>0, arr_delay>0)][, N[4L]]` flights started late but arrived early (or on time). * Note that we did not provide any names to `by-expression`. Therefore, names have been automatically assigned in the result. As with `j`, you can name these expressions as you would elements of any `list`, e.g. `DT[, .N, .(dep_delayed = dep_delay>0, arr_delayed = arr_delay>0)]`. @@ -500,7 +528,7 @@ It is of course not practical to have to type `mean(myCol)` for every column one How can we do this efficiently, concisely? To get there, refresh on [this tip](#tip-1) - *"As long as the `j`-expression returns a `list`, each element of the `list` will be converted to a column in the resulting `data.table`"*. Suppose we can refer to the *data subset for each group* as a variable *while grouping*, then we can loop through all the columns of that variable using the already- or soon-to-be-familiar base function `lapply()`. No new names to learn specific to `data.table`. -#### Special symbol `.SD`: {#special-SD} +#### Special symbol `.SD`: {.bs-callout .bs-callout-info #special-SD} `data.table` provides a *special* symbol, called `.SD`. It stands for **S**ubset of **D**ata. It by itself is a `data.table` that holds the data for *the current group* defined using `by`. @@ -514,6 +542,8 @@ DT DT[, print(.SD), by = ID] ``` +#### {.bs-callout .bs-callout-info} + * `.SD` contains all the columns *except the grouping columns* by default. * It is also generated by preserving the original order - data corresponding to `ID = "b"`, then `ID = "a"`, and then `ID = "c"`. @@ -524,6 +554,8 @@ To compute on (multiple) columns, we can then simply use the base R function `la DT[, lapply(.SD, mean), by = ID] ``` +#### {.bs-callout .bs-callout-info} + * `.SD` holds the rows corresponding to columns `a`, `b` and `c` for that group. We compute the `mean()` on each of these columns using the already-familiar base function `lapply()`. * Each group returns a list of three elements containing the mean value which will become the columns of the resulting `data.table`. @@ -534,7 +566,7 @@ We are almost there. There is one little thing left to address. In our `flights` #### -- How can we specify just the columns we would like to compute the `mean()` on? -#### .SDcols +#### .SDcols {.bs-callout .bs-callout-info} Using the argument `.SDcols`. It accepts either column names or column indices. For example, `.SDcols = c("arr_delay", "dep_delay")` ensures that `.SD` contains only these two columns for each group. @@ -558,6 +590,8 @@ ans <- flights[, head(.SD, 2), by = month] head(ans) ``` +#### {.bs-callout .bs-callout-info} + * `.SD` is a `data.table` that holds all the rows for *that group*. We simply subset the first two rows as we have seen [here](#subset-rows-integer) already. * For each group, `head(.SD, 2)` returns the first two rows as a `data.table`, which is also a `list`, so we do not have to wrap it with `.()`. @@ -572,6 +606,8 @@ So that we have a consistent syntax and keep using already existing (and familia DT[, .(val = c(a,b)), by = ID] ``` +#### {.bs-callout .bs-callout-info} + * That's it. There is no special syntax required. All we need to know is the base function `c()` which concatenates vectors and [the tip from before](#tip-1). #### -- What if we would like to have all the values of column `a` and `b` concatenated, but returned as a list column? @@ -580,6 +616,8 @@ DT[, .(val = c(a,b)), by = ID] DT[, .(val = list(c(a,b))), by = ID] ``` +#### {.bs-callout .bs-callout-info} + * Here, we first concatenate the values with `c(a,b)` for each group, and wrap that with `list()`. So for each group, we return a list of all concatenated values. * Note those commas are for display only. A list column can contain any object in each cell, and in this example, each cell is itself a vector and some cells contain longer vectors than others. @@ -608,7 +646,7 @@ DT[i, j, by] We have seen so far that, -#### Using `i`: +#### Using `i`: {.bs-callout .bs-callout-info} * We can subset rows similar to a `data.frame`- except you don't have to use `DT$` repetitively since columns within the frame of a `data.table` are seen as if they are *variables*. @@ -616,7 +654,7 @@ We have seen so far that, We can do much more in `i` by keying a `data.table`, which allows blazing fast subsets and joins. We will see this in the *"Keys and fast binary search based subsets"* and *"Joins and rolling joins"* vignette. -#### Using `j`: +#### Using `j`: {.bs-callout .bs-callout-info} 1. Select columns the `data.table` way: `DT[, .(colA, colB)]`. @@ -628,7 +666,7 @@ We can do much more in `i` by keying a `data.table`, which allows blazing fast s 5. Combine with `i`: `DT[colA > value, sum(colB)]`. -#### Using `by`: +#### Using `by`: {.bs-callout .bs-callout-info} * Using `by`, we can group by columns by specifying a *list of columns* or a *character vector of column names* or even *expressions*. The flexibility of `j`, combined with `by` and `i` makes for a very powerful syntax. @@ -644,7 +682,7 @@ We can do much more in `i` by keying a `data.table`, which allows blazing fast s 3. `DT[col > val, head(.SD, 1), by = ...]` - combine `i` along with `j` and `by`. -#### And remember the tip: +#### And remember the tip: {.bs-callout .bs-callout-warning} As long as `j` returns a `list`, each element of the list will become a column in the resulting `data.table`. @@ -652,6 +690,3 @@ We will see how to *add/update/delete* columns *by reference* and how to combine *** -```{r, echo=FALSE} -setDTthreads(.old.th) -``` \ No newline at end of file diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index e73b71b929..465052d941 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -2,10 +2,10 @@ title: "Keys and fast binary search based subset" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Keys and fast binary search based subset} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -17,7 +17,6 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) -.old.th = setDTthreads(1) ``` This vignette is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, add/modify/delete columns *by reference* in `j` and group by using `by`. If you're not familiar with these concepts, please read the *"Introduction to data.table"* and *"Reference semantics"* vignettes first. @@ -495,8 +494,3 @@ In this vignette, we have learnt another method to subset rows in `i` by keying * combine key based subsets with `j` and `by`. Note that the `j` and `by` operations are exactly the same as before. Key based subsets are **incredibly fast** and are particularly useful when the task involves *repeated subsetting*. But it may not be always desirable to set key and physically reorder the *data.table*. In the next vignette, we will address this using a *new* feature -- *secondary indexes*. - - -```{r, echo=FALSE} -setDTthreads(.old.th) -``` \ No newline at end of file diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 89d1292012..46008e7045 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -2,10 +2,10 @@ title: "Programming on data.table" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Programming on data.table} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -25,16 +25,8 @@ knitr::opts_chunk$set( `data.table`, from its very first releases, enabled the usage of `subset` and `with` (or `within`) functions by defining the`[.data.table` method. `subset` and `with` are base R functions that are useful for reducing repetition in code, enhancing readability, and reducing number the total characters the user has to type. This functionality is possible in R because of a quite unique feature called *lazy evaluation*. This feature allows a function to catch its arguments, before they are evaluated, and to evaluate them in a different scope than the one in which they were called. Let's recap usage of the `subset` function. -```{r df_print, echo=FALSE} -registerS3method("print", "data.frame", function(x, ...) { - base::print.data.frame(head(x, 2L), ...) - cat("...\n") - invisible(x) -}) -.opts = options( - datatable.print.topn=2L, - datatable.print.nrows=20L -) +```{r opt_max_print_10, include = FALSE} +options(max.print = 10L) # 2 rows ``` ```{r subset} @@ -110,7 +102,7 @@ my_subset = function(data, col, val) { my_subset(iris, Species, "setosa") ``` -Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cran.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. +Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. #### Use third party packages @@ -122,7 +114,7 @@ Though these can be helpful, we will be discussing a `data.table`-unique approac Now that we've established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main subject of this vignette, *programming on data.table*. -Starting from version 1.15.0, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). +Starting from version 1.14.2, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). ### Substituting variables and names @@ -157,18 +149,20 @@ Now, to use substitution inside `[.data.table`, we don't need to call the `subst Let's use the `iris` data set as a demonstration. Just as an example, let's pretend we want to compute the `Sepal.Hypotenuse`, treating the sepal width and length as if they were legs of a right triangle. +```{r opt_max_print_8, include = FALSE} +options(max.print = 8L) # 2 rows +``` + ```{r hypotenuse_datatable} DT = as.data.table(iris) -str( - DT[, outer(inner(var1) + inner(var2)), - env = list( - outer = "sqrt", - inner = "square", - var1 = "Sepal.Length", - var2 = "Sepal.Width" - )] -) +DT[, outer(inner(var1) + inner(var2)), + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width" + )] # return as a data.table DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), @@ -203,7 +197,7 @@ DT[filter_col %in% filter_val, ### Substitute variables and character values -In the above example, we have seen a convenient feature of `substitute2`: automatic conversion from strings into names/symbols. An obvious question arises: what if we actually want to substitute a parameter with a *character* value, so as to have base R `substitute` behaviour. We provide a mechanism to escape automatic conversion by wrapping the elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing its arguments from character-to-symbol automatic conversion. (Read the `?AsIs` documentation for more details.) If base R behaviour is desired for the whole `env` argument, then it's best to wrap the whole argument in `I()`. Alternatively, each list element can be wrapped in `I()` individually. Let's explore both cases below. +In the above example, we have seen a convenient feature of `substitute2`: automatic conversion from strings into names/symbols. An obvious question arises: what if we actually want to substitute a parameter with a *character* value, so as to have base R `substitute` behaviour. We provide a mechanism to escape automatic conversion by wrapping the elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing its arguments from substitution. (Read the `?AsIs` documentation for more details.) If base R behaviour is desired for the whole `env` argument, then it's best to wrap the whole argument in `I()`. Alternatively, each list element can be wrapped in `I()` individually. Let's explore both cases below. ```{r rank} substitute( # base R behaviour @@ -241,6 +235,10 @@ The example presented above illustrates a neat and powerful way to make your cod An obvious use case could be to mimic `.SD` functionality by injecting a `list` call into the `j` argument. +```{r opt_max_print_4, include = FALSE} +options(max.print = 4L) # 2 rows +``` + ```{r splice_sd} cols = c("Sepal.Length", "Sepal.Width") DT[, .SD, .SDcols = cols] @@ -253,7 +251,7 @@ DT[, list(Sepal.Length, Sepal.Width)] ``` *Splicing* is an operation where a list of objects have to be inlined into an expression as a sequence of arguments to call. -In base R, splicing `cols` into a `list` can be achieved using `as.call(c(quote(list), lapply(cols, as.name)))`. Additionally, starting from R 4.0.0, there is new interface for such an operation in the `bquote` function. +In base R, splicing `cols` into a `list` can be achieved using `as.call(c(quote(list), cols))`. Additionally, starting from R 4.0.0, there is new interface for such an operation in the `bquote` function. In data.table, we make it easier by automatically _enlist_-ing a list of objects into a list call with those objects. This means that any `list` object inside the `env` list argument will be turned into list `call`, making the API for that use case as simple as presented below. @@ -318,6 +316,10 @@ It takes arbitrary number of variables on input, but now we cannot just *splice* First, we have to construct calls to the `square` function for each of the variables (see `inner_calls`). Then, we have to reduce the list of calls into a single call, having a nested sequence of `+` calls (see `add_calls`). Lastly, we have to substitute the constructed call into the surrounding expression (see `rms`). +```{r opt_max_print_12, include = FALSE} +options(max.print = 12L) # 2 rows +``` + ```{r complex} outer = "sqrt" inner = "square" @@ -342,19 +344,15 @@ rms = substitute2( ) print(rms) -str( - DT[, j, env = list(j = rms)] -) +DT[, j, env = list(j = rms)] # same, but skipping last substitute2 call and using add_calls directly -str( - DT[, outer((add_calls) / len), - env = list( - outer = outer, - add_calls = add_calls, - len = length(vars) - )] -) +DT[, outer((add_calls) / len), + env = list( + outer = outer, + add_calls = add_calls, + len = length(vars) + )] # return as data.table j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) @@ -379,6 +377,10 @@ In `[.data.table`, it is also possible to use other mechanisms for variable subs ### `get` +```{r opt_max_print_4b, include = FALSE} +options(max.print = 4L) # 2 rows +``` + ```{r old_get} v1 = "Petal.Width" v2 = "Sepal.Width" @@ -416,8 +418,3 @@ DT[, eval(cl)] DT[, cl, env = list(cl = cl)] ``` - -```{r cleanup, echo=FALSE} -options(.opts) -registerS3method("print", "data.frame", base::print.data.frame) -``` \ No newline at end of file diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 7a9990ba40..33da89bb92 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -2,10 +2,10 @@ title: "Reference semantics" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Reference semantics} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -17,7 +17,6 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) -.old.th = setDTthreads(1) ``` This vignette discusses *data.table*'s reference semantics which allows to *add/update/delete* columns of a *data.table by reference*, and also combine them with `i` and `by`. It is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, and perform aggregations by group. If you're not familiar with these concepts, please read the *"Introduction to data.table"* vignette first. @@ -72,7 +71,7 @@ both (1) and (2) resulted in deep copy of the entire data.frame in versions of ` Great performance improvements were made in `R v3.1` as a result of which only a *shallow* copy is made for (1) and not *deep* copy. However, for (2) still, the entire column is *deep* copied even in `R v3.1+`. This means the more columns one subassigns to in the *same query*, the more *deep* copies R does. -#### *shallow* vs *deep* copy +#### *shallow* vs *deep* copy {.bs-callout .bs-callout-info} A *shallow* copy is just a copy of the vector of column pointers (corresponding to the columns in a *data.frame* or *data.table*). The actual data is not physically copied in memory. @@ -87,27 +86,31 @@ It can be used in `j` in two ways: (a) The `LHS := RHS` form -```{r eval = FALSE} -DT[, c("colA", "colB", ...) := list(valA, valB, ...)] + ```{r eval = FALSE} + DT[, c("colA", "colB", ...) := list(valA, valB, ...)] -# when you have only one column to assign to you -# can drop the quotes and list(), for convenience -DT[, colA := valA] -``` + # when you have only one column to assign to you + # can drop the quotes and list(), for convenience + DT[, colA := valA] + ``` (b) The functional form -```{r eval = FALSE} -DT[, `:=`(colA = valA, # valA is assigned to colA - colB = valB, # valB is assigned to colB - ... -)] -``` + ```{r eval = FALSE} + DT[, `:=`(colA = valA, # valA is assigned to colA + colB = valB, # valB is assigned to colB + ... + )] + ``` + +#### {.bs-callout .bs-callout-warning} Note that the code above explains how `:=` can be used. They are not working examples. We will start using them on `flights` *data.table* from the next section. # +#### {.bs-callout .bs-callout-info} + * In (a), `LHS` takes a character vector of column names and `RHS` a *list of values*. `RHS` just needs to be a `list`, irrespective of how its generated (e.g., using `lapply()`, `list()`, `mget()`, `mapply()` etc.). This form is usually easy to program with and is particularly useful when you don't know the columns to assign values to in advance. * On the other hand, (b) is handy if you would like to jot some comments down for later. @@ -137,7 +140,7 @@ head(flights) # flights[, c("speed", "delay") := list(distance/(air_time/60), arr_delay + dep_delay)] ``` -#### Note that +#### Note that {.bs-callout .bs-callout-info} * We did not have to assign the result back to `flights`. @@ -163,6 +166,8 @@ We see that there are totally `25` unique values in the data. Both *0* and *24* flights[hour == 24L, hour := 0L] ``` +#### {.bs-callout .bs-callout-info} + * We can use `i` along with `:=` in `j` the very same way as we have already seen in the *"Introduction to data.table"* vignette. * Column `hour` is replaced with `0` only on those *row indices* where the condition `hour == 24L` specified in `i` evaluates to `TRUE`. @@ -181,7 +186,7 @@ Let's look at all the `hours` to verify. flights[, sort(unique(hour))] ``` -#### Exercise: {#update-by-reference-question} +#### Exercise: {.bs-callout .bs-callout-warning #update-by-reference-question} What is the difference between `flights[hour == 24L, hour := 0L]` and `flights[hour == 24L][, hour := 0L]`? Hint: The latter needs an assignment (`<-`) if you would want to use the result later. @@ -199,7 +204,7 @@ head(flights) # flights[, `:=`(delay = NULL)] ``` -#### {#delete-convenience} +#### {.bs-callout .bs-callout-info #delete-convenience} * Assigning `NULL` to a column *deletes* that column. And it happens *instantly*. @@ -224,6 +229,8 @@ flights[, max_speed := max(speed), by = .(origin, dest)] head(flights) ``` +#### {.bs-callout .bs-callout-info} + * We add a new column `max_speed` using the `:=` operator by reference. * We provide the columns to group by the same way as shown in the *Introduction to data.table* vignette. For each group, `max(speed)` is computed, which returns a single value. That value is recycled to fit the length of the group. Once again, no copies are being made at all. `flights` *data.table* is modified *in-place*. @@ -242,6 +249,7 @@ out_cols = c("max_dep_delay", "max_arr_delay") flights[, c(out_cols) := lapply(.SD, max), by = month, .SDcols = in_cols] head(flights) ``` +#### {.bs-callout .bs-callout-info} * We use the `LHS := RHS` form. We store the input column names and the new columns to add in separate variables and provide them to `.SDcols` and for `LHS` (for better readability). @@ -275,6 +283,7 @@ ans = foo(flights) head(flights) head(ans) ``` +#### {.bs-callout .bs-callout-info} * Note that the new column `speed` has been added to `flights` *data.table*. This is because `:=` performs operations by reference. Since `DT` (the function argument) and `flights` refer to the same object in memory, modifying `DT` also reflects on `flights`. @@ -284,6 +293,8 @@ head(ans) In the previous section, we used `:=` for its side effect. But of course this may not be always desirable. Sometimes, we would like to pass a *data.table* object to a function, and might want to use the `:=` operator, but *wouldn't* want to update the original object. We can accomplish this using the function `copy()`. +#### {.bs-callout .bs-callout-info} + The `copy()` function *deep* copies the input object and therefore any subsequent update by reference operations performed on the copied object will not affect the original object. # @@ -310,6 +321,8 @@ There are two particular places where `copy()` function is essential: head(ans) ``` +#### {.bs-callout .bs-callout-info} + * Using `copy()` function did not update `flights` *data.table* by reference. It doesn't contain the column `speed`. * And `ans` contains the maximum speed corresponding to each month. @@ -341,7 +354,7 @@ However we could improve this functionality further by *shallow* copying instead ## Summary -#### The `:=` operator +#### The `:=` operator {.bs-callout .bs-callout-info} * It is used to *add/update/delete* columns by reference. @@ -349,10 +362,6 @@ However we could improve this functionality further by *shallow* copying instead * We can use `:=` for its side effect or use `copy()` to not modify the original object while updating by reference. -```{r, echo=FALSE} -setDTthreads(.old.th) -``` - # So far we have seen a whole lot in `j`, and how to combine it with `by` and little of `i`. Let's turn our attention back to `i` in the next vignette *"Keys and fast binary search based subset"* to perform *blazing fast subsets* by *keying data.tables*. diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index d282bc7de3..3f94392fc6 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -2,10 +2,10 @@ title: "Efficient reshaping using data.tables" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Efficient reshaping using data.tables} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -17,7 +17,6 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) -.old.th = setDTthreads(1) ``` This vignette discusses the default usage of reshaping functions `melt` (wide to long) and `dcast` (long to wide) for *data.tables* as well as the **new extended functionalities** of melting and casting on *multiple columns* available from `v1.9.6`. @@ -78,6 +77,8 @@ DT.m1 str(DT.m1) ``` +#### {.bs-callout .bs-callout-info} + * `measure.vars` specify the set of columns we would like to collapse (or combine) together. * We can also specify column *indices* instead of *names*. @@ -97,6 +98,8 @@ DT.m1 = melt(DT, measure.vars = c("dob_child1", "dob_child2", "dob_child3"), DT.m1 ``` +#### {.bs-callout .bs-callout-info} + * By default, when one of `id.vars` or `measure.vars` is missing, the rest of the columns are *automatically assigned* to the missing argument. * When neither `id.vars` nor `measure.vars` are specified, as mentioned under `?melt`, all *non*-`numeric`, `integer`, `logical` columns will be assigned to `id.vars`. @@ -115,6 +118,8 @@ That is, we'd like to collect all *child* observations corresponding to each `fa dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob") ``` +#### {.bs-callout .bs-callout-info} + * `dcast` uses *formula* interface. The variables on the *LHS* of formula represents the *id* vars and *RHS* the *measure* vars. * `value.var` denotes the column to be filled in with while casting to wide format. @@ -160,7 +165,7 @@ DT.c1 str(DT.c1) ## gender column is character type now! ``` -#### Issues +#### Issues {.bs-callout .bs-callout-info} 1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient). @@ -193,6 +198,8 @@ DT.m2 str(DT.m2) ## col type is preserved ``` +#### {.bs-callout .bs-callout-info} + * We can remove the `variable` column if necessary. * The functionality is implemented entirely in C, and is therefore both *fast* and *memory efficient* in addition to being *straightforward*. @@ -203,7 +210,7 @@ Usually in these problems, the columns we'd like to melt can be distinguished by ```{r} DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "gender")) -DT.m2 +print(DT.m2, class=TRUE) ``` #### - Using `measure()` to specify `measure.vars` via separator or pattern @@ -253,7 +260,7 @@ is used to convert the `child` string values to integers: ```{r} DT.m3 = melt(DT, measure = measure(value.name, child=as.integer, sep="_child")) -DT.m3 +print(DT.m3, class=TRUE) ``` In the code above we used `sep="_child"` which results in melting only @@ -281,12 +288,12 @@ groups, two numeric output columns, and an anonymous type conversion function, ```{r} -melt(who, measure.vars = measure( +print(melt(who, measure.vars = measure( diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))" -)) +)), class=TRUE) ``` ### b) Enhanced `dcast` @@ -305,20 +312,18 @@ DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "ge DT.c2 ``` +#### {.bs-callout .bs-callout-info} + * Attributes are preserved in result wherever possible. * Everything is taken care of internally, and efficiently. In addition to being fast, it is also very memory efficient. # -#### Multiple functions to `fun.aggregate`: +#### Multiple functions to `fun.aggregate`: {.bs-callout .bs-callout-info} You can also provide *multiple functions* to `fun.aggregate` to `dcast` for *data.tables*. Check the examples in `?dcast` which illustrates this functionality. -```{r, echo=FALSE} -setDTthreads(.old.th) -``` - # *** diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index ae0b5a84ac..fda2c4751f 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -2,15 +2,12 @@ title: "Using .SD for Data Analysis" date: "`r Sys.Date()`" output: - markdown::html_format: - options: - toc: true - number_sections: true - meta: - css: [default, css/toc.css] + rmarkdown::html_vignette: + toc: true + number_sections: true vignette: > %\VignetteIndexEntry{Using .SD for Data Analysis} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -25,7 +22,6 @@ knitr::opts_chunk$set( out.width = '100%', dpi = 144 ) -.old.th = setDTthreads(1) ``` This vignette will explain the most common ways to use the `.SD` variable in your `data.table` analyses. It is an adaptation of [this answer](https://stackoverflow.com/a/47406952/3576984) given on StackOverflow. @@ -38,7 +34,7 @@ The simpler usage of `.SD` is for column subsetting (i.e., when `.SDcols` is spe ## Loading and Previewing Lahman Data -To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://github.com/cdalzell/Lahman). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. +To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](http://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. ```{r download_lahman} load('Teams.RData') @@ -50,7 +46,7 @@ setDT(Pitching) Pitching ``` -Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://github.com/cdalzell/Lahman) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. +Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](http://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. # `.SD` on Ungrouped Data @@ -203,8 +199,7 @@ Note that the `x[y]` syntax returns `nrow(y)` values (i.e., it's a right join), Often, we'd like to perform some operation on our data _at the group level_. When we specify `by =` (or `keyby = `), the mental model for what happens when `data.table` processes `j` is to think of your `data.table` as being split into many component sub-`data.table`s, each of which corresponds to a single value of your `by` variable(s): -![Grouping, Illustrated](plots/grouping_illustration.png) - +![Grouping, Illustrated](plots/grouping_illustration.png 'A visual depiction of how grouping works. On the left is a grid. The first column is titled "ID COLUMN" with values the capital letters A through G, and the rest of the data is unlabelled, but is in a darker color and simply has "Data" written to indicate that's arbitrary. A right arrow shows how this data is split into groups. Each capital letter A through G has a grid on the right-hand side; the grid on the left has been subdivided to create that on the right.') In the case of grouping, `.SD` is multiple in nature -- it refers to _each_ of these sub-`data.table`s, _one-at-a-time_ (slightly more accurately, the scope of `.SD` is a single sub-`data.table`). This allows us to concisely express an operation that we'd like to perform on _each sub-`data.table`_ before the re-assembled result is returned to us. @@ -255,7 +250,3 @@ abline(v = overall_coef, lty = 2L, col = 'red') While there is indeed a fair amount of heterogeneity, there's a distinct concentration around the observed overall value. The above is just a short introduction of the power of `.SD` in facilitating beautiful, efficient code in `data.table`! - -```{r, echo=FALSE} -setDTthreads(.old.th) -``` \ No newline at end of file diff --git a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd index ff50ba97e5..ef506605c3 100644 --- a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd +++ b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd @@ -2,10 +2,10 @@ title: "Secondary indices and auto indexing" date: "`r Sys.Date()`" output: - markdown::html_format + rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Secondary indices and auto indexing} - %\VignetteEngine{knitr::knitr} + %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -17,7 +17,6 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) -.old.th = setDTthreads(1) ``` This vignette assumes that the reader is familiar with data.table's `[i, j, by]` syntax, and how to perform fast key based subsets. If you're not familiar with these concepts, please read the *"Introduction to data.table"*, *"Reference semantics"* and *"Keys and fast binary search based subset"* vignettes first. @@ -106,7 +105,7 @@ setkey(flights, origin) flights["JFK"] # or flights[.("JFK")] ``` -#### `setkey()` requires: +#### `setkey()` requires: {.bs-callout .bs-callout-info} a) computing the order vector for the column(s) provided, here, `origin`, and @@ -140,7 +139,7 @@ Since there can be multiple secondary indices, and creating an index is as simpl As we will see in the next section, the `on` argument provides several advantages: -#### `on` argument +#### `on` argument {.bs-callout .bs-callout-info} * enables subsetting by computing secondary indices on the fly. This eliminates having to do `setindex()` every time. @@ -326,8 +325,3 @@ In recent version we extended auto indexing to expressions involving more than o We will discuss fast *subsets* using keys and secondary indices to *joins* in the next vignette, *"Joins and rolling joins"*. *** - -```{r, echo=FALSE} -setDTthreads(.old.th) -``` -