diff --git a/NEWS.md b/NEWS.md index 3face7519b..524e4e69d6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -338,6 +338,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix. +20. `forderv` could segfault on keys with long runs of identical bytes (e.g., many duplicate columns) because the single-group branch tail-recursed radix-by-radix until the C stack ran out, [#4300](https://github.com/Rdatatable/data.table/issues/4300). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies for the report and @ben-schwen for the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 609977b991..18c28a8178 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21823,4 +21823,15 @@ test(2342.3, options = c(datatable.week = "sequential"), week(as.IDate(c("2019-1 test(2342.4, options = c(datatable.week = "sequential"), week(as.IDate(c("2020-12-31","2021-01-01"))), c(53L,1L)) test(2342.5, options = c(datatable.week = "sequential"), week(as.IDate("2021-01-06") + 0:6), c(1L,1L,2L,2L,2L,2L,2L)) test(2342.6, options = c(datatable.week = "sequential"), week(as.IDate(c("2016-02-27","2016-02-28","2016-02-29","2016-03-01","2016-03-02"))), c(9L,9L,9L,9L,9L)) -test(2342.7, options = c(datatable.week = "default"), week(as.IDate("1970-01-07")), 2L, warning = "The default behavior of week() is changing") \ No newline at end of file +test(2342.7, options = c(datatable.week = "default"), week(as.IDate("1970-01-07")), 2L, warning = "The default behavior of week() is changing") + +# forderv should not segfault on long single group keys due to recursion #4300 +N = 1e4 +set.seed(1) +idx = sort(sample(10, 20, TRUE)) +x = matrix(rnorm(N), nrow=10) +DT = as.data.table(x[idx,]) +DT[, V1000 := 20:1] +test(2343.1, forderv(DT, by=names(DT), sort=FALSE, retGrp=TRUE), forderv(DT, by=c("V1", "V1000"), sort=FALSE, retGrp=TRUE)) +x = c(rep(0, 7e5), 1e6) +test(2343.2, forderv(list(x)), integer(0)) diff --git a/src/forder.c b/src/forder.c index 302a14f2ce..a4564497b9 100644 --- a/src/forder.c +++ b/src/forder.c @@ -437,7 +437,7 @@ uint64_t dtwiddle(double x) //const void *p, int i) STOP(_("Unknown non-finite value; not NA, NaN, -Inf or +Inf")); // # nocov } -void radix_r(const int from, const int to, const int radix); +void radix_r(const int from, const int to, int radix); /* OpenMP is used here to parallelize multiple operations that come together to @@ -901,7 +901,8 @@ static bool sort_ugrp(uint8_t *x, const int n) return skip; } -void radix_r(const int from, const int to, const int radix) { +void radix_r(const int from, const int to, int radix) { + for (;;) { TBEG(); const int my_n = to-from+1; if (my_n==1) { // minor TODO: batch up the 1's instead in caller (and that's only needed when retgrp anyway) @@ -1025,6 +1026,11 @@ void radix_r(const int from, const int to, const int radix) { TEND(9) if (radix+1==nradix || ngrp==my_n) { // ngrp==my_n => unique groups all size 1 and we can stop recursing now push(my_gs, ngrp); + } else if (ngrp==1) { // ngrp == 1 => all same byte at this radix; go to next radix with same splits #4300 + // no need to push since gs stays the same + free(my_gs); + radix++; + continue; } else { for (int i=0, f=from; i256 and ngrp<=256 push(my_gs, ngrp); + } else if (ngrp==1) { // ngrp == 1 => all same byte at this radix; go to next radix with same splits #4300 + // no need to push since gs stays the same + free(my_gs); + radix++; + continue; } else { // this single thread will now descend and resolve all groups, now that the groups are close in cache for (int i=0, my_from=from; i