Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T

19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix.

20. `forderv` could segfault on keys with long runs of identical bytes (e.g., many duplicate columns) because the single-group branch tail-recursed radix-by-radix until the C stack ran out, [#4300](https://github.com/Rdatatable/data.table/issues/4300). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies for the report and @ben-schwen for the fix.

### NOTES

1. The following in-progress deprecations have proceeded:
Expand Down
13 changes: 12 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21823,4 +21823,15 @@ test(2342.3, options = c(datatable.week = "sequential"), week(as.IDate(c("2019-1
test(2342.4, options = c(datatable.week = "sequential"), week(as.IDate(c("2020-12-31","2021-01-01"))), c(53L,1L))
test(2342.5, options = c(datatable.week = "sequential"), week(as.IDate("2021-01-06") + 0:6), c(1L,1L,2L,2L,2L,2L,2L))
test(2342.6, options = c(datatable.week = "sequential"), week(as.IDate(c("2016-02-27","2016-02-28","2016-02-29","2016-03-01","2016-03-02"))), c(9L,9L,9L,9L,9L))
test(2342.7, options = c(datatable.week = "default"), week(as.IDate("1970-01-07")), 2L, warning = "The default behavior of week() is changing")
test(2342.7, options = c(datatable.week = "default"), week(as.IDate("1970-01-07")), 2L, warning = "The default behavior of week() is changing")

# forderv should not segfault on long single group keys due to recursion #4300
N = 1e4
set.seed(1)
idx = sort(sample(10, 20, TRUE))
x = matrix(rnorm(N), nrow=10)
DT = as.data.table(x[idx,])
DT[, V1000 := 20:1]
test(2343.1, forderv(DT, by=names(DT), sort=FALSE, retGrp=TRUE), forderv(DT, by=c("V1", "V1000"), sort=FALSE, retGrp=TRUE))
x = c(rep(0, 7e5), 1e6)
test(2343.2, forderv(list(x)), integer(0))
28 changes: 25 additions & 3 deletions src/forder.c
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ uint64_t dtwiddle(double x) //const void *p, int i)
STOP(_("Unknown non-finite value; not NA, NaN, -Inf or +Inf")); // # nocov
}

void radix_r(const int from, const int to, const int radix);
void radix_r(const int from, const int to, int radix);

/*
OpenMP is used here to parallelize multiple operations that come together to
Expand Down Expand Up @@ -901,7 +901,8 @@ static bool sort_ugrp(uint8_t *x, const int n)
return skip;
}

void radix_r(const int from, const int to, const int radix) {
void radix_r(const int from, const int to, int radix) {
for (;;) {
TBEG();
const int my_n = to-from+1;
if (my_n==1) { // minor TODO: batch up the 1's instead in caller (and that's only needed when retgrp anyway)
Expand Down Expand Up @@ -1025,6 +1026,11 @@ void radix_r(const int from, const int to, const int radix) {
TEND(9)
if (radix+1==nradix || ngrp==my_n) { // ngrp==my_n => unique groups all size 1 and we can stop recursing now
push(my_gs, ngrp);
} else if (ngrp==1) { // ngrp == 1 => all same byte at this radix; go to next radix with same splits #4300
// no need to push since gs stays the same
free(my_gs);
radix++;
continue;
} else {
for (int i=0, f=from; i<ngrp; i++) {
radix_r(f, f+my_gs[i]-1, radix+1);
Expand Down Expand Up @@ -1128,6 +1134,11 @@ void radix_r(const int from, const int to, const int radix) {
if (radix+1==nradix) {
// aside: cannot be all size 1 (a saving used in my_n<=256 case above) because my_n>256 and ngrp<=256
push(my_gs, ngrp);
} else if (ngrp==1) { // ngrp == 1 => all same byte at this radix; go to next radix with same splits #4300
// no need to push since gs stays the same
free(my_gs);
radix++;
continue;
} else {
// this single thread will now descend and resolve all groups, now that the groups are close in cache
for (int i=0, my_from=from; i<ngrp; i++) {
Expand Down Expand Up @@ -1313,6 +1324,16 @@ void radix_r(const int from, const int to, const int radix) {
push(my_gs, ngrp);
TEND(23)
}
else if (ngrp==1) {
// no need to push since gs stays the same
free(my_gs);
free(counts);
free(starts);
free(ugrps);
free(ngrps);
radix++;
continue;
}
else {
// TODO: explicitly repeat parallel batch for any skew bins
bool anyBig = false;
Expand Down Expand Up @@ -1364,7 +1385,8 @@ void radix_r(const int from, const int to, const int radix) {
free(ugrps);
free(ngrps);
TEND(26)
}
return;
}}


SEXP issorted(SEXP x, SEXP by)
Expand Down
Loading