diff --git a/NEWS.md b/NEWS.md index 6730cebb44..0a6cd5a65b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,7 @@ 7. `fwrite()` now avoids a crash when translating strings into a different encoding, [#6883](https://github.com/Rdatatable/data.table/issues/6883). Thanks @filipemsc for the report and @aitap for the fix. +8. `set()` now automatically pre-allocates new column slots if needed, similar to what `:=` already does, [#496](https://github.com/Rdatatable/data.table/issues/496) [#4100](https://github.com/Rdatatable/data.table/issues/4100). Thanks to Huashan Chen and @tyner for the report and Benjamin Schwendinger for the fix. ## NOTES diff --git a/R/data.table.R b/R/data.table.R index 2fe4621b3f..5b78f4aae8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2761,9 +2761,14 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent= invisible(x) } -set = function(x,i=NULL,j,value) # low overhead, loopable -{ - .Call(Cassign,x,i,j,NULL,value) +set = function(x, i=NULL, j, value) { + name = as.character(substitute(x)) + old_add = address(x) + x = .Call(Cassign, x, i, j, NULL, value) + if (old_add != address(x)) { + # assign is needed to replace x on address change due to possible new allocation + assign(name, x, envir=parent.frame()) + } invisible(x) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a86cb16e3f..0d0409c254 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1322,7 +1322,8 @@ test(416, DT[J(2)], error="the columns to join by must be specified using") # Test shallow copy verbose message from := adding a column, and (TO DO) only when X is NAMED. DT = data.table(a=1:3,b=4:6) -test(417, alloc.col(DT,3,verbose=TRUE), DT, output="Attempt to reduce allocation from.*to 5 ignored. Can only increase allocation via shallow copy") +test(417.1, alloc.col(DT,3,verbose=TRUE), DT, output="Attempt to reduce allocation from.*to 5 ignored. Can only increase allocation via shallow copy") +test(417.2, alloc.col(DT,3,verbose=1L), error="verbose must be TRUE or FALSE") options(datatable.alloccol=1L) DT = data.table(a=1:3,b=4:6) options(datatable.alloccol=1024L) @@ -14697,7 +14698,7 @@ test(2016.1, name, "DT") test(2016.2, DT, data.table(a=1:3)) test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column -test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") # just set() +test(2016.5, set(DT,2L,"newCol",5L), data.table(a=INT(1,4,5), newCol=INT(NA,5L,NA))) # works since set overallocates #4100 test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller) unlink(tt) @@ -21147,3 +21148,12 @@ dt = data.table(id = 1:25) test(2314.1, any(grepl("", tail(capture.output(print(dt, class = TRUE)), 2))), TRUE) # Test that class=TRUE with col.names="top" doesn't show classes at bottom test(2314.2, !any(grepl("", tail(capture.output(print(dt, class = TRUE, col.names = "top")), 2))), TRUE) + +# re-overallocate in set if quota is reached #496 #4100 +DT = data.table() +test(2315.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i); ncol(DT)}, 10L) +DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame")) +test(2315.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3)) +# ensure := and set are consistent if they need to overallocate +DT = data.table(); DT2 = data.table() +test(2315.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i)}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]}) diff --git a/src/assign.c b/src/assign.c index e6f8baa647..417c97bf79 100644 --- a/src/assign.c +++ b/src/assign.c @@ -332,7 +332,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; + R_len_t numToDo, targetlen, vlen, oldncol, tl, coln, protecti=0, newcolnum, indexLength; SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted @@ -514,22 +514,23 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // modify DT by reference. Other than if new columns are being added and the allocVec() fails with // out-of-memory. In that case the user will receive hard halt and know to rerun. if (length(newcolnames)) { - oldtncol = TRUELENGTH(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more. - - if (oldtncololdncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."),oldtncol, oldncol); - if (oldtncol < oldncol+LENGTH(newcolnames)) - internal_error(__func__, "input dt has not been allocated enough column slots. l=%d, tl=%d, adding %d", oldncol, oldtncol, LENGTH(newcolnames)); // # nocov + if (tl>oldncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."),tl, oldncol); if (!selfrefnamesok(dt,verbose)) error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov // Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as // strong error message for now. - else if (TRUELENGTH(names) != oldtncol) + else if (TRUELENGTH(names) != tl) // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work - internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)TRUELENGTH(names), oldtncol); // # nocov + internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)TRUELENGTH(names), tl); // # nocov if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker.")); SETLENGTH(dt, oldncol+LENGTH(newcolnames));