diff --git a/NAMESPACE b/NAMESPACE index acc44ad518..1cf699ab93 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,8 @@ exportClasses(data.table, IDate, ITime) ## export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) -export(set2key, set2keyv, key2, setindex, setindexv, indices) +export(setindex, setindexv, indices) +export(set2key, set2keyv, key2) # deprecated with helpful error; remove after May 2019 (see #3399) export(as.data.table,is.data.table,test.data.table,last,first,like,"%like%",between,"%between%",inrange,"%inrange%") export(timetaken) export(truelength, alloc.col, ":=") diff --git a/NEWS.md b/NEWS.md index 88d8d4432b..24ecb1a09d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -53,6 +53,10 @@ Column 2 of by= (2) is type 'list', not yet supported. Please use the by= argument to specify columns with types that are supported. ``` +9. Reminder that note 11 in v1.11.0 (May 2018) warned that `set2key()` and `key2()` will be removed in May 2019. They have been warning since v1.9.8 (Nov 2016) and their warnings were upgraded to errors in v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental'. + +10. The `key(DT)<-` form of `setkey()` has been warning since at least 2012 to use `setkey()`. The warning is now stronger: `key(x)<-value is deprecated and not supported. Please change to use setkey().`. This warning will be upgraded to error in one year. + ### Changes in v1.12.0 (13 Jan 2019) diff --git a/R/data.table.R b/R/data.table.R index 6284585170..2052d1c921 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1173,7 +1173,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { m[is.na(m)] = ncol(x)+seq_len(length(newnames)) cols = as.integer(m) if ((ok<-selfrefok(x,verbose))==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R] - warning("Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid key<-, names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.") + warning("Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.") if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() diff --git a/R/duplicated.R b/R/duplicated.R index 145851f391..635382a04a 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -1,5 +1,6 @@ warning_oldUniqueByKey = "The deprecated option 'datatable.old.unique.by.key' is being used. Please stop using it and pass 'by=key(DT)' instead for clarity. For more information please search the NEWS file for this option." +# upgrade the 4 calls below to error after May 2019 ( see note 10 from 1.11.0 May 2018 which said one year from then ) duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("duplicated")) #nocov diff --git a/R/onLoad.R b/R/onLoad.R index e5a950d50c..7d0e1a7e21 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -55,7 +55,7 @@ "datatable.auto.index"="TRUE", # DT[col=="val"] to auto add index so 2nd time faster "datatable.use.index"="TRUE", # global switch to address #1422 "datatable.prettyprint.char" = NULL, # FR #1091 - "datatable.old.unique.by.key" = "FALSE" # TODO: change warnings in duplicated.R to error on or after Jan 2019 then remove in Jan 2020. + "datatable.old.unique.by.key" = "FALSE" # TODO: change warnings in duplicated.R to error on or after May 2019 then remove a year after that. ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste0("options(",i,"=",opts[i],")"))) diff --git a/R/setkey.R b/R/setkey.R index a7a03cdacc..e56da80d09 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -18,14 +18,20 @@ setindexv <- function(x, cols, verbose=getOption("datatable.verbose")) { } } -set2key <- function(...) { - stop("set2key() is now deprecated. Please use setindex() instead.") -} -set2keyv <- function(...) { - stop("set2keyv() is now deprecated. Please use setindexv() instead.") -} -key2 <- function(x) { - stop("key2() is now deprecated. Please use indices() instead.") +# remove these 3 after May 2019; see discussion in #3399 and notes in v1.12.2. They were marked experimental after all. +set2key <- function(...) stop("set2key() is now deprecated. Please use setindex() instead.") +set2keyv <- function(...) stop("set2keyv() is now deprecated. Please use setindexv() instead.") +key2 <- function(...) stop("key2() is now deprecated. Please use indices() instead.") + +# upgrade to error after Mar 2020. Has already been warning since 2012, and stronger warning in Mar 2019 (note in news for 1.12.2); #3399 +"key<-" <- function(x,value) { + warning("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012 and will be an error in future.") + setkeyv(x,value) + # The returned value here from key<- is then copied by R before assigning to x, it seems. That's + # why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<- + # method, the table gets set to NULL. So, although we call setkeyv(x,cols) here, and that doesn't copy, the + # returned value (x) then gets copied by R. + # So, solution is that caller has to call setkey or setkeyv directly themselves, to avoid <- dispatch and its copy. } setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE) @@ -127,16 +133,6 @@ getindex <- function(x, name) { ans } -"key<-" <- function(x,value) { - warning("The key(x)<-value form of setkey can copy the whole table. This is due to <- in R itself. Please change to setkeyv(x,value) or setkey(x,...) which do not copy and are faster. See help('setkey'). You can safely ignore this warning if it is inconvenient to change right now. Setting options(warn=2) turns this warning into an error, so you can then use traceback() to find and change your key<- calls.") - setkeyv(x,value) - # The returned value here from key<- is then copied by R before assigning to x, it seems. That's - # why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<- - # method, the table gets set to NULL. So, although we call setkeyv(x,cols) here, and that doesn't copy, the - # returned value (x) then gets copied by R. - # So, solution is that caller has to call setkey or setkeyv directly themselves, to avoid <- dispatch and its copy. -} - haskey <- function(x) !is.null(key(x)) # reverse a vector by reference (no copy) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8ee525fd22..d81b87e44b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1501,7 +1501,7 @@ test(505, DT[J(a=1,b=6),sum(i.b*b),by=.EACHI]$V1, 24) # 24 now 'double' because # Test := after a key<- DT = data.table(a=3:1,b=4:6) -test(506, key(DT)<-"a", "a", warning="can copy the whole table") +test(506, key(DT)<-"a", "a", warning="deprecated") test(508, DT, data.table(a=1:3,b=6:4,key="a")) test(509, DT[,b:=10L], data.table(a=1:3,b=10L,key="a")) test(510, DT[,c:=11L], data.table(a=1:3,b=10L,c=11L,key="a")) # Used to be warning about invalid .internal.selfref detected and fixed. As from v1.8.3 data.table() returns a NAMED==0 object, and key<- appears not to copy that. But within functions, key<- would still copy. TO DO: add tests.... diff --git a/man/set2key.Rd b/man/set2key.Rd new file mode 100644 index 0000000000..e150ec3f45 --- /dev/null +++ b/man/set2key.Rd @@ -0,0 +1,21 @@ +\name{set2key} +\alias{set2key} +\alias{set2keyv} +\alias{key2} +\alias{key<-} +\title{ Deprecated. } +\description{ + These functions are deprecated. They will be removed in future. Please use the functions in \code{\link{setkey}}. +} +\usage{ +set2key(...) # DEPRECATED; helpful error since May 2018 and warning since Nov 2016 +set2keyv(...) # DEPRECATED; helpful error since May 2018 and warning since Nov 2016 +key2(...) # DEPRECATED; helpful error since May 2018 and warning since Nov 2016 +key(x) <- value # DEPRECATED; strong warning since Mar 2019 and softer warning since 2012 +} +\arguments{ +\item{\dots}{ Deprecated. } +\item{x}{ Deprecated. } +\item{value}{ Deprecated. } +} + diff --git a/man/setkey.Rd b/man/setkey.Rd index 62df12ba17..b43ff1e416 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -2,38 +2,39 @@ \alias{setkey} \alias{setkeyv} \alias{key} -\alias{key<-} \alias{haskey} -\alias{set2key} -\alias{set2keyv} \alias{setindex} \alias{setindexv} -\alias{key2} \alias{indices} -\title{ Create key on a data table } +\title{ Create key on a data.table } \description{ +\code{setkey} sorts a \code{data.table} and marks it as sorted with an +attribute \code{sorted}. The sorted columns are the key. The key can be any +number of columns. The columns are always sorted in \emph{ascending} order. The table +is changed \emph{by reference} and \code{setkey} is very memory efficient. + +There are three reasons \code{setkey} is desirable: i) binary search and joins are faster +when they detect they can use an existing key, ii) grouping by a leading subset of the key +columns is faster because the groups are already gathered contiguously in RAM, iii) +simpler shorter syntax; e.g. \code{DT["id",]} finds the group "id" in the first column +of DT's key using binary search. It may be helpful to think of a key as +super-charged rownames: multi-column and multi-type rownames. + In \code{data.table} parlance, all \code{set*} functions change their input -\emph{by reference}. That is, no copy is made at all, other than temporary +\emph{by reference}. That is, no copy is made at all other than for temporary working memory, which is as large as one column. The only other \code{data.table} operator that modifies input by reference is \code{\link{:=}}. Check out the -\code{See Also} section below for other \code{set*} function \code{data.table} +\code{See Also} section below for other \code{set*} functions \code{data.table} provides. -\code{setkey()} sorts a \code{data.table} and marks it as sorted (with an -attribute \code{sorted}). The sorted columns are the key. The key can be any -columns in any order. The columns are sorted in ascending order always. The table -is changed \emph{by reference} and is therefore very memory efficient. +\code{setindex} creates an index for the provided columns. This index is simply an +ordering vector of the dataset's rows according to the provided columns. This order vector +is stored as an attribute of the \code{data.table} and the dataset retains the original order +of rows in memory. See the \href{vignettes/datatable-secondary-indices-and-auto-indexing.html}{Secondary indices and auto indexing} vignette for more details. -\code{setindex()} creates an index (or indices) on provided columns. This index is simply an -order of the dataset's according to the provided columns. This order is stored as a \code{data.table} -attribute, and the dataset retains the original order in memory. -See the \href{vignettes/datatable-secondary-indices-and-auto-indexing.html}{Secondary indices and auto indexing} vignette for more details. +\code{key} returns the \code{data.table}'s key if it exists; \code{NULL} if none exists. -\code{key()} returns the \code{data.table}'s key if it exists, and \code{NULL} -if none exist. - -\code{haskey()} returns a logical \code{TRUE}/\code{FALSE} depending on whether -the \code{data.table} has a key (or not). +\code{haskey} returns \code{TRUE}/\code{FALSE} if the \code{data.table} has a key. } \usage{ setkey(x, \dots, verbose=getOption("datatable.verbose"), physical = TRUE) @@ -43,98 +44,52 @@ setindexv(x, cols, verbose=getOption("datatable.verbose")) key(x) indices(x, vectors = FALSE) haskey(x) -key(x) <- value # DEPRECATED, please use setkey or setkeyv instead. } \arguments{ \item{x}{ A \code{data.table}. } -\item{\dots}{ The columns to sort by. Do not quote the column names. If -\code{\dots} is missing (i.e. \code{setkey(DT)}), all the columns are used. -\code{NULL} removes the key. } -\item{cols}{ A character vector of column names. For \code{setindexv}, this can be a \code{list} of character vectors, in which case each element will be applied as an index. } -\item{value}{ In (deprecated) \code{key<-}, a character vector (only) of column -names.} +\item{\dots}{ The columns to sort by. Do not quote the column names. If \code{\dots} is missing (i.e. \code{setkey(DT)}), all the columns are used. \code{NULL} removes the key. } +\item{cols}{ A character vector of column names. For \code{setindexv}, this can be a \code{list} of character vectors, in which case each element will be applied as an index in turn. } \item{verbose}{ Output status and information. } -\item{physical}{ TRUE changes the order of the data in RAM. FALSE adds a -secondary key a.k.a. index. } -\item{vectors}{ logical scalar default \code{FALSE}, when set to \code{TRUE} -then list of character vectors is returned, each vector refers to one index. } +\item{physical}{ \code{TRUE} changes the order of the data in RAM. \code{FALSE} adds an index. } +\item{vectors}{ \code{logical} scalar, default \code{FALSE}; when set to \code{TRUE}, a \code{list} of character vectors is returned, each referring to one index. } } \details{ -\code{setkey} reorders (or sorts) the rows of a data.table by the columns -provided. In versions \code{1.9+}, for \code{integer} columns, a modified version -of base's counting sort is implemented, which allows negative values as well. It -is extremely fast, but is limited by the range of integer values being <= 1e5. If -that fails, it falls back to a (fast) 4-pass radix sort for integers, implemented -based on Pierre Terdiman's and Michael Herf's code (see links below). Similarly, -a very fast 6-pass radix order for columns of type \code{double} is also implemented. -This gives a speed-up of about 5-8x compared to \code{1.8.10} on \code{setkey} -and all internal \code{order}/\code{sort} operations. Fast radix sorting is also -implemented for \code{character} and \code{bit64::integer64} types. - -The sort is \emph{stable}; i.e., the order of ties (if any) is preserved, in both -versions - \code{<=1.8.10} and \code{>= 1.9.0}. +\code{setkey} reorders (i.e. sorts) the rows of a \code{data.table} by the columns +provided. The sort method used has developed over the years and we have contributed +to base R too; see \code{\link[base]{sort}}. Generally speaking we avoid any type +of comparison sort (other than insert sort for very small input) preferring instead +counting sort and forwards radix. We also avoid hash tables. -In \code{data.table} versions \code{<= 1.8.10}, for columns of type \code{integer}, -the sort is attempted with the very fast \code{"radix"} method in -\code{\link[base:order]{sort.list}}. If that fails, the sort reverts to the default -method in \code{\link[base]{order}}. For character vectors, \code{data.table} -takes advantage of R's internal global string cache and implements a very efficient -order, also exported as \code{\link{chorder}}. +The sort is \emph{stable}; i.e., the order of ties (if any) is preserved. -In v1.7.8, the \code{key<-} syntax was deprecated. The \code{<-} method copies -the whole table and we know of no way to avoid that copy without a change in -\R itself. Please use the \code{set}* functions instead, which make no copy at -all. \code{setkey} accepts unquoted column names for convenience, whilst -\code{setkeyv} accepts one vector of column names. +For character vectors, \code{data.table} takes advantage of R's internal global string cache, also exported as \code{\link{chorder}}. -The problem (for \code{data.table}) with the copy by \code{key<-} (other than -being slower) is that \R doesn't maintain the over allocated truelength, but it -looks as though it has. Adding a column by reference using \code{:=} after a -\code{key<-} was therefore a memory overwrite and eventually a segfault; the -over allocated memory wasn't really there after \code{key<-}'s copy. \code{data.table}s -now have an attribute \code{.internal.selfref} to catch and warn about such copies. -This attribute has been implemented in a way that is friendly with -\code{identical()} and \code{object.size()}. - -For the same reason, please use the other \code{set*} functions which modify -objects by reference, rather than using the \code{<-} operator which results -in copying the entire object. - -It isn't good programming practice, in general, to use column numbers rather -than names. This is why \code{setkey} and \code{setkeyv} only accept column names. +In general, it's good practice to use column names rather than numbers. This is +why\code{setkey} and \code{setkeyv} only accept column names. If you use column numbers then bugs (possibly silent) can more easily creep into your code as time progresses if changes are made elsewhere in your code; e.g., if you add, remove or reorder columns in a few months time, a \code{setkey} by column number will then refer to a different column, possibly returning incorrect results -with no warning. (A similar concept exists in SQL, where \code{"select * from \dots"} -is considered poor programming style when a robust, maintainable system is +with no warning. (A similar concept exists in SQL, where \code{"select * from \dots"} is considered poor programming style when a robust, maintainable system is required.) If you really wish to use column numbers, it is possible but deliberately a little harder; e.g., \code{setkeyv(DT,colnames(DT)[1:2])}. +If you wanted to use \code{\link[base]{grep}} to select key columns according to +a pattern, note that you can just set \code{value = TRUE} to return a character vector instead of the default integer indices. } \value{ -The input is modified by reference, and returned (invisibly) so it can be used -in compound statements; e.g., \code{setkey(DT,a)[J("foo")]}. If you require a -copy, take a copy first (using \code{DT2=copy(DT)}). \code{copy()} may also +The input is modified by reference and returned (invisibly) so it can be used +in compound statements; e.g., \code{setkey(DT,a)[.("foo")]}. If you require a +copy, take a copy first (using \code{DT2=copy(DT)}). \code{\link{copy}} may also sometimes be useful before \code{:=} is used to subassign to a column by -reference. See \code{?copy}. +reference. } \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr \url{http://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr - \url{https://cran.r-project.org/package=bit64} -} -\note{ Despite its name, \code{base::sort.list(x,method="radix")} actually -invokes a \emph{counting sort} in R, not a radix sort. See \code{do_radixsort} in -src/main/sort.c. A counting sort, however, is particularly suitable for -sorting integers and factors, and we like it. In fact we like it so much -that \code{data.table} contains a counting sort algorithm for character vectors -using R's internal global string cache. This is particularly fast for character -vectors containing many duplicates, such as grouped data in a key column. This -means that character is often preferred to factor. Factors are still fully -supported, in particular ordered factors (where the levels are not in -alphabetic order). + \url{https://cran.r-project.org/package=bit64}\cr + \url{https://github.com/Rdatatable/data.table/wiki/Presentations} } \seealso{ \code{\link{data.table}}, \code{\link{tables}}, \code{\link{J}}, \code{\link[base:order]{sort.list}}, \code{\link{copy}}, \code{\link{setDT}}, @@ -143,7 +98,7 @@ alphabetic order). \code{\link{chorder}}, \code{\link{setNumericRounding}} } \examples{ -# Type 'example(setkey)' to run these at prompt and browse output +# Type 'example(setkey)' to run these at the prompt and browse output DT = data.table(A=5:1,B=letters[5:1]) DT # before @@ -152,7 +107,7 @@ DT # after tables() # KEY column reports the key'd columns key(DT) keycols = c("A","B") -setkeyv(DT,keycols) # rather than key(DT)<-keycols (which copies entire table) +setkeyv(DT,keycols) DT = data.table(A=5:1,B=letters[5:1]) DT2 = DT # does not copy @@ -172,4 +127,3 @@ indices(DT) # get indices single vector indices(DT, vectors = TRUE) # get indices list } \keyword{ data } - diff --git a/src/assign.c b/src/assign.c index 7390560081..c59b573aa9 100644 --- a/src/assign.c +++ b/src/assign.c @@ -99,6 +99,15 @@ However, we still have problem (ii) above and it didn't pass tests involving bas We really need R itself to start setting TRUELENGTH to be the allocated length and then for GC to release TRUELENGTH not LENGTH. Would really tidy this up. + +Moved out of ?setkey Details section in 1.12.2 (Mar 2019). Revisit this w.r.t. to recent versions of R. + The problem (for \code{data.table}) with the copy by \code{key<-} (other than + being slower) is that \R doesn't maintain the over-allocated truelength, but it + looks as though it has. Adding a column by reference using \code{:=} after a + \code{key<-} was therefore a memory overwrite and eventually a segfault; the + over-allocated memory wasn't really there after \code{key<-}'s copy. \code{data.table}s now have an attribute \code{.internal.selfref} to catch and warn about such copies. + This attribute has been implemented in a way that is friendly with + \code{identical()} and \code{object.size()}. */ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) { @@ -944,7 +953,7 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) case LGLSXP : { Rboolean *vd = (Rboolean *)LOGICAL(v); for (int i=0; i