diff --git a/NEWS.md b/NEWS.md
index d418908b32..eef134c1ff 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -42,6 +42,78 @@
 
 8. `groupingsets()` gets a new argument `enclos` for use together with the `jj` argument in functions wrapping `groupingsets()`, including the existing wrappers `rollup()` and `cube()`. When forwarding a `j`-expression as `groupingsets(jj = substitute(j))`, make sure to pass `enclos = parent.frame()` as well, so that the `j`-expression will be evaluated in the right context. This makes it possible for `j` to refer to variables outside the `data.table`.
 
+9. `first()` and `last()` gain `na.rm` taking values `FALSE` (default), `TRUE` or `"row"`, [#4239](https://github.com/Rdatatable/data.table/issues/4239). For vector input, `TRUE` and `"row"` are the same. For `data.table|frame` input, `TRUE` returns the first/last non-NA observation in each column, while `"row"` returns the first/last row where all columns are non-NA. `TRUE` is optimized by group and `"row"` may be optimized by group in future. `n>1` with `na.rm=TRUE` is also optimized by group. Thanks to Nicolas Bennett and Michael Chirico for the requests, and Benjamin Schwendinger for the PR.
+
+    ```R
+    x
+    # [1] NA  1  2 NA
+
+    first(x)
+    # NA
+
+    first(x, na.rm=TRUE)
+    # 1
+
+    last(x, na.rm=TRUE)
+    # 2
+
+    DT
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+    #2:     1     4    NA
+    #3:     2     5    NA
+    #4:     2     6    NA
+
+    last(DT, na.rm=TRUE)
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     2     6     7
+
+    last(DT, na.rm="row")
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+
+    DT[, last(.SD, na.rm=TRUE), by=grp]
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     4     7
+    #2:     2     6    NA
+
+    DT[, last(.SD, na.rm="row"), by=grp]
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+    #2:     2    NA    NA
+    
+    DT[, last(na.omit(.SD)), by=grp]  # same as na.rm='row' but drops all-NA groups
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+
+    set.seed(1)
+    DT = data.table(id=rep(1:1e6, each=10),
+                     v=sample(c(1:5,NA), 10e6, replace=TRUE))
+    DT
+    #                id     v
+    #             <int> <int>
+    #        1:       1     2
+    #        2:       1     3
+    #        3:       1     4
+    #        4:       1    NA
+    #        5:       1     2
+    #       ---              
+    #  9999996: 1000000     3
+    #  9999997: 1000000    NA
+    #  9999998: 1000000    NA
+    #  9999999: 1000000     1
+    # 10000000: 1000000     4
+    ans1 = DT[, last(na.omit(v)), by=id]       # 18.7 sec
+    ans2 = DT[, last(v, na.rm=TRUE), by=id]    #  0.1 sec
+    identical(ans1, ans2)                      # TRUE
+    ```
+
 ### BUG FIXES
 
 1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
diff --git a/R/data.table.R b/R/data.table.R
index 9fd092beda..2de71fc791 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1668,7 +1668,8 @@ replace_dot_alias = function(e) {
            (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) &&
         (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
       headopt = jsub %iscall% c("head", "tail")
-      firstopt = jsub %iscall% c("first", "last") # fix for #2030
+      firstopt = jsub %iscall% c("first", "last") &&                    # 2030, 4239
+                 !identical(match.call(first, jsub)[["na.rm"]], "row")  # first's signature same as last's
       if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") &&
           (subopt || headopt || firstopt)) {
         if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
@@ -1893,7 +1894,9 @@ replace_dot_alias = function(e) {
     assign(".N", len__, thisEnv) # For #334
     #fix for #1683
     if (use.I) assign(".I", seq_len(nrow(x)), thisEnv)
-    ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971.
+    ans = gforce(thisEnv, jsub, o__, f__, len__, irows,  # irows needed for #971
+                 .Call(CsubsetVector, groups, grpcols),  # just a list() subset to make C level neater; doesn't copy column contents
+                 lhs)  # for now this just prevents := with new feature first/last n>1; in future see TODO below
     gi = if (length(o__)) o__[f__] else f__
     g = lapply(grpcols, function(i) .Call(CsubsetVector, groups[[i]], gi)) # use CsubsetVector instead of [ to preserve attributes #5567
 
@@ -1955,10 +1958,10 @@ replace_dot_alias = function(e) {
   # Grouping by by: i is by val, icols NULL, o__ may be subset of x, f__ points to o__ (or x if !length o__)
   # TO DO: setkey could mark the key whether it is unique or not.
   if (!is.null(lhs)) {
-    if (GForce) { # GForce should work with := #1414
-      vlen = length(ans[[1L]])
+    if (GForce) { # GForce should work with := #1414. TODO: move down into gforce at C level to save creating/rep'ing ans and grpcols wastefully
+      vlen = length(ans[[1L]])   # TODO: this might be ngrp when na.rm=TRUE and one group has 2 and another 0, so needs enhancing here (by passing all-1 back from gans?)
       # replicate vals if GForce returns 1 value per group
-      jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep, times=len__) else tail(ans, -length(g))  # see comment in #4245 for why rep instead of rep.int
+      jvals = if (vlen==length(len__)) lapply(tail(ans, -length(grpcols)), rep, times=len__) else tail(ans, -length(grpcols))  # see comment in #4245 for why rep instead of rep.int
       jrows = vecseq(f__,len__,NULL)
       if (length(o__)) jrows = o__[jrows]
       if (length(irows)) jrows = irows[jrows]
@@ -3124,8 +3127,8 @@ gfuns = c(gdtfuns,
 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here.
 ghead = function(x, n) .Call(Cghead, x, as.integer(n))
 gtail = function(x, n) .Call(Cgtail, x, as.integer(n))
-gfirst = function(x) .Call(Cgfirst, x)
-glast = function(x) .Call(Cglast, x)
+gfirst = function(x, n=1L, na.rm=FALSE) .Call(Cgfirst, x, as.integer(n), na.rm)
+glast = function(x, n=1L, na.rm=FALSE) .Call(Cglast, x, as.integer(n), na.rm)
 gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
 gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
 gweighted.mean = function(x, w, ..., na.rm=FALSE) {
@@ -3150,7 +3153,7 @@ gshift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift", "cyclic")) {
   stopifnot(is.numeric(n))
   .Call(Cgshift, x, as.integer(n), fill, type)
 }
-gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows)
+gforce = function(env, jsub, o, f, l, rows, grpcols, lhs) .Call(Cgforce, env, jsub, o, f, l, rows, grpcols, lhs)
 
 # GForce needs to evaluate all arguments not present in the data.table before calling C part #5547
 # Safe cases: variables [i], calls without variables [c(0,1), list(1)] # TODO extend this list
diff --git a/R/last.R b/R/last.R
index a93ec31731..53f9223a8f 100644
--- a/R/last.R
+++ b/R/last.R
@@ -1,86 +1,83 @@
-# data.table defined last(x) with no arguments, just for last. If you need the last 10 then use tail(x,10).
-# for xts class objects it will dispatch to xts::last
-# reworked to avoid loading xts namespace (#3857) then again to fix dispatching of xts class (#4053)
-# nocov start. Tests 19.* in other.Rraw, not in the main suite.
-last = function(x, n=1L, ...) {
-  verbose = isTRUE(getOption("datatable.verbose", FALSE))
-  if (!inherits(x, "xts")) {
-    if (nargs()>1L) {
-      if ("package:xts" %chin% search()) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "xts::last", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-        xts::last(x, n=n, ...)
-      } else {
-        # nocov start
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-        utils::tail(x, n=n, ...)
-        # nocov end
-      }
+# data.table originally defined first(x) and last(x) with no arguments just for the single
+# first/last observation. Over time n= has been added since xts::last has n so now it makes
+# sense to support n. The difference to head/tail is the default n=1 vs n=6, and
+# that first/last are not generic for speed by group.
+
+first = function(x, n=1L, na.rm=FALSE, ...) {
+  .firstlast(x, n=n, na.rm=na.rm, first=TRUE, ...)
+}
+
+last = function(x, n=1L, na.rm=FALSE, ...) {
+  .firstlast(x, n=n, na.rm=na.rm, first=FALSE, ...)
+}
+
+.firstlast = function(x, n=1L, na.rm=FALSE, first=TRUE, ...) {
+  # nocov start. Tests 19.* in other.Rraw, not in the main suite.
+  if (inherits(x, "xts")) {
+    if (isTRUE(getOption("datatable.verbose", FALSE)))
+      catf("using %s\n", if (first) "xts::first" else "xts::last")
+    return((if (first) xts::first else xts::last)(x, n=n, na.rm=na.rm, ...))
+  }
+  # nocov end.
+  stopifnot(isTRUEorFALSE(na.rm) || identical(na.rm,"row"))
+  stopifnot(is.numeric(n), length(n)==1L, n>=0L)
+  n = as.integer(n)
+  if (is.data.frame(x)) {
+    if (!nrow(x)) return(x)
+    if (identical(na.rm, "row")) {   # any NA on the row removes that row
+      nna = which_(.Call(Cdt_na, x, seq_along(x)), bool=FALSE)
+      # very similar to na.omit.data.table
+      # TODO: n and first/last could be passed to Cdt_na and it could stop after finding n (it already does that in gsumm.c when gforce optimized)
+      nna = .firstlastVector(nna, n=n, first=first, na.rm=FALSE)
+      ans = .Call(CsubsetDT, x, nna, seq_along(x))  # works on DF too
     } else {
-      dx = dim(x)
-      if (is.null(dx)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "'x[[length(x)]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))")
-        lx = length(x)
-        if (!lx) x else x[[lx]]
-      } else if (is.data.frame(x)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "'x[nrow(x),]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)")
-        x[dx[1L], , drop=FALSE]
-      } else {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-        utils::tail(x, n=n, ...)
+      ans = lapply(x, .firstlastVector, n=n, first=first, na.rm=na.rm)
+      if (na.rm) {
+        l = vapply_1i(ans, length)
+        m = max(l)
+        for (i in which(l<m)) {
+          ans[[i]] = c(ans[[i]], rep(NA, m-l[i]))
+        }
+        # any row.names won't align to the values now in the result so don't retain them
       }
     }
-  } else {
-    if (!requireNamespace("xts", quietly=TRUE))
-      stopf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last") # nocov
-    if (verbose)
-      catf("%s: using %s: %s\n", "last", "xts::last", "is.xts(x)")
-    xts::last(x, n=n, ...)
+    if (is.data.table(x)) setDT(ans) else setDF(ans)
+    setattr(ans, "class", class(x))
+    if (!isTRUE(na.rm) && length(rn<-attr(x,"row.names")))
+      setattr(ans, "row.names", if (isFALSE(na.rm)) .firstlastVector(rn, n=n, first=first, na.rm=FALSE)
+                                               else rn[nna])
+    return(ans)
+  }
+  if (!length(x))
+    return(x)
+  if (!is.vector(x)) {
+    if (!isFALSE(na.rm))
+      stopf("na.rm=TRUE|'row' is not currently supported for '%s'", class(x)[1L])
+    return((if (first) utils::head else utils::tail)(x, n=n, ...))  # e.g. matrix
   }
+  return(.firstlastVector(x, n=n, first=first, na.rm=!isFALSE(na.rm)))  # !isFALSE to convert 'row' to TRUE
 }
 
-first = function(x, n=1L, ...) {
-  verbose = isTRUE(getOption("datatable.verbose", FALSE))
-  if (!inherits(x, "xts")) {
-    if (nargs()>1L) {
-      if ("package:xts" %chin% search()) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "xts::first", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-        xts::first(x, n=n, ...)
-      } else {
-        # nocov start
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-        utils::head(x, n=n, ...)
-        # nocov end
-      }
-    } else {
-      dx = dim(x)
-      if (is.null(dx)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "'x[[1L]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))")
-        lx = length(x)
-        if (!lx) x else x[[1L]]
-      } else if (is.data.frame(x)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "'x[1L,]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)")
-        if (!dx[1L]) x else x[1L, , drop=FALSE]
-      } else {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-        utils::head(x, n=n, ...)
-      }
-    }
+.firstlastVector = function(x, n, first, na.rm) {
+  if (!length(x)) return(x)
+  if (n==0L) return(x[0L])
+  ans = if (na.rm) {
+    nna = which_(if (is.list(x)) vapply_1b(x,function(y){is.null(y)||(length(y)==1L&&is.na(y))})
+                            else is.na(x), bool=FALSE)   # TODO: again, n and first/last could be passed to C here
+    if (!length(nna)) x[0L]
+    else {y=min(n,length(nna)); x[nna[if (first) seq.int(1L,y) else seq.int(length(nna)-y+1L,length(nna))]]}
   } else {
-    if (!requireNamespace("xts", quietly=TRUE))
-      stopf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first") # nocov
-    if (verbose)
-      catf("%s: using %s: %s\n", "first", "xts::first", "is.xts(x)")
-    xts::first(x, n=n, ...)
+    y=min(n,length(x)); x[if (first) seq.int(1L,y) else seq.int(length(x)-y+1L,length(x))] 
   }
+  if (n>1L || na.rm)   # n!=length(ans)
+    .Call("Csettruelength", ans, length(ans))
+  # for dogroups.c to know that shorter results (including when na.rm results in a length-1) should be padded with NA to match the length of longer items
+  # head and tail with na.rm=TRUE are by their nature returning a vector and therefore shouldn't be recycled when length-1; test 2240.81
+  # TODO: new function pad() could be provided so user can do things like DT[, .(pad(na.omit(B)), pad(na.omit(C))), by=grp]
+  #         to avoid the error 'Supplied 2 items for column 1 of group 1 which has 3 rows ...'
+  #       and/or pad= could be added to [.data.table to allow padding all results
+  # Since gforce_dynamic optimizes head/tail it knows to pad and that's optimized. However, default last(x) and first(x) (i.e. n=1 na.rm=FALSE) are
+  # single-valued like mean,median etc and are recycled in the same way. This is consistent with n=1 na.rm=FALSE already not being treated as
+  # gforce_dynamic in gsumm.c either. n=1 na.rm=TRUE returns empty when all-NA so is still a vector result not recycled when length-1.
+  ans
 }
-# nocov end
diff --git a/R/shift.R b/R/shift.R
index 1c68d13c41..295c5f049b 100644
--- a/R/shift.R
+++ b/R/shift.R
@@ -23,6 +23,7 @@ shift = function(x, n=1L, fill, type=c("lag", "lead", "shift", "cyclic"), give.n
     }
     setattr(ans, "names",  paste(rep(nx,each=length(n)), type, n, sep="_"))
   }
+  if (length(n)>1L) setDT(ans)
   ans
 }
 
diff --git a/R/test.data.table.R b/R/test.data.table.R
index d17d24f4f9..3ac260372e 100644
--- a/R/test.data.table.R
+++ b/R/test.data.table.R
@@ -514,6 +514,40 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
       # nocov end
     }
   }
+  if (!fail) for (type in c("warning","error","message")) {
+    observed = actual[[type]]
+    expected = get(type)
+    if (type=="warning" && length(observed) && !is.null(ignore.warning)) {
+      # if a warning containing this string occurs, ignore it. First need for #4182 where warning about 'timedatectl' only
+      # occurs in R 3.4, and maybe only on docker too not for users running test.data.table().
+      stopifnot(length(ignore.warning)==1L, is.character(ignore.warning), !is.na(ignore.warning), nchar(ignore.warning)>=1L)
+      observed = grep(ignore.warning, observed, value=TRUE, invert=TRUE)
+    }
+    if (length(expected) != length(observed)) {
+      # nocov start
+      catf("Test %s produced %d %ss but expected %d\n%s\n%s\n", numStr, length(observed), type, length(expected),
+            paste("Expected:", expected, collapse="\n"),
+            paste("Observed:", observed, collapse="\n"))
+      fail = TRUE
+      # nocov end
+    } else {
+      # the expected type occurred and, if more than 1 of that type, in the expected order
+      for (i in seq_along(expected)) {
+        if (!foreign && !string_match(expected[i], observed[i])) {
+          # nocov start
+          catf("Test %s didn't produce the correct %s:\nExpected: %s\nObserved: %s\n", numStr, type, expected[i], observed[i])
+          fail = TRUE
+          # nocov end
+        }
+      }
+    }
+  }
+  if (fail && exists("out",inherits=FALSE)) {
+    # nocov start
+    catf("Output captured before unexpected warning/error/message:\n")
+    writeLines(out)
+    # nocov end
+  }
   if (!fail && !length(error) && (!length(output) || !missing(y))) {   # TODO test y when output=, too
     capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it
     if (inherits(x, c("Date", "POSIXct"))) storage.mode(x) <- "numeric"
diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw
index ccd2ffdc88..e1dba45590 100644
--- a/inst/tests/other.Rraw
+++ b/inst/tests/other.Rraw
@@ -130,9 +130,10 @@ if (loaded[["xts"]]) {
 if (loaded[["gdata"]]) {
   if (!loaded[["xts"]]) warning("The gdata test expects xts loaded as well since all 3 have a last() function.")
   x = list("a",1:2,89)
-  test(6.1, xts::last(x), list(89))   # would prefer 89 here like data.table does, since "last" means the last one (never more than one) so why retain the one-item list() level?
-  test(6.2, gdata::last(x), list(89))
-  test(6.3, data.table::last(x), 89)
+  test(6.1, xts::last(x), list(89))
+  test(6.2, gdata::last(x), list(89))    
+  test(6.3, data.table::last(x), list(89))  # from 1.14.4 which supports n>1 more, last() retains list() so that a consistent type is return between n=1 and n>1.
+                                            # or we could divert users to head and tail and keep first() and last() as originally intended for n=1
   DT = data.table(a=7:9)
   test(6.4, setDT(xts::last(DT)), data.table(a=9L))        # xts 0.11-0 changes from 9L to data.table(a=9L), setDT added due to "Test 6.4 ran without errors but selfrefok(x) is FALSE", see #3047
   test(6.5, data.table::last(DT), DT[3L])
@@ -531,20 +532,20 @@ if (loaded[["xts"]]) {  # was 1465 in tests.Rraw, #5516
 # stopifnot("xts"%in%installed.packages(), !isNamespaceLoaded("xts")); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!isNamespaceLoaded("xts"))
 x = as.POSIXct("2019-09-09")+0:1
 old = options(datatable.verbose=TRUE)
-test(19.01, last(x), x[length(x)], output="!is.xts(x)")
-test(19.02, first(x), x[1L], output="!is.xts(x)")
+test(19.01, last(x), x[length(x)], notOutput="xts")
+test(19.02, first(x), x[1L], notOutput="xts")
 if (loaded[["xts"]]) {
   xt = xts(1:2, x)
-  test(19.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)")
-  test(19.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)")
+  test(19.03, last(xt, 2L), xt, output="using xts::last")
+  test(19.04, first(xt, 2L), xt, output="using xts::first")
   xt = xts(matrix(1:4, 2L, 2L), x)
-  test(19.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)")
-  test(19.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)")
+  test(19.05, last(xt, 2L), xt, output="using xts::last")
+  test(19.06, first(xt, 2L), xt, output="using xts::first")
 }
 # first on empty df now match head(df, n=1L), #3858
 df = data.frame(a=integer(), b=integer())
-test(19.11, first(df), df, output="!is.xts(x)")
-test(19.12, last(df), df, output="!is.xts(x)")
+test(19.11, first(df), df, notOutput="xts")
+test(19.12, last(df), df, notOutput="xts")
 options(datatable.verbose=FALSE)  # so the as.data.table() doesn't pollute output
 # xts last-first dispatch fix #4053
 x = 1:3
@@ -563,10 +564,10 @@ xt = structure(
 )
 options(datatable.verbose=TRUE)
 if (loaded[["xts"]]) {
-  test(19.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-  test(19.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-  test(19.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-  test(19.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
+  test(19.21, last(x, n=2L), 2:3, notOutput="xts")
+  test(19.22, last(y, n=2L), y[2:3], notOutput="xts")
+  test(19.23, last(x, n=1L), 3L, notOutput="xts")
+  test(19.24, last(y, n=1L), y[3L], notOutput="xts")
   xt_last = structure(
     c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008),
     class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
@@ -580,12 +581,12 @@ if (loaded[["xts"]]) {
     index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"),
     .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted"))
   )
-  test(19.25, last(xt), xt_last, output="using xts::last: is.xts(x)")
-  test(19.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)")
-  test(19.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-  test(19.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-  test(19.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-  test(19.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
+  test(19.25, last(xt), xt_last, output="using xts::last")
+  test(19.26, last(xt, n=2L), xt_last2, output="using xts::last")
+  test(19.31, first(x, n=2L), 1:2, notOutput="xts")
+  test(19.32, first(y, n=2L), y[1:2], notOutput="xts")
+  test(19.33, first(x, n=1L), 1L, notOutput="xts")
+  test(19.34, first(y, n=1L), y[1L], notOutput="xts")
   xt_first = structure(
     c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954),
     class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
@@ -601,34 +602,25 @@ if (loaded[["xts"]]) {
   test(19.35, first(xt), xt_first, output="using xts::first: is.xts(x)")
   test(19.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)")
 } else {
-  test(19.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.25, last(xt), error="you should have 'xts' installed already")
-  test(19.26, last(xt, n=2L), error="you should have 'xts' installed already")
-  test(19.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-  test(19.35, first(xt), error="you should have 'xts' installed already")
-  test(19.36, first(xt, n=2L), error="you should have 'xts' installed already")
+  test(19.35, first(xt), xt_first, output="using xts::first")
+  test(19.36, first(xt, n=2L), xt_first2, output="using xts::first")
 }
-test(19.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
-test(19.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
-test(19.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
-test(19.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
-test(19.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
-test(19.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
-test(19.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
-test(19.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
-# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic
-test(19.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127
-test(19.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-test(19.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127
-test(19.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
+test(19.41, last(x), 3L)
+test(19.42, last(y), y[3L])
+test(19.51, first(x), 1L)
+test(19.52, first(y), y[1L])
+test(19.610, last(df), data.frame(a=2L, b=2L, row.names=2L))
+test(19.611, last(as.data.table(df), na.rm=TRUE), data.table(a=2L,b=2L))
+test(19.62, last(dt), data.table(a=2L, b=2L))
+test(19.710, first(df), data.frame(a=1L, b=3L))
+test(19.711, first(as.data.table(df), na.rm=TRUE), data.table(a=1L, b=3L))
+test(19.72, first(dt), data.table(a=1L, b=3L))
+test(19.81, last(mx), matrix(c(3L,6L,9L), nrow=1, dimnames=list("[3,]", NULL)))  # same dimnames as utils::tail returns 
+test(19.82, last(ar), if (base::getRversion()<"4.0.0") 27L  # head/tail changed for arrays in R 4.0.0, #4127
+                      else array(INT(3,6,9,12,15,18,21,24,27), dim=c(1,3,3), dimnames=list("[3,]",NULL,NULL)))
+test(19.91, first(mx), matrix(c(1L,4L,7L), ncol=3L))
+test(19.92, first(ar), if (base::getRversion()<"4.0.0") 1L
+                       else array(INT(1,4,7,10,13,16,19,22,25), dim=c(1,3,3)))
 options(old)
 
 if (loaded[["xts"]]) {  # was 2133 in tests.Rraw, #5516
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index cf2564e558..74d2fc327a 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -6733,68 +6733,68 @@ test(1462.3, DT[, sum(unlist(mget(cols, as.environment(-1)))), by=x], DT[, sum(u
 x=1:5
 y=factor(x)
 test(1463.01, shift(x,1L),                   as.integer(c(NA, 1:4)))
-test(1463.02, shift(x,1:2),                  list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3))))
+test(1463.02, shift(x,1:2),                  data.table(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3))))
 test(1463.03, shift(x,1L, 0L),               as.integer(c(0L, 1:4)))
 test(1463.04, shift(x,1L,    type="lead"),   as.integer(c(2:5, NA)))
-test(1463.05, shift(x,1:2,   type="lead"),   list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA))))
+test(1463.05, shift(x,1:2,   type="lead"),   data.table(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA))))
 test(1463.06, shift(x,1L, 0L,type="lead"),   as.integer(c(2:5, 0L)))
 test(1463.07, shift(y,1L),                   factor(c(NA,1:4), levels=1:5))
 test(1463.08, shift(y,1L,    type="lead"),   factor(c(2:5, NA), levels=1:5))
 test(1463.09, shift(x,1L,    type="cyclic"), as.integer(c(5, 1:4)))
-test(1463.10, shift(x,1:2,   type="cyclic"), list(as.integer(c(5, 1:4)), as.integer(c(4:5, 1:3))))
+test(1463.10, shift(x,1:2,   type="cyclic"), data.table(as.integer(c(5, 1:4)), as.integer(c(4:5, 1:3))))
 test(1463.11, shift(x,-1L,   type="cyclic"), as.integer(c(2:5, 1)))
-test(1463.12, shift(x,-(1:2),type="cyclic"), list(as.integer(c(2:5, 1)), as.integer(c(3:5,1:2))))
+test(1463.12, shift(x,-(1:2),type="cyclic"), data.table(as.integer(c(2:5, 1)), as.integer(c(3:5,1:2))))
 
 x=as.numeric(x)
 test(1463.13, shift(x,1L),                   as.numeric(c(NA, 1:4)))
-test(1463.14, shift(x,1:2),                  list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3))))
+test(1463.14, shift(x,1:2),                  data.table(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3))))
 test(1463.15, shift(x,1L, 0L),               as.numeric(c(0L, 1:4)))
 test(1463.16, shift(x,1L,    type="lead"),   as.numeric(c(2:5, NA)))
-test(1463.17, shift(x,1:2,   type="lead"),   list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA))))
+test(1463.17, shift(x,1:2,   type="lead"),   data.table(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA))))
 test(1463.18, shift(x,1L, 0L,type="lead"),   as.numeric(c(2:5, 0L)))
 test(1463.19, shift(x,1L,    type="cyclic"), as.numeric(c(5, 1:4)))
-test(1463.20, shift(x,1:2,   type="cyclic"), list(as.numeric(c(5, 1:4)), as.numeric(c(4:5, 1:3))))
+test(1463.20, shift(x,1:2,   type="cyclic"), data.table(as.numeric(c(5, 1:4)), as.numeric(c(4:5, 1:3))))
 test(1463.21, shift(x,-1L,   type="cyclic"), as.numeric(c(2:5, 1)))
-test(1463.22, shift(x,-(1:2),type="cyclic"), list(as.numeric(c(2:5, 1)), as.numeric(c(3:5,1:2))))
+test(1463.22, shift(x,-(1:2),type="cyclic"), data.table(as.numeric(c(2:5, 1)), as.numeric(c(3:5,1:2))))
 
 
 if (test_bit64) {
   x=as.integer64(x)
   test(1463.23, shift(x,1L),                    as.integer64(c(NA, 1:4)))
-  test(1463.24, shift(x,1:2),                   list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3))))
+  test(1463.24, shift(x,1:2),                   data.table(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3))))
   test(1463.25, shift(x,1L, 0L),                as.integer64(c(0L, 1:4)))
   test(1463.26, shift(x,1L,     type="lead"),   as.integer64(c(2:5, NA)))
-  test(1463.27, shift(x,1:2,    type="lead"),   list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA))))
+  test(1463.27, shift(x,1:2,    type="lead"),   data.table(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA))))
   test(1463.28, shift(x,1L, 0L, type="lead"),   as.integer64(c(2:5, 0L)))
   test(1463.29, shift(x,1L,     type="cyclic"), as.integer64(c(5, 1:4)))
-  test(1463.30, shift(x,1:2,    type="cyclic"), list(as.integer64(c(5, 1:4)), as.integer64(c(4:5, 1:3))))
+  test(1463.30, shift(x,1:2,    type="cyclic"), data.table(as.integer64(c(5, 1:4)), as.integer64(c(4:5, 1:3))))
   test(1463.31, shift(x,-1L,    type="cyclic"), as.integer64(c(2:5, 1)))
-  test(1463.32, shift(x,-(1:2), type="cyclic"), list(as.integer64(c(2:5, 1)), as.integer64(c(3:5,1:2))))
+  test(1463.32, shift(x,-(1:2), type="cyclic"), data.table(as.integer64(c(2:5, 1)), as.integer64(c(3:5,1:2))))
 }
 
 x=as.character(x)
 test(1463.33, shift(x,1L),                    as.character(c(NA, 1:4)))
-test(1463.34, shift(x,1:2),                   list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3))))
+test(1463.34, shift(x,1:2),                   data.table(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3))))
 test(1463.35, shift(x,1L, 0L),                as.character(c(0L, 1:4)))
 test(1463.36, shift(x,1L,     type="lead"),   as.character(c(2:5, NA)))
-test(1463.37, shift(x,1:2,    type="lead"),   list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA))))
+test(1463.37, shift(x,1:2,    type="lead"),   data.table(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA))))
 test(1463.38, shift(x,1L, 0L, type="lead"),   as.character(c(2:5, 0L)))
 test(1463.39, shift(x,1L,     type="cyclic"), as.character(c(5, 1:4)))
-test(1463.40, shift(x,1:2,    type="cyclic"), list(as.character(c(5, 1:4)), as.character(c(4:5, 1:3))))
+test(1463.40, shift(x,1:2,    type="cyclic"), data.table(as.character(c(5, 1:4)), as.character(c(4:5, 1:3))))
 test(1463.41, shift(x,-1L,    type="cyclic"), as.character(c(2:5, 1)))
-test(1463.42, shift(x,-(1:2), type="cyclic"), list(as.character(c(2:5, 1)), as.character(c(3:5,1:2))))
+test(1463.42, shift(x,-(1:2), type="cyclic"), data.table(as.character(c(2:5, 1)), as.character(c(3:5,1:2))))
 
 x=c(TRUE,FALSE,TRUE,FALSE,TRUE)
 test(1463.43, shift(x,1L),                    c(NA, x[-5L]))
-test(1463.44, shift(x,1:2),                   list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)])))
+test(1463.44, shift(x,1:2),                   data.table(c(NA, x[-5L]), c(NA, NA, x[-(4:5)])))
 test(1463.45, shift(x,1L, 0L),                c(FALSE, x[-5L]))
 test(1463.46, shift(x,1L,     type="lead"),   c(x[-1L], NA))
-test(1463.47, shift(x,1:2,    type="lead"),   list(c(x[-1L],NA), c(x[-(1:2)],NA,NA)))
+test(1463.47, shift(x,1:2,    type="lead"),   data.table(c(x[-1L],NA), c(x[-(1:2)],NA,NA)))
 test(1463.48, shift(x,1L, 0L, type="lead"),   c(x[-(1)], FALSE))
 test(1463.49, shift(x,1L,     type="cyclic"), c(x[5L], x[-5L]))
-test(1463.50, shift(x,1:2,    type="cyclic"), list(c(x[5L], x[-5L]), c(x[4L:5L], x[-4L:-5L])))
+test(1463.50, shift(x,1:2,    type="cyclic"), data.table(c(x[5L], x[-5L]), c(x[4L:5L], x[-4L:-5L])))
 test(1463.51, shift(x,-1L,    type="cyclic"), c(x[-1L], x[1L]))
-test(1463.52, shift(x,-(1:2), type="cyclic"), list(c(x[-1L], x[1L]), c(x[-1L:-2L], x[1L:2L])))
+test(1463.52, shift(x,-(1:2), type="cyclic"), data.table(c(x[-1L], x[1L]), c(x[-1L:-2L], x[1L:2L])))
 
 # for list of list, #1595
 x = data.table(foo = c(list(c("a","b","c")), list(c("b","c")), list(c("a","b")), list(c("a"))), id = c(1,1,2,2))
@@ -6820,7 +6820,7 @@ test(1463.60, shift(mean), error="type 'closure' passed to shift(). Must be a ve
 # test for 'give.names=TRUE' on vectors
 x = 1:10
 nm = c("x_lag_1", "x_lag_2")
-ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8)))
+ans = data.table(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8)))
 setattr(ans, 'names', nm)
 test(1463.61, shift(x, 1:2, give.names=TRUE), ans)
 
@@ -6923,7 +6923,7 @@ test(1469.6, key(DT[J(c(2,0)), roll="nearest"]), NULL)
 # 1007 fix, dealing with Inf and -Inf correctly in rolling joins.
 DT = data.table(x=c(-Inf, 3, Inf), y=1:3, key="x")
 test(1470.1, DT[J(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(2,-Inf,5,Inf), y=c(1L, 1:3)))
-test(1470.2, DT[J(c(2,-Inf,5,Inf)), roll=10], data.table(x=c(2,-Inf,5,Inf), y=INT(c(NA, 1, 2, 3))))
+test(1470.2, DT[J(c(2,-Inf,5,Inf)), roll=10], data.table(x=c(2,-Inf,5,Inf), y=INT(NA, 1, 2, 3)))
 test(1470.3, DT[SJ(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(-Inf,2,5,Inf), y=c(1L, 1:3), key="x"))
 
 # 1006, second bug with -Inf, now that #1007 is fixed.
@@ -6934,7 +6934,7 @@ dt.query <- data.table(q1=c(-0.2, -0.05, 0.05, 0.15), q2=c(-0.2, -0.05, 0.05, 0.
 test(1471, foverlaps(dt.query, dt.ref), data.table(dt.ref, dt.query, key=c("q1", "q2")))
 
 # #1014 (segfault) fix
-test(1472, shift(1, 1:2, NA, 'lag'), list(NA_real_, NA_real_))
+test(1472, shift(1, 1:2, NA, 'lag'), data.table(NA_real_, NA_real_))
 
 # #528, type=equal simple test
 dt1 = data.table(x=1:5, y=6:10)
@@ -8046,9 +8046,9 @@ test(1557.5, names(fread(str, col.names=1:2)), error="Passed a vector of type")
 
 # Fix for #773
 f = testDir("issue_773_fread.txt")
-ans = data.table(AAA=INT(c(4,7,rep(1,17),31,21)),
-                 BBB=INT(c(5,8,rep(2,17),32,22)),
-                 CCC=INT(c(6,9,rep(3,17),33,23)))
+ans = data.table(AAA=INT(4,7,rep(1,17),31,21),
+                 BBB=INT(5,8,rep(2,17),32,22),
+                 CCC=INT(6,9,rep(3,17),33,23))
 test(1558.1, fread(f), ans, warning=w<-"Stopped early on line 23. Expected 3 fields but found 2[.].*First discarded non-empty line: <<ZZZ.*YYY>>")
 test(1558.2, fread(f, nrows=21L), ans)
 test(1558.3, fread(f, nrows=21L, fill=TRUE), ans)
@@ -8917,9 +8917,9 @@ if (test_bit64) {
 # fix for #1571
 x = data.table(c(1,1,2,7,2,3,4,4,7), 1:9)
 y = data.table(c(2,3,4,4,4,5))
-test(1615.1, x[!y, on="V1", mult="first"], data.table(V1=c(1,7), V2=INT(c(1,4))))
-test(1615.2, x[!y, on="V1", mult="last"], data.table(V1=c(1,7), V2=INT(c(2,9))))
-test(1615.3, x[!y, on="V1", mult="all"], data.table(V1=c(1,1,7,7), V2=INT(c(1,2,4,9))))
+test(1615.1, x[!y, on="V1", mult="first"], data.table(V1=c(1,7), V2=INT(1,4)))
+test(1615.2, x[!y, on="V1", mult="last"], data.table(V1=c(1,7), V2=INT(2,9)))
+test(1615.3, x[!y, on="V1", mult="all"], data.table(V1=c(1,1,7,7), V2=INT(1,2,4,9)))
 
 # fix for #1287 and #1271
 set.seed(1L)
@@ -13805,13 +13805,13 @@ test(1963.03, shift(DT$x, -1, fill = 0L),
 test(1963.04, shift(DT$x, -1, give.names = TRUE), # give.names is ignored because we do not return list
      c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA))
 test(1963.05, shift(DT$x, -1:1),
-     list(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), 1:10,
-          c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L)))
+     data.table(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), 1:10,
+                c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L)))
 test(1963.06, shift(DT, -1),
-     list(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA),
-          c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA)))
+     data.table(x=c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA),
+                y=c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA)))
 test(1963.07, shift(DT, -1:1),
-     list(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), 1:10,
+     data.table(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), 1:10,
           c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L),
           c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA), 10:1,
           c(NA, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L)))
@@ -13820,7 +13820,7 @@ test(1963.08, shift(DT$x, type = 'some_other_type'), error='should be one of.*la
 test(1963.09, shift(as.raw(0:1)), as.raw(c(0,0)))
 test(1963.095, shift(list(expression(1))), error = "Type 'expression' is not supported")
 test(1963.10, shift(DT, -1:1, type="shift", give.names = TRUE), # new type="shift" #3223
-     ans <- list(`x_shift_-1` = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA),
+     ans <- data.table(`x_shift_-1` = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA),
           x_shift_0 = 1:10,
           x_shift_1 = c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L),
           `y_shift_-1` = c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA),
@@ -16029,8 +16029,8 @@ test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of co
 # shift naming
 test(2074.34, shift(list(a=1:5, b=6:10), give.names=TRUE), list(a_lag_1=c(NA, 1:4), b_lag_1=c(NA, 6:9)))
 test(2074.345, shift(list(a=1:5, b=6:10), type="cyclic", give.names=TRUE), list(a_cyclic_1=c(5L, 1:4), b_cyclic_1=c(10L, 6:9)))
-test(2074.35, shift(1:5, 1:2, give.names=TRUE), list(V1_lag_1=c(NA, 1:4), V1_lag_2=c(NA, NA, 1:3)))
-test(2074.355, shift(1:5, 1:2, type="cyclic", give.names=TRUE), list(V1_cyclic_1=c(5L, 1:4), V1_cyclic_2=c(4L:5L, 1:3)))
+test(2074.35, shift(1:5, 1:2, give.names=TRUE), data.table(V1_lag_1=c(NA, 1:4), V1_lag_2=c(NA, NA, 1:3)))
+test(2074.355, shift(1:5, 1:2, type="cyclic", give.names=TRUE), data.table(V1_cyclic_1=c(5L, 1:4), V1_cyclic_2=c(4L:5L, 1:3)))
 
 # bmerge.c
 x = data.table(a='a')
@@ -16329,7 +16329,7 @@ ID = as.IDate(INT(
   2113, 2305, 2497, 2689, 2882, 3074, 3266, 3458, 3650
 ))
 D = as.Date(ID)
-br = as.IDate(INT(c(0, 151, 243)))
+br = as.IDate(INT(0, 151, 243))
 test(2102.1, cut(ID, breaks = br), as.IDate(cut(D, breaks=br)))
 test(2102.2, cut(ID, breaks = '1 year'), as.IDate(cut(D, breaks = '1 year')))
 test(2102.3, cut(ID, breaks = '6 months'), as.IDate(cut(D, breaks = '6 months')))
@@ -18165,60 +18165,60 @@ test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)")
 # support := with GForce #1414
 options(datatable.optimize = 2L)
 DT = data.table(a=1:3,b=(1:9)/10)
-test(2233.01, DT[, v := min(b), a, verbose=TRUE],      data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to")
+test(2233.01, DT[, v := min(b), a, verbose=TRUE],   data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to")
 # GForce returning full length
-test(2233.02, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to")
+test(2233.02, DT[, v := shift(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=c(NA,NA,NA,(1:6)/10)), output="GForce optimized j to")
 # GForce neither returning 1 per group nor full length
-test(2233.03, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.")
-# compare to non GForce version
+test(2233.03, DT[, v := head(b, 2L), a], error="could be implemented")
+# ensure base:: and utils:: return the same result
 DT = data.table(a=1:3,b=(1:9)/10)
-test(2233.04, copy(DT)[, v := min(b), a,      verbose=TRUE], copy(DT)[, v := base::min(b), a,     ], output="GForce optimized j to")
-test(2233.05, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to")
+test(2233.04, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a,     ], output="GForce optimized j to")
+test(2233.05, copy(DT)[, v := b[1:3], a], copy(DT)[, v := utils::head(b, 3L), a])
 
 # with key and grouping by key
 DT = data.table(a=1:3,b=(1:9)/10, key="a")
-test(2233.06, DT[, v := min(b), a, verbose=TRUE],      data.table(a=1:3, b=(1:9)/10, v=(1:3)/10, key="a"), output="GForce optimized j to")
-test(2233.07, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10, key="a"), output="GForce optimized j to")
-test(2233.08, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.")
+test(2233.06, DT[, v := min(b), a, verbose=TRUE],   data.table(a=1:3, b=(1:9)/10, v=(1:3)/10, key="a"), output="GForce optimized j to")
+test(2233.07, DT[, v := shift(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=c(NA,NA,NA,(1:6)/10), key="a"), output="GForce optimized j to")
+test(2233.08, DT[, v := head(b, 2L), a], error="could be implemented")
 DT = data.table(a=1:3,b=(1:9)/10, key="a")
-test(2233.09, copy(DT)[, v := min(b), a,      verbose=TRUE], copy(DT)[, v := base::min(b), a,     ], output="GForce optimized j to")
-test(2233.10, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to")
+test(2233.09, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a,     ], output="GForce optimized j to")
+test(2233.10, copy(DT)[, v := b[1:3], a], copy(DT)[, v := utils::head(b, 3L), a])
 
 # with key and grouping by nonkey
 DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c")
-test(2233.11, DT[, v := min(b), a, verbose=TRUE],      data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="c"), output="GForce optimized j to")
-test(2233.12, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:9)/10, key="c"), output="GForce optimized j to")
-test(2233.13, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.")
+test(2233.11, DT[, v := min(b), a, verbose=TRUE],   data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="c"), output="GForce optimized j to")
+test(2233.12, DT[, v := shift(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=c(NA,NA,NA,1:6)/10, key="c"), output="GForce optimized j to")
+test(2233.13, DT[, v := head(b, 2L), a], error="could be implemented")
 DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c")
-test(2233.14, copy(DT)[, v := min(b), a,      verbose=TRUE], copy(DT)[, v := base::min(b), a,     ], output="GForce optimized j to")
-test(2233.15, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to")
+test(2233.14, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a,     ], output="GForce optimized j to")
+test(2233.15, copy(DT)[, v := b[1:3], a], copy(DT)[, v := utils::head(b, 3L), a])
 
 # with key and keyby by nonkey
 DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c")
-test(2233.16, copy(DT)[, v := min(b),      keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="a"), output="GForce optimized j to")
-test(2233.17, copy(DT)[, v := head(b, 3L), keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:9)/10, key="a"), output="GForce optimized j to")
-test(2233.18, copy(DT)[, v := head(b, 2L), keyby=a], error="Supplied 6 items to be assigned to 9 items of column 'v'.")
+test(2233.16, copy(DT)[, v := min(b),   keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="a"), output="GForce optimized j to")
+test(2233.17, copy(DT)[, v := shift(b), keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=c(NA,NA,NA,1:6)/10, key="a"), output="GForce optimized j to")
+test(2233.18, copy(DT)[, v := head(b, 2L), keyby=a], error="could be implemented")
 DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c")
-test(2233.19, copy(DT)[, v := min(b),      keyby=a, verbose=TRUE], copy(DT)[, v := base::min(b),       keyby=a], output="GForce optimized j to")
-test(2233.20, copy(DT)[, v := head(b, 3L), keyby=a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), keyby=a], output="GForce optimized j to")
+test(2233.19, copy(DT)[, v := min(b), keyby=a, verbose=TRUE], copy(DT)[, v := base::min(b),       keyby=a], output="GForce optimized j to")
+test(2233.20, copy(DT)[, v := b[1:3], keyby=a], copy(DT)[, v := utils::head(b, 3L), keyby=a])
 # with irows
 DT = data.table(a=1:3,b=(1:9)/10)
-test(2233.21, DT[a==2, v := min(b), a, verbose=TRUE],      data.table(a=1:3, b=(1:9)/10, v=c(NA,0.2,NA)), output="GForce optimized j to")
-test(2233.22, DT[a!=4, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10),     output="GForce optimized j to")
-test(2233.23, DT[a!=4, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.")
+test(2233.21, DT[a==2, v := min(b), a, verbose=TRUE],   data.table(a=1:3, b=(1:9)/10, v=c(NA,0.2,NA)), output="GForce optimized j to")
+test(2233.22, DT[a!=4, v := shift(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=c(NA,NA,NA,1:6)/10), output="GForce optimized j to")
+test(2233.23, DT[a!=4, v := head(b, 2L), a], error="could be implemented")
 DT = data.table(a=1:3,b=(1:9)/10)
-test(2233.24, copy(DT)[a==2, v := min(b), a,      verbose=TRUE], copy(DT)[a==2, v := base::min(b), a,     ], output="GForce optimized j to")
-test(2233.25, copy(DT)[a!=4, v := head(b, 3L), a, verbose=TRUE], copy(DT)[a!=4, v := utils::head(b, 3L), a], output="GForce optimized j to")
+test(2233.24, copy(DT)[a==2, v := min(b), a, verbose=TRUE], copy(DT)[a==2, v := base::min(b), a,     ], output="GForce optimized j to")
+test(2233.25, copy(DT)[a!=4, v := b[1:3], a], copy(DT)[a!=4, v := utils::head(b, 3L), a])
 
 # multiple assignments
 DT = data.table(a=1:3,b=(1:9)/10)
-test(2233.26, DT[, c("v1","v2") := .(min(b), max(b)), a,         verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=(1:3)/10, v2=(7:9)/10), output="GForce optimized j to")
-test(2233.27, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=(1:9)/10, v2=(1:9)/10), output="GForce optimized j to")
-test(2233.28, DT[, c("v1","v2") := .(head(b,3L), tail(b,2L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v2'.")
-test(2233.29, DT[, c("v1","v2") := .(head(b,2L), tail(b,3L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v1'.")
-test(2233.30, DT[, c("v1","v2") := .(head(b,2L), tail(b,2L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v1'.")
-test(2233.31, DT[, c("v1","v2") := .(min(b), max(b)), a,         verbose=TRUE], DT[, c("v1","v2") := .(base::min(b), base::max(b)), a          ], output="GForce optimized j to")
-test(2233.32, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a, verbose=TRUE], DT[, c("v1","v2") := .(utils::head(b,3L), utils::tail(b,3L)), a], output="GForce optimized j to")
+test(2233.26, DT[, c("v1","v2") := .(min(b), max(b)), a, verbose=TRUE],        data.table(a=1:3, b=(1:9)/10, v1=(1:3)/10, v2=(7:9)/10), output="GForce optimized j to")
+test(2233.27, DT[, c("v1","v2") := .(shift(b), shift(b,2L)), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=c(NA,NA,NA,1:6)/10, v2=c(NA,NA,NA,NA,NA,NA,1:3)/10), output="GForce optimized j to")
+test(2233.28, DT[, c("v1","v2") := .(shift(b), head(b,3L)), a],                error="head.*:=.*could be implemented")
+test(2233.29, DT[, c("v1","v2") := .(min(b), shift(b)), a, verbose=TRUE],      data.table(a=1:3, b=(1:9)/10, v1=(1:3)/10, v2=c(NA,NA,NA,1:6)/10), output="GForce optimized j to")
+test(2233.30, DT[, c("v1","v2") := .(head(b,2L), tail(b,2L)), a],              error="head.*:=.*could be implemented")
+test(2233.31, DT[, c("v1","v2") := .(min(b), max(b)), a, verbose=TRUE],        DT[, c("v1","v2") := .(base::min(b), base::max(b)), a], output="GForce optimized j to")
+test(2233.32, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a],              error="head.*:=.*could be implemented")
 
 # gforce needs to evaluate variable arguments before calling C part (part of test 101.17 in programming.Rraw)
 set.seed(108)
@@ -21347,3 +21347,139 @@ local({
   test(2326.2, key(d), "x")
   test(2326.3, indices(d), c("y", "z"))
 })
+
+# add na.rm argument to first/last, #4446 #4239
+DT = data.table( grp = INT(1,1,     2,2,2,               3,          4,4),
+                 A =   INT(1,NA,    NA,2,NA,             NA,         3,3),
+                 B =     c(NA,1,    pi,3,NA,             3,          pi,NA),
+                 C =     c("a",NA,  NA,"b",NA,           "c",        "d",NA),
+                 D =     c(NA,1i,   2i,3i,NA,            NA,         4i,NA),
+                 E =  list(NA,NA,   NA,c("a","b"),NULL,  list(1:2),  3:4,NA))
+# to encourage adding cases to DT above (some NAs, all NAs, NAs beginning and/or end within group, types, etc) we
+# define helper functions FNNA and LNNA here to construct the correct result to test against. Explicitly coding
+# all the results instead may reveal mistakes and be good but then might be inflexible to adding more cases.
+FNNA = function(x) {  # First Not NA
+  nna = which(if (is.list(x)) !is.na(x) & !sapply(x,is.null) else !is.na(x))
+  if (length(nna)) x[nna[1L]]          else x[0L]
+}
+LNNA = function(x) {  # Last Not NA
+  nna = which(if (is.list(x)) !is.na(x) & !sapply(x,is.null) else !is.na(x))
+  if (length(nna)) x[nna[length(nna)]] else x[0L]
+}
+for (opt in c(0, Inf)) {
+  num_bump = .1
+  options(datatable.optimize=opt)
+  out = if (opt) "GForce optimized" else "GForce FALSE"
+  num_bump_minor = .0001
+  for (col in setdiff(names(DT), "grp")) {
+    # test vector input to first/last outside of DT[] query
+    test(2327+num_bump+num_bump_minor+.00001, first(DT[[col]], na.rm=TRUE), ans<-FNNA(DT[[col]]))
+    test(2327+num_bump+num_bump_minor+.00002, first(DT[[col]], na.rm="row"), ans)   # "row" same as TRUE for vectors
+    test(2327+num_bump+num_bump_minor+.00003, last(DT[[col]], na.rm=TRUE), ans<-LNNA(DT[[col]]))
+    test(2327+num_bump+num_bump_minor+.00004, last(DT[[col]], na.rm="row"), ans)
+    
+    # one function by group with and without .() wrapper
+    test(2327+num_bump+num_bump_minor+.00005,
+         EVAL("DT[, .(first(",col,",na.rm=TRUE)), by=grp, verbose=TRUE]"),
+         ans<-EVAL("DT[, .(FNNA(",col,")),        by=grp]"), output=out)
+    if (col!="E")  # subsets of list columns need to be done within .() otherwise j's result looks like multiple columns
+      test(2327+num_bump+num_bump_minor+.00006,
+           EVAL("DT[, first(",col,",na.rm=TRUE),  by=grp, verbose=TRUE]"),
+           ans,                                              output=out)
+    test(2327+num_bump+num_bump_minor+.00007,
+         EVAL("DT[, .(last(",col,",na.rm=TRUE)),  by=grp, verbose=TRUE]"),
+         ans<-EVAL("DT[, .(LNNA(",col,")),        by=grp]"), output=out)
+    if (col!="E")  # see comment above why !="E"
+      test(2327+num_bump+num_bump_minor+.00008,
+           EVAL("DT[, last(",col,",na.rm=TRUE),   by=grp, verbose=TRUE]"),
+           ans,                                              output=out)
+    num_bump_minor = num_bump_minor + .0001
+  }
+
+  test(2327+num_bump+.001, first(DT), DT[1,])
+  test(2327+num_bump+.002, first(DT, na.rm=TRUE), data.table(grp=1L, A=1L, B=1, C="a", D=1i, E=list(c("a","b"))))  # first non-NA in each column
+  test(2327+num_bump+.003, first(DT, na.rm="row"), DT[4])                                                   # first row with no NA
+  test(2327+num_bump+.004, last(DT), DT[.N,])
+  test(2327+num_bump+.005, last(DT, na.rm=TRUE),  data.table(grp=4L, A=3L, B=pi, C="d", D=4i, E=list(3:4)))
+  test(2327+num_bump+.006, last(DT, na.rm="row"), DT[7])
+  test(2327+num_bump+.007, first(DT[0]), DT[0])
+  test(2327+num_bump+.008, last(DT[0]), DT[0])
+  DF = as.data.frame(DT)
+  test(2327+num_bump+.011, first(DF), DF[1,])
+  test(2327+num_bump+.012, first(DF, na.rm=TRUE), {x=data.frame(grp=1L, A=1L, B=1, C="a", D=1i); x$E=list(c("a","b")); x})  # with E inside data.frame() it recycles to 2 rows
+  test(2327+num_bump+.013, first(DF, na.rm="row"), DF[4,])
+  test(2327+num_bump+.014, last(DF), DF[8,])
+  test(2327+num_bump+.015, last(DF, na.rm=TRUE),  {x=data.frame(grp=4L, A=3L, B=pi, C="d", D=4i); x$E=list(3:4); x})
+  test(2327+num_bump+.016, last(DF, na.rm="row"), DF[7,])
+  test(2327+num_bump+.017, first(list(), na.rm=TRUE), list())
+  test(2327+num_bump+.018, last(list(), na.rm=TRUE), list())
+  test(2327+num_bump+.021,
+       DT[, first(.SD, na.rm=TRUE), by=grp, verbose=TRUE],
+       ans<-data.table(grp=1:4, A=c(1L,2L,NA,3L), B=c(1,pi,3,pi), C=c("a","b","c","d"), D=c(1i,2i,NA,4i), E=list(NA,c("a","b"),list(1:2),3:4)),
+       output=out)
+  test(2327+num_bump+.022, DT[, lapply(.SD, first, na.rm=TRUE), by=grp, verbose=TRUE], ans, output=out)
+  test(2327+num_bump+.023,
+       DT[, first(.SD, na.rm='row'), by=grp, verbose=TRUE],
+       data.table(grp=c(2L,4L), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
+       notOutput="GForce optimized")   # TODO: could try to implement gforce optimized na.rm='row' in future
+  test(2327+num_bump+.024,
+       DT[, last(.SD, na.rm=TRUE), by=grp, verbose=TRUE],
+       ans<-data.table(grp=1:4, A=c(1L,2L,NA,3L), B=c(1,3,3,pi), C=c("a","b","c","d"), D=c(1i,3i,NA,4i), E=list(NA,c("a","b"),list(1:2),3:4)),
+       output=out)
+  test(2327+num_bump+.025, DT[, lapply(.SD, last, na.rm=TRUE), by=grp, verbose=TRUE], ans, output=out)
+  test(2327+num_bump+.026,
+       DT[, last(.SD, na.rm='row'), by=grp, verbose=TRUE],
+       data.table(grp=c(2L,4L), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
+       output="GForce FALSE")
+  test(2327+num_bump+.027,
+       DT[, .(last(A,na.rm=TRUE), first(B, na.rm=TRUE), last(C)), by=grp, verbose=TRUE],
+       data.table(grp=1:4, V1=c(1L, 2L, NA, 3L), V2=c(1,pi,3,pi), V3=c(NA,NA,"c",NA)),
+       output=out)
+  test(2327+num_bump+.031, DT[, last(D,na.rm=TRUE,n=2)], c(3i,4i))
+  test(2327+num_bump+.032, DT[, last(B,na.rm=TRUE,n=2), by=grp, verbose=TRUE], data.table(grp=INT(1,2,2,3,4), V1=c(1,pi,3,3,pi)), output=out)
+  test(2327+num_bump+.033, DT[, first(D,na.rm=TRUE,n=2)], c(1i,2i))
+  test(2327+num_bump+.034, DT[, first(B,na.rm=TRUE,n=2), by=grp, verbose=TRUE], data.table(grp=INT(1,2,2,3,4), V1=c(1,pi,3,3,pi)), output=out)
+  test(2327+num_bump+.035, DT[, last(D,na.rm=TRUE,n=1), by=grp, verbose=TRUE], data.table(grp=INT(1,2,4), V1=c(1i,3i,4i)), output=out)
+  test(2327+num_bump+.041,
+       DT[, last(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE],
+       data.table(grp=INT(1,2,2,3,4,4), A=INT(1,2,NA,NA,3,3), B=c(1,pi,3,3,pi,NA), C=c("a","b",NA,"c","d",NA), D=c(1i,2i,3i,NA,4i,NA), E=list(NA,c("a","b"),NA,list(1:2),3:4,NA)),
+       output=out)
+  test(2327+num_bump+.042,
+       DT[, last(.SD, na.rm="row", n=2), by=grp, verbose=TRUE],
+       data.table(grp=INT(2,4), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
+       output="GForce FALSE")
+  test(2327+num_bump+.043,
+       DT[, first(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE],
+       data.table(grp=INT(1,2,2,3,4,4), A=INT(1,2,NA,NA,3,3), B=c(1,pi,3,3,pi,NA), C=c("a","b",NA,"c","d",NA), D=c(1i,2i,3i,NA,4i,NA), E=list(NA,c("a","b"),NA,list(1:2),3:4,NA)),
+       output=out)
+  test(2327+num_bump+.044,
+       DT[, first(.SD, na.rm="row", n=2), by=grp, verbose=TRUE],
+       data.table(grp=INT(2,4), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
+       output="GForce FALSE")
+  test(2327+num_bump+.051, DT[, first(.SD, na.rm=TRUE, n=0), by=grp, verbose=TRUE], DT[0], output="GForce FALSE")
+  test(2327+num_bump+.052, DT[, last(.SD, na.rm=TRUE, n=0),  by=grp, verbose=TRUE], DT[0], output="GForce FALSE")
+  test(2327+num_bump+.053, DT[, head(.SD, n=0),              by=grp, verbose=TRUE], DT[0], output="GForce FALSE")
+  test(2327+num_bump+.054, DT[, tail(.SD, na.rm=TRUE, n=0),  by=grp, verbose=TRUE], DT[0], output="GForce FALSE")
+  test(2327+num_bump+.061, first(DT$A, n=-1), error="n.*is not TRUE")
+  test(2327+num_bump+.062, DT[,first(A, n=-1)], error="n.*is not TRUE")
+  test(2327+num_bump+.063, DT[,first(A, n=-1),by=grp,verbose=TRUE], error="n.*is not TRUE", output="GForce FALSE")
+  test(2327+num_bump+.064, last(DT$A, n=-1), error="n.*is not TRUE")
+  test(2327+num_bump+.065, DT[,last(A, n=-1)], error="n.*is not TRUE")
+  test(2327+num_bump+.066, DT[,last(A, n=-1),by=grp,verbose=TRUE], error="n.*is not TRUE", output="GForce FALSE")
+  test(2327+num_bump+.067, first(matrix(1:12,nrow=3), na.rm=TRUE), error="na.rm.*not currently supported for.*matrix")
+  test(2327+num_bump+.068, first(matrix(1:12,nrow=3), na.rm='row'), error="na.rm.*not currently supported for.*matrix")
+  test(2327+num_bump+.069, first(matrix(1:12,nrow=3)), matrix(INT(1,4,7,10),nrow=1))
+  test(2327+num_bump+.071, first(DT$A, na.rm=NA), error="na.rm")
+  test(2327+num_bump+.072, DT[,first(A, na.rm=NA),by=grp,verbose=TRUE], error="na.rm", output=out)
+  test(2327+num_bump+.073, last(DT$A, na.rm=NA), error="na.rm")
+  test(2327+num_bump+.074, DT[,last(A, na.rm=NA),by=grp,verbose=TRUE], error="na.rm", output=out)
+  
+  # aligning two gforce dynamic columns the same between optimized and unoptimized
+  # needs to be top aligned otherwise dogroups.c would need knowledge of whether first or last was called (last was aligned at the bottom in the PR before
+  # this test added). dogroups.c now just needs to know whether the vector is a true vector to know not to recycle length-1 and to pad with NA
+  test(2327+num_bump+.081,
+       DT[, .(first(B, n=2, na.rm=TRUE), last(C, na.rm=TRUE)), by=grp, verbose=TRUE],
+       data.table(grp=INT(1,2,2,3,4), V1=c(1,pi,3,3,pi), V2=c("a","b",NA,"c","d")),
+       output=out)
+  num_nump = num_bump + .1
+}
diff --git a/man/last.Rd b/man/last.Rd
index ce28145286..4ad2396093 100644
--- a/man/last.Rd
+++ b/man/last.Rd
@@ -3,18 +3,17 @@
 \alias{last}
 \title{ First/last item of an object }
 \description{
-Returns the first/last item of a vector or list, or the first/last row of a data.frame
-or data.table. The main difference to head/tail is that the default for \code{n} is 1
-rather than 6.
+Returns the first/last item of a vector, list, \code{data.frame} or \code{data.table}. The main difference
+to head/tail is that the default for \code{n} is 1 rather than 6, and support for skipping missing entries.
 }
 \usage{
-first(x, n=1L, \dots)
-last(x, n=1L, \dots)
+first(x, n=1L, na.rm=FALSE, \dots)
+last(x, n=1L, na.rm=FALSE, \dots)
 }
 \arguments{
-\item{x}{ A vector, list, data.frame or data.table. Otherwise the S3 method
-of \code{xts::first} is deployed. }
+\item{x}{ A vector, list, \code{data.frame} or \code{data.table}. \code{xts} objects dispatch to the \code{xts} method. }
 \item{n}{ A numeric vector length 1. How many items to select. }
+\item{na.rm}{ \code{FALSE}, \code{TRUE} or \code{"row"}. Should missing values be skipped when selecting values? See examples. }
 \item{\dots}{ Not applicable for \code{data.table} first/last. Any arguments here
 are passed through to \code{xts}'s first/last. }
 }
@@ -22,20 +21,37 @@ are passed through to \code{xts}'s first/last. }
 For zero-length vectors, \code{first(x)} and \code{last(x)} mimic \code{head(x, 1)} and \code{tail(x, 1)} by returning an empty vector instead of \code{NA}. However, unlike \code{head()}/\code{tail()} and base R subsetting (e.g., \code{x[1]}), they do not preserve attributes like names.
 }
 \value{
-If no other arguments are supplied it depends on the type of \code{x}. The first/last item
-of a vector or list. The first/last row of a \code{data.frame} or \code{data.table}.
-For other types, or if any argument is supplied in addition to \code{x} (such as \code{n}, or
-\code{keep} in \code{xts}) regardless of \code{x}'s type, then \code{xts::first}/
-\code{xts::last} is called if \code{xts} has been loaded, otherwise \code{utils::head}/\code{utils::tail}.
+The first/last \code{n} items of a vector or list.
+When \code{na.rm=TRUE} the first/last item in each column of a \code{data.frame} or \code{data.table}.
+When \code{na.rm="row"} the first/last \code{n} rows which contain no NA in any column; i.e. if an NA is found that row is removed.
+}
+\details{
+It may be natural to expect \code{na.rm="col"} to be supported to compliment \code{na.rm="row"}. But whereas \code{na.rm="row"}
+unambiguously conveys intent to remove rows, \code{na.rm="col"} may convey intent to remove columns where in fact NA are removed
+within each column and the column retained. Your feedback is sought on this point.
 }
 \seealso{ \code{\link{NROW}}, \code{\link{head}}, \code{\link{tail}} }
 \examples{
-first(1:5) # [1] 1
+first(1:5)
 x = data.table(x=1:5, y=6:10)
 first(x) # same as head(x, 1)
 
 last(1:5) # [1] 5
 x = data.table(x=1:5, y=6:10)
 last(x) # same as tail(x, 1)
+
+x = c(NA, 1, 2, NA)
+first(x)
+first(x, na.rm=TRUE)
+last(x)
+last(x, na.rm=TRUE)
+
+x = data.table(x=c(NA, 1, 2, NA), y=1:4)
+first(x)
+first(x, na.rm=TRUE)
+first(x, na.rm="row")
+last(x)
+last(x, na.rm=TRUE)
+last(x, na.rm="row")
 }
 \keyword{ data }
diff --git a/src/assign.c b/src/assign.c
index 6ba175cc83..24d3f09641 100644
--- a/src/assign.c
+++ b/src/assign.c
@@ -251,7 +251,7 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose)
   names = getAttrib(dt,R_NamesSymbol);
   // names may be NULL when null.data.table() passes list() to alloccol for example.
   // So, careful to use length() on names, not LENGTH().
-  if (length(names)!=l) internal_error(__func__, "length of names (%d) is not length of dt (%d)", length(names),l); // # nocov
+  if (length(names) != l && length(names) > 0) internal_error(__func__, "length of names (%d) is not length of dt (%d)", length(names), l); // # nocov
   if (!selfrefok(dt,verbose))
     return shallow(dt,R_NilValue,(n>l) ? n : l);  // e.g. test 848 and 851 in R > 3.0.2
     // added (n>l) ? ... for #970, see test 1481.
@@ -316,6 +316,12 @@ SEXP truelength(SEXP x) {
   return ScalarInteger(isNull(x) ? 0 : TRUELENGTH(x));
 }
 
+SEXP settruelength(SEXP x, SEXP n) {
+  // currently just for first/last and dogroups.c; see comments at the end of last.R
+  SET_TRUELENGTH(x, INTEGER(n)[0]);
+  return R_NilValue;
+}
+
 SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
   return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0]));
 }
diff --git a/src/data.table.h b/src/data.table.h
index 2bb26111cb..2e9e190125 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -112,6 +112,7 @@ extern SEXP sym_index;
 extern SEXP sym_BY;
 extern SEXP sym_starts, char_starts;
 extern SEXP sym_maxgrpn;
+extern SEXP sym_gforce_dynamic;
 extern SEXP sym_anyna;
 extern SEXP sym_anyinfnan;
 extern SEXP sym_anynotascii;
@@ -278,6 +279,7 @@ bool isDataTable(SEXP x);
 bool isRectangularList(SEXP x);
 bool perhapsDataTable(SEXP x);
 SEXP perhapsDataTableR(SEXP x);
+SEXP setDT(SEXP list);
 NORET void internal_error(const char *call_name, const char *format, ...);
 
 // types.c
diff --git a/src/dogroups.c b/src/dogroups.c
index d4f774568b..e0b6d2368b 100644
--- a/src/dogroups.c
+++ b/src/dogroups.c
@@ -426,20 +426,26 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
           warning(_("Item %d of j's result for group %d is zero length. This will be filled with %d NAs to match the longest column in this result. Later groups may have a similar problem but only the first is reported to save filling the warning buffer."), j+1, i+1, maxn);
           NullWarnDone = TRUE;
         }
-        writeNA(target, thisansloc, maxn, false);
+        writeNA(target, thisansloc, maxn, true);
       } else {
         // thislen>0
         if (TYPEOF(source) != TYPEOF(target))
           error(_("Column %d of result for group %d is type '%s' but expecting type '%s'. Column types must be consistent for each group."), j+1, i+1, type2char(TYPEOF(source)), type2char(TYPEOF(target)));
-        if (thislen>1 && thislen!=maxn && grpn>0) {  // grpn>0 for grouping empty tables; test 1986
-          error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
-        }
         bool copied = false;
         if (isNewList(target) && anySpecialStatic(source)) {  // see comments in anySpecialStatic()
           source = PROTECT(copyAsPlain(source));
           copied = true;
         }
-        memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
+        if (TRUELENGTH(source)==LENGTH(source)) {
+          // first() and last() set truelength to mark that it is a true vector; see comments at the end of last.R and test 2240.81
+          // a true vector is not recycled when length-1 and is padded with NA to match the length of the longest result
+          memrecycle(target, R_NilValue, thisansloc, thislen, source, 0, -1, 0, "");  // just using memrecycle to copy contents
+          writeNA(target, thisansloc+thislen, maxn-thislen, true);                    // pad with NA
+        } else {
+          if (thislen>1 && thislen!=maxn && grpn>0)  // grpn>0 for grouping empty tables; test 1986
+            error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
+          memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
+        }
         if (copied) UNPROTECT(1);
       }
     }
diff --git a/src/gsumm.c b/src/gsumm.c
index 5970f59194..8a4e5a5104 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -21,6 +21,9 @@ static int *oo = NULL;
 static int *ff = NULL;
 static int isunsorted = 0;
 
+// for first/last with n>1 to error when used with :=, until implemented
+static bool assignByRef = false;
+
 // from R's src/cov.c (for variance / sd)
 #ifdef HAVE_LONG_DOUBLE
 # define SQRTL sqrtl
@@ -42,9 +45,11 @@ static int nbit(int n)
     grouped summaries over a large data.table. OpenMP is used here to
     parallelize operations involved in calculating common group-wise statistics.
 */
-SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
+SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg, SEXP grpcols, SEXP lhs) {
+  int nprotect=0;
   double started = wallclock();
   const bool verbose = GetVerbose();
+  assignByRef = !isNull(lhs);
   if (TYPEOF(env) != ENVSXP) error(_("env is not an environment"));
   // The type of jsub is pretty flexible in R, so leave checking to eval() below.
   if (!isInteger(o)) error(_("%s is not an integer vector"), "o");
@@ -205,16 +210,212 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   oo = INTEGER(o);
   ff = INTEGER(f);
 
-  SEXP ans = PROTECT( eval(jsub, env) );
+  SEXP gans = PROTECT( eval(jsub, env) );  nprotect++; // just the gforce result columns; the group columns are added below when we know the shape
   if (verbose) { Rprintf(_("gforce eval took %.3f\n"), wallclock()-started); started=wallclock(); }
   // if this eval() fails with R error, R will release grp for us. Which is why we use R_alloc above.
-  if (isVectorAtomic(ans)) {
-    SEXP tt = PROTECT(allocVector(VECSXP, 1));
-    SET_VECTOR_ELT(tt, 0, ans);
-    UNPROTECT(2);
-    return tt;
+  if (isVectorAtomic(gans)) {
+    SEXP tt = PROTECT(allocVector(VECSXP, 1)); nprotect++;
+    SET_VECTOR_ELT(tt, 0, gans);
+    gans = tt;
   }
-  UNPROTECT(1);
+  
+  // TODO: refine these comments
+  //
+  // Now replicate group values to match the number of rows in each group.
+  // In most cases (e.g. mean, sum) gfuns return one value per group and there is nothing left to do because
+  // each column in ans is ngrp long.
+  // gforce_fixed_over_1 and gforce_dynamic
+  // However, first/last with n>1 and na.rm=false results in MIN(grpsize[g], n) items per group. This is
+  // still referred to as gforce_dynamic (because it's not a fixed 1 per group) but the gfun doesn't need
+  // to return how many each group has (the lens att is null); all it needs return is the n it was passed.
+  // shift is the same as first/last with n>1 and na.rm=false; i.e., just returns n and lens can be empty.
+  // Currently just first/last with na.rm=true (regardless of n) returns the lens attribute because the
+  // number of items for each group does depend on the data in that case; e.g. it can be 0 when all NA. 
+  // Further, if the query consists of several calls to first/last, each call could have a
+  // different value of n and/or na.rm=TRUE which would result in the group results not being aligned
+  // across the columns of ans. Any gfun call which can result in not-1-value-per-group should attach
+  // sym_gforce_dynamic to tell us here how to align the groups across the columns in ans.
+  // For example, first/last with the default n=1 and na.rm=FALSE does not attach sym_gforce_dynamic
+  // because it it returns a length-ngrp result.
+  // If one of the ans columns contains a gforce_dynamic result, then the non-dynamic other columns
+  // are replicated to match.
+  // If there is only one column (very common) and it is gforce_dynamic then the gaps will now be removed
+  // since each group was allocated to accomodate the largest result per group: max(grpsize[], n).
+  
+  // If there are many columns (e.g. 10,000) and many small groups (e.g. size 1-5 rows) then we wish to
+  // avoid each dynamic gfun from allocating a new lens (ngrp long) if that can be avoided; e.g. shift(.SD)
+  // always returns .N rows for each group; that shape is known before calling shift. If lens was allocated
+  // and populated it would be the same as grpsize[], every time for each column.
+  
+  // 3 states :    i) no gforce_dynamic at all (including n=1 na.rm=false)
+  //              ii) 1 < n < maxgrpn meaning min(n, grp_size[g]) must be summed.  We might have less than ngrp when na.rm=TRUE due to all-NA groups
+  //             iii) n >= maxgrpn and na.rm=false meaning all groups are size grpsize[]; e.g. shift sets n=INT_MAX 
+  // If we have reached maxgrpn, then we can stop there; e.g. shifting a lot of small groups which is within scope to do a good job on
+  
+  // We can point lens to be the grpsize[] and find maximum n that clamps the grpsize
+  
+  // Having one final lens, though, with max_dynamic_n clamped, is desirable, to pass to the eval of rep.int for example. Just one allocation is dramatically better than on each of the 10,000 gans columns.
+  
+  const int ngans=length(gans);
+  SEXP lens=NULL;
+  int max_w=0;
+  bool lensCopied=false;
+  for (int i=0; i<ngans; ++i) {
+    SEXP tt;
+    if (!isNull(tt=getAttrib(VECTOR_ELT(gans, i), sym_gforce_dynamic))) {
+      if (isNull(VECTOR_ELT(tt, 0))) {
+        int this_w=INTEGER(VECTOR_ELT(tt, 2))[0];
+        if (this_w>max_w) max_w=this_w;
+      } else {
+        if (!lens) {
+          lens=VECTOR_ELT(tt, 0);  // find the first dynamic column's lens and use it directly without copying it if there are no other dynamic columns
+        } else {
+          if (!lensCopied) {
+            // upon the 2nd lens, we need to allocate a new lens to hold the max of the 2-or-more dynamic result with lens
+            // allocate a new lens to calc the max size of each group across the dynamic columns
+            // original gforce_dynamic attributes need to be retained so we can navigate those columns after we find the max
+            //int *newlens = (int *)R_alloc(ngrp, sizeof(int));
+            lens=PROTECT(duplicate(lens)); nprotect++;
+            lensCopied=true;
+          }
+          int *lensp=INTEGER(lens);
+          const int *ss=INTEGER(VECTOR_ELT(tt, 0));
+          for (int g=0; g<ngrp; ++g)
+            if (ss[g]>lensp[g]) lensp[g]=ss[g];
+        }
+      }
+    }
+  }
+  if (max_w) {
+    if (!lens) {
+      // construct a lens because we currently need it to pass to rep.int below
+      lens = PROTECT(allocVector(INTSXP, ngrp)); nprotect++;
+      int *lensp = INTEGER(lens);
+      for (int g=0; g<ngrp; ++g)
+        lensp[g]=MIN(grpsize[g], max_w);
+    } else {    
+      // there is a mixture of non-lens and lens; e.g. .(first(colA,n=3), first(colB,n=3,na.rm=TRUE))
+      if (!lensCopied) {
+        // there is a single dynamic column so it wasn't copied yet
+        lens=PROTECT(duplicate(lens)); nprotect++;
+        lensCopied=true;
+      }
+      int *lensp = INTEGER(lens);
+      for (int g=0; g<ngrp; ++g) {
+        int this_w=MIN(grpsize[g], max_w);
+        if (this_w>lensp[g]) lensp[g]=this_w;
+      }
+    }
+  }
+  // we have now maximized over any combination of over_1 and dynamic which determines the final shape in lens
+  // TODO if max_w>=grpsize_max then further savings can be made
+
+  // TODO: First we replicate group values to match the number of rows in each group if necessary
+  // TODO: DT[, .(first(A,n=2), last(B,n=3)), by=group] -- missing test
+  
+  // We could require that vector output functions have to be returned in a list column but that could be expensive and likely would be flattened afterwards by user anyway.
+  // If user really doesn't want flattening, then they can wrap with list().
+  
+  const int ngrpcol=length(grpcols);
+  SEXP ans = PROTECT(allocVector(VECSXP, ngrpcol+ngans)); nprotect++;
+  SEXP first_each_group = PROTECT( length(o) ? subsetVector(o, f) : f ); nprotect++;
+  
+  for (int i=0; i<ngrpcol; ++i) {
+    const SEXP in = VECTOR_ELT(grpcols, i);
+    SEXP out;
+    SET_VECTOR_ELT(ans, i, out=allocVector(TYPEOF(in), ngrp));
+    copyMostAttrib(in, out);
+    subsetVectorRaw(out, in, first_each_group, /*anyNA=*/false);
+    // avoiding subsetVector()'s check that idx is in bounds in case both ngrp and ngrpcol are large 
+    // this is rep.int'd below when group results have more than 1-row.
+    // TODO: for those cases avoid this subset first by enabling the rep.int to accept first_each_group
+  }
+  
+  if (!lens) {
+    // every group is length-1 (no dynamic or fixed_over_1 results) so can just use the gans as-is and we're done
+    for (int i=0; i<ngans; ++i) {
+      SET_VECTOR_ELT(ans, ngrpcol+i, VECTOR_ELT(gans, i));
+    }
+  } else {
+    // There is one or more gforce_dynamic columns, gap removal and/or padding within groups may be necessary
+    // and the non-gforce-dynamic columns (including group values) need to be replicated to align with the dynamic columns
+    // Either we create another column, or we budge up/down the padding within the same allocated column
+    // budging up/down many small groups will take a lot of reads and writes, albeit saving total memory. The
+    // extra memory is only one-at-a-time per column though, so choose speed over memory here.
+    // gforce_dynamic includes any result where the number of values per group may not be 1; e.g. shift() simply sets
+    // its lens to equal the grp lens to save an allocation
+    
+    int anslen=0;  // to be populated when no dynamic is just the length of the first ans column
+    const int *lensp = INTEGER(lens);
+    // if (lens==grpsize) {
+    //  if (max_dynamic_n <= 1) error("Internal error: max_dynamic_n<=1"); // is not considered dynamic and should not be marked as such
+    //  if (max_dynamic_n >= maxgrpn) anslen=nrow;                               // e.g. last(.SD, 2) where biggest group is 2 rows
+    //  else for (int g=0; g<ngrp; ++g) anslen+=MIN(grpsize[g], max_dynamic_n);  // e.g. last(.SD, 2) where biggest group is >2 rows, and there might be some 1-row groups
+    //} else {
+    
+    for (int g=0; g<ngrp; ++g) anslen+=lensp[g];  // contiguous sweep insignificant time not worth trying to save; e.g. by including the sum in the gforce_dynamic attribute returned for when there's just one column and it's dynamic
+    
+    for (int i=0; i<ngrpcol; ++i) {
+      SET_VECTOR_ELT(ans, i, eval(PROTECT(lang3(install("rep.int"), VECTOR_ELT(ans, i), lens)), R_GlobalEnv));
+      UNPROTECT(1);
+      // use R's rep.int for now, no C API for it afaik. R's rep.int will be summing lens here on each call which we can avoid (TODO).
+      // TODO: expand subsetVector to accept 'rep=' and 'len=' so that subset and rep.int can be done in one step avoiding intermediate subset of first_each_group, and pass len=anslen which we know already
+    }
+
+    for (int i=0; i<ngans; ++i) {
+      SEXP tt = VECTOR_ELT(gans, i);
+      SEXP att = getAttrib(tt, sym_gforce_dynamic);  // how long the items are, we won't parallelize within column so we can sweep forwards
+      if (isNull(att)) {
+        // e.g. the mean in .(mean(colA), first(colB, n=2));    TODO test
+        SET_VECTOR_ELT(ans, ngrpcol+i, eval(PROTECT(lang3(install("rep.int"), tt, lens)), R_GlobalEnv));
+        UNPROTECT(1);
+      } else {
+        // att can be i) empty lens vec in which case this ans col's shape is min(grpsize,w). If this matches the end result shape (i.e. all ans columns are the same, then no need to allocate copy)
+        //           ii) presence of lens vec always needs a copy and pad? (or could loop it and test if the same since looping through ngrp is much faster than nrow vector) 
+        const int *ss = isNull(VECTOR_ELT(att,0)) ? NULL : INTEGER(VECTOR_ELT(att,0));
+        const bool first = LOGICAL(VECTOR_ELT(att, 1))[0];
+        const int w = INTEGER(VECTOR_ELT(att,2))[0];
+        SEXP newcol = PROTECT(allocVector(TYPEOF(tt), anslen));
+        copyMostAttrib(tt, newcol);
+        setAttrib(newcol, sym_gforce_dynamic, R_NilValue);
+        int ansi=0, k=0;
+
+        #define DO(CTYPE, RTYPE, RNA, ASSIGN) {                                                        \
+        const CTYPE *xd = (const CTYPE *)RTYPE(tt);                                                    \
+        CTYPE *ansd = (CTYPE *)RTYPE(newcol);                                                          \
+        CTYPE val = RNA;                                                                               \
+        for (int g=0; g<ngrp; ++g) {                                                                   \
+          const int grp_allocated = MIN(grpsize[g], w);                                                \
+          const int thislen = ss ? ss[g] : grp_allocated;                                              \
+          const int targetlen = lensp[g];                                                              \
+          const int napad = targetlen-thislen;                                                         \
+          k += (!first)*(grp_allocated-thislen);                                                       \
+          for (int i=0; i<thislen; ++i) { val=xd[k++]; ASSIGN; }                                       \
+          val=RNA; for (int i=0; i<napad; ++i) ASSIGN;                                                 \
+          k += (first)*(grp_allocated-thislen);                                                        \
+        }                                                                                              \
+        ansd++; /* just to suppress unused-variable warning in STRSXP and VECSXP cases */              \
+        } break;
+
+        switch(TYPEOF(tt)) {
+        case RAWSXP:  DO(Rbyte,    RAW,     0,            ansd[ansi++]=val)
+        case LGLSXP:  DO(int,      LOGICAL, NA_LOGICAL,   ansd[ansi++]=val)
+        case INTSXP:  DO(int,      INTEGER, NA_INTEGER,   ansd[ansi++]=val)
+        case REALSXP: if (INHERITS(tt, char_integer64)) {
+                      DO(int64_t,  REAL,    NA_INTEGER64, ansd[ansi++]=val)
+             } else { DO(double,   REAL,    NA_REAL,      ansd[ansi++]=val) }
+        case CPLXSXP: DO(Rcomplex, COMPLEX, NA_CPLX,      ansd[ansi++]=val)
+        case STRSXP:  DO(SEXP,  STRING_PTR, NA_STRING,    SET_STRING_ELT(newcol,ansi++,val))
+        case VECSXP:  DO(SEXP,  SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(newcol,ansi++,val))       /* TODO: global replace ScalarLogical() with fixed constant R_NAValue, depending on R dependency */
+        default:
+          error(_("Type '%s' is not supported by gforce padding."), type2char(TYPEOF(tt)));
+        }
+        SET_VECTOR_ELT(ans, ngrpcol+i, newcol);
+        UNPROTECT(1); // newcol
+      }
+    }
+  }
+  UNPROTECT(nprotect);
   return ans;
 }
 
@@ -920,52 +1121,82 @@ SEXP gmedian(SEXP x, SEXP narmArg) {
   return ans;
 }
 
-static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) {
-  // w: which item (1 other than for gnthvalue when could be >1)
-  // headw: select 1:w of each group when first=true, and (n-w+1):n when first=false (i.e. tail)
+static SEXP gfirstlast(const SEXP x, const bool first, const SEXP nArg, const bool nthvalue, const SEXP narmArg) {
+  if (!IS_TRUE_OR_FALSE(narmArg))
+    error(_("%s must be TRUE or FALSE"), "na.rm");  // # nocov
+  const bool narm = LOGICAL(narmArg)[0];
+  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<0)
+    error(_("Internal error, gfirstlast is not implemented for n<0. This should have been caught before. Please report to data.table issue tracker.")); // # nocov
+  const int w = INTEGER(nArg)[0];
+  if (w>1 && assignByRef)
+    error(_("Is first/last/head/tail with n>1 and := by group intentional? Please provide a use case to the GitHub issue tracker. It could be implemented."));
+  // select 1:w when first=TRUE, and (n-w+1):n when first=FALSE
+  // or select w'th item when nthvalue=TRUE; e.g. the n=0 case in test 280
   const bool nosubset = irowslen == -1;
   const bool issorted = !isunsorted; // make a const-bool for use inside loops
   const int n = nosubset ? length(x) : irowslen;
   if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, first?"gfirst":"glast");
-  if (w==1 && headw) internal_error(__func__, "headw should only be true when w>1");
   int anslen = ngrp;
-  if (headw) {
+  if (!nthvalue && w>1) {
     anslen = 0;
     for (int i=0; i<ngrp; ++i) {
       anslen += MIN(w, grpsize[i]);
     }
   }
   SEXP ans = PROTECT(allocVector(TYPEOF(x), anslen));
+  int *anslens = NULL;
+  if (narm || (w>1 && !nthvalue)) {  // w>1 because some groups may be smaller than w so we need to save w
+    // narm=true needing gforce_dynamic is clear
+    // when w>1 and narm=false, we could avoid gforce_dynamic since each group result in MIN(w,grpsize[g]) 
+    // how many non-NA were found for each group im
+    SEXP att, v;
+    setAttrib(ans, sym_gforce_dynamic, att=allocVector(VECSXP, 3));
+    SET_VECTOR_ELT(att, 0, v = narm ? allocVector(INTSXP, ngrp) : R_NilValue);
+    SET_VECTOR_ELT(att, 1, ScalarLogical(first)); // so gforce knows which end the data is (last writes from the end of the alloc)
+    SET_VECTOR_ELT(att, 2, ScalarInteger(w));     // to know how many were allocated for each group; i.e. MIN(w,grpsize[i])
+    if (narm) anslens = INTEGER(v);
+  }
   int ansi = 0;
+  #undef DO
   #define DO(CTYPE, RTYPE, RNA, ASSIGN) {                                                          \
     const CTYPE *xd = (const CTYPE *)RTYPE(x);                                                     \
-    if (headw) {                                                                                   \
-      /* returning more than 1 per group; w>1 */                                                   \
-      for (int i=0; i<ngrp; ++i) {                                                                 \
-        const int grpn = grpsize[i];                                                               \
+    CTYPE *ansd = (CTYPE *)RTYPE(ans);                                                             \
+    if (w==1 || !nthvalue) {                                                                       \
+      const int inc = first ? +1 : -1;                                                             \
+      for (int g=0; g<ngrp; ++g) {                                                                 \
+        const int grpn = grpsize[g];                                                               \
         const int thisn = MIN(w, grpn);                                                            \
-        const int jstart = ff[i]-1+ (!first)*(grpn-thisn);                                         \
-        const int jend = jstart+thisn;                                                             \
-        for (int j=jstart; j<jend; ++j) {                                                          \
-          const int k = issorted ? j : oo[j]-1;                                                    \
-          /* ternary on const-bool assumed to be branch-predicted and ok inside loops */           \
+        const int jstart = ff[g]-1 + !first*(grpn-1);                                              \
+        if (!first) ansi+=thisn-1;                                                                 \
+        int read=0, write=0;                                                                       \
+        while (write<thisn && read<grpn) {                                                         \
+          const int k = issorted ? jstart+inc*read : oo[jstart+inc*read]-1;                        \
           const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]);      \
-          ASSIGN;                                                                                  \
+          /* ternaries on const-bools isorted and nosubset above assumed branch-predicted and */   \
+          /* insignificant inside loop  */                                                         \
+          read++;                                                                                  \
+          if (narm && ISNAT(val)) continue;                                                        \
+          /* const-bool narm and short-circuit-&& means above if() insignificant */                \
+          ASSIGN; write++;                                                                         \
         }                                                                                          \
+        if (anslens) anslens[g]=write;                                                              \
+        const CTYPE val = RNA;                                                                     \
+        while (write<thisn) { ASSIGN; write++; }  /* when not enough non-NA pad with NA */         \
+        /* if NAs can be removed when narm=TRUE, that's done up in gforce() afterwards when     */ \
+        /* we know what the results are from the other gforce columns are to align with. When   */ \
+        /* there's just one column, and narm=TRUE too, and there are NAs removed, it would      */ \
+        /* save removing the NAs in gforce() by not doing the padding now and just continuing   */ \
+        /* to write the next group's results straight after this group. A special case and      */ \
+        /* likely only measureable speedup for string columns but relatively easy to do.        */ \
+        if (!first) ansi+=write+1; /* we wrote backwards so pass over what we wrote */             \
       }                                                                                            \
-    } else if (w==1) {                                                                             \
-      for (int i=0; i<ngrp; ++i) {                                                                 \
-        const int j = ff[i]-1 + (first ? 0 : grpsize[i]-1);                                        \
-        const int k = issorted ? j : oo[j]-1;                                                      \
-        const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]);        \
-        ASSIGN;                                                                                    \
-      }                                                                                            \
-    } else if (w>1 && first) {                                                                     \
+    } else if (first) {                                                                            \
       /* gnthvalue */                                                                              \
-      for (int i=0; i<ngrp; ++i) {                                                                 \
-        const int grpn = grpsize[i];                                                               \
-        if (w>grpn) { const CTYPE val=RNA; ASSIGN; continue; }                                     \
-        const int j = ff[i]-1+w-1;                                                                 \
+      const int inc=1;                                                                             \
+      for (int g=0; g<ngrp; ++g) {                                                                 \
+        const int grpn = grpsize[g];                                                               \
+        if (w>grpn || w==0) { const CTYPE val=RNA; ASSIGN; continue; }                             \
+        const int j = ff[g]-1+w-1;                                                                 \
         const int k = issorted ? j : oo[j]-1;                                                      \
         const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]);        \
         ASSIGN;                                                                                    \
@@ -973,18 +1204,45 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw)
     } else {                                                                                       \
       /* w>1 && !first not supported because -i in R means everything-but-i and gnthvalue */       \
       /* currently takes n>0 only. However, we could still support n'th from the end, somehow */   \
-      internal_error(__func__, "unanticipated case first=%d w=%d headw=%d", first, w, headw);      \
+      internal_error(__func__, "unanticipated case first=%d w=%d nthvalue=%d", first, w, nthvalue); \
     }                                                                                              \
+    ansd++; /* just to suppress unused-variable warning in STRSXP and VECSXP cases */              \
   }
   switch(TYPEOF(x)) {
-  case LGLSXP:  { int      *ansd=LOGICAL(ans); DO(int,      LOGICAL, NA_LOGICAL,   ansd[ansi++]=val) } break;
-  case INTSXP:  { int      *ansd=INTEGER(ans); DO(int,      INTEGER, NA_INTEGER,   ansd[ansi++]=val) } break;
+  case LGLSXP:  {
+    #undef ISNAT
+    #define ISNAT(x) ((x)==NA_INTEGER)
+    DO(int,      LOGICAL, NA_LOGICAL,   ansd[ansi]=val; ansi+=inc)
+  } break;
+  case INTSXP:  {
+    #undef ISNAT
+    #define ISNAT(x) ((x)==NA_INTEGER)
+    DO(int,      INTEGER, NA_INTEGER,   ansd[ansi]=val; ansi+=inc)
+  } break;
   case REALSXP: if (INHERITS(x, char_integer64)) {
-           int64_t *ansd=(int64_t *)REAL(ans); DO(int64_t,  REAL,    NA_INTEGER64, ansd[ansi++]=val) }
-           else { double      *ansd=REAL(ans); DO(double,   REAL,    NA_REAL,      ansd[ansi++]=val) } break;
-  case CPLXSXP: { Rcomplex *ansd=COMPLEX(ans); DO(Rcomplex, COMPLEX, NA_CPLX,      ansd[ansi++]=val) } break;
-  case STRSXP:  DO(SEXP, STRING_PTR_RO, NA_STRING,              SET_STRING_ELT(ans,ansi++,val))        break;
-  case VECSXP:  DO(SEXP, SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(ans,ansi++,val))        break;
+    #undef ISNAT
+    #define ISNAT(x) ((x)==NA_INTEGER64)
+    DO(int64_t,  REAL,    NA_INTEGER64, ansd[ansi]=val; ansi+=inc)
+    } else {
+    #undef ISNAT
+    #define ISNAT(x) (ISNAN(x))
+    DO(double,   REAL,    NA_REAL,      ansd[ansi]=val; ansi+=inc)
+  } break;
+  case CPLXSXP: {
+    #undef ISNAT
+    #define ISNAT(x) (ISNAN_COMPLEX(x))
+    DO(Rcomplex, COMPLEX, NA_CPLX,      ansd[ansi]=val; ansi+=inc)
+  } break;
+  case STRSXP: {
+    #undef ISNAT
+    #define ISNAT(x) ((x)==NA_STRING)
+    DO(SEXP,  STRING_PTR_RO, NA_STRING,    SET_STRING_ELT(ans,ansi,val); ansi+=inc)
+  } break;
+  case VECSXP: {
+    #undef ISNAT
+    #define ISNAT(x) (isNull(x) || (isLogical(x) && LENGTH(x)==1 && LOGICAL(x)[0]==NA_LOGICAL))
+    DO(SEXP, SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(ans,ansi,val); ansi+=inc)       /* global replace ScalarLogical() with fixed constant R_FalseValue somehow */
+  } break;
   default:
     error(_("Type '%s' is not supported by GForce head/tail/first/last/`[`. Either add the namespace prefix (e.g. utils::head(.)) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)));
   }
@@ -993,29 +1251,24 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw)
   return(ans);
 }
 
-SEXP glast(SEXP x) {
-  return gfirstlast(x, false, 1, false);
+SEXP glast(const SEXP x, const SEXP nArg, const SEXP narmArg) {
+  return gfirstlast(x, /*first=*/false, nArg, /*nthvalue=*/false, narmArg);
 }
 
-SEXP gfirst(SEXP x) {
-  return gfirstlast(x, true, 1, false);
+SEXP gfirst(const SEXP x, const SEXP nArg, const SEXP narmArg) {
+  return gfirstlast(x, true, nArg, false, narmArg);
 }
 
-SEXP gtail(SEXP x, SEXP nArg) {
-  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) internal_error(__func__, "gtail is only implemented for n>0. This should have been caught before"); // # nocov
-  const int n=INTEGER(nArg)[0];
-  return n==1 ? glast(x) : gfirstlast(x, false, n, true);
+SEXP gtail(const SEXP x, const SEXP nArg) {
+  return gfirstlast(x, false, nArg, false, ScalarLogical(0));
 }
 
-SEXP ghead(SEXP x, SEXP nArg) {
-  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) internal_error(__func__, "gtail is only implemented for n>0. This should have been caught before"); // # nocov
-  const int n=INTEGER(nArg)[0];
-  return n==1 ? gfirst(x) : gfirstlast(x, true, n, true);
+SEXP ghead(const SEXP x, const SEXP nArg) {
+  return gfirstlast(x, true, nArg, false, ScalarLogical(0));
 }
 
-SEXP gnthvalue(SEXP x, SEXP nArg) {
-  if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) internal_error(__func__, "`g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before"); // # nocov
-  return gfirstlast(x, true, INTEGER(nArg)[0], false);
+SEXP gnthvalue(const SEXP x, const SEXP nArg) {
+  return gfirstlast(x, /*first=*/true, nArg, /*nthvalue=*/true, ScalarLogical(0));
 }
 
 // TODO: gwhich.min, gwhich.max
@@ -1221,6 +1474,16 @@ SEXP gshift(SEXP x, SEXP nArg, SEXP fillArg, SEXP typeArg) {
   for (int i=0; i<nk; i++) if (kd[i]==NA_INTEGER) error(_("Item %d of n is NA"), i+1);
 
   SEXP ans = PROTECT(allocVector(VECSXP, nk)); nprotect++;
+  setDT(ans);  // to tell gforce() clearly that these are columns without ambiguity over being a list column when DT or the group has 1 row
+  
+  // TODO: do we still need to set gforce dynamic here???
+  SEXP att = PROTECT(allocVector(VECSXP, 3)); nprotect++;
+  SET_VECTOR_ELT(att, 0, R_NilValue);
+  SET_VECTOR_ELT(att, 1, ScalarLogical(true));    // first/last doesn't matter for gshift which returns the same length as its input
+  SET_VECTOR_ELT(att, 2, ScalarInteger(INT_MAX)); // i.e. grpsize; TODO: perhaps point lens directly to grpsize instead
+  setAttrib(ans, sym_gforce_dynamic, att);
+  //
+  
   SEXP thisfill = PROTECT(coerceAs(fillArg, x, ScalarLogical(0))); nprotect++;
   for (int g=0; g<nk; g++) {
     lag = stype == LAG || stype == CYCLIC;
diff --git a/src/init.c b/src/init.c
index eaae557ae5..99457554e5 100644
--- a/src/init.c
+++ b/src/init.c
@@ -29,6 +29,7 @@ SEXP sym_index;
 SEXP sym_BY;
 SEXP sym_starts, char_starts;
 SEXP sym_maxgrpn;
+SEXP sym_gforce_dynamic;
 SEXP sym_anyna;
 SEXP sym_anyinfnan;
 SEXP sym_anynotascii;
@@ -61,6 +62,7 @@ R_CallMethodDef callMethods[] = {
 {"Calloccolwrapper", (DL_FUNC) &alloccolwrapper, -1},
 {"Cselfrefokwrapper", (DL_FUNC) &selfrefokwrapper, -1},
 {"Ctruelength", (DL_FUNC) &truelength, -1},
+{"Csettruelength", (DL_FUNC) &settruelength, -1},
 {"Csetcharvec", (DL_FUNC) &setcharvec, -1},
 {"Csetcolorder", (DL_FUNC) &setcolorder, -1},
 {"Cchmatch", (DL_FUNC) &chmatch_R, -1},
@@ -288,6 +290,7 @@ void attribute_visible R_init_data_table(DllInfo *info)
   sym_index   = install("index");
   sym_BY      = install(".BY");
   sym_maxgrpn = install("maxgrpn");
+  sym_gforce_dynamic = install("gforce_dynamic");
   sym_anyna   = install("anyna");
   sym_anyinfnan = install("anyinfnan");
   sym_anynotascii = install("anynotascii");
diff --git a/src/utils.c b/src/utils.c
index 0a864ee8c7..93f3dc1a78 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -542,6 +542,15 @@ SEXP perhapsDataTableR(SEXP x) {
   return ScalarLogical(perhapsDataTable(x));
 }
 
+SEXP setDT(SEXP x) {
+  // used by gsumm.c to return a barebones DT (e.g. no names, not over-allocated) to gforce when n is a vector of shifts
+  SEXP class;
+  setAttrib(x, R_ClassSymbol, class=allocVector(STRSXP, 2));
+  SET_STRING_ELT(class,0,char_datatable);
+  SET_STRING_ELT(class,1,char_dataframe);
+  return x;
+}
+
 void internal_error(const char *call_name, const char *format, ...) {
   char buff[1024];
   va_list args;