diff --git a/NEWS.md b/NEWS.md index f7c073d0b4..a7db78c3fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,6 +74,8 @@ 15. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix. +16. `fread()` now handles the `na.strings` argument for quoted text columns, making it possible to specify `na.strings = '""'` and read empty quoted strings as `NA`s, [#6974](https://github.com/Rdatatable/data.table/issues/6974). Thanks to @AngelFelizR for the report and @aitap for the PR. + ### NOTES 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8de433dc05..07702ff847 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21276,3 +21276,18 @@ test(2324.2, rollup(DT, j = sum(value) + ..sets, by=c("color","year","status"), label="total"), rollup(DT, j = sum(value), by=c("color","year","status"), label="total") ) + +# allow na.strings to be quoted, #6974 +f = tempfile() +DT = data.table( + "Date Example"=c("12/5/2012", NA), + "Question 1"=c("Yes", NA), + "Question 2"=c("Yes", NA), + "Site: Country"=c("Chile", "Virgin Islands, British") +) +fwrite(DT, f, na='""') +test(2325.1, fread(f, na.strings='""'), DT) +unlink(f) +test(2325.2, + fread('"foo","bar","baz"\n"a","b","c"', na.strings=c('"foo"', '"bar"', '"baz"'), header=FALSE), + data.table(V1=c(NA, "a"), V2=c(NA, "b"), V3=c(NA, "c"))) diff --git a/src/fread.c b/src/fread.c index 4abed040c4..53e22f3d43 100644 --- a/src/fread.c +++ b/src/fread.c @@ -515,6 +515,8 @@ static void Field(FieldParseContext *ctx) // the field is quoted and quotes are correctly escaped (quoteRule 0 and 1) // or the field is quoted but quotes are not escaped (quoteRule 2) // or the field is not quoted but the data contains a quote at the start (quoteRule 2 too) + // What if this string signifies an NA? Will find out after we're done parsing quotes + const char *field_after_NA = end_NA_string(fieldStart); fieldStart++; // step over opening quote switch(quoteRule) { case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol @@ -573,6 +575,8 @@ static void Field(FieldParseContext *ctx) if (ch == eof && quoteRule != 2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2 while(target->len > 0 && ((ch[-1] == ' ' && stripWhite) || ch[-1] == '\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST " } + // Does end-of-field correspond to end-of-possible-NA? + if (field_after_NA == ch) target->len = INT32_MIN; } static void str_to_i32_core(const char **pch, int32_t *target, bool parse_date)