diff --git a/NEWS.md b/NEWS.md index 61168af9..35e4d262 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,11 @@ # vroom (development version) +* `vroom()` reads more reliably from filepaths containing non-ascii characters, in a non-UTF-8 locale (#394, #438). + * `vroom_format()` and `vroom_write()` only quote values that contain a delimiter, quote, or newline. Specifically values that are equal to the `na` string (or that start with it) are no longer quoted (#426). - -* `vroom()` reads more reliably from filepaths containing non-ascii characters (#394). - + * Fixed segfault when reading in multiple files and the first file is header-only but subsequent files have at least one row (#430). * Fixed segfault when `vroom_format()` is given an empty data frame (#425) diff --git a/R/path.R b/R/path.R index 2c3cfb6c..baa2d778 100644 --- a/R/path.R +++ b/R/path.R @@ -20,14 +20,6 @@ reencode_file <- function(path, encoding) { return(list(out_file)) } -reencode_filepath <- function(path) { - if (is_windows()) { - enc2utf8(path) - } else { - enc2native(path) - } -} - # These functions adapted from https://github.com/tidyverse/readr/blob/192cb1ca5c445e359f153d2259391e6d324fd0a2/R/source.R standardise_path <- function(path) { if (is.raw(path)) { @@ -68,7 +60,7 @@ standardise_path <- function(path) { } } - as.list(reencode_filepath(path)) + as.list(enc2utf8(path)) } standardise_one_path <- function (path, write = FALSE) { @@ -104,10 +96,12 @@ standardise_one_path <- function (path, write = FALSE) { ) } - p <- split_path_ext(basename(path)) + path <- enc2utf8(path) + + p <- split_path_ext(basename_utf8(path)) if (write) { - path <- normalizePath(path, mustWork = FALSE) + path <- normalizePath_utf8(path, mustWork = FALSE) } else { path <- check_path(path) } @@ -228,8 +222,9 @@ is_url <- function(path) { } check_path <- function(path) { - if (file.exists(path)) - return(normalizePath(path, "/", mustWork = FALSE)) + if (file.exists(path)) { + return(normalizePath_utf8(path, mustWork = FALSE)) + } stop("'", path, "' does not exist", if (!is_absolute_path(path)) { @@ -265,7 +260,7 @@ chr_to_file <- function(x, envir = parent.frame()) { withr::defer(unlink(out), envir = envir) - normalizePath(out) + normalizePath_utf8(out) } detect_compression <- function(path) { @@ -319,3 +314,11 @@ detect_compression <- function(path) { NA_character_ } + +basename_utf8 <- function(path) { + enc2utf8(basename(path)) +} + +normalizePath_utf8 <- function(path, winslash = "/", mustWork = NA) { + enc2utf8(normalizePath(path, winslash = winslash, mustWork = mustWork)) +} diff --git a/src/unicode_fopen.h b/src/unicode_fopen.h index b3a735d0..eac96dc7 100644 --- a/src/unicode_fopen.h +++ b/src/unicode_fopen.h @@ -12,16 +12,31 @@ #endif // clang-format on -#ifdef _WIN32 #include + +#ifdef _WIN32 #include +#else +#include "cpp11/r_string.hpp" #endif +// useful for print debugging file path encoding +// inline void print_hex(const char* string) { +// unsigned char* p = (unsigned char*) string; +// for (int i = 0; i < 300 ; i++) { +// if (p[i] == '\0') break; +// Rprintf("%c 0x%02x ", p[i], p[i]); +// if ((i%16 == 0) && i) +// Rprintf("\n"); +// } +// Rprintf("\n"); +// } + // This is needed to support wide character paths on windows inline FILE* unicode_fopen(const char* path, const char* mode) { FILE* out; #ifdef _WIN32 - // First conver the mode to the wide equivalent + // First convert the mode to the wide equivalent // Only usage is 2 characters so max 8 bytes + 2 byte null. wchar_t mode_w[10]; MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9); @@ -40,7 +55,11 @@ inline FILE* unicode_fopen(const char* path, const char* mode) { MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len); out = _wfopen(buf, mode_w); #else - out = fopen(path, mode); + // the path has UTF-8 encoding, because we do that unconditionally on the R + // side (but also because cpp11 is eager to use UTF-8) + // however, we need to pass the path to fopen() in the native encoding + const char* native_path = Rf_translateChar(cpp11::r_string(path)); + out = fopen(native_path, mode); #endif return out; @@ -64,6 +83,10 @@ make_mmap_source(const char* file, std::error_code& error) { free(buf); return out; #else - return mio::make_mmap_source(file, error); + // the path has UTF-8 encoding, because we do that unconditionally on the R + // side (but also because cpp11 is eager to use UTF-8) + // however, we need to pass the path to fopen() in the native encoding + const char* native_path = Rf_translateChar(cpp11::r_string(file)); + return mio::make_mmap_source(native_path, error); #endif } diff --git a/tests/testthat/test-path.R b/tests/testthat/test-path.R index 01b5234c..e1e61659 100644 --- a/tests/testthat/test-path.R +++ b/tests/testthat/test-path.R @@ -1,13 +1,10 @@ -mt <- vroom(vroom_example("mtcars.csv"), col_types = list()) - test_that("vroom errors if the file does not exist", { - tf <- tempfile() - expect_error(vroom(tf, col_types = list()), "does not exist") }) test_that("vroom works with compressed files", { + mt <- vroom(vroom_example("mtcars.csv"), col_types = list()) expect_equal(vroom(vroom_example("mtcars.csv.gz"), col_types = list()), mt) expect_equal(vroom(vroom_example("mtcars.csv.bz2"), col_types = list()), mt) expect_equal(vroom(vroom_example("mtcars.csv.xz"), col_types = list()), mt) @@ -17,6 +14,7 @@ test_that("vroom works with compressed files", { test_that("read_file works via https", { skip_on_cran() + mt <- vroom(vroom_example("mtcars.csv"), col_types = list()) url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv" expect_equal(vroom(url, col_types = list()), mt) }) @@ -24,6 +22,7 @@ test_that("read_file works via https", { test_that("vroom works via https on gz file", { skip_on_cran() + mt <- vroom(vroom_example("mtcars.csv"), col_types = list()) url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv.gz" expect_equal(vroom(url, col_types = list()), mt) }) @@ -110,11 +109,6 @@ test_that("can read file w/o final newline, w/ multi-byte characters in path", { # for completeness, w.r.t. test above test_that("can read file w/ final newline, w/ multi-byte characters in path", { - # (our usage of) mio seems to fail for a non-ascii path, on linux, in a - # non-UTF-8 local - # I'm not convinced it's worth troubleshooting at this point - skip_if(!is_windows() && isTRUE(l10n_info()$`Latin-1`)) - pattern <- "yes-trailing-n\u00e8wline-m\u00fblti-byt\u00e9-path-" tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv") writeLines(c("a,b", "A,B"), tfile) @@ -124,3 +118,57 @@ test_that("can read file w/ final newline, w/ multi-byte characters in path", { tibble::tibble(a = "A", b = "B") ) }) + +test_that("can write to path with non-ascii characters", { + pattern <- "cr\u00E8me-br\u00FBl\u00E9e-" + tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv") + dat <- tibble::tibble(a = "A", b = "B") + vroom_write(dat, tfile, delim = ",") + expect_equal(readLines(tfile), c("a,b", "A,B")) +}) + +test_that("can read/write a compressed file with non-ascii characters in path", { + skip_on_cran() + skip_if_not(rlang::is_installed("archive")) + # https://github.com/r-lib/archive/issues/75 + skip_if(l10n_info()$`Latin-1`) + + make_temp_path <- function(ext) file.path(tempdir(), paste0("d\u00E4t", ext)) + + gzfile <- withr::local_file(make_temp_path(".tar.gz")) + bz2file <- withr::local_file(make_temp_path(".tar.bz2")) + xzfile <- withr::local_file(make_temp_path(".tar.xz")) + zipfile <- withr::local_file(make_temp_path(".zip")) + + dat <- tibble::tibble(a = "A", b = "B") + + vroom_write(dat, gzfile) + vroom_write(dat, bz2file) + vroom_write(dat, xzfile) + vroom_write(dat, zipfile) + + expect_equal(detect_compression(gzfile), "gz") + expect_equal(detect_compression(bz2file), "bz2") + expect_equal(detect_compression(xzfile), "xz") + expect_equal(detect_compression(zipfile), "zip") + + expect_equal(vroom(gzfile, show_col_types = FALSE), dat) + expect_equal(vroom(bz2file, show_col_types = FALSE), dat) + expect_equal(vroom(xzfile, show_col_types = FALSE), dat) + expect_equal(vroom(zipfile, show_col_types = FALSE), dat) +}) + +test_that("can read fwf file w/ non-ascii characters in path", { + tfile <- withr::local_tempfile(pattern = "fwf-y\u00F6-", fileext = ".txt") + writeLines(c("A B", "C D"), tfile) + + expect_equal( + spec <- fwf_empty(tfile, col_names = c("a", "b")), + list(begin = c(0L, 2L), end = c(1L, NA), col_names = c("a", "b")) + ) + + expect_equal( + vroom_fwf(tfile, spec, show_col_types = FALSE), + tibble::tibble(a = c("A", "C"), b = c("B", "D")) + ) +})