Rework filepath (re-)encoding (#438)

* Revert filepath reencoding * Unskip this test * Re-encode to native just prior to fopen() or mio::make_mmap_source() * Expose the bytes * Unconditionally encode paths as UTF-8 * Clean up comments, remove print debugging * Ensure path returned by `chr_to_file()` is encoded as UTF-8 This guards against the scenario where the tempdir's path has non-ascii characters in it. Presumably that could arise on, say, Windows if the user name has non-ascii characters: > tempdir() [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\Rtmpg30qBQ" * Add more `enc2utf8()` Everytime we use a base R path-handling function, explicitly re-encode the result as UTF-8. * Inline the reference object * Test writing to a non-ascii path * Test read/write of .zip with non-ascii characters in path * Rewrite this test * I now think the problem IS in archive, for Windows and unix Skip this test in non-UTF-8 locales for now * Add a test re: reading fwf from non-ascii filepath * Tweaks NEWS * Make versions of basename() and normalizePath() that uphold UTF-8 everywhere
tidyverse · May 17, 2022 · 85143f7 · 85143f7
1 parent 0c90270
commit 85143f7
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 30 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,11 @@
 # vroom (development version)
 
+* `vroom()` reads more reliably from filepaths containing non-ascii characters, in a non-UTF-8 locale (#394, #438).
+
 * `vroom_format()` and `vroom_write()` only quote values that contain a
   delimiter, quote, or newline. Specifically values that are equal to the `na`
   string (or that start with it) are no longer quoted (#426).
-
-* `vroom()` reads more reliably from filepaths containing non-ascii characters (#394).
-
+
 * Fixed segfault when reading in multiple files and the first file is header-only but subsequent files have at least one row (#430).
 
 * Fixed segfault when `vroom_format()` is given an empty data frame (#425)

diff --git a/R/path.R b/R/path.R
@@ -20,14 +20,6 @@ reencode_file <- function(path, encoding) {
   return(list(out_file))
 }
 
-reencode_filepath <- function(path) {
-  if (is_windows()) {
-    enc2utf8(path)
-  } else {
-    enc2native(path)
-  }
-}
-
 # These functions adapted from https://github.com/tidyverse/readr/blob/192cb1ca5c445e359f153d2259391e6d324fd0a2/R/source.R
 standardise_path <- function(path) {
   if (is.raw(path)) {
@@ -68,7 +60,7 @@ standardise_path <- function(path) {
     }
   }
 
-  as.list(reencode_filepath(path))
+  as.list(enc2utf8(path))
 }
 
 standardise_one_path <- function (path, write = FALSE) {
@@ -104,10 +96,12 @@ standardise_one_path <- function (path, write = FALSE) {
     )
   }
 
-  p <- split_path_ext(basename(path))
+  path <- enc2utf8(path)
+
+  p <- split_path_ext(basename_utf8(path))
 
   if (write) {
-    path <- normalizePath(path, mustWork = FALSE)
+    path <- normalizePath_utf8(path, mustWork = FALSE)
   } else {
     path <- check_path(path)
   }
@@ -228,8 +222,9 @@ is_url <- function(path) {
 }
 
 check_path <- function(path) {
-  if (file.exists(path))
-    return(normalizePath(path, "/", mustWork = FALSE))
+  if (file.exists(path)) {
+    return(normalizePath_utf8(path, mustWork = FALSE))
+  }
 
   stop("'", path, "' does not exist",
     if (!is_absolute_path(path)) {
@@ -265,7 +260,7 @@ chr_to_file <- function(x, envir = parent.frame()) {
 
   withr::defer(unlink(out), envir = envir)
 
-  normalizePath(out)
+  normalizePath_utf8(out)
 }
 
 detect_compression <- function(path) {
@@ -319,3 +314,11 @@ detect_compression <- function(path) {
 
   NA_character_
 }
+
+basename_utf8 <- function(path) {
+  enc2utf8(basename(path))
+}
+
+normalizePath_utf8 <- function(path, winslash = "/", mustWork = NA) {
+  enc2utf8(normalizePath(path, winslash = winslash, mustWork = mustWork))
+}
diff --git a/src/unicode_fopen.h b/src/unicode_fopen.h
@@ -12,16 +12,31 @@
 #endif
 // clang-format on
 
-#ifdef _WIN32
 #include <Rinternals.h>
+
+#ifdef _WIN32
 #include <windows.h>
+#else
+#include "cpp11/r_string.hpp"
 #endif
 
+// useful for print debugging file path encoding
+// inline void print_hex(const char* string) {
+//   unsigned char* p = (unsigned char*) string;
+//   for (int i = 0; i < 300 ; i++) {
+//     if (p[i] == '\0') break;
+//     Rprintf("%c 0x%02x ", p[i], p[i]);
+//     if ((i%16 == 0) && i)
+//       Rprintf("\n");
+//   }
+//   Rprintf("\n");
+// }
+
 // This is needed to support wide character paths on windows
 inline FILE* unicode_fopen(const char* path, const char* mode) {
   FILE* out;
 #ifdef _WIN32
-  // First conver the mode to the wide equivalent
+  // First convert the mode to the wide equivalent
   // Only usage is 2 characters so max 8 bytes + 2 byte null.
   wchar_t mode_w[10];
   MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9);
@@ -40,7 +55,11 @@ inline FILE* unicode_fopen(const char* path, const char* mode) {
   MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len);
   out = _wfopen(buf, mode_w);
 #else
-  out = fopen(path, mode);
+  // the path has UTF-8 encoding, because we do that unconditionally on the R
+  // side (but also because cpp11 is eager to use UTF-8)
+  // however, we need to pass the path to fopen() in the native encoding
+  const char* native_path = Rf_translateChar(cpp11::r_string(path));
+  out = fopen(native_path, mode);
 #endif
 
   return out;
@@ -64,6 +83,10 @@ make_mmap_source(const char* file, std::error_code& error) {
   free(buf);
   return out;
 #else
-  return mio::make_mmap_source(file, error);
+  // the path has UTF-8 encoding, because we do that unconditionally on the R
+  // side (but also because cpp11 is eager to use UTF-8)
+  // however, we need to pass the path to fopen() in the native encoding
+  const char* native_path = Rf_translateChar(cpp11::r_string(file));
+  return mio::make_mmap_source(native_path, error);
 #endif
 }
diff --git a/tests/testthat/test-path.R b/tests/testthat/test-path.R
@@ -1,13 +1,10 @@
-mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
-
 test_that("vroom errors if the file does not exist", {
-
   tf <- tempfile()
-
   expect_error(vroom(tf, col_types = list()), "does not exist")
 })
 
 test_that("vroom works with compressed files", {
+  mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
   expect_equal(vroom(vroom_example("mtcars.csv.gz"), col_types = list()), mt)
   expect_equal(vroom(vroom_example("mtcars.csv.bz2"), col_types = list()), mt)
   expect_equal(vroom(vroom_example("mtcars.csv.xz"), col_types = list()), mt)
@@ -17,13 +14,15 @@ test_that("vroom works with compressed files", {
 test_that("read_file works via https", {
   skip_on_cran()
 
+  mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
   url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv"
   expect_equal(vroom(url, col_types = list()), mt)
 })
 
 test_that("vroom works via https on gz file", {
   skip_on_cran()
 
+  mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
   url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv.gz"
   expect_equal(vroom(url, col_types = list()), mt)
 })
@@ -110,11 +109,6 @@ test_that("can read file w/o final newline, w/ multi-byte characters in path", {
 
 # for completeness, w.r.t. test above
 test_that("can read file w/ final newline, w/ multi-byte characters in path", {
-  # (our usage of) mio seems to fail for a non-ascii path, on linux, in a
-  # non-UTF-8 local
-  # I'm not convinced it's worth troubleshooting at this point
-  skip_if(!is_windows() && isTRUE(l10n_info()$`Latin-1`))
-
   pattern <- "yes-trailing-n\u00e8wline-m\u00fblti-byt\u00e9-path-"
   tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
   writeLines(c("a,b", "A,B"), tfile)
@@ -124,3 +118,57 @@ test_that("can read file w/ final newline, w/ multi-byte characters in path", {
     tibble::tibble(a = "A", b = "B")
   )
 })
+
+test_that("can write to path with non-ascii characters", {
+  pattern <- "cr\u00E8me-br\u00FBl\u00E9e-"
+  tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
+  dat <- tibble::tibble(a = "A", b = "B")
+  vroom_write(dat, tfile, delim = ",")
+  expect_equal(readLines(tfile), c("a,b", "A,B"))
+})
+
+test_that("can read/write a compressed file with non-ascii characters in path", {
+  skip_on_cran()
+  skip_if_not(rlang::is_installed("archive"))
+  # https://github.com/r-lib/archive/issues/75
+  skip_if(l10n_info()$`Latin-1`)
+
+  make_temp_path <- function(ext) file.path(tempdir(), paste0("d\u00E4t", ext))
+
+  gzfile   <- withr::local_file(make_temp_path(".tar.gz"))
+  bz2file  <- withr::local_file(make_temp_path(".tar.bz2"))
+  xzfile   <- withr::local_file(make_temp_path(".tar.xz"))
+  zipfile  <- withr::local_file(make_temp_path(".zip"))
+
+  dat <- tibble::tibble(a = "A", b = "B")
+
+  vroom_write(dat, gzfile)
+  vroom_write(dat, bz2file)
+  vroom_write(dat, xzfile)
+  vroom_write(dat, zipfile)
+
+  expect_equal(detect_compression(gzfile), "gz")
+  expect_equal(detect_compression(bz2file), "bz2")
+  expect_equal(detect_compression(xzfile), "xz")
+  expect_equal(detect_compression(zipfile), "zip")
+
+  expect_equal(vroom(gzfile,  show_col_types = FALSE), dat)
+  expect_equal(vroom(bz2file, show_col_types = FALSE), dat)
+  expect_equal(vroom(xzfile,  show_col_types = FALSE), dat)
+  expect_equal(vroom(zipfile, show_col_types = FALSE), dat)
+})
+
+test_that("can read fwf file w/ non-ascii characters in path", {
+  tfile <- withr::local_tempfile(pattern = "fwf-y\u00F6-", fileext = ".txt")
+  writeLines(c("A B", "C D"), tfile)
+
+  expect_equal(
+    spec <- fwf_empty(tfile, col_names = c("a", "b")),
+    list(begin = c(0L, 2L), end = c(1L, NA), col_names = c("a", "b"))
+  )
+
+  expect_equal(
+    vroom_fwf(tfile, spec, show_col_types = FALSE),
+    tibble::tibble(a = c("A", "C"), b = c("B", "D"))
+  )
+})