tidyverse · jennybc · May 17, 2022 · May 8, 2022 · May 8, 2022 · May 8, 2022
diff --git a/R/path.R b/R/path.R
@@ -20,14 +20,6 @@ reencode_file <- function(path, encoding) {
   return(list(out_file))
 }
 
-reencode_filepath <- function(path) {
-  if (is_windows()) {
-    enc2utf8(path)
-  } else {
-    enc2native(path)
-  }
-}
-
 # These functions adapted from https://github.com/tidyverse/readr/blob/192cb1ca5c445e359f153d2259391e6d324fd0a2/R/source.R
 standardise_path <- function(path) {
   if (is.raw(path)) {
@@ -68,7 +60,7 @@ standardise_path <- function(path) {
     }
   }
 
-  as.list(reencode_filepath(path))
+  as.list(enc2utf8(path))
 }
 
 standardise_one_path <- function (path, write = FALSE) {
@@ -148,6 +140,8 @@ standardise_one_path <- function (path, write = FALSE) {
     stop("Can only read from, not write to, .zip", call. = FALSE)
   }
 
+  path <- enc2utf8(path)
+
   switch(compression,
     gz = gzfile(path, ""),
     bz2 = bzfile(path, ""),

diff --git a/src/unicode_fopen.h b/src/unicode_fopen.h
@@ -12,16 +12,31 @@
 #endif
 // clang-format on
 
-#ifdef _WIN32
 #include <Rinternals.h>
+
+#ifdef _WIN32
 #include <windows.h>
+#else
+#include "cpp11/r_string.hpp"
 #endif
 
+// useful for print debugging file path encoding
+// inline void print_hex(const char* string) {
+//   unsigned char* p = (unsigned char*) string;
+//   for (int i = 0; i < 300 ; i++) {
+//     if (p[i] == '\0') break;
+//     Rprintf("%c 0x%02x ", p[i], p[i]);
+//     if ((i%16 == 0) && i)
+//       Rprintf("\n");
+//   }
+//   Rprintf("\n");
+// }
+
 // This is needed to support wide character paths on windows
 inline FILE* unicode_fopen(const char* path, const char* mode) {
   FILE* out;
 #ifdef _WIN32
-  // First conver the mode to the wide equivalent
+  // First convert the mode to the wide equivalent
   // Only usage is 2 characters so max 8 bytes + 2 byte null.
   wchar_t mode_w[10];
   MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9);
@@ -40,7 +55,11 @@ inline FILE* unicode_fopen(const char* path, const char* mode) {
   MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len);
   out = _wfopen(buf, mode_w);
 #else
-  out = fopen(path, mode);
+  // the path has UTF-8 encoding, because we do that unconditionally on the R
+  // side (but also because cpp11 is eager to use UTF-8)
+  // however, we need to pass the path to fopen() in the native encoding
+  const char* native_path = Rf_translateChar(cpp11::r_string(path));
+  out = fopen(native_path, mode);
 #endif
 
   return out;
@@ -64,6 +83,10 @@ make_mmap_source(const char* file, std::error_code& error) {
   free(buf);
   return out;
 #else
-  return mio::make_mmap_source(file, error);
+  // the path has UTF-8 encoding, because we do that unconditionally on the R
+  // side (but also because cpp11 is eager to use UTF-8)
+  // however, we need to pass the path to fopen() in the native encoding
+  const char* native_path = Rf_translateChar(cpp11::r_string(file));
+  return mio::make_mmap_source(native_path, error);
 #endif
 }
diff --git a/tests/testthat/test-path.R b/tests/testthat/test-path.R
@@ -110,11 +110,6 @@ test_that("can read file w/o final newline, w/ multi-byte characters in path", {
 
 # for completeness, w.r.t. test above
 test_that("can read file w/ final newline, w/ multi-byte characters in path", {
-  # (our usage of) mio seems to fail for a non-ascii path, on linux, in a
-  # non-UTF-8 local
-  # I'm not convinced it's worth troubleshooting at this point
-  skip_if(!is_windows() && isTRUE(l10n_info()$`Latin-1`))
-
   pattern <- "yes-trailing-n\u00e8wline-m\u00fblti-byt\u00e9-path-"
   tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
   writeLines(c("a,b", "A,B"), tfile)