Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework filepath (re-)encoding #438

Merged
merged 17 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions R/path.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@ reencode_file <- function(path, encoding) {
return(list(out_file))
}

reencode_filepath <- function(path) {
if (is_windows()) {
enc2utf8(path)
} else {
enc2native(path)
}
}

# These functions adapted from https://github.com/tidyverse/readr/blob/192cb1ca5c445e359f153d2259391e6d324fd0a2/R/source.R
standardise_path <- function(path) {
if (is.raw(path)) {
Expand Down Expand Up @@ -68,7 +60,7 @@ standardise_path <- function(path) {
}
jennybc marked this conversation as resolved.
Show resolved Hide resolved
}

as.list(reencode_filepath(path))
as.list(enc2utf8(path))
}

standardise_one_path <- function (path, write = FALSE) {
Expand Down Expand Up @@ -148,6 +140,8 @@ standardise_one_path <- function (path, write = FALSE) {
stop("Can only read from, not write to, .zip", call. = FALSE)
}

path <- enc2utf8(path)
jennybc marked this conversation as resolved.
Show resolved Hide resolved

switch(compression,
gz = gzfile(path, ""),
bz2 = bzfile(path, ""),
Expand Down
31 changes: 27 additions & 4 deletions src/unicode_fopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,31 @@
#endif
// clang-format on

#ifdef _WIN32
#include <Rinternals.h>

#ifdef _WIN32
#include <windows.h>
#else
#include "cpp11/r_string.hpp"
#endif

// useful for print debugging file path encoding
// inline void print_hex(const char* string) {
// unsigned char* p = (unsigned char*) string;
// for (int i = 0; i < 300 ; i++) {
// if (p[i] == '\0') break;
// Rprintf("%c 0x%02x ", p[i], p[i]);
// if ((i%16 == 0) && i)
// Rprintf("\n");
// }
// Rprintf("\n");
// }

// This is needed to support wide character paths on windows
inline FILE* unicode_fopen(const char* path, const char* mode) {
FILE* out;
#ifdef _WIN32
// First conver the mode to the wide equivalent
// First convert the mode to the wide equivalent
// Only usage is 2 characters so max 8 bytes + 2 byte null.
wchar_t mode_w[10];
MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9);
Expand All @@ -40,7 +55,11 @@ inline FILE* unicode_fopen(const char* path, const char* mode) {
MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len);
out = _wfopen(buf, mode_w);
#else
out = fopen(path, mode);
// the path has UTF-8 encoding, because we do that unconditionally on the R
// side (but also because cpp11 is eager to use UTF-8)
// however, we need to pass the path to fopen() in the native encoding
const char* native_path = Rf_translateChar(cpp11::r_string(path));
out = fopen(native_path, mode);
#endif

return out;
Expand All @@ -64,6 +83,10 @@ make_mmap_source(const char* file, std::error_code& error) {
free(buf);
return out;
#else
return mio::make_mmap_source(file, error);
// the path has UTF-8 encoding, because we do that unconditionally on the R
// side (but also because cpp11 is eager to use UTF-8)
// however, we need to pass the path to fopen() in the native encoding
const char* native_path = Rf_translateChar(cpp11::r_string(file));
return mio::make_mmap_source(native_path, error);
#endif
}
5 changes: 0 additions & 5 deletions tests/testthat/test-path.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,6 @@ test_that("can read file w/o final newline, w/ multi-byte characters in path", {

# for completeness, w.r.t. test above
test_that("can read file w/ final newline, w/ multi-byte characters in path", {
# (our usage of) mio seems to fail for a non-ascii path, on linux, in a
# non-UTF-8 local
# I'm not convinced it's worth troubleshooting at this point
skip_if(!is_windows() && isTRUE(l10n_info()$`Latin-1`))

pattern <- "yes-trailing-n\u00e8wline-m\u00fblti-byt\u00e9-path-"
tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
writeLines(c("a,b", "A,B"), tfile)
Expand Down