Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework filepath (re-)encoding #438

Merged
merged 17 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# vroom (development version)

* `vroom()` reads more reliably from filepaths containing non-ascii characters, in a non-UTF-8 locale (#394, #438).

* `vroom_format()` and `vroom_write()` only quote values that contain a
delimiter, quote, or newline. Specifically values that are equal to the `na`
string (or that start with it) are no longer quoted (#426).

* `vroom()` reads more reliably from filepaths containing non-ascii characters (#394).


* Fixed segfault when reading in multiple files and the first file is header-only but subsequent files have at least one row (#430).

* Fixed segfault when `vroom_format()` is given an empty data frame (#425)
Expand Down
31 changes: 17 additions & 14 deletions R/path.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@ reencode_file <- function(path, encoding) {
return(list(out_file))
}

reencode_filepath <- function(path) {
if (is_windows()) {
enc2utf8(path)
} else {
enc2native(path)
}
}

# These functions adapted from https://github.com/tidyverse/readr/blob/192cb1ca5c445e359f153d2259391e6d324fd0a2/R/source.R
standardise_path <- function(path) {
if (is.raw(path)) {
Expand Down Expand Up @@ -68,7 +60,7 @@ standardise_path <- function(path) {
}
jennybc marked this conversation as resolved.
Show resolved Hide resolved
}

as.list(reencode_filepath(path))
as.list(enc2utf8(path))
}

standardise_one_path <- function (path, write = FALSE) {
Expand Down Expand Up @@ -104,10 +96,12 @@ standardise_one_path <- function (path, write = FALSE) {
)
}

p <- split_path_ext(basename(path))
path <- enc2utf8(path)

p <- split_path_ext(basename_utf8(path))

if (write) {
path <- normalizePath(path, mustWork = FALSE)
path <- normalizePath_utf8(path, mustWork = FALSE)
} else {
path <- check_path(path)
}
Expand Down Expand Up @@ -228,8 +222,9 @@ is_url <- function(path) {
}

check_path <- function(path) {
if (file.exists(path))
return(normalizePath(path, "/", mustWork = FALSE))
if (file.exists(path)) {
return(normalizePath_utf8(path, mustWork = FALSE))
}

stop("'", path, "' does not exist",
if (!is_absolute_path(path)) {
Expand Down Expand Up @@ -265,7 +260,7 @@ chr_to_file <- function(x, envir = parent.frame()) {

withr::defer(unlink(out), envir = envir)

normalizePath(out)
normalizePath_utf8(out)
}

detect_compression <- function(path) {
Expand Down Expand Up @@ -319,3 +314,11 @@ detect_compression <- function(path) {

NA_character_
}

basename_utf8 <- function(path) {
enc2utf8(basename(path))
}

normalizePath_utf8 <- function(path, winslash = "/", mustWork = NA) {
enc2utf8(normalizePath(path, winslash = winslash, mustWork = mustWork))
}
31 changes: 27 additions & 4 deletions src/unicode_fopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,31 @@
#endif
// clang-format on

#ifdef _WIN32
#include <Rinternals.h>

#ifdef _WIN32
#include <windows.h>
#else
#include "cpp11/r_string.hpp"
#endif

// useful for print debugging file path encoding
// inline void print_hex(const char* string) {
// unsigned char* p = (unsigned char*) string;
// for (int i = 0; i < 300 ; i++) {
// if (p[i] == '\0') break;
// Rprintf("%c 0x%02x ", p[i], p[i]);
// if ((i%16 == 0) && i)
// Rprintf("\n");
// }
// Rprintf("\n");
// }

// This is needed to support wide character paths on windows
inline FILE* unicode_fopen(const char* path, const char* mode) {
FILE* out;
#ifdef _WIN32
// First conver the mode to the wide equivalent
// First convert the mode to the wide equivalent
// Only usage is 2 characters so max 8 bytes + 2 byte null.
wchar_t mode_w[10];
MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9);
Expand All @@ -40,7 +55,11 @@ inline FILE* unicode_fopen(const char* path, const char* mode) {
MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len);
out = _wfopen(buf, mode_w);
#else
out = fopen(path, mode);
// the path has UTF-8 encoding, because we do that unconditionally on the R
// side (but also because cpp11 is eager to use UTF-8)
// however, we need to pass the path to fopen() in the native encoding
const char* native_path = Rf_translateChar(cpp11::r_string(path));
out = fopen(native_path, mode);
#endif

return out;
Expand All @@ -64,6 +83,10 @@ make_mmap_source(const char* file, std::error_code& error) {
free(buf);
return out;
#else
return mio::make_mmap_source(file, error);
// the path has UTF-8 encoding, because we do that unconditionally on the R
// side (but also because cpp11 is eager to use UTF-8)
// however, we need to pass the path to fopen() in the native encoding
const char* native_path = Rf_translateChar(cpp11::r_string(file));
return mio::make_mmap_source(native_path, error);
#endif
}
66 changes: 57 additions & 9 deletions tests/testthat/test-path.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
mt <- vroom(vroom_example("mtcars.csv"), col_types = list())

test_that("vroom errors if the file does not exist", {

tf <- tempfile()

expect_error(vroom(tf, col_types = list()), "does not exist")
})

test_that("vroom works with compressed files", {
mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
expect_equal(vroom(vroom_example("mtcars.csv.gz"), col_types = list()), mt)
expect_equal(vroom(vroom_example("mtcars.csv.bz2"), col_types = list()), mt)
expect_equal(vroom(vroom_example("mtcars.csv.xz"), col_types = list()), mt)
Expand All @@ -17,13 +14,15 @@ test_that("vroom works with compressed files", {
test_that("read_file works via https", {
skip_on_cran()

mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv"
expect_equal(vroom(url, col_types = list()), mt)
})

test_that("vroom works via https on gz file", {
skip_on_cran()

mt <- vroom(vroom_example("mtcars.csv"), col_types = list())
url <- "https://raw.githubusercontent.com/r-lib/vroom/main/inst/extdata/mtcars.csv.gz"
expect_equal(vroom(url, col_types = list()), mt)
})
Expand Down Expand Up @@ -110,11 +109,6 @@ test_that("can read file w/o final newline, w/ multi-byte characters in path", {

# for completeness, w.r.t. test above
test_that("can read file w/ final newline, w/ multi-byte characters in path", {
# (our usage of) mio seems to fail for a non-ascii path, on linux, in a
# non-UTF-8 local
# I'm not convinced it's worth troubleshooting at this point
skip_if(!is_windows() && isTRUE(l10n_info()$`Latin-1`))

pattern <- "yes-trailing-n\u00e8wline-m\u00fblti-byt\u00e9-path-"
tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
writeLines(c("a,b", "A,B"), tfile)
Expand All @@ -124,3 +118,57 @@ test_that("can read file w/ final newline, w/ multi-byte characters in path", {
tibble::tibble(a = "A", b = "B")
)
})

test_that("can write to path with non-ascii characters", {
pattern <- "cr\u00E8me-br\u00FBl\u00E9e-"
tfile <- withr::local_tempfile(pattern = pattern, fileext = ".csv")
dat <- tibble::tibble(a = "A", b = "B")
vroom_write(dat, tfile, delim = ",")
expect_equal(readLines(tfile), c("a,b", "A,B"))
})

test_that("can read/write a compressed file with non-ascii characters in path", {
skip_on_cran()
skip_if_not(rlang::is_installed("archive"))
# https://github.com/r-lib/archive/issues/75
skip_if(l10n_info()$`Latin-1`)

make_temp_path <- function(ext) file.path(tempdir(), paste0("d\u00E4t", ext))

gzfile <- withr::local_file(make_temp_path(".tar.gz"))
bz2file <- withr::local_file(make_temp_path(".tar.bz2"))
xzfile <- withr::local_file(make_temp_path(".tar.xz"))
zipfile <- withr::local_file(make_temp_path(".zip"))

dat <- tibble::tibble(a = "A", b = "B")

vroom_write(dat, gzfile)
vroom_write(dat, bz2file)
vroom_write(dat, xzfile)
vroom_write(dat, zipfile)

expect_equal(detect_compression(gzfile), "gz")
expect_equal(detect_compression(bz2file), "bz2")
expect_equal(detect_compression(xzfile), "xz")
expect_equal(detect_compression(zipfile), "zip")

expect_equal(vroom(gzfile, show_col_types = FALSE), dat)
expect_equal(vroom(bz2file, show_col_types = FALSE), dat)
expect_equal(vroom(xzfile, show_col_types = FALSE), dat)
expect_equal(vroom(zipfile, show_col_types = FALSE), dat)
})

test_that("can read fwf file w/ non-ascii characters in path", {
tfile <- withr::local_tempfile(pattern = "fwf-y\u00F6-", fileext = ".txt")
writeLines(c("A B", "C D"), tfile)

expect_equal(
spec <- fwf_empty(tfile, col_names = c("a", "b")),
list(begin = c(0L, 2L), end = c(1L, NA), col_names = c("a", "b"))
)

expect_equal(
vroom_fwf(tfile, spec, show_col_types = FALSE),
tibble::tibble(a = c("A", "C"), b = c("B", "D"))
)
})