From bcaea8f23e5377762ce7700b88e7abe1c18ed15f Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Fri, 4 May 2018 14:14:59 -0700 Subject: [PATCH 1/8] Create temporary test sheet with non-ASCII character in filename Use characters representable in Windows-1252 codepage --- tests/testthat/test-read-excel.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/testthat/test-read-excel.R b/tests/testthat/test-read-excel.R index b7268986..b0126716 100644 --- a/tests/testthat/test-read-excel.R +++ b/tests/testthat/test-read-excel.R @@ -93,3 +93,14 @@ test_that("trim_ws must be a logical", { "`trim_ws` must be either TRUE or FALSE" ) }) + +## https://github.com/tidyverse/readxl/issues/476 +test_that("non-ASCII xls filenames can be read", { + ## chosen to be non-ASCII but representable in Windows-1252 codepage + ## a-grave e-diaeresis Eth + '.xls' + filename <- "\u00C0\u00CB\u00D0\u002E\u0078\u006C\u0073" + tricky_file <- file.path(tempdir(), filename) + file.copy(test_sheet("mtcars.xls"), tricky_file) + on.exit(file.remove(tricky_file)) + expect_error_free(read_xls(tricky_file)) +}) From 945178d41dbef4b02a0ce56c7fffc1793d2eaed6 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Fri, 4 May 2018 14:15:47 -0700 Subject: [PATCH 2/8] Don't normalizePath() on xls filepaths --- src/XlsWorkBook.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/XlsWorkBook.h b/src/XlsWorkBook.h index a02b4c60..8086725a 100644 --- a/src/XlsWorkBook.h +++ b/src/XlsWorkBook.h @@ -26,7 +26,8 @@ class XlsWorkBook { public: XlsWorkBook(const std::string& path) { - path_ = normalizePath(path); + //path_ = normalizePath(path); + path_ = path; xls::xlsWorkBook* pWB_ = xls::xls_open(path_.c_str(), "UTF-8"); if (pWB_ == NULL) { From 8a8c06136c9254b029368c24dc34915f49af6d0c Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Fri, 4 May 2018 16:22:21 -0700 Subject: [PATCH 3/8] Explicit enc2native() --- src/XlsWorkBook.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/XlsWorkBook.h b/src/XlsWorkBook.h index 8086725a..ba23aed1 100644 --- a/src/XlsWorkBook.h +++ b/src/XlsWorkBook.h @@ -9,7 +9,8 @@ inline std::string normalizePath(std::string path) { Rcpp::Environment baseEnv = Rcpp::Environment::base_env(); Rcpp::Function normalizePath = baseEnv["normalizePath"]; - return Rcpp::as(normalizePath(path, "/", true)); + Rcpp::Function enc2native = baseEnv["enc2native"]; + return Rcpp::as(enc2native(normalizePath(path, "/", true))); } class XlsWorkBook { @@ -26,8 +27,7 @@ class XlsWorkBook { public: XlsWorkBook(const std::string& path) { - //path_ = normalizePath(path); - path_ = path; + path_ = normalizePath(path); xls::xlsWorkBook* pWB_ = xls::xls_open(path_.c_str(), "UTF-8"); if (pWB_ == NULL) { From f6982353812eb193afae2c0674ef059aad749a6e Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Fri, 4 May 2018 18:44:35 -0700 Subject: [PATCH 4/8] Work on the test --- tests/testthat/test-read-excel.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-read-excel.R b/tests/testthat/test-read-excel.R index b0126716..21b44c94 100644 --- a/tests/testthat/test-read-excel.R +++ b/tests/testthat/test-read-excel.R @@ -96,11 +96,15 @@ test_that("trim_ws must be a logical", { ## https://github.com/tidyverse/readxl/issues/476 test_that("non-ASCII xls filenames can be read", { - ## chosen to be non-ASCII but representable in Windows-1252 codepage - ## a-grave e-diaeresis Eth + '.xls' + skip_on_cran() + ## chosen to be non-ASCII but + ## [1] representable in Windows-1252 and + ## [2] not any of the few differences between Windows-1252 and ISO-8859-1 + ## a-grave + e-diaeresis + Eth + '.xls' filename <- "\u00C0\u00CB\u00D0\u002E\u0078\u006C\u0073" tricky_file <- file.path(tempdir(), filename) file.copy(test_sheet("mtcars.xls"), tricky_file) + expect_true(file.exists(tricky_file)) on.exit(file.remove(tricky_file)) expect_error_free(read_xls(tricky_file)) }) From 96cd855ecd24f3e261b501c76c4b89d1d8750a54 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Thu, 10 May 2018 22:04:41 -0700 Subject: [PATCH 5/8] Extend test to xlsx --- tests/testthat/test-read-excel.R | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-read-excel.R b/tests/testthat/test-read-excel.R index 21b44c94..fefbd8be 100644 --- a/tests/testthat/test-read-excel.R +++ b/tests/testthat/test-read-excel.R @@ -94,17 +94,24 @@ test_that("trim_ws must be a logical", { ) }) -## https://github.com/tidyverse/readxl/issues/476 -test_that("non-ASCII xls filenames can be read", { +## xlsx: https://github.com/tidyverse/readxl/issues/370 +## xls: https://github.com/tidyverse/readxl/issues/476 +test_that("non-ASCII filenames can be read", { skip_on_cran() ## chosen to be non-ASCII but ## [1] representable in Windows-1252 and ## [2] not any of the few differences between Windows-1252 and ISO-8859-1 - ## a-grave + e-diaeresis + Eth + '.xls' - filename <- "\u00C0\u00CB\u00D0\u002E\u0078\u006C\u0073" - tricky_file <- file.path(tempdir(), filename) - file.copy(test_sheet("mtcars.xls"), tricky_file) - expect_true(file.exists(tricky_file)) - on.exit(file.remove(tricky_file)) - expect_error_free(read_xls(tricky_file)) + ## a-grave + e-diaeresis + Eth + '.xls[x]' + xls_filename <- "\u00C0\u00CB\u00D0\u002E\u0078\u006C\u0073" + xlsx_filename <- "\u00C0\u00CB\u00D0\u002E\u0078\u006C\u0073\u0078" + tricky_xls_file <- file.path(tempdir(), xls_filename) + tricky_xlsx_file <- file.path(tempdir(), xlsx_filename) + file.copy(test_sheet("list_type.xls"), tricky_xls_file) + file.copy(test_sheet("list_type.xlsx"), tricky_xlsx_file) + expect_true(file.exists(tricky_xls_file)) + expect_true(file.exists(tricky_xlsx_file)) + on.exit(file.remove(tricky_xls_file)) + on.exit(file.remove(tricky_xlsx_file)) + expect_error_free(read_xls(tricky_xls_file)) + expect_error_free(read_xlsx(tricky_xlsx_file)) }) From 2afc45eae5d94f40fab991d711fe776ab4487400 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Thu, 10 May 2018 22:49:30 -0700 Subject: [PATCH 6/8] Don't modify xls path on the C++ side --- src/XlsWorkBook.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/XlsWorkBook.h b/src/XlsWorkBook.h index ba23aed1..ae756418 100644 --- a/src/XlsWorkBook.h +++ b/src/XlsWorkBook.h @@ -6,13 +6,6 @@ #include "ColSpec.h" #include "utils.h" -inline std::string normalizePath(std::string path) { - Rcpp::Environment baseEnv = Rcpp::Environment::base_env(); - Rcpp::Function normalizePath = baseEnv["normalizePath"]; - Rcpp::Function enc2native = baseEnv["enc2native"]; - return Rcpp::as(enc2native(normalizePath(path, "/", true))); -} - class XlsWorkBook { // common to Xls[x]WorkBook @@ -27,7 +20,7 @@ class XlsWorkBook { public: XlsWorkBook(const std::string& path) { - path_ = normalizePath(path); + path_ = path; xls::xlsWorkBook* pWB_ = xls::xls_open(path_.c_str(), "UTF-8"); if (pWB_ == NULL) { From 098c3b0fd5fe5eb668e5cb0ed3e37edacbbafb18 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Thu, 10 May 2018 22:53:36 -0700 Subject: [PATCH 7/8] Ensure all paths are in native encoding at the handoff --- R/read_excel.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/read_excel.R b/R/read_excel.R index a6de3db2..29ead9ee 100644 --- a/R/read_excel.R +++ b/R/read_excel.R @@ -152,7 +152,7 @@ read_excel_ <- function(path, sheet = NULL, range = NULL, tibble::set_tidy_names( tibble::as_tibble( read_fun( - path = path, sheet_i = sheet, + path = enc2native(path), sheet_i = sheet, limits = limits, shim = shim, col_names = col_names, col_types = col_types, na = na, trim_ws = trim_ws, guess_max = guess_max From a15391bc92f54181fb2429881bff1b22275984fb Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Thu, 10 May 2018 23:20:10 -0700 Subject: [PATCH 8/8] Edit NEWS [skip ci] --- NEWS.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7a2227f5..7fb6926c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,19 @@ # readxl 1.1.0.9000 +## Breaking change + * Missing or duplicated column names are now repaired with `tibble::set_tidy_names()` in `read_excel()` and friends. `set_tidy_names()` is intended to encourage name repair that is more principled and consistent, across multiple tidyverse packages. Its design is discussed in [tidyverse/tibble#217](https://github.com/tidyverse/tibble/issues/217). (#357, #453) - Example of change a user will see: consider a spreadsheet with three columns, one unnamed and two named `x`. - Column names in readxl > 1.1.0: `..1`, `x..2`, `x..3` - Column names in readxl <= 1.1.0: `X__1`, `x`, `x__1` + +## Other changes + +* Path handling (#477): + + - `.xls` paths are no longer normalized. (#476 xls) + - All paths are explicitly converted to the native encoding via `enc2native()` (#370) # readxl 1.1.0