diff --git a/r/R/csv.R b/r/R/csv.R index 6ac0118d58f..bf69830079d 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -42,16 +42,16 @@ #' characters? This is more general than `escape_double` as backslashes #' can be used to escape the delimiter character, the quote character, or #' to add special characters like `\\n`. -# #' @param col_names If `TRUE`, the first row of the input will be used as the -# #' column names and will not be included in the data frame. Note that `FALSE` -# #' is not currently supported, nor is specifying a character vector of column -# #' names. +#' @param col_names If `TRUE`, the first row of the input will be used as the +#' column names and will not be included in the data frame. (Note that `FALSE` +#' is not currently supported.) Alternatively, you can specify a character +#' vector of column names. #' @param col_select A [tidy selection specification][tidyselect::vars_select] #' of columns, as used in `dplyr::select()`. #' @param skip_empty_rows Should blank rows be ignored altogether? If #' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be #' filled with missings. -# #' @param skip Number of lines to skip before reading data. +#' @param skip Number of lines to skip before reading data. #' @param parse_options see [csv_parse_options()]. If given, this overrides any #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). #' @param convert_options see [csv_convert_options()] @@ -66,39 +66,41 @@ read_delim_arrow <- function(file, quote = '"', escape_double = TRUE, escape_backslash = FALSE, - # col_names = TRUE, + col_names = TRUE, # col_types = TRUE, col_select = NULL, # na = c("", "NA"), # quoted_na = TRUE, skip_empty_rows = TRUE, - # skip = 0L, + skip = 0L, parse_options = NULL, convert_options = NULL, - read_options = csv_read_options(), + read_options = NULL, as_tibble = TRUE) { - # These are hardcoded pending https://issues.apache.org/jira/browse/ARROW-5747 - col_names <- TRUE - skip <- 0L - + if (identical(col_names, FALSE)) { + stop("Not implemented", call.=FALSE) + } if (is.null(parse_options)) { - if (isTRUE(col_names)) { - # Add one row to skip, to match arrow's header_rows - skip <- skip + 1L - # Note that with the hardcoding, header_rows is always 1, which - # turns out to be the only value that works meaningfully - } parse_options <- readr_to_csv_parse_options( delim, quote, escape_double, escape_backslash, - skip_empty_rows, - skip + skip_empty_rows ) } + if (is.null(read_options)) { + if (isTRUE(col_names)) { + # C++ default to parse is 0-length string array + col_names <- character(0) + } + read_options <- csv_read_options( + skip_rows = skip, + column_names = col_names + ) + } if (is.null(convert_options)) { # TODO: # * na strings (needs wiring in csv_convert_options) @@ -117,10 +119,6 @@ read_delim_arrow <- function(file, ) tab <- reader$Read()$select(!!enquo(col_select)) - if (is.character(col_names)) { - # TODO: Rename `tab`'s columns - # See https://github.com/apache/arrow/pull/4557 - } if (isTRUE(as_tibble)) { tab <- as.data.frame(tab) @@ -135,16 +133,16 @@ read_csv_arrow <- function(file, quote = '"', escape_double = TRUE, escape_backslash = FALSE, - # col_names = TRUE, + col_names = TRUE, # col_types = TRUE, col_select = NULL, # na = c("", "NA"), # quoted_na = TRUE, skip_empty_rows = TRUE, - # skip = 0L, + skip = 0L, parse_options = NULL, convert_options = NULL, - read_options = csv_read_options(), + read_options = NULL, as_tibble = TRUE) { mc <- match.call() @@ -159,16 +157,16 @@ read_tsv_arrow <- function(file, quote = '"', escape_double = TRUE, escape_backslash = FALSE, - # col_names = TRUE, + col_names = TRUE, # col_types = TRUE, col_select = NULL, # na = c("", "NA"), # quoted_na = TRUE, skip_empty_rows = TRUE, - # skip = 0L, + skip = 0L, parse_options = NULL, convert_options = NULL, - read_options = csv_read_options(), + read_options = NULL, as_tibble = TRUE) { mc <- match.call() @@ -192,15 +190,25 @@ read_tsv_arrow <- function(file, #' Read options for the Arrow file readers #' #' @param use_threads Whether to use the global CPU thread pool -#' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE`. NB: if false, JSON input must end with an empty line +#' @param block_size Block size we request from the IO layer; also determines +#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input +#' must end with an empty line. +#' @param skip_rows Number of lines to skip before reading data. +#' @param column_names Character vector to supply column names. If length-0 +#' (the default), the first non-skipped row will be parsed to generate column +#' names. #' #' @export csv_read_options <- function(use_threads = option_use_threads(), - block_size = 1048576L) { + block_size = 1048576L, + skip_rows = 0L, + column_names = character(0)) { shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize( list( use_threads = use_threads, - block_size = block_size + block_size = block_size, + skip_rows = skip_rows, + column_names = column_names ) )) } @@ -209,8 +217,7 @@ readr_to_csv_parse_options <- function(delim = ",", quote = '"', escape_double = TRUE, escape_backslash = FALSE, - skip_empty_rows = TRUE, - skip = 0L) { + skip_empty_rows = TRUE) { # This function translates from the readr argument list to the arrow arg names # TODO: validate inputs csv_parse_options( @@ -221,8 +228,7 @@ readr_to_csv_parse_options <- function(delim = ",", escaping = escape_backslash, escape_char = '\\', newlines_in_values = escape_backslash, - ignore_empty_lines = skip_empty_rows, - header_rows = skip + ignore_empty_lines = skip_empty_rows ) } @@ -236,7 +242,6 @@ readr_to_csv_parse_options <- function(delim = ",", #' @param escape_char Escaping character (if `escaping` is `TRUE`) #' @param newlines_in_values Whether values are allowed to contain CR (`0x0d`) and LF (`0x0a`) characters #' @param ignore_empty_lines Whether empty lines are ignored. If `FALSE`, an empty line represents -#' @param header_rows Number of header rows to skip (including the first row containing column names) #' #' @export csv_parse_options <- function(delimiter = ",", @@ -246,8 +251,7 @@ csv_parse_options <- function(delimiter = ",", escaping = FALSE, escape_char = '\\', newlines_in_values = FALSE, - ignore_empty_lines = TRUE, - header_rows = 1L) { + ignore_empty_lines = TRUE) { shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize( list( @@ -258,8 +262,7 @@ csv_parse_options <- function(delimiter = ",", escaping = escaping, escape_char = escape_char, newlines_in_values = newlines_in_values, - ignore_empty_lines = ignore_empty_lines, - header_rows = header_rows + ignore_empty_lines = ignore_empty_lines ) )) } diff --git a/r/README.md b/r/README.md index 43280f33c1b..47458cf118f 100644 --- a/r/README.md +++ b/r/README.md @@ -48,14 +48,6 @@ library. ``` r library(arrow) -#> -#> Attaching package: 'arrow' -#> The following object is masked from 'package:utils': -#> -#> timestamp -#> The following objects are masked from 'package:base': -#> -#> array, table set.seed(24) tab <- arrow::table(x = 1:10, y = rnorm(10)) diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd index 17c5ba238ab..a46cfb3761e 100644 --- a/r/man/csv_parse_options.Rd +++ b/r/man/csv_parse_options.Rd @@ -8,7 +8,7 @@ csv_parse_options(delimiter = ",", quoting = TRUE, quote_char = "\\"", double_quote = TRUE, escaping = FALSE, escape_char = "\\\\", newlines_in_values = FALSE, - ignore_empty_lines = TRUE, header_rows = 1L) + ignore_empty_lines = TRUE) json_parse_options(newlines_in_values = FALSE) } @@ -28,8 +28,6 @@ json_parse_options(newlines_in_values = FALSE) \item{newlines_in_values}{Whether values are allowed to contain CR (\code{0x0d}) and LF (\code{0x0a}) characters} \item{ignore_empty_lines}{Whether empty lines are ignored. If \code{FALSE}, an empty line represents} - -\item{header_rows}{Number of header rows to skip (including the first row containing column names)} } \description{ Parsing options for Arrow file readers diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd index ddfc9d121ae..38b6e470d6e 100644 --- a/r/man/csv_read_options.Rd +++ b/r/man/csv_read_options.Rd @@ -6,14 +6,22 @@ \title{Read options for the Arrow file readers} \usage{ csv_read_options(use_threads = option_use_threads(), - block_size = 1048576L) + block_size = 1048576L, skip_rows = 0L, column_names = character(0)) json_read_options(use_threads = TRUE, block_size = 1048576L) } \arguments{ \item{use_threads}{Whether to use the global CPU thread pool} -\item{block_size}{Block size we request from the IO layer; also determines the size of chunks when use_threads is \code{TRUE}. NB: if false, JSON input must end with an empty line} +\item{block_size}{Block size we request from the IO layer; also determines +the size of chunks when use_threads is \code{TRUE}. NB: if \code{FALSE}, JSON input +must end with an empty line.} + +\item{skip_rows}{Number of lines to skip before reading data.} + +\item{column_names}{Character vector to supply column names. If length-0 +(the default), the first non-skipped row will be parsed to generate column +names.} } \description{ Read options for the Arrow file readers diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index ff732ae8004..0726889cdab 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -7,22 +7,20 @@ \title{Read a CSV or other delimited file with Arrow} \usage{ read_delim_arrow(file, delim = ",", quote = "\\"", - escape_double = TRUE, escape_backslash = FALSE, col_select = NULL, - skip_empty_rows = TRUE, parse_options = NULL, - convert_options = NULL, read_options = csv_read_options(), + escape_double = TRUE, escape_backslash = FALSE, col_names = TRUE, + col_select = NULL, skip_empty_rows = TRUE, skip = 0L, + parse_options = NULL, convert_options = NULL, read_options = NULL, as_tibble = TRUE) read_csv_arrow(file, quote = "\\"", escape_double = TRUE, - escape_backslash = FALSE, col_select = NULL, - skip_empty_rows = TRUE, parse_options = NULL, - convert_options = NULL, read_options = csv_read_options(), - as_tibble = TRUE) + escape_backslash = FALSE, col_names = TRUE, col_select = NULL, + skip_empty_rows = TRUE, skip = 0L, parse_options = NULL, + convert_options = NULL, read_options = NULL, as_tibble = TRUE) read_tsv_arrow(file, quote = "\\"", escape_double = TRUE, - escape_backslash = FALSE, col_select = NULL, - skip_empty_rows = TRUE, parse_options = NULL, - convert_options = NULL, read_options = csv_read_options(), - as_tibble = TRUE) + escape_backslash = FALSE, col_names = TRUE, col_select = NULL, + skip_empty_rows = TRUE, skip = 0L, parse_options = NULL, + convert_options = NULL, read_options = NULL, as_tibble = TRUE) } \arguments{ \item{file}{A character path to a local file, or an Arrow input stream} @@ -40,6 +38,11 @@ characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \code{\\n}.} +\item{col_names}{If \code{TRUE}, the first row of the input will be used as the +column names and will not be included in the data frame. (Note that \code{FALSE} +is not currently supported.) Alternatively, you can specify a character +vector of column names.} + \item{col_select}{A \link[tidyselect:vars_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} @@ -47,6 +50,8 @@ of columns, as used in \code{dplyr::select()}.} \code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be filled with missings.} +\item{skip}{Number of lines to skip before reading data.} + \item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides any parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} diff --git a/r/src/csv.cpp b/r/src/csv.cpp index bfcbae7a7cf..6165636d315 100644 --- a/r/src/csv.cpp +++ b/r/src/csv.cpp @@ -28,6 +28,8 @@ std::shared_ptr csv___ReadOptions__initialize(List_ opt std::make_shared(arrow::csv::ReadOptions::Defaults()); res->use_threads = options["use_threads"]; res->block_size = options["block_size"]; + res->skip_rows = options["skip_rows"]; + res->column_names = Rcpp::as>(options["column_names"]); return res; } @@ -43,7 +45,6 @@ std::shared_ptr csv___ParseOptions__initialize(List_ o res->double_quote = options["double_quote"]; res->escape_char = get_char(options["escape_char"]); res->newlines_in_values = options["newlines_in_values"]; - res->header_rows = options["header_rows"]; res->ignore_empty_lines = options["ignore_empty_lines"]; return res; } diff --git a/r/tests/testthat/test-arrow-csv.R b/r/tests/testthat/test-arrow-csv.R index aed96387a82..81e35b3a5db 100644 --- a/r/tests/testthat/test-arrow-csv.R +++ b/r/tests/testthat/test-arrow-csv.R @@ -81,29 +81,39 @@ test_that("read_delim_arrow parsing options: quote", { }) test_that("read_csv_arrow parsing options: col_names", { - skip("Invalid: Empty CSV file") tf <- tempfile() on.exit(unlink(tf)) + # Writing the CSV without the header write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE) - tab1 <- read_csv_arrow(tf, col_names = FALSE) + + expect_error(read_csv_arrow(tf, col_names = FALSE), "Not implemented") + + tab1 <- read_csv_arrow(tf, col_names = names(iris)) expect_identical(names(tab1), names(iris)) iris$Species <- as.character(iris$Species) expect_equivalent(iris, tab1) + + # This errors (correctly) because I haven't given enough names + # but the error message is "Invalid: Empty CSV file", which is not accurate + expect_error( + read_csv_arrow(tf, col_names = names(iris)[1]) + ) + # Same here + expect_error( + read_csv_arrow(tf, col_names = c(names(iris), names(iris))) + ) }) test_that("read_csv_arrow parsing options: skip", { - skip("Invalid: Empty CSV file") tf <- tempfile() on.exit(unlink(tf)) + # Adding two garbage lines to start the csv cat("asdf\nqwer\n", file = tf) suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append = TRUE)) - # This works: - # print(head(readr::read_csv(tf, skip = 2))) - # This errors: tab1 <- read_csv_arrow(tf, skip = 2) expect_identical(names(tab1), names(iris))