diff --git a/r/NAMESPACE b/r/NAMESPACE index 31f056226ce..3f880fb4d04 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -3,16 +3,12 @@ S3method("!=",Object) S3method("$",RecordBatch) S3method("$",Table) -S3method("==",Array) -S3method("==",DataType) -S3method("==",Field) -S3method("==",Message) -S3method("==",RecordBatch) -S3method("==",Schema) +S3method("==",Object) S3method("[",RecordBatch) S3method("[",Table) S3method("[[",RecordBatch) S3method("[[",Table) +S3method(all,equal.Object) S3method(as.data.frame,RecordBatch) S3method(as.data.frame,Table) S3method(as.raw,Buffer) @@ -84,6 +80,7 @@ export(MessageType) export(MockOutputStream) export(ParquetFileReader) export(ParquetReaderProperties) +export(ParquetVersionType) export(RandomAccessFile) export(ReadableFile) export(RecordBatchFileReader) @@ -173,6 +170,7 @@ importFrom(rlang,dots_n) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,is_false) +importFrom(rlang,is_integerish) importFrom(rlang,list2) importFrom(rlang,quo_is_null) importFrom(rlang,warn) diff --git a/r/R/array.R b/r/R/array.R index 3333218cd4d..2c50edb4b45 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -156,6 +156,3 @@ length.Array <- function(x) x$length() #' @export as.vector.Array <- function(x, mode) x$as_vector() - -#' @export -`==.Array` <- function(x, y) x$Equals(y) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7ce881f90b0..107541d64ca 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -18,7 +18,7 @@ #' @importFrom R6 R6Class #' @importFrom purrr map map_int map2 #' @importFrom assertthat assert_that -#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish #' @importFrom Rcpp sourceCpp #' @importFrom tidyselect vars_select #' @useDynLib arrow, .registration = TRUE @@ -66,6 +66,16 @@ Object <- R6Class("Object", #' @export `!=.Object` <- function(lhs, rhs) !(lhs == rhs) +#' @export +`==.Object` <- function(x, y) { + x$Equals(y) +} + +#' @export +all.equal.Object <- function(target, current, ...) { + target == current +} + shared_ptr <- function(class, xp) { if (!shared_ptr_is_null(xp)) class$new(xp) } diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 271fcd3c5de..40c05f9b6d0 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -180,6 +180,10 @@ Buffer__data <- function(buffer){ .Call(`_arrow_Buffer__data` , buffer) } +Buffer__Equals <- function(x, y){ + .Call(`_arrow_Buffer__Equals` , x, y) +} + ChunkedArray__length <- function(chunked_array){ .Call(`_arrow_ChunkedArray__length` , chunked_array) } @@ -220,10 +224,18 @@ ChunkedArray__Validate <- function(chunked_array){ invisible(.Call(`_arrow_ChunkedArray__Validate` , chunked_array)) } +ChunkedArray__Equals <- function(x, y){ + .Call(`_arrow_ChunkedArray__Equals` , x, y) +} + util___Codec__Create <- function(codec, compression_level){ .Call(`_arrow_util___Codec__Create` , codec, compression_level) } +util___Codec__name <- function(codec){ + .Call(`_arrow_util___Codec__name` , codec) +} + io___CompressedOutputStream__Make <- function(codec, raw){ .Call(`_arrow_io___CompressedOutputStream__Make` , codec, raw) } @@ -880,8 +892,108 @@ parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices){ .Call(`_arrow_parquet___arrow___FileReader__ReadTable2` , reader, column_indices) } -write_parquet_file <- function(table, filename){ - invisible(.Call(`_arrow_write_parquet_file` , table, filename)) +parquet___default_arrow_writer_properties <- function(){ + .Call(`_arrow_parquet___default_arrow_writer_properties` ) +} + +parquet___ArrowWriterProperties___Builder__create <- function(){ + .Call(`_arrow_parquet___ArrowWriterProperties___Builder__create` ) +} + +parquet___ArrowWriterProperties___Builder__store_schema <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__store_schema` , builder)) +} + +parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__coerce_timestamps <- function(builder, unit){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps` , builder, unit)) +} + +parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__build <- function(builder){ + .Call(`_arrow_parquet___ArrowWriterProperties___Builder__build` , builder) +} + +parquet___default_writer_properties <- function(){ + .Call(`_arrow_parquet___default_writer_properties` ) +} + +parquet___WriterProperties___Builder__create <- function(){ + .Call(`_arrow_parquet___WriterProperties___Builder__create` ) +} + +parquet___WriterProperties___Builder__version <- function(builder, version){ + invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version` , builder, version)) +} + +parquet___ArrowWriterProperties___Builder__default_compression <- function(builder, compression){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_compression` , builder, compression)) +} + +parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions` , builder, paths, types)) +} + +parquet___ArrowWriterProperties___Builder__default_compression_level <- function(builder, compression_level){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_compression_level` , builder, compression_level)) +} + +parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels` , builder, paths, levels)) +} + +parquet___ArrowWriterProperties___Builder__default_write_statistics <- function(builder, write_statistics){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics` , builder, write_statistics)) +} + +parquet___ArrowWriterProperties___Builder__default_use_dictionary <- function(builder, use_dictionary){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary` , builder, use_dictionary)) +} + +parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary` , builder, paths, use_dictionary)) +} + +parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics` , builder, paths, write_statistics)) +} + +parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size` , builder, data_page_size)) +} + +parquet___WriterProperties___Builder__build <- function(builder){ + .Call(`_arrow_parquet___WriterProperties___Builder__build` , builder) +} + +parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties){ + .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open` , schema, sink, properties, arrow_properties) +} + +parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size){ + invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable` , writer, table, chunk_size)) +} + +parquet___arrow___FileWriter__Close <- function(writer){ + invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close` , writer)) +} + +parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties){ + invisible(.Call(`_arrow_parquet___arrow___WriteTable` , table, sink, properties, arrow_properties)) } parquet___arrow___FileReader__GetSchema <- function(reader){ @@ -1088,6 +1200,10 @@ Table__Slice2 <- function(table, offset, length){ .Call(`_arrow_Table__Slice2` , table, offset, length) } +Table__Equals <- function(lhs, rhs){ + .Call(`_arrow_Table__Equals` , lhs, rhs) +} + Table__GetColumnByName <- function(table, name){ .Call(`_arrow_Table__GetColumnByName` , table, name) } diff --git a/r/R/buffer.R b/r/R/buffer.R index d1f789175cc..0f11fdcf0a9 100644 --- a/r/R/buffer.R +++ b/r/R/buffer.R @@ -38,7 +38,8 @@ Buffer <- R6Class("Buffer", inherit = Object, public = list( ZeroPadding = function() Buffer__ZeroPadding(self), - data = function() Buffer__data(self) + data = function() Buffer__data(self), + Equals = function(other) Buffer__Equals(self, other) ), active = list( diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 3bf72985bd9..58f4b4e81d1 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -83,6 +83,9 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = Object, } } out + }, + Equals = function(other) { + ChunkedArray__Equals(self, other) } ), active = list( diff --git a/r/R/compression.R b/r/R/compression.R index 5fbe53e0c48..15375f4399c 100644 --- a/r/R/compression.R +++ b/r/R/compression.R @@ -36,18 +36,26 @@ #' @rdname Codec #' @name Codec #' @export -Codec <- R6Class("Codec", inherit = Object) +Codec <- R6Class("Codec", inherit = Object, + active = list( + name = function() util___Codec__name(self), + level = function() abort("Codec$level() not yet implemented") + ) +) Codec$create <- function(type = "gzip", compression_level = NA) { if (is.character(type)) { type <- unique_ptr(Codec, util___Codec__Create( - CompressionType[[match.arg(toupper(type), names(CompressionType))]], - compression_level + compression_from_name(type), compression_level )) } assert_is(type, "Codec") type } +compression_from_name <- function(name) { + map_int(name, ~CompressionType[[match.arg(toupper(.x), names(CompressionType))]]) +} + #' @title Compressed stream classes #' @rdname compression #' @name compression diff --git a/r/R/enums.R b/r/R/enums.R index ade6d8e94f3..cad25f882a1 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -84,3 +84,9 @@ CompressionType <- enum("Compression::type", FileType <- enum("FileType", NonExistent = 0L, Unknown = 1L, File = 2L, Directory = 3L ) + +#' @export +#' @rdname enums +ParquetVersionType <- enum("ParquetVersionType", + PARQUET_1_0 = 0L, PARQUET_2_0 = 1L +) diff --git a/r/R/feather.R b/r/R/feather.R index e1b74bdda9c..420307063ba 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -17,8 +17,8 @@ #' Write data in the Feather format #' -#' @param data `data.frame` or RecordBatch -#' @param stream A file path or an OutputStream +#' @param x `data.frame` or RecordBatch +#' @param sink A file path or an OutputStream #' #' @export #' @examples @@ -30,20 +30,20 @@ #' }) #' } #' @include arrow-package.R -write_feather <- function(data, stream) { - if (is.data.frame(data)) { - data <- record_batch(data) +write_feather <- function(x, sink) { + if (is.data.frame(x)) { + x <- record_batch(x) } - assert_is(data, "RecordBatch") + assert_is(x, "RecordBatch") - if (is.character(stream)) { - stream <- FileOutputStream$create(stream) - on.exit(stream$close()) + if (is.character(sink)) { + sink <- FileOutputStream$create(sink) + on.exit(sink$close()) } - assert_is(stream, "OutputStream") + assert_is(sink, "OutputStream") - writer <- FeatherTableWriter$create(stream) - ipc___TableWriter__RecordBatch__WriteFeather(writer, data) + writer <- FeatherTableWriter$create(sink) + ipc___TableWriter__RecordBatch__WriteFeather(writer, x) } #' @title FeatherTableWriter class diff --git a/r/R/field.R b/r/R/field.R index 152099c0d10..18337a15659 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -67,11 +67,6 @@ Field$create <- function(name, type, metadata) { shared_ptr(Field, Field__initialize(name, type, TRUE)) } -#' @export -`==.Field` <- function(lhs, rhs){ - lhs$Equals(rhs) -} - #' @param name field name #' @param type logical type, instance of [DataType] #' @param metadata currently ignored diff --git a/r/R/message.R b/r/R/message.R index 701d157fd43..51e0f965e27 100644 --- a/r/R/message.R +++ b/r/R/message.R @@ -45,9 +45,6 @@ Message <- R6Class("Message", inherit = Object, ) ) -#' @export -`==.Message` <- function(x, y) x$Equals(y) - #' @title class arrow::MessageReader #' #' @usage NULL diff --git a/r/R/parquet.R b/r/R/parquet.R index d36e5c33dd7..706494ab37f 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. - #' Read a Parquet file #' #' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format. @@ -34,10 +33,10 @@ #' } #' @export read_parquet <- function(file, - col_select = NULL, - as_data_frame = TRUE, - props = ParquetReaderProperties$create(), - ...) { + col_select = NULL, + as_data_frame = TRUE, + props = ParquetReaderProperties$create(), + ...) { reader <- ParquetFileReader$create(file, props = props, ...) tab <- reader$ReadTable(!!enquo(col_select)) @@ -47,6 +46,285 @@ read_parquet <- function(file, tab } +#' Write Parquet file to disk +#' +#' [Parquet](https://parquet.apache.org/) is a columnar storage file format. +#' This function enables you to write Parquet files from R. +#' +#' @param x An [arrow::Table][Table], or an object convertible to it. +#' @param sink an [arrow::io::OutputStream][OutputStream] or a string which is interpreted as a file path +#' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used. +#' +#' @param version parquet version, "1.0" or "2.0". +#' @param compression compression algorithm. No compression by default. +#' @param compression_level compression level. +#' @param use_dictionary Specify if we should use dictionary encoding. +#' @param write_statistics Specify if we should write statistics +#' @param data_page_size Set a target threshhold for the approximate encoded size of data +#' pages within a column chunk. If omitted, the default data page size (1Mb) is used. +#' @param properties properties for parquet writer, derived from arguments +#' `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics` and `data_page_size` +#' +#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format +#' @param coerce_timestamps Cast timestamps a particular resolution. can be NULL, "ms" or "us" +#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a particular +#' resolution. E.g. if microsecond or nanosecond data is lost when coercing to +#' ms', do not raise an exception +#' +#' @param arrow_properties arrow specific writer properties, derived from +#' arguments `use_deprecated_int96_timestamps`, `coerce_timestamps` and `allow_truncated_timestamps` +#' +#' @details The parameters `compression`, `compression_level`, `use_dictionary` and `write_statistics` support +#' various patterns: +#' - The default `NULL` leaves the parameter unspecified, and the C++ library uses an appropriate default for +#' each column +#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns +#' - An unnamed vector, of the same size as the number of columns, to specify a value for each column, in +#' positional order +#' - A named vector, to specify the value for the named columns, the default value for the setting is used +#' when not supplied. +#' +#' @return NULL, invisibly +#' +#' @examples +#' \donttest{ +#' tf1 <- tempfile(fileext = ".parquet") +#' write_parquet(data.frame(x = 1:5), tf2) +#' +#' # using compression +#' tf2 <- tempfile(fileext = ".gz.parquet") +#' write_parquet(data.frame(x = 1:5), compression = "gzip", compression_level = 5) +#' +#' } +#' @export +write_parquet <- function(x, + sink, + chunk_size = NULL, + + # writer properties + version = NULL, + compression = NULL, + compression_level = NULL, + use_dictionary = NULL, + write_statistics = NULL, + data_page_size = NULL, + + properties = ParquetWriterProperties$create( + x, + version = version, + compression = compression, + compression_level = compression_level, + use_dictionary = use_dictionary, + write_statistics = write_statistics, + data_page_size = data_page_size + ), + + # arrow writer properties + use_deprecated_int96_timestamps = FALSE, + coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + + arrow_properties = ParquetArrowWriterProperties$create( + use_deprecated_int96_timestamps = use_deprecated_int96_timestamps, + coerce_timestamps = coerce_timestamps, + allow_truncated_timestamps = allow_truncated_timestamps + ) +) { + x <- to_arrow(x) + + if (is.character(sink)) { + sink <- FileOutputStream$create(sink) + on.exit(sink$close()) + } else if (!inherits(sink, OutputStream)) { + abort("sink must be a file path or an OutputStream") + } + + schema <- x$schema + writer <- ParquetFileWriter$create(schema, sink, properties = properties, arrow_properties = arrow_properties) + writer$WriteTable(x, chunk_size = chunk_size %||% x$num_rows) + writer$Close() +} + + +ParquetArrowWriterPropertiesBuilder <- R6Class("ParquetArrowWriterPropertiesBuilder", inherit = Object, + public = list( + store_schema = function() { + parquet___ArrowWriterProperties___Builder__store_schema(self) + self + }, + set_int96_support = function(use_deprecated_int96_timestamps = FALSE) { + if (use_deprecated_int96_timestamps) { + parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(self) + } else { + parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(self) + } + self + }, + set_coerce_timestamps = function(coerce_timestamps = NULL) { + if (!is.null(coerce_timestamps)) { + unit <- make_valid_time_unit(coerce_timestamps, + c("ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO) + ) + parquet___ArrowWriterProperties___Builder__coerce_timestamps(unit) + } + self + }, + set_allow_truncated_timestamps = function(allow_truncated_timestamps = FALSE) { + if (allow_truncated_timestamps) { + parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(self) + } else { + parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(self) + } + + self + } + + ) +) +ParquetArrowWriterProperties <- R6Class("ParquetArrowWriterProperties", inherit = Object) + +ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, allow_truncated_timestamps = FALSE) { + if (!use_deprecated_int96_timestamps && is.null(coerce_timestamps) && !allow_truncated_timestamps) { + shared_ptr(ParquetArrowWriterProperties, parquet___default_arrow_writer_properties()) + } else { + builder <- shared_ptr(ParquetArrowWriterPropertiesBuilder, parquet___ArrowWriterProperties___Builder__create()) + builder$store_schema() + builder$set_int96_support(use_deprecated_int96_timestamps) + builder$set_coerce_timestamps(coerce_timestamps) + builder$set_allow_truncated_timestamps(allow_truncated_timestamps) + shared_ptr(ParquetArrowWriterProperties, parquet___ArrowWriterProperties___Builder__build(builder)) + } +} + +valid_parquet_version <- c( + "1.0" = ParquetVersionType$PARQUET_1_0, + "2.0" = ParquetVersionType$PARQUET_2_0 +) + +make_valid_version <- function(version, valid_versions = valid_parquet_version) { + if (is_integerish(version)) { + version <- as.character(version) + } + tryCatch( + valid_versions[[match.arg(version, choices = names(valid_versions))]], + error = function(cond) { + stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call.=FALSE) + } + ) +} + +ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = Object) +ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder", inherit = Object, + public = list( + set_version = function(version) { + parquet___ArrowWriterProperties___Builder__version(self, make_valid_version(version)) + }, + + set_compression = function(table, compression){ + private$.set(table, compression_from_name(compression), "compression", is.integer, + parquet___ArrowWriterProperties___Builder__default_compression, + parquet___ArrowWriterProperties___Builder__set_compressions + ) + }, + + set_compression_level = function(table, compression_level){ + private$.set(table, compression_level, "compression_level", is_integerish, + parquet___ArrowWriterProperties___Builder__default_compression_level, + parquet___ArrowWriterProperties___Builder__set_compression_levels + ) + }, + + set_dictionary = function(table, use_dictionary) { + private$.set(table, use_dictionary, "use_dictionary", is.logical, + parquet___ArrowWriterProperties___Builder__default_use_dictionary, + parquet___ArrowWriterProperties___Builder__set_use_dictionary + ) + }, + + set_write_statistics = function(table, write_statistics) { + private$.set(table, write_statistics, "write_statistics", is.logical, + parquet___ArrowWriterProperties___Builder__default_write_statistics, + parquet___ArrowWriterProperties___Builder__set_write_statistics + ) + }, + + set_data_page_size = function(data_page_size) { + parquet___ArrowWriterProperties___Builder__data_page_size(self, data_page_size) + } + ), + + private = list( + .set = function(table, value, name, is, default, multiple) { + msg <- paste0("unsupported ", name, "= specification") + assert_that(is(value), msg = msg) + column_names <- names(table) + if (is.null(given_names <- names(value))) { + if (length(value) == 1L) { + default(self, value) + } else if (length(value) == length(column_names)) { + multiple(self, column_names, value) + } + } else if(all(given_names %in% column_names)) { + multiple(self, given_names, value) + } else { + abort(msg) + } + } + ) + +) + +ParquetWriterProperties$create <- function(table, version = NULL, compression = NULL, compression_level = NULL, use_dictionary = NULL, write_statistics = NULL, data_page_size = NULL) { + if (is.null(version) && is.null(compression) && is.null(compression_level) && is.null(use_dictionary) && is.null(write_statistics) && is.null(data_page_size)) { + shared_ptr(ParquetWriterProperties, parquet___default_writer_properties()) + } else { + builder <- shared_ptr(ParquetWriterPropertiesBuilder, parquet___WriterProperties___Builder__create()) + if (!is.null(version)) { + builder$set_version(version) + } + if (!is.null(compression)) { + builder$set_compression(table, compression = compression) + } + if (!is.null(compression_level)) { + builder$set_compression_level(table, compression_level = compression_level) + } + if (!is.null(use_dictionary)) { + builder$set_dictionary(table, use_dictionary) + } + if (!is.null(write_statistics)) { + builder$set_write_statistics(table, write_statistics) + } + if (!is.null(data_page_size)) { + builder$set_data_page_size(data_page_size) + } + shared_ptr(ParquetWriterProperties, parquet___WriterProperties___Builder__build(builder)) + } +} + +ParquetFileWriter <- R6Class("ParquetFileWriter", inherit = Object, + public = list( + WriteTable = function(table, chunk_size) { + parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) + }, + Close = function() { + parquet___arrow___FileWriter__Close(self) + } + ) + +) +ParquetFileWriter$create <- function( + schema, + sink, + properties = ParquetWriterProperties$create(), + arrow_properties = ParquetArrowWriterProperties$create() +) { + unique_ptr( + ParquetFileWriter, + parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties) + ) +} + + #' @title ParquetFileReader class #' @rdname ParquetFileReader #' @name ParquetFileReader @@ -162,23 +440,3 @@ ParquetReaderProperties$create <- function(use_threads = option_use_threads()) { parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads)) ) } - - -#' Write Parquet file to disk -#' -#' [Parquet](https://parquet.apache.org/) is a columnar storage file format. -#' This function enables you to write Parquet files from R. -#' -#' @param table An [arrow::Table][Table], or an object convertible to it -#' @param file a file path -#' -#' @examples -#' \donttest{ -#' tf <- tempfile(fileext = ".parquet") -#' on.exit(unlink(tf)) -#' write_parquet(tibble::tibble(x = 1:5), tf) -#' } -#' @export -write_parquet <- function(table, file) { - write_parquet_file(to_arrow(table), file) -} diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 3814a2a2b78..6dcb18f1c3e 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -163,11 +163,6 @@ names.RecordBatch <- function(x) { x$names() } -#' @export -`==.RecordBatch` <- function(x, y) { - x$Equals(y) -} - #' @importFrom methods as #' @export `[.RecordBatch` <- function(x, i, j, ..., drop = FALSE) { diff --git a/r/R/schema.R b/r/R/schema.R index a2ee00c0ac6..11230158e77 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -62,9 +62,6 @@ Schema <- R6Class("Schema", Schema$create <- function(...) shared_ptr(Schema, schema_(.fields(list2(...)))) -#' @export -`==.Schema` <- function(lhs, rhs) lhs$Equals(rhs) - #' @param ... named list of [data types][data-type] #' @export #' @rdname Schema diff --git a/r/R/table.R b/r/R/table.R index bd3f447c84e..b3175e57941 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -124,12 +124,17 @@ Table <- R6Class("Table", inherit = Object, shared_ptr(Table, Table__select(self, indices)) } }, + Slice = function(offset, length = NULL) { if (is.null(length)) { shared_ptr(Table, Table__Slice1(self, offset)) } else { shared_ptr(Table, Table__Slice2(self, offset, length)) } + }, + + Equals = function(other) { + Table__Equals(self, other) } ), diff --git a/r/R/type.R b/r/R/type.R index 1e130787c68..51601e46790 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -128,9 +128,6 @@ FixedWidthType <- R6Class("FixedWidthType", ) ) -#' @export -`==.DataType` <- function(lhs, rhs) lhs$Equals(rhs) - Int8 <- R6Class("Int8", inherit = FixedWidthType) Int16 <- R6Class("Int16", inherit = FixedWidthType) Int32 <- R6Class("Int32", inherit = FixedWidthType) diff --git a/r/R/write-arrow.R b/r/R/write-arrow.R index dbab158204f..3903b1901cf 100644 --- a/r/R/write-arrow.R +++ b/r/R/write-arrow.R @@ -30,7 +30,7 @@ to_arrow.data.frame <- function(x) Table$create(!!!x) #' #' @param x an [arrow::Table][Table], an [arrow::RecordBatch][RecordBatch] or a data.frame #' -#' @param stream where to serialize to +#' @param sink where to serialize to #' #' - A [arrow::RecordBatchWriter][RecordBatchWriter]: the `$write()` #' of `x` is used. The stream is left open. This uses the streaming format @@ -50,20 +50,20 @@ to_arrow.data.frame <- function(x) Table$create(!!!x) #' and [arrow::RecordBatchStreamWriter][RecordBatchStreamWriter] can be used for more flexibility. #' #' @export -write_arrow <- function(x, stream, ...) { - UseMethod("write_arrow", stream) +write_arrow <- function(x, sink, ...) { + UseMethod("write_arrow", sink) } #' @export -write_arrow.RecordBatchWriter <- function(x, stream, ...){ - stream$write(x) +write_arrow.RecordBatchWriter <- function(x, sink, ...){ + sink$write(x) } #' @export -write_arrow.character <- function(x, stream, ...) { - assert_that(length(stream) == 1L) +write_arrow.character <- function(x, sink, ...) { + assert_that(length(sink) == 1L) x <- to_arrow(x) - file_stream <- FileOutputStream$create(stream) + file_stream <- FileOutputStream$create(sink) on.exit(file_stream$close()) file_writer <- RecordBatchFileWriter$create(file_stream, x$schema) on.exit({ @@ -77,7 +77,7 @@ write_arrow.character <- function(x, stream, ...) { } #' @export -write_arrow.raw <- function(x, stream, ...) { +write_arrow.raw <- function(x, sink, ...) { x <- to_arrow(x) schema <- x$schema diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 3d841fa0c64..7f7358f760b 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -11,6 +11,7 @@ \alias{MessageType} \alias{CompressionType} \alias{FileType} +\alias{ParquetVersionType} \title{Arrow enums} \format{An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.} \usage{ @@ -29,6 +30,8 @@ MessageType CompressionType FileType + +ParquetVersionType } \description{ Arrow enums diff --git a/r/man/write_arrow.Rd b/r/man/write_arrow.Rd index 1820e0e1536..c4d67033fbe 100644 --- a/r/man/write_arrow.Rd +++ b/r/man/write_arrow.Rd @@ -4,12 +4,12 @@ \alias{write_arrow} \title{Write Arrow formatted data} \usage{ -write_arrow(x, stream, ...) +write_arrow(x, sink, ...) } \arguments{ \item{x}{an \link[=Table]{arrow::Table}, an \link[=RecordBatch]{arrow::RecordBatch} or a data.frame} -\item{stream}{where to serialize to +\item{sink}{where to serialize to \itemize{ \item A \link[=RecordBatchWriter]{arrow::RecordBatchWriter}: the \code{$write()} of \code{x} is used. The stream is left open. This uses the streaming format diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 24636a09cb0..9bc37975281 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -4,12 +4,12 @@ \alias{write_feather} \title{Write data in the Feather format} \usage{ -write_feather(data, stream) +write_feather(x, sink) } \arguments{ -\item{data}{\code{data.frame} or RecordBatch} +\item{x}{\code{data.frame} or RecordBatch} -\item{stream}{A file path or an OutputStream} +\item{sink}{A file path or an OutputStream} } \description{ Write data in the Feather format diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index b0fb7bc6761..b2471d7a5b7 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -4,21 +4,79 @@ \alias{write_parquet} \title{Write Parquet file to disk} \usage{ -write_parquet(table, file) +write_parquet(x, sink, chunk_size = NULL, version = NULL, + compression = NULL, compression_level = NULL, + use_dictionary = NULL, write_statistics = NULL, + data_page_size = NULL, properties = ParquetWriterProperties$create(x, + version = version, compression = compression, compression_level = + compression_level, use_dictionary = use_dictionary, write_statistics = + write_statistics, data_page_size = data_page_size), + use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps + = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, + allow_truncated_timestamps = allow_truncated_timestamps)) } \arguments{ -\item{table}{An \link[=Table]{arrow::Table}, or an object convertible to it} +\item{x}{An \link[=Table]{arrow::Table}, or an object convertible to it.} -\item{file}{a file path} +\item{sink}{an \link[=OutputStream]{arrow::io::OutputStream} or a string which is interpreted as a file path} + +\item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.} + +\item{version}{parquet version, "1.0" or "2.0".} + +\item{compression}{compression algorithm. No compression by default.} + +\item{compression_level}{compression level.} + +\item{use_dictionary}{Specify if we should use dictionary encoding.} + +\item{write_statistics}{Specify if we should write statistics} + +\item{data_page_size}{Set a target threshhold for the approximate encoded size of data +pages within a column chunk. If omitted, the default data page size (1Mb) is used.} + +\item{properties}{properties for parquet writer, derived from arguments +\code{version}, \code{compression}, \code{compression_level}, \code{use_dictionary}, \code{write_statistics} and \code{data_page_size}} + +\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet format} + +\item{coerce_timestamps}{Cast timestamps a particular resolution. can be NULL, "ms" or "us"} + +\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps to a particular +resolution. E.g. if microsecond or nanosecond data is lost when coercing to +ms', do not raise an exception} + +\item{arrow_properties}{arrow specific writer properties, derived from +arguments \code{use_deprecated_int96_timestamps}, \code{coerce_timestamps} and \code{allow_truncated_timestamps}} +} +\value{ +NULL, invisibly } \description{ \href{https://parquet.apache.org/}{Parquet} is a columnar storage file format. This function enables you to write Parquet files from R. } +\details{ +The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and \code{write_statistics} support +various patterns: +- The default \code{NULL} leaves the parameter unspecified, and the C++ library uses an appropriate default for +each column +- A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns +- An unnamed vector, of the same size as the number of columns, to specify a value for each column, in +positional order +- A named vector, to specify the value for the named columns, the default value for the setting is used +when not supplied. +} \examples{ \donttest{ -tf <- tempfile(fileext = ".parquet") -on.exit(unlink(tf)) -write_parquet(tibble::tibble(x = 1:5), tf) +tf1 <- tempfile(fileext = ".parquet") +write_parquet(data.frame(x = 1:5), tf2) + +# using compression +tf2 <- tempfile(fileext = ".gz.parquet") +write_parquet(data.frame(x = 1:5), compression = "gzip", compression_level = 5) + } } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 10a657514e7..b00eca925a4 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -701,6 +701,22 @@ RcppExport SEXP _arrow_Buffer__data(SEXP buffer_sexp){ } #endif +// buffer.cpp +#if defined(ARROW_R_WITH_ARROW) +bool Buffer__Equals(const std::shared_ptr& x, const std::shared_ptr& y); +RcppExport SEXP _arrow_Buffer__Equals(SEXP x_sexp, SEXP y_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type x(x_sexp); + Rcpp::traits::input_parameter&>::type y(y_sexp); + return Rcpp::wrap(Buffer__Equals(x, y)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Buffer__Equals(SEXP x_sexp, SEXP y_sexp){ + Rf_error("Cannot call Buffer__Equals(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // chunkedarray.cpp #if defined(ARROW_R_WITH_ARROW) int ChunkedArray__length(const std::shared_ptr& chunked_array); @@ -857,6 +873,22 @@ RcppExport SEXP _arrow_ChunkedArray__Validate(SEXP chunked_array_sexp){ } #endif +// chunkedarray.cpp +#if defined(ARROW_R_WITH_ARROW) +bool ChunkedArray__Equals(const std::shared_ptr& x, const std::shared_ptr& y); +RcppExport SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type x(x_sexp); + Rcpp::traits::input_parameter&>::type y(y_sexp); + return Rcpp::wrap(ChunkedArray__Equals(x, y)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){ + Rf_error("Cannot call ChunkedArray__Equals(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // compression.cpp #if defined(ARROW_R_WITH_ARROW) std::unique_ptr util___Codec__Create(arrow::Compression::type codec, int compression_level); @@ -873,6 +905,21 @@ RcppExport SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_le } #endif +// compression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::string util___Codec__name(const std::unique_ptr& codec); +RcppExport SEXP _arrow_util___Codec__name(SEXP codec_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type codec(codec_sexp); + return Rcpp::wrap(util___Codec__name(codec)); +END_RCPP +} +#else +RcppExport SEXP _arrow_util___Codec__name(SEXP codec_sexp){ + Rf_error("Cannot call util___Codec__name(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // compression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr io___CompressedOutputStream__Make(const std::unique_ptr& codec, const std::shared_ptr& raw); @@ -3416,18 +3463,429 @@ RcppExport SEXP _arrow_parquet___arrow___FileReader__ReadTable2(SEXP reader_sexp // parquet.cpp #if defined(ARROW_R_WITH_ARROW) -void write_parquet_file(const std::shared_ptr& table, std::string filename); -RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){ +std::shared_ptr parquet___default_arrow_writer_properties(); +RcppExport SEXP _arrow_parquet___default_arrow_writer_properties(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___default_arrow_writer_properties()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___default_arrow_writer_properties(){ + Rf_error("Cannot call parquet___default_arrow_writer_properties(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___ArrowWriterProperties___Builder__create(); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__create(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___ArrowWriterProperties___Builder__create()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__create(){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__create(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__store_schema(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__store_schema(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__store_schema(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__store_schema(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__store_schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__coerce_timestamps(const std::shared_ptr& builder, arrow::TimeUnit::type unit); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps(SEXP builder_sexp, SEXP unit_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type unit(unit_sexp); + parquet___ArrowWriterProperties___Builder__coerce_timestamps(builder, unit); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps(SEXP builder_sexp, SEXP unit_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__coerce_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___ArrowWriterProperties___Builder__build(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__build(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + return Rcpp::wrap(parquet___ArrowWriterProperties___Builder__build(builder)); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__build(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__build(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___default_writer_properties(); +RcppExport SEXP _arrow_parquet___default_writer_properties(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___default_writer_properties()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___default_writer_properties(){ + Rf_error("Cannot call parquet___default_writer_properties(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___WriterProperties___Builder__create(); +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__create(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___WriterProperties___Builder__create()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__create(){ + Rf_error("Cannot call parquet___WriterProperties___Builder__create(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___WriterProperties___Builder__version(const std::shared_ptr& builder, const parquet::ParquetVersion::type& version); +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__version(SEXP builder_sexp, SEXP version_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type version(version_sexp); + parquet___WriterProperties___Builder__version(builder, version); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__version(SEXP builder_sexp, SEXP version_sexp){ + Rf_error("Cannot call parquet___WriterProperties___Builder__version(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_compression(const std::shared_ptr& builder, const arrow::Compression::type& compression); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression(SEXP builder_sexp, SEXP compression_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type compression(compression_sexp); + parquet___ArrowWriterProperties___Builder__default_compression(builder, compression); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression(SEXP builder_sexp, SEXP compression_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_compression(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_compressions(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::IntegerVector& types); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compressions(SEXP builder_sexp, SEXP paths_sexp, SEXP types_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type types(types_sexp); + parquet___ArrowWriterProperties___Builder__set_compressions(builder, paths, types); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compressions(SEXP builder_sexp, SEXP paths_sexp, SEXP types_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_compressions(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_compression_level(const std::shared_ptr& builder, int compression_level); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression_level(SEXP builder_sexp, SEXP compression_level_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type compression_level(compression_level_sexp); + parquet___ArrowWriterProperties___Builder__default_compression_level(builder, compression_level); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression_level(SEXP builder_sexp, SEXP compression_level_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_compression_level(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_compression_levels(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::IntegerVector& levels); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels(SEXP builder_sexp, SEXP paths_sexp, SEXP levels_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type levels(levels_sexp); + parquet___ArrowWriterProperties___Builder__set_compression_levels(builder, paths, levels); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels(SEXP builder_sexp, SEXP paths_sexp, SEXP levels_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_compression_levels(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_write_statistics(const std::shared_ptr& builder, bool write_statistics); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics(SEXP builder_sexp, SEXP write_statistics_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type write_statistics(write_statistics_sexp); + parquet___ArrowWriterProperties___Builder__default_write_statistics(builder, write_statistics); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics(SEXP builder_sexp, SEXP write_statistics_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_write_statistics(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_use_dictionary(const std::shared_ptr& builder, bool use_dictionary); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary(SEXP builder_sexp, SEXP use_dictionary_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type use_dictionary(use_dictionary_sexp); + parquet___ArrowWriterProperties___Builder__default_use_dictionary(builder, use_dictionary); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary(SEXP builder_sexp, SEXP use_dictionary_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_use_dictionary(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_use_dictionary(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::LogicalVector& use_dictionary); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary(SEXP builder_sexp, SEXP paths_sexp, SEXP use_dictionary_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type use_dictionary(use_dictionary_sexp); + parquet___ArrowWriterProperties___Builder__set_use_dictionary(builder, paths, use_dictionary); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary(SEXP builder_sexp, SEXP paths_sexp, SEXP use_dictionary_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_use_dictionary(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_write_statistics(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::LogicalVector& write_statistics); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics(SEXP builder_sexp, SEXP paths_sexp, SEXP write_statistics_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type write_statistics(write_statistics_sexp); + parquet___ArrowWriterProperties___Builder__set_write_statistics(builder, paths, write_statistics); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics(SEXP builder_sexp, SEXP paths_sexp, SEXP write_statistics_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_write_statistics(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__data_page_size(const std::shared_ptr& builder, int64_t data_page_size); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__data_page_size(SEXP builder_sexp, SEXP data_page_size_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type data_page_size(data_page_size_sexp); + parquet___ArrowWriterProperties___Builder__data_page_size(builder, data_page_size); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__data_page_size(SEXP builder_sexp, SEXP data_page_size_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__data_page_size(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___WriterProperties___Builder__build(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__build(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + return Rcpp::wrap(parquet___WriterProperties___Builder__build(builder)); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__build(SEXP builder_sexp){ + Rf_error("Cannot call parquet___WriterProperties___Builder__build(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::unique_ptr parquet___arrow___ParquetFileWriter__Open(const std::shared_ptr& schema, const std::shared_ptr& sink, const std::shared_ptr& properties, const std::shared_ptr& arrow_properties); +RcppExport SEXP _arrow_parquet___arrow___ParquetFileWriter__Open(SEXP schema_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type schema(schema_sexp); + Rcpp::traits::input_parameter&>::type sink(sink_sexp); + Rcpp::traits::input_parameter&>::type properties(properties_sexp); + Rcpp::traits::input_parameter&>::type arrow_properties(arrow_properties_sexp); + return Rcpp::wrap(parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties)); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___arrow___ParquetFileWriter__Open(SEXP schema_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ + Rf_error("Cannot call parquet___arrow___ParquetFileWriter__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___arrow___FileWriter__WriteTable(const std::unique_ptr& writer, const std::shared_ptr& table, int64_t chunk_size); +RcppExport SEXP _arrow_parquet___arrow___FileWriter__WriteTable(SEXP writer_sexp, SEXP table_sexp, SEXP chunk_size_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type writer(writer_sexp); + Rcpp::traits::input_parameter&>::type table(table_sexp); + Rcpp::traits::input_parameter::type chunk_size(chunk_size_sexp); + parquet___arrow___FileWriter__WriteTable(writer, table, chunk_size); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___arrow___FileWriter__WriteTable(SEXP writer_sexp, SEXP table_sexp, SEXP chunk_size_sexp){ + Rf_error("Cannot call parquet___arrow___FileWriter__WriteTable(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___arrow___FileWriter__Close(const std::unique_ptr& writer); +RcppExport SEXP _arrow_parquet___arrow___FileWriter__Close(SEXP writer_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type writer(writer_sexp); + parquet___arrow___FileWriter__Close(writer); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___arrow___FileWriter__Close(SEXP writer_sexp){ + Rf_error("Cannot call parquet___arrow___FileWriter__Close(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___arrow___WriteTable(const std::shared_ptr& table, const std::shared_ptr& sink, const std::shared_ptr& properties, const std::shared_ptr& arrow_properties); +RcppExport SEXP _arrow_parquet___arrow___WriteTable(SEXP table_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type table(table_sexp); - Rcpp::traits::input_parameter::type filename(filename_sexp); - write_parquet_file(table, filename); + Rcpp::traits::input_parameter&>::type sink(sink_sexp); + Rcpp::traits::input_parameter&>::type properties(properties_sexp); + Rcpp::traits::input_parameter&>::type arrow_properties(arrow_properties_sexp); + parquet___arrow___WriteTable(table, sink, properties, arrow_properties); return R_NilValue; END_RCPP } #else -RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){ - Rf_error("Cannot call write_parquet_file(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_parquet___arrow___WriteTable(SEXP table_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ + Rf_error("Cannot call parquet___arrow___WriteTable(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -4225,6 +4683,22 @@ RcppExport SEXP _arrow_Table__Slice2(SEXP table_sexp, SEXP offset_sexp, SEXP len } #endif +// table.cpp +#if defined(ARROW_R_WITH_ARROW) +bool Table__Equals(const std::shared_ptr& lhs, const std::shared_ptr& rhs); +RcppExport SEXP _arrow_Table__Equals(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter&>::type rhs(rhs_sexp); + return Rcpp::wrap(Table__Equals(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Table__Equals(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call Table__Equals(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // table.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Table__GetColumnByName(const std::shared_ptr& table, const std::string& name); @@ -4361,6 +4835,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, + { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, @@ -4371,7 +4846,9 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ChunkArray__Slice2", (DL_FUNC) &_arrow_ChunkArray__Slice2, 3}, { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, + { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, + { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, { "_arrow_compute___CastOptions__initialize", (DL_FUNC) &_arrow_compute___CastOptions__initialize, 3}, @@ -4536,7 +5013,32 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, - { "_arrow_write_parquet_file", (DL_FUNC) &_arrow_write_parquet_file, 2}, + { "_arrow_parquet___default_arrow_writer_properties", (DL_FUNC) &_arrow_parquet___default_arrow_writer_properties, 0}, + { "_arrow_parquet___ArrowWriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__create, 0}, + { "_arrow_parquet___ArrowWriterProperties___Builder__store_schema", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__store_schema, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__build, 1}, + { "_arrow_parquet___default_writer_properties", (DL_FUNC) &_arrow_parquet___default_writer_properties, 0}, + { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, + { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_compression", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_compression, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_compression_level", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_compression_level, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, + { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, + { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, + { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, + { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, + { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, @@ -4588,6 +5090,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, + { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 2}, { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, { "_arrow_Table__select", (DL_FUNC) &_arrow_Table__select, 2}, { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 2}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index f8c52fa0716..69f5cb39e62 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -207,6 +207,7 @@ RCPP_EXPOSED_ENUM_NODECL(arrow::io::FileMode::type) RCPP_EXPOSED_ENUM_NODECL(arrow::ipc::Message::Type) RCPP_EXPOSED_ENUM_NODECL(arrow::Compression::type) RCPP_EXPOSED_ENUM_NODECL(arrow::fs::FileType) +RCPP_EXPOSED_ENUM_NODECL(parquet::ParquetVersion::type) SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array); SEXP Array__as_vector(const std::shared_ptr& array); diff --git a/r/src/buffer.cpp b/r/src/buffer.cpp index 00df28d12ea..09ab39a5f98 100644 --- a/r/src/buffer.cpp +++ b/r/src/buffer.cpp @@ -62,4 +62,10 @@ Rcpp::RawVector Buffer__data(const std::shared_ptr& buffer) { return Rcpp::RawVector(buffer->data(), buffer->data() + buffer->size()); } +// [[arrow::export]] +bool Buffer__Equals(const std::shared_ptr& x, + const std::shared_ptr& y) { + return x->Equals(*y.get()); +} + #endif diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index 317728757a7..aef2a0eca21 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -80,4 +80,10 @@ void ChunkedArray__Validate(const std::shared_ptr& chunked_ STOP_IF_NOT_OK(chunked_array->Validate()); } +// [[arrow::export]] +bool ChunkedArray__Equals(const std::shared_ptr& x, + const std::shared_ptr& y) { + return x->Equals(y); +} + #endif diff --git a/r/src/compression.cpp b/r/src/compression.cpp index 4f9bc1772f4..4e6ec3105d7 100644 --- a/r/src/compression.cpp +++ b/r/src/compression.cpp @@ -27,6 +27,11 @@ std::unique_ptr util___Codec__Create(arrow::Compression::typ return out; } +// [[arrow::export]] +std::string util___Codec__name(const std::unique_ptr& codec) { + return codec->name(); +} + // [[arrow::export]] std::shared_ptr io___CompressedOutputStream__Make( const std::unique_ptr& codec, diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp index 9f9216a9522..add820b7dee 100644 --- a/r/src/parquet.cpp +++ b/r/src/parquet.cpp @@ -83,12 +83,213 @@ std::shared_ptr parquet___arrow___FileReader__ReadTable2( } // [[arrow::export]] -void write_parquet_file(const std::shared_ptr& table, - std::string filename) { - std::shared_ptr sink; - PARQUET_THROW_NOT_OK(arrow::io::FileOutputStream::Open(filename, &sink)); +std::shared_ptr +parquet___default_arrow_writer_properties() { + return parquet::default_arrow_writer_properties(); +} + +// [[arrow::export]] +std::shared_ptr +parquet___ArrowWriterProperties___Builder__create() { + return std::make_shared(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__store_schema( + const std::shared_ptr& builder) { + builder->store_schema(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps( + const std::shared_ptr& builder) { + builder->enable_deprecated_int96_timestamps(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps( + const std::shared_ptr& builder) { + builder->disable_deprecated_int96_timestamps(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__coerce_timestamps( + const std::shared_ptr& builder, + arrow::TimeUnit::type unit) { + builder->coerce_timestamps(unit); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps( + const std::shared_ptr& builder) { + builder->allow_truncated_timestamps(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps( + const std::shared_ptr& builder) { + builder->disallow_truncated_timestamps(); +} + +// [[arrow::export]] +std::shared_ptr +parquet___ArrowWriterProperties___Builder__build( + const std::shared_ptr& builder) { + return builder->build(); +} + +// [[arrow::export]] +std::shared_ptr parquet___default_writer_properties() { + return parquet::default_writer_properties(); +} + +// [[arrow::export]] +std::shared_ptr +parquet___WriterProperties___Builder__create() { + return std::make_shared(); +} + +// [[arrow::export]] +void parquet___WriterProperties___Builder__version( + const std::shared_ptr& builder, + const parquet::ParquetVersion::type& version) { + builder->version(version); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_compression( + const std::shared_ptr& builder, + const arrow::Compression::type& compression) { + builder->compression(compression); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_compressions( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::IntegerVector& types) { + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + builder->compression(paths[i], static_cast(types[i])); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_compression_level( + const std::shared_ptr& builder, + int compression_level) { + builder->compression_level(compression_level); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_compression_levels( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::IntegerVector& levels) { + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + builder->compression_level(paths[i], levels[i]); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_write_statistics( + const std::shared_ptr& builder, + bool write_statistics) { + if (write_statistics) { + builder->enable_statistics(); + } else { + builder->disable_statistics(); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_use_dictionary( + const std::shared_ptr& builder, + bool use_dictionary) { + if (use_dictionary) { + builder->enable_dictionary(); + } else { + builder->disable_dictionary(); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_use_dictionary( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::LogicalVector& use_dictionary) { + builder->disable_dictionary(); + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + if (use_dictionary[i]) { + builder->enable_dictionary(paths[i]); + } else { + builder->disable_dictionary(paths[i]); + } + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_write_statistics( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::LogicalVector& write_statistics) { + builder->disable_statistics(); + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + if (write_statistics[i]) { + builder->enable_statistics(paths[i]); + } else { + builder->disable_statistics(paths[i]); + } + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__data_page_size( + const std::shared_ptr& builder, + int64_t data_page_size) { + builder->data_pagesize(data_page_size); +} + +// [[arrow::export]] +std::shared_ptr parquet___WriterProperties___Builder__build( + const std::shared_ptr& builder) { + return builder->build(); +} + +// [[arrow::export]] +std::unique_ptr parquet___arrow___ParquetFileWriter__Open( + const std::shared_ptr& schema, + const std::shared_ptr& sink, + const std::shared_ptr& properties, + const std::shared_ptr& arrow_properties) { + std::unique_ptr writer; + PARQUET_THROW_NOT_OK( + parquet::arrow::FileWriter::Open(*schema, arrow::default_memory_pool(), sink, + properties, arrow_properties, &writer)); + return writer; +} + +// [[arrow::export]] +void parquet___arrow___FileWriter__WriteTable( + const std::unique_ptr& writer, + const std::shared_ptr& table, int64_t chunk_size) { + PARQUET_THROW_NOT_OK(writer->WriteTable(*table, chunk_size)); +} + +// [[arrow::export]] +void parquet___arrow___FileWriter__Close( + const std::unique_ptr& writer) { + PARQUET_THROW_NOT_OK(writer->Close()); +} + +// [[arrow::export]] +void parquet___arrow___WriteTable( + const std::shared_ptr& table, + const std::shared_ptr& sink, + const std::shared_ptr& properties, + const std::shared_ptr& arrow_properties) { PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), - sink, table->num_rows())); + sink, table->num_rows(), properties, + arrow_properties)); } // [[arrow::export]] diff --git a/r/src/table.cpp b/r/src/table.cpp index a78f1196294..e17db49ffb5 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -75,17 +75,23 @@ std::vector Table__ColumnNames(const std::shared_ptr& } // [[arrow::export]] -std::shared_ptr Table__Slice1( - const std::shared_ptr& table, int offset) { +std::shared_ptr Table__Slice1(const std::shared_ptr& table, + int offset) { return table->Slice(offset); } // [[arrow::export]] -std::shared_ptr Table__Slice2( - const std::shared_ptr& table, int offset, int length) { +std::shared_ptr Table__Slice2(const std::shared_ptr& table, + int offset, int length) { return table->Slice(offset, length); } +// [[arrow::export]] +bool Table__Equals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + return lhs->Equals(*rhs.get()); +} + // [[arrow::export]] std::shared_ptr Table__GetColumnByName( const std::shared_ptr& table, const std::string& name) { diff --git a/r/tests/testthat/helper-parquet.R b/r/tests/testthat/helper-parquet.R new file mode 100644 index 00000000000..1eec5b08862 --- /dev/null +++ b/r/tests/testthat/helper-parquet.R @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_parquet_roundtrip <- function(tab, ...) { + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(tab, tf, ...) + expect_equal(read_parquet(tf, as_data_frame = FALSE), tab) +} diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 97b8f694868..6ee630061df 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -28,13 +28,13 @@ test_that("RecordBatch", { ) batch <- record_batch(tbl) - expect_true(batch == batch) + expect_equal(batch, batch) expect_equal( batch$schema, schema( int = int32(), dbl = float64(), lgl = boolean(), chr = utf8(), - fct = dictionary() + fct = dictionary(int8(), utf8()) ) ) expect_equal(batch$num_columns, 5L) @@ -69,12 +69,12 @@ test_that("RecordBatch", { col_fct <- batch$column(4) expect_true(inherits(col_fct, 'Array')) expect_equal(col_fct$as_vector(), tbl$fct) - expect_equal(col_fct$type, dictionary()) + expect_equal(col_fct$type, dictionary(int8(), utf8())) batch2 <- batch$RemoveColumn(0) expect_equal( batch2$schema, - schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary()) + schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) ) expect_equal(batch2$column(0), batch$column(1)) expect_identical(as.data.frame(batch2), tbl[,-1]) @@ -156,7 +156,7 @@ test_that("RecordBatch with 0 rows are supported", { dbl = float64(), lgl = boolean(), chr = utf8(), - fct = dictionary() + fct = dictionary(int8(), utf8()) ) ) }) @@ -208,10 +208,11 @@ test_that("record_batch() handles data frame columns", { tib <- tibble::tibble(x = 1:10, y = 1:10) # because tib is named here, this becomes a struct array batch <- record_batch(a = 1:10, b = tib) - expect_equal(batch$schema, + expect_equal( + batch$schema, schema( a = int32(), - struct(x = int32(), y = int32()) + b = struct(x = int32(), y = int32()) ) ) out <- as.data.frame(batch) @@ -219,7 +220,8 @@ test_that("record_batch() handles data frame columns", { # if not named, columns from tib are auto spliced batch2 <- record_batch(a = 1:10, tib) - expect_equal(batch$schema, + expect_equal( + batch2$schema, schema(a = int32(), x = int32(), y = int32()) ) out <- as.data.frame(batch2) @@ -273,3 +275,4 @@ test_that("record_batch() only auto splice data frames", { regexp = "only data frames are allowed as unnamed arguments to be auto spliced" ) }) + diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index d1b00ed40e9..61fb5465423 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -111,6 +111,15 @@ test_that("[, [[, $ for Table", { }) test_that("head and tail on Table", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + expect_identical(as.data.frame(head(tab)), head(tbl)) expect_identical(as.data.frame(head(tab, 4)), head(tbl, 4)) expect_identical(as.data.frame(head(tab, -4)), head(tbl, -4)) @@ -137,6 +146,15 @@ test_that("Table print method", { }) test_that("table active bindings", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + expect_identical(dim(tbl), dim(tab)) expect_is(tab$columns, "list") expect_equal(tab$columns[[1]], tab[[1]]) @@ -196,3 +214,23 @@ test_that("table() auto splices (ARROW-5718)", { expect_equal(tab3$schema, s) expect_equivalent(as.data.frame(tab3), df) }) + +test_that("==.Table", { + tab1 <- Table$create(x = 1:2, y = c("a", "b")) + tab2 <- Table$create(x = 1:2, y = c("a", "b")) + tab3 <- Table$create(x = 1:2) + tab4 <- Table$create(x = 1:2, y = c("a", "b"), z = 3:4) + + expect_true(tab1 == tab2) + expect_true(tab2 == tab1) + + expect_false(tab1 == tab3) + expect_false(tab3 == tab1) + + expect_false(tab1 == tab4) + expect_false(tab4 == tab1) + + expect_true(all.equal(tab1, tab2)) + expect_equal(tab1, tab2) +}) + diff --git a/r/tests/testthat/test-compressed.R b/r/tests/testthat/test-compressed.R index 8bf1092616e..dedb1a8c84a 100644 --- a/r/tests/testthat/test-compressed.R +++ b/r/tests/testthat/test-compressed.R @@ -36,7 +36,6 @@ test_that("can write Buffer to CompressedOutputStream and read back in Compresse stream2$close() sink2$close() - input1 <- CompressedInputStream$create(tf1) buf1 <- input1$Read(1024L) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index fd6f40fcd56..18aa4298d46 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -49,3 +49,46 @@ test_that("read_parquet() with raw data", { df <- read_parquet(test_raw) expect_identical(dim(df), c(10L, 11L)) }) + +test_that("write_parquet() handles various compression= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "snappy") + expect_parquet_roundtrip(tab, compression = rep("snappy", 3L)) + expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy")) +}) + +test_that("write_parquet() handles various compression_level= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L)) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L)) +}) + +test_that("write_parquet() handles various use_dictionary= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, use_dictionary = TRUE) + expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE)) +}) + +test_that("write_parquet() handles various write_statistics= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, write_statistics = TRUE) + expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE)) +}) + +test_that("make_valid_version()", { + expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0) +}) diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index ec56d6a783b..dcda9ce4277 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -97,7 +97,7 @@ test_that("table round trip handles NA in integer and numeric", { expect_equal(tab$column(0)$type, int32()) expect_equal(tab$column(1)$type, float64()) - expect_equal(tab$column(2)$type, int8()) + expect_equal(tab$column(2)$type, uint8()) tf <- tempfile() write_arrow(tbl, tf) diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index b6bfab0fdbd..f50b0783db6 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -33,7 +33,10 @@ test_that("type() infers from R type", { expect_equal(type(TRUE), boolean()) expect_equal(type(raw()), int8()) expect_equal(type(""), utf8()) - expect_equal(type(iris$Species), dictionary()) + expect_equal( + type(iris$Species), + dictionary(int8(), utf8(), FALSE) + ) expect_equal( type(lubridate::ymd_hms("2019-02-14 13:55:05")), timestamp(TimeUnit$MICRO, "GMT")