diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index e03615b97da..2877736c70d 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -97,6 +97,7 @@ jobs: ARROW_WITH_LZ4: ON ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON + ARROW_WITH_ZSTD: ON XML_CATALOG_FILES: /usr/local/etc/xml/catalog steps: - name: Checkout Arrow diff --git a/c_glib/arrow-glib/file-system.cpp b/c_glib/arrow-glib/file-system.cpp index 779d19a4afb..5336f07d3a5 100644 --- a/c_glib/arrow-glib/file-system.cpp +++ b/c_glib/arrow-glib/file-system.cpp @@ -85,27 +85,27 @@ garrow_file_info_set_property(GObject *object, const GValue *value, GParamSpec *pspec) { - auto &arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object)); + auto arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object)); switch (prop_id) { case PROP_FILE_INFO_TYPE: { auto arrow_file_type = static_cast(g_value_get_enum(value)); - arrow_file_info.set_type(arrow_file_type); + arrow_file_info->set_type(arrow_file_type); } break; case PROP_FILE_INFO_PATH: - arrow_file_info.set_path(g_value_get_string(value)); + arrow_file_info->set_path(g_value_get_string(value)); break; case PROP_FILE_INFO_SIZE: - arrow_file_info.set_size(g_value_get_int64(value)); + arrow_file_info->set_size(g_value_get_int64(value)); break; case PROP_FILE_INFO_MTIME: { const gint64 mtime = g_value_get_int64(value); const arrow::fs::TimePoint::duration duration(mtime); - arrow_file_info.set_mtime(arrow::fs::TimePoint(duration)); + arrow_file_info->set_mtime(arrow::fs::TimePoint(duration)); } break; default: @@ -120,35 +120,35 @@ garrow_file_info_get_property(GObject *object, GValue *value, GParamSpec *pspec) { - const auto &arrow_file_info = + const auto arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object)); switch (prop_id) { case PROP_FILE_INFO_TYPE: { - const auto arrow_file_type = arrow_file_info.type(); + const auto arrow_file_type = arrow_file_info->type(); const auto file_type = static_cast(arrow_file_type); g_value_set_enum(value, file_type); } break; case PROP_FILE_INFO_PATH: - g_value_set_string(value, arrow_file_info.path().c_str()); + g_value_set_string(value, arrow_file_info->path().c_str()); break; case PROP_FILE_INFO_BASE_NAME: - g_value_set_string(value, arrow_file_info.base_name().c_str()); + g_value_set_string(value, arrow_file_info->base_name().c_str()); break; case PROP_FILE_INFO_DIR_NAME: - g_value_set_string(value, arrow_file_info.dir_name().c_str()); + g_value_set_string(value, arrow_file_info->dir_name().c_str()); break; case PROP_FILE_INFO_EXTENSION: - g_value_set_string(value, arrow_file_info.extension().c_str()); + g_value_set_string(value, arrow_file_info->extension().c_str()); break; case PROP_FILE_INFO_SIZE: - g_value_set_int64(value, arrow_file_info.size()); + g_value_set_int64(value, arrow_file_info->size()); break; case PROP_FILE_INFO_MTIME: { - const auto arrow_mtime = arrow_file_info.mtime(); + const auto arrow_mtime = arrow_file_info->mtime(); const auto mtime = arrow_mtime.time_since_epoch().count(); g_value_set_int64(value, mtime); } @@ -317,9 +317,9 @@ gboolean garrow_file_info_equal(GArrowFileInfo *file_info, GArrowFileInfo *other_file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - const auto &arrow_other_file_info = garrow_file_info_get_raw(other_file_info); - return arrow_file_info.Equals(arrow_other_file_info); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + const auto arrow_other_file_info = garrow_file_info_get_raw(other_file_info); + return arrow_file_info->Equals(*arrow_other_file_info); } /** @@ -333,8 +333,8 @@ garrow_file_info_equal(GArrowFileInfo *file_info, gboolean garrow_file_info_is_file(GArrowFileInfo *file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - return arrow_file_info.IsFile(); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + return arrow_file_info->IsFile(); } /** @@ -348,8 +348,8 @@ garrow_file_info_is_file(GArrowFileInfo *file_info) gboolean garrow_file_info_is_dir(GArrowFileInfo *file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - return arrow_file_info.IsDirectory(); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + return arrow_file_info->IsDirectory(); } /** @@ -365,8 +365,8 @@ garrow_file_info_is_dir(GArrowFileInfo *file_info) gchar * garrow_file_info_to_string(GArrowFileInfo *file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - auto string = arrow_file_info.ToString(); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + auto string = arrow_file_info->ToString(); return g_strndup(string.data(), string.size()); } @@ -1308,10 +1308,11 @@ garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info) return file_info; } -arrow::fs::FileInfo & +arrow::fs::FileInfo * garrow_file_info_get_raw(GArrowFileInfo *file_info) { - return GARROW_FILE_INFO_GET_PRIVATE(file_info)->file_info; + auto priv = GARROW_FILE_INFO_GET_PRIVATE(file_info); + return &(priv->file_info); } std::shared_ptr diff --git a/c_glib/arrow-glib/file-system.hpp b/c_glib/arrow-glib/file-system.hpp index db0e0a45680..175ac093ab2 100644 --- a/c_glib/arrow-glib/file-system.hpp +++ b/c_glib/arrow-glib/file-system.hpp @@ -26,7 +26,7 @@ GArrowFileInfo * garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info); -arrow::fs::FileInfo & +arrow::fs::FileInfo * garrow_file_info_get_raw(GArrowFileInfo *file_info); std::shared_ptr diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 3190d240026..6315e51cb78 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -521,12 +521,11 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r typedef struct GArrowFeatherFileReaderPrivate_ { - arrow::ipc::feather::TableReader *feather_table_reader; + std::shared_ptr feather_reader; } GArrowFeatherFileReaderPrivate; enum { - PROP_0__, - PROP_FEATHER_TABLE_READER + PROP_FEATHER_READER = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherFileReader, @@ -543,7 +542,7 @@ garrow_feather_file_reader_finalize(GObject *object) { auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(object); - delete priv->feather_table_reader; + priv->feather_reader.~shared_ptr(); G_OBJECT_CLASS(garrow_feather_file_reader_parent_class)->finalize(object); } @@ -557,9 +556,9 @@ garrow_feather_file_reader_set_property(GObject *object, auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(object); switch (prop_id) { - case PROP_FEATHER_TABLE_READER: - priv->feather_table_reader = - static_cast(g_value_get_pointer(value)); + case PROP_FEATHER_READER: + priv->feather_reader = + *static_cast *>(g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -588,21 +587,19 @@ garrow_feather_file_reader_init(GArrowFeatherFileReader *object) static void garrow_feather_file_reader_class_init(GArrowFeatherFileReaderClass *klass) { - GObjectClass *gobject_class; - GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); + auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->finalize = garrow_feather_file_reader_finalize; gobject_class->set_property = garrow_feather_file_reader_set_property; gobject_class->get_property = garrow_feather_file_reader_get_property; - spec = g_param_spec_pointer("feather-table-reader", - "arrow::ipc::feather::TableReader", - "The raw std::shared *", + GParamSpec *spec; + spec = g_param_spec_pointer("feather-reader", + "arrow::ipc::feather::Reader", + "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FEATHER_TABLE_READER, spec); + g_object_class_install_property(gobject_class, PROP_FEATHER_READER, spec); } @@ -621,58 +618,14 @@ garrow_feather_file_reader_new(GArrowSeekableInputStream *file, GError **error) { auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(file); - std::unique_ptr arrow_reader; - auto status = - arrow::ipc::feather::TableReader::Open(arrow_random_access_file, - &arrow_reader); - if (garrow_error_check(error, status, "[feather-file-reader][new]")) { - return garrow_feather_file_reader_new_raw(arrow_reader.release()); - } else { - return NULL; - } -} - -/** - * garrow_feather_file_reader_get_description: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: (nullable) (transfer full): - * The description of the file if it exists, - * %NULL otherwise. You can confirm whether description exists or not by - * garrow_feather_file_reader_has_description(). - * - * It should be freed with g_free() when no longer needed. - * - * Since: 0.4.0 - */ -gchar * -garrow_feather_file_reader_get_description(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - if (arrow_reader->HasDescription()) { - auto description = arrow_reader->GetDescription(); - return g_strndup(description.data(), - description.size()); + auto reader = arrow::ipc::feather::Reader::Open(arrow_random_access_file); + if (garrow::check(error, reader, "[feather-file-reader][new]")) { + return garrow_feather_file_reader_new_raw(&(*reader)); } else { return NULL; } } -/** - * garrow_feather_file_reader_has_description: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: Whether the file has description or not. - * - * Since: 0.4.0 - */ -gboolean -garrow_feather_file_reader_has_description(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - return arrow_reader->HasDescription(); -} - /** * garrow_feather_file_reader_get_version: * @reader: A #GArrowFeatherFileReader. @@ -688,102 +641,6 @@ garrow_feather_file_reader_get_version(GArrowFeatherFileReader *reader) return arrow_reader->version(); } -/** - * garrow_feather_file_reader_get_n_rows: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: The number of rows in the file. - * - * Since: 0.4.0 - */ -gint64 -garrow_feather_file_reader_get_n_rows(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - return arrow_reader->num_rows(); -} - -/** - * garrow_feather_file_reader_get_n_columns: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: The number of columns in the file. - * - * Since: 0.4.0 - */ -gint64 -garrow_feather_file_reader_get_n_columns(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - return arrow_reader->num_columns(); -} - -/** - * garrow_feather_file_reader_get_column_name: - * @reader: A #GArrowFeatherFileReader. - * @i: The index of the target column. If it's negative, index is - * counted backward from the end of the columns. `-1` means the last - * column. - * - * Returns: (nullable) (transfer full): The i-th column name in the file. - * - * It should be freed with g_free() when no longer needed. - * - * Since: 0.4.0 - */ -gchar * -garrow_feather_file_reader_get_column_name(GArrowFeatherFileReader *reader, - gint i) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - if (!garrow_internal_index_adjust(i, arrow_reader->num_columns())) { - return NULL; - } - const auto &column_name = arrow_reader->GetColumnName(i); - return g_strndup(column_name.data(), - column_name.size()); -} - -/** - * garrow_feather_file_reader_get_column_data: - * @reader: A #GArrowFeatherFileReader. - * @i: The index of the target column. If it's negative, index is - * counted backward from the end of the columns. `-1` means the last - * column. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): - * The i-th column's data in the file or %NULL on error. - * - * Since: 1.0.0 - */ -GArrowChunkedArray * -garrow_feather_file_reader_get_column_data(GArrowFeatherFileReader *reader, - gint i, - GError **error) -{ - const auto tag = "[feather-file-reader][get-column-data]"; - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - - const auto n_columns = arrow_reader->num_columns(); - if (!garrow_internal_index_adjust(i, n_columns)) { - garrow_error_check(error, - arrow::Status::IndexError("Out of index: " - "<0..", n_columns, ">: " - "<", i, ">"), - tag); - return NULL; - } - - std::shared_ptr arrow_chunked_array; - auto status = arrow_reader->GetColumn(i, &arrow_chunked_array); - if (garrow_error_check(error, status, tag)) { - return garrow_chunked_array_new_raw(&arrow_chunked_array); - } else { - return NULL; - } -} - /** * garrow_feather_file_reader_read: * @reader: A #GArrowFeatherFileReader. @@ -858,7 +715,7 @@ garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader, GError **error) { auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - std::vector cpp_names(n_names); + std::vector cpp_names; for (guint i = 0; i < n_names; ++i) { cpp_names.push_back(names[i]); } @@ -2212,21 +2069,21 @@ garrow_record_batch_file_reader_get_raw(GArrowRecordBatchFileReader *reader) } GArrowFeatherFileReader * -garrow_feather_file_reader_new_raw(arrow::ipc::feather::TableReader *arrow_reader) +garrow_feather_file_reader_new_raw(std::shared_ptr *arrow_reader) { auto reader = GARROW_FEATHER_FILE_READER( g_object_new(GARROW_TYPE_FEATHER_FILE_READER, - "feather-table-reader", arrow_reader, + "feather-reader", arrow_reader, NULL)); return reader; } -arrow::ipc::feather::TableReader * +std::shared_ptr garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader) { auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(reader); - return priv->feather_table_reader; + return priv->feather_reader; } GArrowCSVReader * diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index 6241792c8b2..5054d530703 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -209,24 +209,8 @@ GArrowFeatherFileReader *garrow_feather_file_reader_new( GArrowSeekableInputStream *file, GError **error); -gchar *garrow_feather_file_reader_get_description( - GArrowFeatherFileReader *reader); -gboolean garrow_feather_file_reader_has_description( - GArrowFeatherFileReader *reader); gint garrow_feather_file_reader_get_version( GArrowFeatherFileReader *reader); -gint64 garrow_feather_file_reader_get_n_rows( - GArrowFeatherFileReader *reader); -gint64 garrow_feather_file_reader_get_n_columns( - GArrowFeatherFileReader *reader); -gchar *garrow_feather_file_reader_get_column_name( - GArrowFeatherFileReader *reader, - gint i); -GARROW_AVAILABLE_IN_1_0 -GArrowChunkedArray * -garrow_feather_file_reader_get_column_data(GArrowFeatherFileReader *reader, - gint i, - GError **error); GArrowTable * garrow_feather_file_reader_read(GArrowFeatherFileReader *reader, GError **error); diff --git a/c_glib/arrow-glib/reader.hpp b/c_glib/arrow-glib/reader.hpp index 7fa89ce84c9..c1df700fe13 100644 --- a/c_glib/arrow-glib/reader.hpp +++ b/c_glib/arrow-glib/reader.hpp @@ -34,11 +34,14 @@ GArrowTableBatchReader *garrow_table_batch_reader_new_raw(std::shared_ptr *arrow_reader); -GArrowRecordBatchFileReader *garrow_record_batch_file_reader_new_raw(std::shared_ptr *arrow_reader); +GArrowRecordBatchFileReader * +garrow_record_batch_file_reader_new_raw(std::shared_ptr *arrow_reader); std::shared_ptr garrow_record_batch_file_reader_get_raw(GArrowRecordBatchFileReader *reader); -GArrowFeatherFileReader *garrow_feather_file_reader_new_raw(arrow::ipc::feather::TableReader *arrow_reader); -arrow::ipc::feather::TableReader *garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); +GArrowFeatherFileReader * +garrow_feather_file_reader_new_raw(std::shared_ptr *arrow_reader); +std::shared_ptr +garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); GArrowCSVReader * garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader); diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index e23cc0434b0..7b05e252e00 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -36,10 +38,16 @@ G_BEGIN_DECLS /** * SECTION: table + * @section_id: GArrowTable + * @title: GArrowTable * @short_description: Table class + * @include: arrow-glib/arrow-glib.h * * #GArrowTable is a class for table. Table has zero or more * #GArrowChunkedArrays and zero or more records. + * + * #GArrowFeatherWriteProperties is a class to customize how to write + * Feather data. */ typedef struct GArrowTablePrivate_ { @@ -61,13 +69,13 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowTable, GARROW_TABLE(obj))) static void -garrow_table_dispose(GObject *object) +garrow_table_finalize(GObject *object) { auto priv = GARROW_TABLE_GET_PRIVATE(object); priv->table.~shared_ptr(); - G_OBJECT_CLASS(garrow_table_parent_class)->dispose(object); + G_OBJECT_CLASS(garrow_table_parent_class)->finalize(object); } static void @@ -117,13 +125,13 @@ garrow_table_class_init(GArrowTableClass *klass) gobject_class = G_OBJECT_CLASS(klass); - gobject_class->dispose = garrow_table_dispose; + gobject_class->finalize = garrow_table_finalize; gobject_class->set_property = garrow_table_set_property; gobject_class->get_property = garrow_table_get_property; spec = g_param_spec_pointer("table", "Table", - "The raw std::shared *", + "The raw std::shared_ptr *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_TABLE, spec); @@ -632,6 +640,167 @@ garrow_table_combine_chunks(GArrowTable *table, } } + +typedef struct GArrowFeatherWritePropertiesPrivate_ { + arrow::ipc::feather::WriteProperties properties; +} GArrowFeatherWritePropertiesPrivate; + +enum { + PROP_VERSION = 1, + PROP_CHUNK_SIZE, + PROP_COMPRESSION, + PROP_COMPRESSION_LEVEL, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherWriteProperties, + garrow_feather_write_properties, + G_TYPE_OBJECT) + +#define GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(obj) \ + static_cast( \ + garrow_feather_write_properties_get_instance_private( \ + GARROW_FEATHER_WRITE_PROPERTIES(obj))) + +static void +garrow_feather_write_properties_finalize(GObject *object) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + + priv->properties.~WriteProperties(); + + G_OBJECT_CLASS(garrow_feather_write_properties_parent_class)->finalize(object); +} + +static void +garrow_feather_write_properties_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_COMPRESSION: + priv->properties.compression = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_feather_write_properties_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_COMPRESSION: + g_value_set_enum(value, priv->properties.compression); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_feather_write_properties_init(GArrowFeatherWriteProperties *object) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + new(&priv->properties) arrow::ipc::feather::WriteProperties; + priv->properties = arrow::ipc::feather::WriteProperties::Defaults(); +} + +static void +garrow_feather_write_properties_class_init(GArrowFeatherWritePropertiesClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_feather_write_properties_finalize; + gobject_class->set_property = garrow_feather_write_properties_set_property; + gobject_class->get_property = garrow_feather_write_properties_get_property; + + auto properties = arrow::ipc::feather::WriteProperties::Defaults(); + GParamSpec *spec; + // TODO: version + // TODO: chunk_size + + /** + * GArrowFeatherWriteProperties:compression: + * + * Compression type to use. Only + * %GARROW_COMPRESSION_TYPE_UNCOMPRESSED, + * %GARROW_COMPRESSION_TYPE_LZ4 and %GARROW_COMPRESSION_TYPE_ZSTD + * are supported. The default compression is + * %GARROW_COMPRESSION_TYPE_LZ4 if Apache Arrow C++ is built with + * support for it, otherwise %GARROW_COMPRESSION_TYPE_UNCOMPRESSED. + * %GARROW_COMPRESSION_TYPE_UNCOMPRESSED is set as the object + * default here. + * + * Since: 0.17.0 + */ + spec = g_param_spec_enum("compression", + "Compression", + "The compression type to use", + GARROW_TYPE_COMPRESSION_TYPE, + properties.compression, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_COMPRESSION, spec); + // TODO: compression_level +} + +/** + * garrow_feather_write_properties_new: + * + * Returns: A newly created #GArrowFeatherWriteProperties. + * + * Since: 0.17.0 + */ +GArrowFeatherWriteProperties * +garrow_feather_write_properties_new(void) +{ + auto properties = g_object_new(GARROW_TYPE_FEATHER_WRITE_PROPERTIES, NULL); + return GARROW_FEATHER_WRITE_PROPERTIES(properties); +} + +/** + * garrow_table_write_as_feather: + * @table: A #GArrowTable. + * @sink: The output. + * @properties: (nullable): The properties for this write. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Writes the @table as Feather format data to the @sink. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.17.0 + */ +gboolean +garrow_table_write_as_feather(GArrowTable *table, + GArrowOutputStream *sink, + GArrowFeatherWriteProperties *properties, + GError **error) +{ + auto arrow_table = garrow_table_get_raw(table); + auto arrow_sink = garrow_output_stream_get_raw(sink); + arrow::Status status; + if (properties) { + auto arrow_properties = garrow_feather_write_properties_get_raw(properties); + status = arrow::ipc::feather::WriteTable(*arrow_table, + arrow_sink.get(), + *arrow_properties); + } else { + status = arrow::ipc::feather::WriteTable(*arrow_table, arrow_sink.get()); + } + return garrow::check(error, status, "[feather-write-file]"); +} + G_END_DECLS GArrowTable * @@ -649,3 +818,10 @@ garrow_table_get_raw(GArrowTable *table) auto priv = GARROW_TABLE_GET_PRIVATE(table); return priv->table; } + +arrow::ipc::feather::WriteProperties * +garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(properties); + return &(priv->properties); +} diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index 5c00acf11d0..97f4544ef2c 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include #include @@ -110,4 +111,28 @@ GArrowTable* garrow_table_combine_chunks(GArrowTable *table, GError **error); + +#define GARROW_TYPE_FEATHER_WRITE_PROPERTIES \ + (garrow_feather_write_properties_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFeatherWriteProperties, + garrow_feather_write_properties, + GARROW, + FEATHER_WRITE_PROPERTIES, + GObject) +struct _GArrowFeatherWritePropertiesClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_17 +GArrowFeatherWriteProperties * +garrow_feather_write_properties_new(void); + +GARROW_AVAILABLE_IN_0_17 +gboolean +garrow_table_write_as_feather(GArrowTable *table, + GArrowOutputStream *sink, + GArrowFeatherWriteProperties *properties, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-glib/table.hpp b/c_glib/arrow-glib/table.hpp index 22b0fad5024..dc972d80cff 100644 --- a/c_glib/arrow-glib/table.hpp +++ b/c_glib/arrow-glib/table.hpp @@ -20,8 +20,12 @@ #pragma once #include +#include #include GArrowTable *garrow_table_new_raw(std::shared_ptr *arrow_table); std::shared_ptr garrow_table_get_raw(GArrowTable *table); + +arrow::ipc::feather::WriteProperties * +garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties); diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index 8e9132c5a89..074c83af120 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -47,9 +47,6 @@ G_BEGIN_DECLS * * #GArrowRecordBatchFileWriter is a class for writing record * batches in file format into output. - * - * #GArrowFeatherFileWriter is a class for writing arrays - * in Feather file format into output. */ typedef struct GArrowRecordBatchWriterPrivate_ { @@ -298,215 +295,8 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, } } - -typedef struct GArrowFeatherFileWriterPrivate_ { - arrow::ipc::feather::TableWriter *feather_table_writer; -} GArrowFeatherFileWriterPrivate; - -enum { - PROP_0_, - PROP_FEATHER_TABLE_WRITER -}; - -G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherFileWriter, - garrow_feather_file_writer, - G_TYPE_OBJECT); - -#define GARROW_FEATHER_FILE_WRITER_GET_PRIVATE(obj) \ - static_cast( \ - garrow_feather_file_writer_get_instance_private( \ - GARROW_FEATHER_FILE_WRITER(obj))) - -static void -garrow_feather_file_writer_finalize(GObject *object) -{ - auto priv = GARROW_FEATHER_FILE_WRITER_GET_PRIVATE(object); - - delete priv->feather_table_writer; - - G_OBJECT_CLASS(garrow_feather_file_writer_parent_class)->finalize(object); -} - -static void -garrow_feather_file_writer_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_FEATHER_FILE_WRITER_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_FEATHER_TABLE_WRITER: - priv->feather_table_writer = - static_cast(g_value_get_pointer(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_feather_file_writer_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_feather_file_writer_init(GArrowFeatherFileWriter *object) -{ -} - -static void -garrow_feather_file_writer_class_init(GArrowFeatherFileWriterClass *klass) -{ - GObjectClass *gobject_class; - GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = garrow_feather_file_writer_finalize; - gobject_class->set_property = garrow_feather_file_writer_set_property; - gobject_class->get_property = garrow_feather_file_writer_get_property; - - spec = g_param_spec_pointer("feather-table-writer", - "arrow::ipc::feather::TableWriter", - "The raw std::shared *", - static_cast(G_PARAM_WRITABLE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FEATHER_TABLE_WRITER, spec); -} - -/** - * garrow_feather_file_writer_new: - * @sink: The output of the writer. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable): A newly created #GArrowFeatherFileWriter - * or %NULL on error. - * - * Since: 0.4.0 - */ -GArrowFeatherFileWriter * -garrow_feather_file_writer_new(GArrowOutputStream *sink, - GError **error) -{ - auto arrow_sink = garrow_output_stream_get_raw(sink); - std::unique_ptr arrow_writer; - auto status = arrow::ipc::feather::TableWriter::Open(arrow_sink, - &arrow_writer); - if (garrow_error_check(error, status, "[feature-file-writer][new]")) { - return garrow_feather_file_writer_new_raw(arrow_writer.release()); - } else { - return NULL; - } -} - -/** - * garrow_feather_file_writer_set_description: - * @writer: A #GArrowFeatherFileWriter. - * @description: The description of the file. - * - * Since: 0.4.0 - */ -void -garrow_feather_file_writer_set_description(GArrowFeatherFileWriter *writer, - const gchar *description) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - arrow_writer->SetDescription(std::string(description)); -} - -/** - * garrow_feather_file_writer_set_n_rows: - * @writer: A #GArrowFeatherFileWriter. - * @n_rows: The number of rows in the file. - * - * Since: 0.4.0 - */ -void -garrow_feather_file_writer_set_n_rows(GArrowFeatherFileWriter *writer, - gint64 n_rows) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - arrow_writer->SetNumRows(n_rows); -} - -/** - * garrow_feather_file_writer_append: - * @writer: A #GArrowFeatherFileWriter. - * @name: The name of the array to be appended. - * @array: The array to be appended. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.4.0 - */ -gboolean -garrow_feather_file_writer_append(GArrowFeatherFileWriter *writer, - const gchar *name, - GArrowArray *array, - GError **error) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - auto arrow_array = garrow_array_get_raw(array); - - auto status = arrow_writer->Append(std::string(name), *arrow_array); - return garrow_error_check(error, status, "[feather-file-writer][append]"); -} - -/** - * garrow_feather_file_writer_writer: - * @writer: A #GArrowFeatherFileWriter. - * @array: The table to be written. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.12.0 - */ -gboolean -garrow_feather_file_writer_write(GArrowFeatherFileWriter *writer, - GArrowTable *table, - GError **error) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - auto arrow_table = garrow_table_get_raw(table); - - auto status = arrow_writer->Write(*arrow_table); - return garrow_error_check(error, status, "[feather-file-writer][write]"); -} - -/** - * garrow_feather_file_writer_close: - * @writer: A #GArrowFeatherFileWriter. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.4.0 - */ -gboolean -garrow_feather_file_writer_close(GArrowFeatherFileWriter *writer, - GError **error) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - - auto status = arrow_writer->Finalize(); - return garrow_error_check(error, status, "[feather-file-writer][close]"); -} - G_END_DECLS - GArrowRecordBatchWriter * garrow_record_batch_writer_new_raw(std::shared_ptr *arrow_writer) { @@ -546,21 +336,3 @@ garrow_record_batch_file_writer_new_raw(std::shared_ptrfeather_table_writer; -} diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index 8950b213cf7..a0d22fe626c 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -189,32 +189,4 @@ GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new( GArrowSchema *schema, GError **error); - -#define GARROW_TYPE_FEATHER_FILE_WRITER (garrow_feather_file_writer_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowFeatherFileWriter, - garrow_feather_file_writer, - GARROW, - FEATHER_FILE_WRITER, - GObject) -struct _GArrowFeatherFileWriterClass -{ - GObjectClass parent_class; -}; - -GArrowFeatherFileWriter *garrow_feather_file_writer_new(GArrowOutputStream *sink, - GError **error); -void garrow_feather_file_writer_set_description(GArrowFeatherFileWriter *writer, - const gchar *description); -void garrow_feather_file_writer_set_n_rows(GArrowFeatherFileWriter *writer, - gint64 n_rows); -gboolean garrow_feather_file_writer_append(GArrowFeatherFileWriter *writer, - const gchar *name, - GArrowArray *array, - GError **error); -gboolean garrow_feather_file_writer_write(GArrowFeatherFileWriter *writer, - GArrowTable *table, - GError **error); -gboolean garrow_feather_file_writer_close(GArrowFeatherFileWriter *writer, - GError **error); - G_END_DECLS diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp index 61d9d679dc3..1e188bd3c68 100644 --- a/c_glib/arrow-glib/writer.hpp +++ b/c_glib/arrow-glib/writer.hpp @@ -31,6 +31,3 @@ std::shared_ptr garrow_record_batch_writer_get_ra GArrowRecordBatchStreamWriter *garrow_record_batch_stream_writer_new_raw(std::shared_ptr *arrow_writer); GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new_raw(std::shared_ptr *arrow_writer); - -GArrowFeatherFileWriter *garrow_feather_file_writer_new_raw(arrow::ipc::feather::TableWriter *arrow_writer); -arrow::ipc::feather::TableWriter *garrow_feather_file_writer_get_raw(GArrowFeatherFileWriter *writer); diff --git a/c_glib/test/test-feather-file-reader.rb b/c_glib/test/test-feather-file-reader.rb index 48a4fc75488..33ff6a46b67 100644 --- a/c_glib/test/test-feather-file-reader.rb +++ b/c_glib/test/test-feather-file-reader.rb @@ -18,26 +18,11 @@ class TestFeatherFileReader < Test::Unit::TestCase include Helper::Buildable - def setup_file(data) + def setup_file(table) tempfile = Tempfile.open("arrow-feather-file-reader") output = Arrow::FileOutputStream.new(tempfile.path, false) begin - writer = Arrow::FeatherFileWriter.new(output) - begin - if data[:description] - writer.description = data[:description] - end - writer.n_rows = data[:n_rows] || 0 - if data[:table] - writer.write(data[:table]) - elsif data[:columns] - data[:columns].each do |name, array| - writer.append(name, array) - end - end - ensure - writer.close - end + table.write_as_feather(output) ensure output.close end @@ -51,100 +36,13 @@ def setup_file(data) end end - sub_test_case("#description") do - test("exist") do - setup_file(:description => "Log") do |reader| - assert_equal("Log", reader.description) - end - end - - test("not exist") do - setup_file(:description => nil) do |reader| - assert_nil(reader.description) - end - end - end - - sub_test_case("#has_description?") do - test("exist") do - setup_file(:description => "Log") do |reader| - assert do - reader.has_description? - end - end - end - - test("not exist") do - setup_file(:description => nil) do |reader| - assert do - not reader.has_description? - end - end - end - end - - test("#version") do - setup_file({}) do |reader| - assert do - reader.version >= 2 - end - end - end - - test("#n_rows") do - setup_file(:n_rows => 3) do |reader| - assert_equal(3, reader.n_rows) - end - end - - test("#n_columns") do - columns = { - "message" => build_string_array([]), - "is_critical" => build_boolean_array([]), - } - setup_file(:columns => columns) do |reader| - assert_equal(2, reader.n_columns) - end - end - - test("#get_column_name") do - columns = { - "message" => build_string_array([]), - "is_critical" => build_boolean_array([]), - } - setup_file(:columns => columns) do |reader| - actual_column_names = reader.n_columns.times.collect do |i| - reader.get_column_name(i) - end - assert_equal([ - "message", - "is_critical", - ], - actual_column_names) - end - end - - test("#get_column_data") do - columns = { - "message" => build_string_array(["Hello"]), - "is_critical" => build_boolean_array([false]), - } - setup_file(:columns => columns) do |reader| - actual_columns = reader.n_columns.times.collect do |i| - reader.get_column_data(i).get_chunk(0) - end - assert_equal([ - columns["message"], - columns["is_critical"], - ], - actual_columns) - end - end - test("#read") do table = build_table("message" => build_string_array(["Login"]), "is_critical" => build_boolean_array([true])) - setup_file(:table => table) do |reader| + setup_file(table) do |reader| + assert do + reader.version >= 2 + end assert_equal(table, reader.read) end end @@ -153,7 +51,7 @@ def setup_file(data) table = build_table("message" => build_string_array(["Login"]), "is_critical" => build_boolean_array([true]), "host" => build_string_array(["www"])) - setup_file(:table => table) do |reader| + setup_file(table) do |reader| assert_equal(build_table("message" => build_string_array(["Login"]), "host" => build_string_array(["www"])), reader.read_indices([2, 0])) @@ -164,7 +62,7 @@ def setup_file(data) table = build_table("message" => build_string_array(["Login"]), "is_critical" => build_boolean_array([true]), "host" => build_string_array(["www"])) - setup_file(:table => table) do |reader| + setup_file(table) do |reader| assert_equal(build_table("message" => build_string_array(["Login"]), "host" => build_string_array(["www"])), reader.read_names(["host", "message"])) diff --git a/c_glib/test/test-feather-file-writer.rb b/c_glib/test/test-feather-file-writer.rb deleted file mode 100644 index 247d937e93e..00000000000 --- a/c_glib/test/test-feather-file-writer.rb +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestFeatherFileWriter < Test::Unit::TestCase - include Helper::Buildable - - def test_append - tempfile = Tempfile.open("arrow-feather-file-writer") - output = Arrow::FileOutputStream.new(tempfile.path, false) - begin - writer = Arrow::FeatherFileWriter.new(output) - begin - writer.description = "Log" - writer.n_rows = 3 - writer.append("message", - build_string_array(["Crash", "Error", "Shutdown"])) - writer.append("is_critical", - build_boolean_array([true, true, false])) - ensure - writer.close - end - ensure - output.close - end - - input = Arrow::MemoryMappedInputStream.new(tempfile.path) - begin - reader = Arrow::FeatherFileReader.new(input) - columns = reader.n_columns.times.collect do |i| - [ - reader.get_column_name(i), - reader.get_column_data(i).get_chunk(0), - ] - end - assert_equal([ - true, - "Log", - [ - [ - "message", - build_string_array(["Crash", "Error", "Shutdown"]), - ], - [ - "is_critical", - build_boolean_array([true, true, false]), - ], - ], - ], - [ - reader.has_description?, - reader.description, - columns, - ]) - ensure - input.close - end - end - - def test_write - messages = build_string_array(["Crash", "Error", "Shutdown"]) - is_criticals = build_boolean_array([true, true, false]) - table = build_table("message" => messages, - "is_critical" => is_criticals) - - tempfile = Tempfile.open("arrow-feather-file-writer") - - output = Arrow::FileOutputStream.new(tempfile.path, false) - writer = Arrow::FeatherFileWriter.new(output) - writer.n_rows = table.n_rows - writer.write(table) - writer.close - output.close - - input = Arrow::MemoryMappedInputStream.new(tempfile.path) - reader = Arrow::FeatherFileReader.new(input) - assert_equal([table.n_rows, table], - [reader.n_rows, reader.read]) - input.close - end -end diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index acf97fd4c0f..bef1cffd57b 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -225,5 +225,49 @@ def test_combine_chunks assert_equal([[[true, false, true, false, true, false]]], all_values) end + + sub_test_case("#write_as_feather") do + def setup + super + @tempfile = Tempfile.open("arrow-table-write-as-feather") + begin + yield + ensure + @tempfile.close! + end + end + + def read_feather + input = Arrow::MemoryMappedInputStream.new(@tempfile.path) + reader = Arrow::FeatherFileReader.new(input) + begin + yield(reader.read) + ensure + input.close + end + end + + test("default") do + output = Arrow::FileOutputStream.new(@tempfile.path, false) + @table.write_as_feather(output) + output.close + + read_feather do |read_table| + assert_equal(@table, read_table) + end + end + + test("compression") do + output = Arrow::FileOutputStream.new(@tempfile.path, false) + properties = Arrow::FeatherWriteProperties.new + properties.compression = :zstd + @table.write_as_feather(output, properties) + output.close + + read_feather do |read_table| + assert_equal(@table, read_table) + end + end + end end end diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 9a324a07757..fc0a78a7b75 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -17,11 +17,13 @@ #include "arrow/ipc/feather.h" +#include #include #include #include #include // IWYU pragma: keep #include +#include #include #include @@ -30,27 +32,38 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/io/interfaces.h" -#include "arrow/ipc/feather_internal.h" -#include "arrow/ipc/util.h" // IWYU pragma: keep +#include "arrow/ipc/metadata_internal.h" +#include "arrow/ipc/options.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/util.h" +#include "arrow/ipc/writer.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/table.h" // IWYU pragma: keep +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" -#include "arrow/visitor.h" +#include "arrow/util/make_unique.h" +#include "arrow/visitor_inline.h" #include "generated/feather_generated.h" namespace arrow { using internal::checked_cast; +using internal::make_unique; + +class ExtensionType; namespace ipc { namespace feather { +typedef flatbuffers::FlatBufferBuilder FBB; + +static constexpr const char* kFeatherV1MagicBytes = "FEA1"; +static constexpr const int kFeatherDefaultAlignment = 8; static const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0}; static inline int64_t PaddedLength(int64_t nbytes) { @@ -58,32 +71,9 @@ static inline int64_t PaddedLength(int64_t nbytes) { return ((nbytes + alignment - 1) / alignment) * alignment; } -// XXX: Hack for Feather 0.3.0 for backwards compatibility with old files -// Size in-file of written byte buffer -static int64_t GetOutputLength(int64_t nbytes) { - if (kFeatherVersion < 2) { - // Feather files < 0.3.0 - return nbytes; - } else { - return PaddedLength(nbytes); - } -} - -static Status WritePadded(io::OutputStream* stream, const uint8_t* data, int64_t length, - int64_t* bytes_written) { - RETURN_NOT_OK(stream->Write(data, length)); - - int64_t remainder = PaddedLength(length) - length; - if (remainder != 0) { - RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); - } - *bytes_written = length + remainder; - return Status::OK(); -} - -static Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* data, - int64_t bit_offset, const int64_t length, - int64_t* bytes_written) { +Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* data, + int64_t bit_offset, const int64_t length, + int64_t* bytes_written) { data = data + bit_offset / 8; uint8_t bit_shift = static_cast(bit_offset % 8); if (bit_offset == 0) { @@ -118,6 +108,19 @@ static Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* dat return Status::OK(); } +Status WritePadded(io::OutputStream* stream, const uint8_t* data, int64_t length, + int64_t* bytes_written) { + return WritePaddedWithOffset(stream, data, /*bit_offset=*/0, length, bytes_written); +} + +struct ColumnType { + enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; +}; + +static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { + return static_cast(static_cast(unit)); +} + /// For compatibility, we need to write any data sometimes just to keep producing /// files that can be read with an older reader. static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, @@ -126,7 +129,6 @@ static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, for (int64_t i = 0; i < length; i++) { RETURN_NOT_OK(stream->Write(&null, 1)); } - int64_t remainder = PaddedLength(length) - length; if (remainder != 0) { RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); @@ -136,168 +138,21 @@ static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, } // ---------------------------------------------------------------------- -// TableBuilder - -TableBuilder::TableBuilder(int64_t num_rows) : finished_(false), num_rows_(num_rows) {} - -FBB& TableBuilder::fbb() { return fbb_; } - -Status TableBuilder::Finish() { - if (finished_) { - return Status::Invalid("can only call this once"); - } - - FBString desc = 0; - if (!description_.empty()) { - desc = fbb_.CreateString(description_); - } - - flatbuffers::Offset metadata = 0; - - auto root = fbs::CreateCTable(fbb_, desc, num_rows_, fbb_.CreateVector(columns_), - kFeatherVersion, metadata); - fbb_.Finish(root); - finished_ = true; - - return Status::OK(); -} - -std::shared_ptr TableBuilder::GetBuffer() const { - return std::make_shared(fbb_.GetBufferPointer(), - static_cast(fbb_.GetSize())); -} - -void TableBuilder::SetDescription(const std::string& description) { - description_ = description; -} - -void TableBuilder::SetNumRows(int64_t num_rows) { num_rows_ = num_rows; } - -void TableBuilder::add_column(const flatbuffers::Offset& col) { - columns_.push_back(col); -} - -ColumnBuilder::ColumnBuilder(TableBuilder* parent, const std::string& name) - : parent_(parent) { - fbb_ = &parent->fbb(); - name_ = name; - type_ = ColumnType::PRIMITIVE; - meta_time_.unit = TimeUnit::SECOND; -} - -flatbuffers::Offset ColumnBuilder::CreateColumnMetadata() { - switch (type_) { - case ColumnType::PRIMITIVE: - // flatbuffer void - return 0; - case ColumnType::CATEGORY: { - auto cat_meta = fbs::CreateCategoryMetadata( - fbb(), GetPrimitiveArray(fbb(), meta_category_.levels), meta_category_.ordered); - return cat_meta.Union(); - } - case ColumnType::TIMESTAMP: { - // flatbuffer void - flatbuffers::Offset tz = 0; - if (!meta_timestamp_.timezone.empty()) { - tz = fbb().CreateString(meta_timestamp_.timezone); - } - - auto ts_meta = - fbs::CreateTimestampMetadata(fbb(), ToFlatbufferEnum(meta_timestamp_.unit), tz); - return ts_meta.Union(); - } - case ColumnType::DATE: { - auto date_meta = fbs::CreateDateMetadata(fbb()); - return date_meta.Union(); - } - case ColumnType::TIME: { - auto time_meta = fbs::CreateTimeMetadata(fbb(), ToFlatbufferEnum(meta_time_.unit)); - return time_meta.Union(); - } - default: - // null - return flatbuffers::Offset(); - } -} - -Status ColumnBuilder::Finish() { - FBB& buf = fbb(); - - // values - auto values = GetPrimitiveArray(buf, values_); - flatbuffers::Offset metadata = CreateColumnMetadata(); - - auto column = fbs::CreateColumn(buf, buf.CreateString(name_), values, - ToFlatbufferEnum(type_), // metadata_type - metadata, buf.CreateString(user_metadata_)); - - // bad coupling, but OK for now - parent_->add_column(column); - return Status::OK(); -} - -void ColumnBuilder::SetValues(const ArrayMetadata& values) { values_ = values; } - -void ColumnBuilder::SetUserMetadata(const std::string& data) { user_metadata_ = data; } +// ReaderV1 -void ColumnBuilder::SetCategory(const ArrayMetadata& levels, bool ordered) { - type_ = ColumnType::CATEGORY; - meta_category_.levels = levels; - meta_category_.ordered = ordered; -} - -void ColumnBuilder::SetTimestamp(TimeUnit::type unit) { - type_ = ColumnType::TIMESTAMP; - meta_timestamp_.unit = unit; -} - -void ColumnBuilder::SetTimestamp(TimeUnit::type unit, const std::string& timezone) { - SetTimestamp(unit); - meta_timestamp_.timezone = timezone; -} - -void ColumnBuilder::SetDate() { type_ = ColumnType::DATE; } - -void ColumnBuilder::SetTime(TimeUnit::type unit) { - type_ = ColumnType::TIME; - meta_time_.unit = unit; -} - -FBB& ColumnBuilder::fbb() { return *fbb_; } - -std::unique_ptr TableBuilder::AddColumn(const std::string& name) { - return std::unique_ptr(new ColumnBuilder(this, name)); -} - -// ---------------------------------------------------------------------- -// reader.cc - -class TableReader::TableReaderImpl { +class ReaderV1 : public Reader { public: - TableReaderImpl() {} - Status Open(const std::shared_ptr& source) { source_ = source; - int magic_size = static_cast(strlen(kFeatherMagicBytes)); - int footer_size = magic_size + static_cast(sizeof(uint32_t)); - - // Pathological issue where the file is smaller than ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize()); - if (size < magic_size + footer_size) { - return Status::Invalid("File is too small to be a well-formed file"); - } - - ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(0, magic_size)); - - if (memcmp(buffer->data(), kFeatherMagicBytes, magic_size)) { - return Status::Invalid("Not a feather file"); - } + int magic_size = static_cast(strlen(kFeatherV1MagicBytes)); + int footer_size = magic_size + static_cast(sizeof(uint32_t)); // Now get the footer and verify - ARROW_ASSIGN_OR_RAISE(buffer, source->ReadAt(size - footer_size, footer_size)); + ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(size - footer_size, footer_size)); - if (memcmp(buffer->data() + sizeof(uint32_t), kFeatherMagicBytes, magic_size)) { + if (memcmp(buffer->data() + sizeof(uint32_t), kFeatherV1MagicBytes, magic_size)) { return Status::Invalid("Feather file footer incomplete"); } @@ -306,15 +161,28 @@ class TableReader::TableReaderImpl { return Status::Invalid("File is smaller than indicated metadata size"); } ARROW_ASSIGN_OR_RAISE( - buffer, source->ReadAt(size - footer_size - metadata_length, metadata_length)); + metadata_buffer_, + source->ReadAt(size - footer_size - metadata_length, metadata_length)); - metadata_.reset(new TableMetadata()); - return metadata_->Open(buffer); + metadata_ = fbs::GetCTable(metadata_buffer_->data()); + return ReadSchema(); + } + + Status ReadSchema() { + std::vector> fields; + for (int i = 0; i < static_cast(metadata_->columns()->size()); ++i) { + const fbs::Column* col = metadata_->columns()->Get(i); + std::shared_ptr type; + RETURN_NOT_OK( + GetDataType(col->values(), col->metadata_type(), col->metadata(), &type)); + fields.push_back(::arrow::field(col->name()->str(), type)); + } + schema_ = ::arrow::schema(fields); + return Status::OK(); } Status GetDataType(const fbs::PrimitiveArray* values, fbs::TypeMetadata metadata_type, - const void* metadata, std::shared_ptr* out, - std::shared_ptr* out_dictionary = nullptr) { + const void* metadata, std::shared_ptr* out) { #define PRIMITIVE_CASE(CAP_TYPE, FACTORY_FUNC) \ case fbs::Type::CAP_TYPE: \ *out = FACTORY_FUNC(); \ @@ -324,13 +192,11 @@ class TableReader::TableReaderImpl { case fbs::TypeMetadata::CategoryMetadata: { auto meta = static_cast(metadata); - std::shared_ptr index_type; + std::shared_ptr index_type, dict_type; RETURN_NOT_OK(GetDataType(values, fbs::TypeMetadata::NONE, nullptr, &index_type)); - RETURN_NOT_OK( - LoadValues(meta->levels(), fbs::TypeMetadata::NONE, nullptr, out_dictionary)); - - *out = dictionary(index_type, (*out_dictionary)->type(), meta->ordered()); + GetDataType(meta->levels(), fbs::TypeMetadata::NONE, nullptr, &dict_type)); + *out = dictionary(index_type, dict_type, meta->ordered()); break; } case fbs::TypeMetadata::TimestampMetadata: { @@ -380,16 +246,24 @@ class TableReader::TableReaderImpl { return Status::OK(); } + int64_t GetOutputLength(int64_t nbytes) { + // XXX: Hack for Feather 0.3.0 for backwards compatibility with old files + // Size in-file of written byte buffer + if (version() < 2) { + // Feather files < 0.3.0 + return nbytes; + } else { + return PaddedLength(nbytes); + } + } + // Retrieve a primitive array from the data source // // @returns: a Buffer instance, the precise type will depend on the kind of // input data source (which may or may not have memory-map like semantics) - Status LoadValues(const fbs::PrimitiveArray* meta, fbs::TypeMetadata metadata_type, - const void* metadata, std::shared_ptr* out) { - std::shared_ptr type; - std::shared_ptr dictionary; - RETURN_NOT_OK(GetDataType(meta, metadata_type, metadata, &type, &dictionary)); - + Status LoadValues(std::shared_ptr type, const fbs::PrimitiveArray* meta, + fbs::TypeMetadata metadata_type, const void* metadata, + std::shared_ptr* out) { std::vector> buffers; // Buffer data from the source (may or may not perform a copy depending on @@ -399,6 +273,11 @@ class TableReader::TableReaderImpl { int64_t offset = 0; + if (type->id() == Type::DICTIONARY) { + // Load the index type values + type = checked_cast(*type).index_type(); + } + // If there are nulls, the null bitmask is first if (meta->null_count() > 0) { int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(meta->length())); @@ -420,181 +299,131 @@ class TableReader::TableReaderImpl { buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset)); - auto arr_data = - ArrayData::Make(type, meta->length(), std::move(buffers), meta->null_count()); - arr_data->dictionary = dictionary; - *out = MakeArray(arr_data); + *out = ArrayData::Make(type, meta->length(), std::move(buffers), meta->null_count()); return Status::OK(); } - bool HasDescription() const { return metadata_->HasDescription(); } + int version() const override { return metadata_->version(); } + int64_t num_rows() const { return metadata_->num_rows(); } - std::string GetDescription() const { return metadata_->GetDescription(); } + std::shared_ptr schema() const override { return schema_; } - int version() const { return metadata_->version(); } - int64_t num_rows() const { return metadata_->num_rows(); } - int64_t num_columns() const { return metadata_->num_columns(); } + Status GetDictionary(int field_index, std::shared_ptr* out) { + const fbs::Column* col_meta = metadata_->columns()->Get(field_index); + auto dict_meta = col_meta->metadata_as(); + const auto& dict_type = + checked_cast(*schema_->field(field_index)->type()); - std::string GetColumnName(int i) const { - const fbs::Column* col_meta = metadata_->column(i); - return col_meta->name()->str(); + std::shared_ptr out_data; + RETURN_NOT_OK(LoadValues(dict_type.value_type(), dict_meta->levels(), + fbs::TypeMetadata::NONE, nullptr, &out_data)); + *out = MakeArray(out_data); + return Status::OK(); } - Status GetColumn(int i, std::shared_ptr* out) { - const fbs::Column* col_meta = metadata_->column(i); + Status GetColumn(int field_index, std::shared_ptr* out) { + const fbs::Column* col_meta = metadata_->columns()->Get(field_index); + std::shared_ptr data; - // auto user_meta = column->user_metadata(); - // if (user_meta->size() > 0) { user_metadata_ = user_meta->str(); } + auto type = schema_->field(field_index)->type(); + RETURN_NOT_OK(LoadValues(type, col_meta->values(), col_meta->metadata_type(), + col_meta->metadata(), &data)); - std::shared_ptr values; - RETURN_NOT_OK(LoadValues(col_meta->values(), col_meta->metadata_type(), - col_meta->metadata(), &values)); - *out = std::make_shared(values); + if (type->id() == Type::DICTIONARY) { + RETURN_NOT_OK(GetDictionary(field_index, &data->dictionary)); + data->type = type; + } + *out = std::make_shared(MakeArray(data)); return Status::OK(); } - Status Read(std::shared_ptr* out) { - std::vector> fields; + Status Read(std::shared_ptr
* out) override { std::vector> columns; - for (int i = 0; i < num_columns(); ++i) { - std::shared_ptr column; - RETURN_NOT_OK(GetColumn(i, &column)); - columns.push_back(column); - fields.push_back(::arrow::field(GetColumnName(i), column->type())); + for (int i = 0; i < static_cast(metadata_->columns()->size()); ++i) { + columns.emplace_back(); + RETURN_NOT_OK(GetColumn(i, &columns.back())); } - *out = Table::Make(schema(fields), columns); + *out = Table::Make(this->schema(), columns, this->num_rows()); return Status::OK(); } - Status Read(const std::vector& indices, std::shared_ptr
* out) { + Status Read(const std::vector& indices, std::shared_ptr
* out) override { std::vector> fields; std::vector> columns; - for (int i = 0; i < num_columns(); ++i) { - bool found = false; - for (auto j : indices) { - if (i == j) { - found = true; - break; - } - } - if (!found) { - continue; + + auto my_schema = this->schema(); + for (auto field_index : indices) { + if (field_index < 0 || field_index >= my_schema->num_fields()) { + return Status::Invalid("Field index ", field_index, " is out of bounds"); } - std::shared_ptr column; - RETURN_NOT_OK(GetColumn(i, &column)); - columns.push_back(column); - fields.push_back(::arrow::field(GetColumnName(i), column->type())); + columns.emplace_back(); + RETURN_NOT_OK(GetColumn(field_index, &columns.back())); + fields.push_back(my_schema->field(field_index)); } - *out = Table::Make(schema(fields), columns); + *out = Table::Make(::arrow::schema(fields), columns, this->num_rows()); return Status::OK(); } - Status Read(const std::vector& names, std::shared_ptr
* out) { + Status Read(const std::vector& names, + std::shared_ptr
* out) override { std::vector> fields; std::vector> columns; - for (int i = 0; i < num_columns(); ++i) { - auto name = GetColumnName(i); - bool found = false; - for (auto& n : names) { - if (name == n) { - found = true; - break; - } - } - if (!found) { - continue; + + std::shared_ptr sch = this->schema(); + for (auto name : names) { + int field_index = sch->GetFieldIndex(name); + if (field_index == -1) { + return Status::Invalid("Field named ", name, " is not found"); } - std::shared_ptr column; - RETURN_NOT_OK(GetColumn(i, &column)); - columns.push_back(column); - fields.push_back(::arrow::field(name, column->type())); + columns.emplace_back(); + RETURN_NOT_OK(GetColumn(field_index, &columns.back())); + fields.push_back(sch->field(field_index)); } - *out = Table::Make(schema(fields), columns); + *out = Table::Make(::arrow::schema(fields), columns, this->num_rows()); return Status::OK(); } private: std::shared_ptr source_; - std::unique_ptr metadata_; - + std::shared_ptr metadata_buffer_; + const fbs::CTable* metadata_; std::shared_ptr schema_; }; // ---------------------------------------------------------------------- -// TableReader public API - -TableReader::TableReader() { impl_.reset(new TableReaderImpl()); } - -TableReader::~TableReader() {} - -Status TableReader::Open(const std::shared_ptr& source, - std::unique_ptr* out) { - out->reset(new TableReader()); - return (*out)->impl_->Open(source); -} - -bool TableReader::HasDescription() const { return impl_->HasDescription(); } - -std::string TableReader::GetDescription() const { return impl_->GetDescription(); } - -int TableReader::version() const { return impl_->version(); } - -int64_t TableReader::num_rows() const { return impl_->num_rows(); } - -int64_t TableReader::num_columns() const { return impl_->num_columns(); } - -std::string TableReader::GetColumnName(int i) const { return impl_->GetColumnName(i); } - -Status TableReader::GetColumn(int i, std::shared_ptr* out) { - return impl_->GetColumn(i, out); -} - -Status TableReader::Read(std::shared_ptr
* out) { return impl_->Read(out); } - -Status TableReader::Read(const std::vector& indices, std::shared_ptr
* out) { - return impl_->Read(indices, out); -} - -Status TableReader::Read(const std::vector& names, - std::shared_ptr
* out) { - return impl_->Read(names, out); -} +// WriterV1 + +struct ArrayMetadata { + fbs::Type type; + int64_t offset; + int64_t length; + int64_t null_count; + int64_t total_bytes; +}; -// ---------------------------------------------------------------------- -// writer.cc - -fbs::Type ToFlatbufferType(Type::type type) { - switch (type) { - case Type::BOOL: - return fbs::Type::BOOL; - case Type::INT8: - return fbs::Type::INT8; - case Type::INT16: - return fbs::Type::INT16; - case Type::INT32: - return fbs::Type::INT32; - case Type::INT64: - return fbs::Type::INT64; - case Type::UINT8: - return fbs::Type::UINT8; - case Type::UINT16: - return fbs::Type::UINT16; - case Type::UINT32: - return fbs::Type::UINT32; - case Type::UINT64: - return fbs::Type::UINT64; - case Type::FLOAT: - return fbs::Type::FLOAT; - case Type::DOUBLE: - return fbs::Type::DOUBLE; +#define TO_FLATBUFFER_CASE(TYPE) \ + case Type::TYPE: \ + return fbs::Type::TYPE; + +Result ToFlatbufferType(const DataType& type) { + switch (type.id()) { + TO_FLATBUFFER_CASE(BOOL); + TO_FLATBUFFER_CASE(INT8); + TO_FLATBUFFER_CASE(INT16); + TO_FLATBUFFER_CASE(INT32); + TO_FLATBUFFER_CASE(INT64); + TO_FLATBUFFER_CASE(UINT8); + TO_FLATBUFFER_CASE(UINT16); + TO_FLATBUFFER_CASE(UINT32); + TO_FLATBUFFER_CASE(UINT64); + TO_FLATBUFFER_CASE(FLOAT); + TO_FLATBUFFER_CASE(DOUBLE); + TO_FLATBUFFER_CASE(LARGE_BINARY); + TO_FLATBUFFER_CASE(BINARY); case Type::STRING: return fbs::Type::UTF8; - case Type::BINARY: - return fbs::Type::BINARY; case Type::LARGE_STRING: return fbs::Type::LARGE_UTF8; - case Type::LARGE_BINARY: - return fbs::Type::LARGE_BINARY; case Type::DATE32: return fbs::Type::INT32; case Type::TIMESTAMP: @@ -604,291 +433,393 @@ fbs::Type ToFlatbufferType(Type::type type) { case Type::TIME64: return fbs::Type::INT64; default: - DCHECK(false) << "Cannot reach this code"; + return Status::TypeError("Unsupported Feather V1 type: ", type.ToString(), + ". Use V2 format to serialize all Arrow types."); } - // prevent compiler warning - return fbs::Type::MIN; } -static Status SanitizeUnsupportedTypes(const Array& values, std::shared_ptr* out) { - if (values.type_id() == Type::NA) { - // As long as R doesn't support NA, we write this as a StringColumn - // to ensure stable roundtrips. - *out = std::make_shared(values.length(), nullptr, nullptr, - values.null_bitmap(), values.null_count()); - return Status::OK(); - } else { - *out = MakeArray(values.data()); - return Status::OK(); - } +static inline flatbuffers::Offset GetPrimitiveArray( + FBB& fbb, const ArrayMetadata& array) { + return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset, + array.length, array.null_count, array.total_bytes); } -class TableWriter::TableWriterImpl : public ArrayVisitor { - public: - TableWriterImpl() : initialized_stream_(false), metadata_(0) {} - - Status Open(const std::shared_ptr& stream) { - stream_ = stream; - return Status::OK(); - } - - void SetDescription(const std::string& desc) { metadata_.SetDescription(desc); } - - void SetNumRows(int64_t num_rows) { metadata_.SetNumRows(num_rows); } - - Status Finalize() { - RETURN_NOT_OK(CheckStarted()); - RETURN_NOT_OK(metadata_.Finish()); +// Convert Feather enums to Flatbuffer enums +static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { + return static_cast(static_cast(unit)); +} - auto buffer = metadata_.GetBuffer(); +const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { + fbs::TypeMetadata::NONE, // PRIMITIVE + fbs::TypeMetadata::CategoryMetadata, // CATEGORY + fbs::TypeMetadata::TimestampMetadata, // TIMESTAMP + fbs::TypeMetadata::DateMetadata, // DATE + fbs::TypeMetadata::TimeMetadata // TIME +}; - // Writer metadata - int64_t bytes_written; - RETURN_NOT_OK( - WritePadded(stream_.get(), buffer->data(), buffer->size(), &bytes_written)); - uint32_t buffer_size = static_cast(bytes_written); +static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { + return COLUMN_TYPE_ENUM_MAPPING[column_type]; +} - // Footer: metadata length, magic bytes - RETURN_NOT_OK(stream_->Write(&buffer_size, sizeof(uint32_t))); - return stream_->Write(kFeatherMagicBytes, strlen(kFeatherMagicBytes)); - } +struct ColumnMetadata { + flatbuffers::Offset WriteMetadata(FBB& fbb) { // NOLINT + switch (this->meta_type) { + case ColumnType::PRIMITIVE: + // flatbuffer void + return 0; + case ColumnType::CATEGORY: { + auto cat_meta = fbs::CreateCategoryMetadata( + fbb, GetPrimitiveArray(fbb, this->category_levels), this->category_ordered); + return cat_meta.Union(); + } + case ColumnType::TIMESTAMP: { + // flatbuffer void + flatbuffers::Offset tz = 0; + if (!this->timezone.empty()) { + tz = fbb.CreateString(this->timezone); + } - Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { - if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()) || - is_large_binary_like(values.type_id()))) { - return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); + auto ts_meta = + fbs::CreateTimestampMetadata(fbb, ToFlatbufferEnum(this->temporal_unit), tz); + return ts_meta.Union(); + } + case ColumnType::DATE: { + auto date_meta = fbs::CreateDateMetadata(fbb); + return date_meta.Union(); + } + case ColumnType::TIME: { + auto time_meta = + fbs::CreateTimeMetadata(fbb, ToFlatbufferEnum(this->temporal_unit)); + return time_meta.Union(); + } + default: + // null + DCHECK(false); + return 0; } + } - meta->type = ToFlatbufferType(values.type_id()); + ArrayMetadata values; + ColumnType::type meta_type; - ARROW_ASSIGN_OR_RAISE(meta->offset, stream_->Tell()); + ArrayMetadata category_levels; + bool category_ordered; - meta->length = values.length(); - meta->null_count = values.null_count(); - meta->total_bytes = 0; + TimeUnit::type temporal_unit; - return Status::OK(); - } - - template - Status WriteBinaryArray(const ArrayType& values, ArrayMetadata* meta, - const uint8_t** values_buffer, int64_t* values_bytes, - int64_t* bytes_written) { - using offset_type = typename ArrayType::offset_type; + // A timezone name known to the Olson timezone database. For display purposes + // because the actual data is all UTC + std::string timezone; +}; - int64_t offset_bytes = sizeof(offset_type) * (values.length() + 1); +Status WriteArrayV1(const Array& values, io::OutputStream* dst, ArrayMetadata* meta); - if (values.value_offsets()) { - *values_bytes = values.raw_value_offsets()[values.length()]; +struct ArrayWriterV1 { + const Array& values; + io::OutputStream* dst; + ArrayMetadata* meta; - // Write the variable-length offsets - RETURN_NOT_OK(WritePadded( - stream_.get(), reinterpret_cast(values.raw_value_offsets()), - offset_bytes, bytes_written)); + Status WriteBuffer(const uint8_t* buffer, int64_t length, int64_t bit_offset) { + int64_t bytes_written = 0; + if (buffer) { + RETURN_NOT_OK( + WritePaddedWithOffset(dst, buffer, bit_offset, length, &bytes_written)); } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, bytes_written)); + RETURN_NOT_OK(WritePaddedBlank(dst, length, &bytes_written)); } - meta->total_bytes += *bytes_written; + meta->total_bytes += bytes_written; + return Status::OK(); + } - if (values.value_data()) { - *values_buffer = values.value_data()->data(); + template + typename std::enable_if< + is_nested_type::value || is_null_type::value || is_decimal_type::value || + std::is_same::value || is_duration_type::value || + is_interval_type::value || is_fixed_size_binary_type::value || + std::is_same::value || std::is_same::value || + std::is_same::value, + Status>::type + Visit(const T& type) { + return Status::NotImplemented(type.ToString()); + } + + template + typename std::enable_if::value || + std::is_same::value || + std::is_same::value || + is_timestamp_type::value || is_boolean_type::value, + Status>::type + Visit(const T&) { + const auto& prim_values = checked_cast(values); + const auto& fw_type = checked_cast(*values.type()); + + if (prim_values.values()) { + const uint8_t* buffer = + prim_values.values()->data() + (prim_values.offset() * fw_type.bit_width() / 8); + int64_t bit_offset = (prim_values.offset() * fw_type.bit_width()) % 8; + return WriteBuffer(buffer, + BitUtil::BytesForBits(values.length() * fw_type.bit_width()), + bit_offset); + } else { + return Status::OK(); } return Status::OK(); } - Status WriteArray(const Array& values, ArrayMetadata* meta) { - RETURN_NOT_OK(CheckStarted()); - RETURN_NOT_OK(LoadArrayMetadata(values, meta)); - - int64_t bytes_written; - - // Write the null bitmask - if (values.null_count() > 0) { - // We assume there is one bit for each value in values.nulls, - // starting at the zero offset. - int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(values.length())); - if (values.null_bitmap()) { - auto null_bitmap = values.null_bitmap(); - RETURN_NOT_OK(WritePaddedWithOffset(stream_.get(), null_bitmap->data(), - values.offset(), null_bitmap_size, - &bytes_written)); - } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), null_bitmap_size, &bytes_written)); - } - meta->total_bytes += bytes_written; - } + template + enable_if_base_binary Visit(const T&) { + using ArrayType = typename TypeTraits::ArrayType; + const auto& ty_values = checked_cast(values); + using offset_type = typename T::offset_type; + const offset_type* offsets_data = nullptr; int64_t values_bytes = 0; - int64_t bit_offset = 0; + if (ty_values.value_offsets()) { + offsets_data = ty_values.raw_value_offsets(); + // All of the data has to be written because we don't have offset + // shifting implemented here as with the IPC format + values_bytes = offsets_data[values.length()]; + } + RETURN_NOT_OK(WriteBuffer(reinterpret_cast(offsets_data), + sizeof(offset_type) * (values.length() + 1), + /*bit_offset=*/0)); const uint8_t* values_buffer = nullptr; + if (ty_values.value_data()) { + values_buffer = ty_values.value_data()->data(); + } + return WriteBuffer(values_buffer, values_bytes, /*bit_offset=*/0); + } - if (is_binary_like(values.type_id())) { - RETURN_NOT_OK(WriteBinaryArray(checked_cast(values), meta, - &values_buffer, &values_bytes, &bytes_written)); - } else if (is_large_binary_like(values.type_id())) { - RETURN_NOT_OK(WriteBinaryArray(checked_cast(values), meta, - &values_buffer, &values_bytes, &bytes_written)); - } else { - const auto& prim_values = checked_cast(values); - const auto& fw_type = checked_cast(*values.type()); + Status Write() { + if (values.type_id() == Type::DICTIONARY) { + return WriteArrayV1(*(checked_cast(values).indices()), dst, + meta); + } - values_bytes = BitUtil::BytesForBits(values.length() * fw_type.bit_width()); + ARROW_ASSIGN_OR_RAISE(meta->type, ToFlatbufferType(*values.type())); + ARROW_ASSIGN_OR_RAISE(meta->offset, dst->Tell()); + meta->length = values.length(); + meta->null_count = values.null_count(); + meta->total_bytes = 0; - if (prim_values.values()) { - values_buffer = prim_values.values()->data() + - (prim_values.offset() * fw_type.bit_width() / 8); - bit_offset = (prim_values.offset() * fw_type.bit_width()) % 8; - } - } - if (values_buffer) { - RETURN_NOT_OK(WritePaddedWithOffset(stream_.get(), values_buffer, bit_offset, - values_bytes, &bytes_written)); - } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), values_bytes, &bytes_written)); + // Write the null bitmask + if (values.null_count() > 0) { + RETURN_NOT_OK(WriteBuffer(values.null_bitmap_data(), + BitUtil::BytesForBits(values.length()), values.offset())); } - meta->total_bytes += bytes_written; - - return Status::OK(); + // Write data buffer(s) + return VisitTypeInline(*values.type(), this); } +}; - Status WritePrimitiveValues(const Array& values) { - // Prepare metadata payload - ArrayMetadata meta; - RETURN_NOT_OK(WriteArray(values, &meta)); - current_column_->SetValues(meta); - return Status::OK(); +Status WriteArrayV1(const Array& values, io::OutputStream* dst, ArrayMetadata* meta) { + std::shared_ptr sanitized; + if (values.type_id() == Type::NA) { + // As long as R doesn't support NA, we write this as a StringColumn + // to ensure stable roundtrips. + sanitized = std::make_shared(values.length(), nullptr, nullptr, + values.null_bitmap(), values.null_count()); + } else { + sanitized = MakeArray(values.data()); } + ArrayWriterV1 visitor{*sanitized, dst, meta}; + return visitor.Write(); +} - Status Visit(const NullArray& values) override { - std::shared_ptr sanitized_nulls; - RETURN_NOT_OK(SanitizeUnsupportedTypes(values, &sanitized_nulls)); - return WritePrimitiveValues(*sanitized_nulls); +Status WriteColumnV1(const ChunkedArray& values, io::OutputStream* dst, + ColumnMetadata* out) { + if (values.num_chunks() > 1) { + return Status::Invalid("Writing chunked arrays not supported in Feather V1"); + } + const Array& chunk = *values.chunk(0); + RETURN_NOT_OK(WriteArrayV1(chunk, dst, &out->values)); + switch (chunk.type_id()) { + case Type::DICTIONARY: { + out->meta_type = ColumnType::CATEGORY; + auto dictionary = checked_cast(chunk).dictionary(); + RETURN_NOT_OK(WriteArrayV1(*dictionary, dst, &out->category_levels)); + out->category_ordered = + checked_cast(*chunk.type()).ordered(); + } break; + case Type::DATE32: + out->meta_type = ColumnType::DATE; + break; + case Type::TIME32: { + out->meta_type = ColumnType::TIME; + out->temporal_unit = checked_cast(*chunk.type()).unit(); + } break; + case Type::TIMESTAMP: { + const auto& ts_type = checked_cast(*chunk.type()); + out->meta_type = ColumnType::TIMESTAMP; + out->temporal_unit = ts_type.unit(); + out->timezone = ts_type.timezone(); + } break; + default: + out->meta_type = ColumnType::PRIMITIVE; + break; } + return Status::OK(); +} -#define VISIT_PRIMITIVE(TYPE) \ - Status Visit(const TYPE& values) override { return WritePrimitiveValues(values); } - - VISIT_PRIMITIVE(BooleanArray) - VISIT_PRIMITIVE(Int8Array) - VISIT_PRIMITIVE(Int16Array) - VISIT_PRIMITIVE(Int32Array) - VISIT_PRIMITIVE(Int64Array) - VISIT_PRIMITIVE(UInt8Array) - VISIT_PRIMITIVE(UInt16Array) - VISIT_PRIMITIVE(UInt32Array) - VISIT_PRIMITIVE(UInt64Array) - VISIT_PRIMITIVE(FloatArray) - VISIT_PRIMITIVE(DoubleArray) - VISIT_PRIMITIVE(BinaryArray) - VISIT_PRIMITIVE(StringArray) - VISIT_PRIMITIVE(LargeBinaryArray) - VISIT_PRIMITIVE(LargeStringArray) - -#undef VISIT_PRIMITIVE - - Status Visit(const DictionaryArray& values) override { - const auto& dict_type = checked_cast(*values.type()); - - if (!is_integer(values.indices()->type_id())) { - return Status::Invalid("Category values must be integers"); - } +Status WriteFeatherV1(const Table& table, io::OutputStream* dst) { + // Preamble + int64_t bytes_written; + RETURN_NOT_OK(WritePadded(dst, reinterpret_cast(kFeatherV1MagicBytes), + strlen(kFeatherV1MagicBytes), &bytes_written)); + + // Write columns + flatbuffers::FlatBufferBuilder fbb; + std::vector> fb_columns; + for (int i = 0; i < table.num_columns(); ++i) { + ColumnMetadata col; + RETURN_NOT_OK(WriteColumnV1(*table.column(i), dst, &col)); + auto fb_column = fbs::CreateColumn( + fbb, fbb.CreateString(table.field(i)->name()), GetPrimitiveArray(fbb, col.values), + ToFlatbufferEnum(col.meta_type), col.WriteMetadata(fbb), + /*user_metadata=*/0); + fb_columns.push_back(fb_column); + } + + // Finalize file footer + auto root = fbs::CreateCTable(fbb, /*description=*/0, table.num_rows(), + fbb.CreateVector(fb_columns), kFeatherV1Version, + /*metadata=*/0); + fbb.Finish(root); + auto buffer = std::make_shared(fbb.GetBufferPointer(), + static_cast(fbb.GetSize())); + + // Writer metadata + RETURN_NOT_OK(WritePadded(dst, buffer->data(), buffer->size(), &bytes_written)); + uint32_t metadata_size = static_cast(bytes_written); + + // Footer: metadata length, magic bytes + RETURN_NOT_OK(dst->Write(&metadata_size, sizeof(uint32_t))); + return dst->Write(kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes)); +} - RETURN_NOT_OK(WritePrimitiveValues(*values.indices())); +// ---------------------------------------------------------------------- +// Reader V2 - ArrayMetadata levels_meta; - std::shared_ptr sanitized_dictionary; - RETURN_NOT_OK(SanitizeUnsupportedTypes(*values.dictionary(), &sanitized_dictionary)); - RETURN_NOT_OK(WriteArray(*sanitized_dictionary, &levels_meta)); - current_column_->SetCategory(levels_meta, dict_type.ordered()); +class ReaderV2 : public Reader { + public: + Status Open(const std::shared_ptr& source) { + source_ = source; + ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_)); + schema_ = reader->schema(); return Status::OK(); } - Status Visit(const TimestampArray& values) override { - RETURN_NOT_OK(WritePrimitiveValues(values)); - const auto& ts_type = checked_cast(*values.type()); - current_column_->SetTimestamp(ts_type.unit(), ts_type.timezone()); - return Status::OK(); - } + int version() const override { return kFeatherV2Version; } - Status Visit(const Date32Array& values) override { - RETURN_NOT_OK(WritePrimitiveValues(values)); - current_column_->SetDate(); - return Status::OK(); - } + std::shared_ptr schema() const override { return schema_; } - Status Visit(const Time32Array& values) override { - RETURN_NOT_OK(WritePrimitiveValues(values)); - auto unit = checked_cast(*values.type()).unit(); - current_column_->SetTime(unit); - return Status::OK(); + Status Read(const IpcReadOptions& options, std::shared_ptr
* out) { + ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_, options)); + std::vector> batches; + for (int i = 0; i < reader->num_record_batches(); ++i) { + std::shared_ptr batch; + RETURN_NOT_OK(reader->ReadRecordBatch(i, &batch)); + batches.emplace_back(batch); + } + + // XXX: Handle included_fields in RecordBatchFileReader::schema + auto out_schema = reader->schema(); + if (options.included_fields) { + const auto& indices = *options.included_fields; + std::vector> fields; + for (int i = 0; i < out_schema->num_fields(); ++i) { + if (std::find(indices.begin(), indices.end(), i) != indices.end()) { + fields.push_back(out_schema->field(i)); + } + } + out_schema = ::arrow::schema(fields, out_schema->metadata()); + } + return Table::FromRecordBatches(out_schema, batches, out); } - Status Visit(const Time64Array& values) override { - return Status::NotImplemented("time64"); + Status Read(std::shared_ptr
* out) override { + return Read(IpcReadOptions::Defaults(), out); } - Status Append(const std::string& name, const Array& values) { - current_column_ = metadata_.AddColumn(name); - RETURN_NOT_OK(values.Accept(this)); - return current_column_->Finish(); + Status Read(const std::vector& indices, std::shared_ptr
* out) override { + auto options = IpcReadOptions::Defaults(); + options.included_fields = indices; + return Read(options, out); } - Status Write(const Table& table) { - for (int i = 0; i < table.num_columns(); ++i) { - auto column = table.column(i); - current_column_ = metadata_.AddColumn(table.field(i)->name()); - for (const auto chunk : column->chunks()) { - RETURN_NOT_OK(chunk->Accept(this)); + Status Read(const std::vector& names, + std::shared_ptr
* out) override { + std::vector indices; + std::shared_ptr sch = this->schema(); + for (auto name : names) { + int field_index = sch->GetFieldIndex(name); + if (field_index == -1) { + return Status::Invalid("Field named ", name, " is not found"); } - RETURN_NOT_OK(current_column_->Finish()); + indices.push_back(field_index); } - return Status::OK(); + return Read(indices, out); } private: - Status CheckStarted() { - if (!initialized_stream_) { - int64_t bytes_written_unused; - RETURN_NOT_OK(WritePadded(stream_.get(), - reinterpret_cast(kFeatherMagicBytes), - strlen(kFeatherMagicBytes), &bytes_written_unused)); - initialized_stream_ = true; - } - return Status::OK(); - } - - std::shared_ptr stream_; - - bool initialized_stream_; - TableBuilder metadata_; - - std::unique_ptr current_column_; - - Status AppendPrimitive(const PrimitiveArray& values, ArrayMetadata* out); + std::shared_ptr source_; + std::shared_ptr schema_; }; -TableWriter::TableWriter() { impl_.reset(new TableWriterImpl()); } - -TableWriter::~TableWriter() {} - -Status TableWriter::Open(const std::shared_ptr& stream, - std::unique_ptr* out) { - out->reset(new TableWriter()); - return (*out)->impl_->Open(stream); +Result> Reader::Open( + const std::shared_ptr& source) { + // Pathological issue where the file is smaller than header and footer + // combined + ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize()); + if (size < /* 2 * 4 + 4 */ 12) { + return Status::Invalid("File is too small to be a well-formed file"); + } + + // Determine what kind of file we have. 6 is the max of len(FEA1) and + // len(ARROW1) + constexpr int magic_size = 6; + ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(0, magic_size)); + + if (memcmp(buffer->data(), kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes)) == 0) { + std::shared_ptr result = std::make_shared(); + RETURN_NOT_OK(result->Open(source)); + return result; + } else if (memcmp(buffer->data(), internal::kArrowMagicBytes, + strlen(internal::kArrowMagicBytes)) == 0) { + std::shared_ptr result = std::make_shared(); + RETURN_NOT_OK(result->Open(source)); + return result; + } else { + return Status::Invalid("Not a Feather V1 or Arrow IPC file"); + } } -void TableWriter::SetDescription(const std::string& desc) { impl_->SetDescription(desc); } - -void TableWriter::SetNumRows(int64_t num_rows) { impl_->SetNumRows(num_rows); } - -Status TableWriter::Append(const std::string& name, const Array& values) { - return impl_->Append(name, values); +WriteProperties WriteProperties::Defaults() { + WriteProperties result; +#ifdef ARROW_WITH_LZ4 + result.compression = Compression::LZ4; +#else + result.compression = Compression::UNCOMPRESSED; +#endif + return result; } -Status TableWriter::Write(const Table& table) { return impl_->Write(table); } +Status WriteTable(const Table& table, io::OutputStream* dst, + const WriteProperties& properties) { + if (properties.version == kFeatherV1Version) { + return WriteFeatherV1(table, dst); + } else { + IpcWriteOptions ipc_options = IpcWriteOptions::Defaults(); + ipc_options.compression = properties.compression; + ipc_options.compression_level = properties.compression_level; -Status TableWriter::Finalize() { return impl_->Finalize(); } + std::shared_ptr writer; + ARROW_ASSIGN_OR_RAISE(writer, NewFileWriter(dst, table.schema(), ipc_options)); + RETURN_NOT_OK(writer->WriteTable(table, properties.chunksize)); + return writer->Close(); + } +} } // namespace feather } // namespace ipc diff --git a/cpp/src/arrow/ipc/feather.fbs b/cpp/src/arrow/ipc/feather.fbs index 5ec06299864..b4076be8757 100644 --- a/cpp/src/arrow/ipc/feather.fbs +++ b/cpp/src/arrow/ipc/feather.fbs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +/// DEPRECATED: Feather V2 is available starting in version 0.17.0 and does not +/// use this file at all. + namespace arrow.ipc.feather.fbs; /// Feather is an experimental serialization format implemented using @@ -141,6 +144,9 @@ table CTable { columns: [Column]; /// Version number of the Feather format + /// + /// Internal versions 0, 1, and 2: Implemented in Apache Arrow <= 0.16.0 and + /// wesm/feather. Uses "custom" metadata defined in this file. version: int; /// Table metadata (likely JSON), not yet used diff --git a/cpp/src/arrow/ipc/feather.h b/cpp/src/arrow/ipc/feather.h index 28b4b43c671..991f55f5bb2 100644 --- a/cpp/src/arrow/ipc/feather.h +++ b/cpp/src/arrow/ipc/feather.h @@ -25,12 +25,13 @@ #include #include +#include "arrow/result.h" +#include "arrow/util/compression.h" #include "arrow/util/visibility.h" namespace arrow { -class Array; -class ChunkedArray; +class Schema; class Status; class Table; @@ -44,53 +45,29 @@ class RandomAccessFile; namespace ipc { namespace feather { -static constexpr const int kFeatherVersion = 2; +static constexpr const int kFeatherV1Version = 2; +static constexpr const int kFeatherV2Version = 3; // ---------------------------------------------------------------------- // Metadata accessor classes -/// \class TableReader +/// \class Reader /// \brief An interface for reading columns from Feather files -class ARROW_EXPORT TableReader { +class ARROW_EXPORT Reader { public: - TableReader(); - ~TableReader(); + virtual ~Reader() = default; /// \brief Open a Feather file from a RandomAccessFile interface /// /// \param[in] source a RandomAccessFile instance - /// \param[out] out the table reader - static Status Open(const std::shared_ptr& source, - std::unique_ptr* out); - - /// \brief Optional table description - /// - /// This does not return a const std::string& because a string has to be - /// copied from the flatbuffer to be able to return a non-flatbuffer type - std::string GetDescription() const; - - /// \brief Return true if the table has a description field populated - bool HasDescription() const; + /// \return the table reader + static Result> Open( + const std::shared_ptr& source); /// \brief Return the version number of the Feather file - int version() const; - - /// \brief Return the number of rows in the file - int64_t num_rows() const; - - /// \brief Return the number of columns in the file - int64_t num_columns() const; - - std::string GetColumnName(int i) const; + virtual int version() const = 0; - /// \brief Read a column from the file as an arrow::ChunkedArray. - /// - /// \param[in] i the column index to read - /// \param[out] out the returned column - /// \return Status - /// - /// This function is zero-copy if the file source supports zero-copy reads - Status GetColumn(int i, std::shared_ptr* out); + virtual std::shared_ptr schema() const = 0; /// \brief Read all columns from the file as an arrow::Table. /// @@ -98,7 +75,7 @@ class ARROW_EXPORT TableReader { /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status Read(std::shared_ptr
* out); + virtual Status Read(std::shared_ptr
* out) = 0; /// \brief Read only the specified columns from the file as an arrow::Table. /// @@ -107,7 +84,7 @@ class ARROW_EXPORT TableReader { /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status Read(const std::vector& indices, std::shared_ptr
* out); + virtual Status Read(const std::vector& indices, std::shared_ptr
* out) = 0; /// \brief Read only the specified columns from the file as an arrow::Table. /// @@ -116,55 +93,48 @@ class ARROW_EXPORT TableReader { /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status Read(const std::vector& names, std::shared_ptr
* out); - - private: - class ARROW_NO_EXPORT TableReaderImpl; - std::unique_ptr impl_; + virtual Status Read(const std::vector& names, + std::shared_ptr
* out) = 0; }; -/// \class TableWriter -/// \brief Interface for writing Feather files -class ARROW_EXPORT TableWriter { - public: - ~TableWriter(); +struct ARROW_EXPORT WriteProperties { + static WriteProperties Defaults(); - /// \brief Create a new TableWriter that writes to an OutputStream - /// \param[in] stream an output stream - /// \param[out] out the returned table writer - /// \return Status - static Status Open(const std::shared_ptr& stream, - std::unique_ptr* out); - - /// \brief Set the description field in the file metadata - void SetDescription(const std::string& desc); - - /// \brief Set the number of rows in the file - void SetNumRows(int64_t num_rows); - - /// \brief Append a column to the file - /// - /// \param[in] name the column name - /// \param[in] values the column values as a contiguous arrow::Array - /// \return Status - Status Append(const std::string& name, const Array& values); + static WriteProperties DefaultsV1() { + WriteProperties props = Defaults(); + props.version = kFeatherV1Version; + return props; + } - /// \brief Write a table to the file + /// Feather file version number /// - /// \param[in] table the table to be written - /// \return Status - Status Write(const Table& table); - - /// \brief Finalize the file by writing the file metadata and footer - /// \return Status - Status Finalize(); - - private: - TableWriter(); - class ARROW_NO_EXPORT TableWriterImpl; - std::unique_ptr impl_; + /// version 2: "Feather V1" Apache Arrow <= 0.16.0 + /// version 3: "Feather V2" Apache Arrow > 0.16.0 + int version = kFeatherV2Version; + + // Parameters for Feather V2 only + + /// Number of rows per intra-file chunk. Use smaller chunksize when you need + /// faster random row access + int64_t chunksize = 1LL << 16; + + /// Compression type to use. Only UNCOMPRESSED, LZ4, and ZSTD are + /// supported. The default compression returned by Defaults() is LZ4 if the + /// project is built with support for it, otherwise + /// UNCOMPRESSED. UNCOMPRESSED is set as the object default here so that if + /// WriteProperties::Defaults() is not used, the default constructor for + /// WriteProperties will work regardless of the options used to build the C++ + /// project. + Compression::type compression = Compression::UNCOMPRESSED; + + /// Compressor-specific compression level + int compression_level = Compression::kUseDefaultCompressionLevel; }; +ARROW_EXPORT +Status WriteTable(const Table& table, io::OutputStream* dst, + const WriteProperties& properties = WriteProperties::Defaults()); + } // namespace feather } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/feather_internal.h b/cpp/src/arrow/ipc/feather_internal.h deleted file mode 100644 index 3c2463057d5..00000000000 --- a/cpp/src/arrow/ipc/feather_internal.h +++ /dev/null @@ -1,233 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Public API for the "Feather" file format, originally created at -// http://github.com/wesm/feather - -#pragma once - -#include -#include -#include -#include -#include - -#include - -#include "arrow/buffer.h" -#include "arrow/ipc/feather.h" -#include "arrow/type.h" - -#include "generated/feather_generated.h" - -namespace arrow { -namespace ipc { -namespace feather { - -typedef std::vector> ColumnVector; -typedef flatbuffers::FlatBufferBuilder FBB; -typedef flatbuffers::Offset FBString; - -struct ARROW_EXPORT ColumnType { - enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; -}; - -struct ARROW_EXPORT ArrayMetadata { - ArrayMetadata() {} - - ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, - int64_t total_bytes) - : type(type), - offset(offset), - length(length), - null_count(null_count), - total_bytes(total_bytes) {} - - bool Equals(const ArrayMetadata& other) const { - return this->type == other.type && this->offset == other.offset && - this->length == other.length && this->null_count == other.null_count && - this->total_bytes == other.total_bytes; - } - - fbs::Type type; - int64_t offset; - int64_t length; - int64_t null_count; - int64_t total_bytes; -}; - -struct ARROW_EXPORT CategoryMetadata { - ArrayMetadata levels; - bool ordered; -}; - -struct ARROW_EXPORT TimestampMetadata { - TimeUnit::type unit; - - // A timezone name known to the Olson timezone database. For display purposes - // because the actual data is all UTC - std::string timezone; -}; - -struct ARROW_EXPORT TimeMetadata { - TimeUnit::type unit; -}; - -static constexpr const char* kFeatherMagicBytes = "FEA1"; -static constexpr const int kFeatherDefaultAlignment = 8; - -class ColumnBuilder; - -class ARROW_EXPORT TableBuilder { - public: - explicit TableBuilder(int64_t num_rows); - ~TableBuilder() = default; - - FBB& fbb(); - Status Finish(); - std::shared_ptr GetBuffer() const; - - std::unique_ptr AddColumn(const std::string& name); - void SetDescription(const std::string& description); - void SetNumRows(int64_t num_rows); - void add_column(const flatbuffers::Offset& col); - - private: - flatbuffers::FlatBufferBuilder fbb_; - ColumnVector columns_; - - friend class ColumnBuilder; - - bool finished_; - std::string description_; - int64_t num_rows_; -}; - -class ARROW_EXPORT TableMetadata { - public: - TableMetadata() : table_(NULLPTR) {} - ~TableMetadata() = default; - - Status Open(const std::shared_ptr& buffer) { - metadata_buffer_ = buffer; - table_ = fbs::GetCTable(buffer->data()); - - if (table_->version() < kFeatherVersion) { - std::cout << "This Feather file is old" - << " and will not be readable beyond the 0.3.0 release" << std::endl; - } - return Status::OK(); - } - - bool HasDescription() const { return table_->description() != 0; } - - std::string GetDescription() const { - if (!HasDescription()) { - return std::string(""); - } - return table_->description()->str(); - } - - int version() const { return table_->version(); } - int64_t num_rows() const { return table_->num_rows(); } - int64_t num_columns() const { return table_->columns()->size(); } - - const fbs::Column* column(int i) { return table_->columns()->Get(i); } - - private: - std::shared_ptr metadata_buffer_; - const fbs::CTable* table_; -}; - -static inline flatbuffers::Offset GetPrimitiveArray( - FBB& fbb, const ArrayMetadata& array) { - return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset, - array.length, array.null_count, array.total_bytes); -} - -static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { - return static_cast(static_cast(unit)); -} - -static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { - return static_cast(static_cast(unit)); -} - -// Convert Feather enums to Flatbuffer enums - -const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { - fbs::TypeMetadata::NONE, // PRIMITIVE - fbs::TypeMetadata::CategoryMetadata, // CATEGORY - fbs::TypeMetadata::TimestampMetadata, // TIMESTAMP - fbs::TypeMetadata::DateMetadata, // DATE - fbs::TypeMetadata::TimeMetadata // TIME -}; - -static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { - return COLUMN_TYPE_ENUM_MAPPING[column_type]; -} - -static inline void FromFlatbuffer(const fbs::PrimitiveArray* values, ArrayMetadata* out) { - out->type = values->type(); - out->offset = values->offset(); - out->length = values->length(); - out->null_count = values->null_count(); - out->total_bytes = values->total_bytes(); -} - -class ARROW_EXPORT ColumnBuilder { - public: - ColumnBuilder(TableBuilder* parent, const std::string& name); - ~ColumnBuilder() = default; - - flatbuffers::Offset CreateColumnMetadata(); - - Status Finish(); - void SetValues(const ArrayMetadata& values); - void SetUserMetadata(const std::string& data); - void SetCategory(const ArrayMetadata& levels, bool ordered = false); - void SetTimestamp(TimeUnit::type unit); - void SetTimestamp(TimeUnit::type unit, const std::string& timezone); - void SetDate(); - void SetTime(TimeUnit::type unit); - FBB& fbb(); - - private: - TableBuilder* parent_; - - std::string name_; - ArrayMetadata values_; - std::string user_metadata_; - - // Column metadata - - // Is this a primitive type, or one of the types having metadata? Default is - // primitive - ColumnType::type type_; - - // Type-specific metadata union - CategoryMetadata meta_category_; - TimeMetadata meta_time_; - - TimestampMetadata meta_timestamp_; - - FBB* fbb_; -}; - -} // namespace feather -} // namespace ipc -} // namespace arrow diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 40198e08cd8..6b7f82051b6 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -15,434 +15,189 @@ // specific language governing permissions and limitations // under the License. +#include #include -#include #include +#include +#include #include #include "arrow/array.h" +#include "arrow/buffer.h" #include "arrow/io/memory.h" -#include "arrow/ipc/feather_internal.h" +#include "arrow/ipc/feather.h" #include "arrow/ipc/test_common.h" -#include "arrow/memory_pool.h" -#include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" - -#include "generated/feather_generated.h" +#include "arrow/util/compression.h" namespace arrow { -class Buffer; - using internal::checked_cast; namespace ipc { namespace feather { -template -inline void assert_vector_equal(const std::vector& left, const std::vector& right) { - ASSERT_EQ(left.size(), right.size()); - - for (size_t i = 0; i < left.size(); ++i) { - ASSERT_EQ(left[i], right[i]) << i; - } -} - -class TestTableBuilder : public ::testing::Test { - public: - void SetUp() { tb_.reset(new TableBuilder(1000)); } - - virtual void Finish() { - ASSERT_OK(tb_->Finish()); +struct TestParam { + TestParam(int arg_version, + Compression::type arg_compression = Compression::UNCOMPRESSED) + : version(arg_version), compression(arg_compression) {} - table_.reset(new TableMetadata()); - ASSERT_OK(table_->Open(tb_->GetBuffer())); - } - - protected: - std::unique_ptr tb_; - std::unique_ptr table_; + int version; + Compression::type compression; }; -TEST_F(TestTableBuilder, Version) { - Finish(); - ASSERT_EQ(kFeatherVersion, table_->version()); -} - -TEST_F(TestTableBuilder, EmptyTable) { - Finish(); - - ASSERT_FALSE(table_->HasDescription()); - ASSERT_EQ("", table_->GetDescription()); - ASSERT_EQ(1000, table_->num_rows()); - ASSERT_EQ(0, table_->num_columns()); -} - -TEST_F(TestTableBuilder, SetDescription) { - std::string desc("this is some good data"); - tb_->SetDescription(desc); - Finish(); - ASSERT_TRUE(table_->HasDescription()); - ASSERT_EQ(desc, table_->GetDescription()); -} - -void AssertArrayEquals(const ArrayMetadata& left, const ArrayMetadata& right) { - EXPECT_EQ(left.type, right.type); - EXPECT_EQ(left.offset, right.offset); - EXPECT_EQ(left.length, right.length); - EXPECT_EQ(left.null_count, right.null_count); - EXPECT_EQ(left.total_bytes, right.total_bytes); -} - -TEST_F(TestTableBuilder, AddPrimitiveColumn) { - std::unique_ptr cb = tb_->AddColumn("f0"); - - ArrayMetadata values1; - ArrayMetadata values2; - values1.type = fbs::Type::INT32; - values1.offset = 10000; - values1.length = 1000; - values1.null_count = 100; - values1.total_bytes = 4000; - - cb->SetValues(values1); - - std::string user_meta = "as you wish"; - cb->SetUserMetadata(user_meta); - - ASSERT_OK(cb->Finish()); - - cb = tb_->AddColumn("f1"); - - values2.type = fbs::Type::UTF8; - values2.offset = 14000; - values2.length = 1000; - values2.null_count = 100; - values2.total_bytes = 10000; - - cb->SetValues(values2); - ASSERT_OK(cb->Finish()); - - Finish(); - - ASSERT_EQ(2, table_->num_columns()); - - auto col = table_->column(0); - - ASSERT_EQ("f0", col->name()->str()); - ASSERT_EQ(user_meta, col->user_metadata()->str()); - - ArrayMetadata values3; - FromFlatbuffer(col->values(), &values3); - AssertArrayEquals(values3, values1); - - col = table_->column(1); - ASSERT_EQ("f1", col->name()->str()); - - ArrayMetadata values4; - FromFlatbuffer(col->values(), &values4); - AssertArrayEquals(values4, values2); -} - -TEST_F(TestTableBuilder, AddCategoryColumn) { - ArrayMetadata values1(fbs::Type::UINT8, 10000, 1000, 100, 4000); - ArrayMetadata levels(fbs::Type::UTF8, 14000, 10, 0, 300); - - std::unique_ptr cb = tb_->AddColumn("c0"); - cb->SetValues(values1); - cb->SetCategory(levels); - ASSERT_OK(cb->Finish()); - - cb = tb_->AddColumn("c1"); - cb->SetValues(values1); - cb->SetCategory(levels, true); - ASSERT_OK(cb->Finish()); - - Finish(); - - auto col = table_->column(0); - ASSERT_EQ(fbs::TypeMetadata::CategoryMetadata, col->metadata_type()); - - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); - - auto cat_ptr = static_cast(col->metadata()); - ASSERT_FALSE(cat_ptr->ordered()); - - FromFlatbuffer(cat_ptr->levels(), &result); - AssertArrayEquals(result, levels); - - col = table_->column(1); - cat_ptr = static_cast(col->metadata()); - ASSERT_TRUE(cat_ptr->ordered()); - FromFlatbuffer(cat_ptr->levels(), &result); - AssertArrayEquals(result, levels); -} - -TEST_F(TestTableBuilder, AddTimestampColumn) { - ArrayMetadata values1(fbs::Type::INT64, 10000, 1000, 100, 4000); - std::unique_ptr cb = tb_->AddColumn("c0"); - cb->SetValues(values1); - cb->SetTimestamp(TimeUnit::MILLI); - ASSERT_OK(cb->Finish()); - - cb = tb_->AddColumn("c1"); - - std::string tz("America/Los_Angeles"); - - cb->SetValues(values1); - cb->SetTimestamp(TimeUnit::SECOND, tz); - ASSERT_OK(cb->Finish()); - - Finish(); - - auto col = table_->column(0); - - ASSERT_EQ(fbs::TypeMetadata::TimestampMetadata, col->metadata_type()); - - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); - - auto ts_ptr = static_cast(col->metadata()); - ASSERT_EQ(fbs::TimeUnit::MILLISECOND, ts_ptr->unit()); - - col = table_->column(1); - ts_ptr = static_cast(col->metadata()); - ASSERT_EQ(fbs::TimeUnit::SECOND, ts_ptr->unit()); - ASSERT_EQ(tz, ts_ptr->timezone()->str()); -} - -TEST_F(TestTableBuilder, AddDateColumn) { - ArrayMetadata values1(fbs::Type::INT64, 10000, 1000, 100, 4000); - std::unique_ptr cb = tb_->AddColumn("d0"); - cb->SetValues(values1); - cb->SetDate(); - ASSERT_OK(cb->Finish()); - - Finish(); - - auto col = table_->column(0); - - ASSERT_EQ(fbs::TypeMetadata::DateMetadata, col->metadata_type()); - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); -} - -TEST_F(TestTableBuilder, AddTimeColumn) { - ArrayMetadata values1(fbs::Type::INT64, 10000, 1000, 100, 4000); - std::unique_ptr cb = tb_->AddColumn("c0"); - cb->SetValues(values1); - cb->SetTime(TimeUnit::SECOND); - ASSERT_OK(cb->Finish()); - Finish(); +class TestFeather : public ::testing::TestWithParam { + public: + void SetUp() { Initialize(); } - auto col = table_->column(0); + void Initialize() { ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); } - ASSERT_EQ(fbs::TypeMetadata::TimeMetadata, col->metadata_type()); - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); + WriteProperties GetProperties() { + auto param = GetParam(); - auto t_ptr = static_cast(col->metadata()); - ASSERT_EQ(fbs::TimeUnit::SECOND, t_ptr->unit()); -} + auto props = WriteProperties::Defaults(); + props.version = param.version; -void CheckArrays(const Array& expected, const Array& result) { - if (!result.Equals(expected)) { - std::stringstream pp_result; - std::stringstream pp_expected; - - ARROW_EXPECT_OK(PrettyPrint(result, 0, &pp_result)); - ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected)); - FAIL() << "Got: " << pp_result.str() << "\nExpected: " << pp_expected.str(); + // Don't fail if the build doesn't have LZ4 or ZSTD enabled + if (util::Codec::IsAvailable(param.compression)) { + props.compression = param.compression; + } else { + props.compression = Compression::UNCOMPRESSED; + } + return props; } -} -void CheckBatches(const RecordBatch& expected, const RecordBatch& result) { - if (!result.Equals(expected)) { - std::stringstream pp_result; - std::stringstream pp_expected; + void DoWrite(const Table& table) { + Initialize(); + ASSERT_OK(WriteTable(table, stream_.get(), GetProperties())); + ASSERT_OK_AND_ASSIGN(output_, stream_->Finish()); + auto buffer = std::make_shared(output_); + ASSERT_OK_AND_ASSIGN(reader_, Reader::Open(buffer)); + } - ARROW_EXPECT_OK(PrettyPrint(result, 0, &pp_result)); - ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected)); - FAIL() << "Got: " << pp_result.str() << "\nExpected: " << pp_expected.str(); + void CheckSlice(std::shared_ptr batch, int start, int size) { + batch = batch->Slice(start, size); + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch}, &table)); + + DoWrite(*table); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + if (table->num_rows() > 0) { + AssertTablesEqual(*table, *result); + } else { + ASSERT_EQ(0, result->num_rows()); + ASSERT_TRUE(result->schema()->Equals(*table->schema())); + } } -} -class TestTableReader : public ::testing::Test { - public: - void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); - ASSERT_OK(TableWriter::Open(stream_, &writer_)); + void CheckSlices(std::shared_ptr batch) { + std::vector starts = {0, 1, 300, 301, 302, 303, 304, 305, 306, 307}; + std::vector sizes = {0, 1, 7, 8, 30, 32, 100}; + for (auto start : starts) { + for (auto size : sizes) { + CheckSlice(batch, start, size); + } + } } - void Finish() { - // Write table footer - ASSERT_OK(writer_->Finalize()); + void CheckRoundtrip(std::shared_ptr batch) { + std::shared_ptr
table; + std::vector> batches = {batch}; + ASSERT_OK(Table::FromRecordBatches(batches, &table)); - ASSERT_OK_AND_ASSIGN(output_, stream_->Finish()); + DoWrite(*table); - auto buffer = std::make_shared(output_); - ASSERT_OK(TableReader::Open(buffer, &reader_)); + std::shared_ptr
read_table; + ASSERT_OK(reader_->Read(&read_table)); + AssertTablesEqual(*table, *read_table); } protected: std::shared_ptr stream_; - std::unique_ptr writer_; - std::unique_ptr reader_; - + std::shared_ptr reader_; std::shared_ptr output_; }; -TEST_F(TestTableReader, ReadIndices) { - std::shared_ptr batch1; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch1)); - std::shared_ptr batch2; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch2)); - - ASSERT_OK(writer_->Append("f0", *batch1->column(0))); - ASSERT_OK(writer_->Append("f1", *batch1->column(1))); - ASSERT_OK(writer_->Append("f2", *batch2->column(0))); - ASSERT_OK(writer_->Append("f3", *batch2->column(1))); - Finish(); +TEST(TestFeatherWriteProperties, Defaults) { + auto props = WriteProperties::Defaults(); - std::vector indices({3, 0, 5}); - std::shared_ptr
result; - ASSERT_OK(reader_->Read(indices, &result)); - std::vector> fields; - std::vector> arrays; - fields.push_back(std::make_shared("f0", int32())); - arrays.push_back(batch1->column(0)); - fields.push_back(std::make_shared("f3", int32())); - arrays.push_back(batch2->column(1)); - auto expected = Table::Make(std::make_shared(fields), arrays); - AssertTablesEqual(*expected, *result); +#ifdef ARROW_WITH_LZ4 + ASSERT_EQ(Compression::LZ4, props.compression); +#else + ASSERT_EQ(Compression::UNCOMPRESSED, props.compression); +#endif } -TEST_F(TestTableReader, ReadNames) { +TEST_P(TestFeather, ReadIndicesOrNames) { std::shared_ptr batch1; ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch1)); - std::shared_ptr batch2; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch2)); - ASSERT_OK(writer_->Append("f0", *batch1->column(0))); - ASSERT_OK(writer_->Append("f1", *batch1->column(1))); - ASSERT_OK(writer_->Append("f2", *batch2->column(0))); - ASSERT_OK(writer_->Append("f3", *batch2->column(1))); - Finish(); - - std::vector names({"f3", "f0", "f5"}); - std::shared_ptr
result; - ASSERT_OK(reader_->Read(names, &result)); - std::vector> fields; - std::vector> arrays; - fields.push_back(std::make_shared("f0", int32())); - arrays.push_back(batch1->column(0)); - fields.push_back(std::make_shared("f3", int32())); - arrays.push_back(batch2->column(1)); - auto expected = Table::Make(std::make_shared(fields), arrays); - AssertTablesEqual(*expected, *result); -} + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch1}, &table)); -class TestTableWriter : public ::testing::Test { - public: - void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); - ASSERT_OK(TableWriter::Open(stream_, &writer_)); - } + DoWrite(*table); - void Finish() { - // Write table footer - ASSERT_OK(writer_->Finalize()); + auto expected = Table::Make(schema({field("f1", int32())}), {batch1->column(1)}); - ASSERT_OK_AND_ASSIGN(output_, stream_->Finish()); + std::shared_ptr
result1, result2; - auto buffer = std::make_shared(output_); - ASSERT_OK(TableReader::Open(buffer, &reader_)); - } + std::vector indices = {1}; + ASSERT_OK(reader_->Read(indices, &result1)); + AssertTablesEqual(*expected, *result1); - void CheckBatch(std::shared_ptr batch) { - std::shared_ptr
table; - std::vector> batches = {batch}; - ASSERT_OK(Table::FromRecordBatches(batches, &table)); - ASSERT_OK(writer_->Write(*table)); - Finish(); - - std::shared_ptr
read_table; - ASSERT_OK(reader_->Read(&read_table)); - AssertTablesEqual(*table, *read_table); - } - - protected: - std::shared_ptr stream_; - std::unique_ptr writer_; - std::unique_ptr reader_; - - std::shared_ptr output_; -}; - -TEST_F(TestTableWriter, EmptyTable) { - Finish(); - - ASSERT_FALSE(reader_->HasDescription()); - ASSERT_EQ("", reader_->GetDescription()); - - ASSERT_EQ(0, reader_->num_rows()); - ASSERT_EQ(0, reader_->num_columns()); + std::vector names = {"f1"}; + ASSERT_OK(reader_->Read(names, &result2)); + AssertTablesEqual(*expected, *result2); } -TEST_F(TestTableWriter, SetNumRows) { - writer_->SetNumRows(1000); - Finish(); - ASSERT_EQ(1000, reader_->num_rows()); -} +TEST_P(TestFeather, EmptyTable) { + std::vector> columns; + auto table = Table::Make(schema({}), columns, 0); -TEST_F(TestTableWriter, SetDescription) { - std::string desc("contents of the file"); - writer_->SetDescription(desc); - Finish(); + DoWrite(*table); - ASSERT_TRUE(reader_->HasDescription()); - ASSERT_EQ(desc, reader_->GetDescription()); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + AssertTablesEqual(*table, *result); +} - ASSERT_EQ(0, reader_->num_rows()); - ASSERT_EQ(0, reader_->num_columns()); +TEST_P(TestFeather, SetNumRows) { + std::vector> columns; + auto table = Table::Make(schema({}), columns, 1000); + DoWrite(*table); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + ASSERT_EQ(1000, result->num_rows()); } -TEST_F(TestTableWriter, PrimitiveRoundTrip) { +TEST_P(TestFeather, PrimitiveRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch)); - ASSERT_OK(writer_->Append("f0", *batch->column(0))); - ASSERT_OK(writer_->Append("f1", *batch->column(1))); - Finish(); + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch}, &table)); - std::shared_ptr col; - ASSERT_OK(reader_->GetColumn(0, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(0))); - ASSERT_EQ("f0", reader_->GetColumnName(0)); + DoWrite(*table); - ASSERT_OK(reader_->GetColumn(1, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(1))); - ASSERT_EQ("f1", reader_->GetColumnName(1)); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + AssertTablesEqual(*table, *result); } -TEST_F(TestTableWriter, CategoryRoundtrip) { +TEST_P(TestFeather, CategoryRoundtrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeDictionaryFlat(&batch)); - CheckBatch(batch); + CheckRoundtrip(batch); } -TEST_F(TestTableWriter, TimeTypes) { +TEST_P(TestFeather, TimeTypes) { std::vector is_valid = {true, true, true, false, true, true, true}; auto f0 = field("f0", date32()); auto f1 = field("f1", time32(TimeUnit::MILLI)); @@ -485,85 +240,64 @@ TEST_F(TestTableWriter, TimeTypes) { } auto batch = RecordBatch::Make(schema, 7, std::move(arrays)); - CheckBatch(batch); + CheckRoundtrip(batch); } -TEST_F(TestTableWriter, VLenPrimitiveRoundTrip) { +TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); - CheckBatch(batch); + CheckRoundtrip(batch); } -TEST_F(TestTableWriter, PrimitiveNullRoundTrip) { +TEST_P(TestFeather, PrimitiveNullRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeNullRecordBatch(&batch)); - for (int i = 0; i < batch->num_columns(); ++i) { - ASSERT_OK(writer_->Append(batch->column_name(i), *batch->column(i))); - } - Finish(); - - std::shared_ptr col; - for (int i = 0; i < batch->num_columns(); ++i) { - ASSERT_OK(reader_->GetColumn(i, &col)); - ASSERT_EQ(batch->column_name(i), reader_->GetColumnName(i)); - StringArray str_values(batch->column(i)->length(), nullptr, nullptr, - batch->column(i)->null_bitmap(), - batch->column(i)->null_count()); - CheckArrays(str_values, *col->chunk(0)); - } -} + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch}, &table)); -class TestTableWriterSlice : public TestTableWriter, - public ::testing::WithParamInterface> { - public: - void CheckSlice(std::shared_ptr batch) { - auto p = GetParam(); - auto start = std::get<0>(p); - auto size = std::get<1>(p); + DoWrite(*table); - batch = batch->Slice(start, size); - - ASSERT_OK(writer_->Append("f0", *batch->column(0))); - ASSERT_OK(writer_->Append("f1", *batch->column(1))); - Finish(); - - std::shared_ptr col; - ASSERT_OK(reader_->GetColumn(0, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(0))); - ASSERT_EQ("f0", reader_->GetColumnName(0)); - - ASSERT_OK(reader_->GetColumn(1, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(1))); - ASSERT_EQ("f1", reader_->GetColumnName(1)); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + + if (GetParam().version == kFeatherV1Version) { + std::vector> expected_fields; + for (int i = 0; i < batch->num_columns(); ++i) { + ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name()); + StringArray str_values(batch->column(i)->length(), nullptr, nullptr, + batch->column(i)->null_bitmap(), + batch->column(i)->null_count()); + AssertArraysEqual(str_values, *result->column(i)->chunk(0)); + } + } else { + AssertTablesEqual(*table, *result); } -}; +} -TEST_P(TestTableWriterSlice, SliceRoundTrip) { +TEST_P(TestFeather, SliceRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeIntBatchSized(600, &batch)); - CheckSlice(batch); + CheckSlices(batch); } -TEST_P(TestTableWriterSlice, SliceStringsRoundTrip) { - auto p = GetParam(); - auto start = std::get<0>(p); - auto with_nulls = start % 2 == 0; +TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, with_nulls)); - CheckSlice(batch); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + CheckSlices(batch); } -TEST_P(TestTableWriterSlice, SliceBooleanRoundTrip) { +TEST_P(TestFeather, SliceBooleanRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeBooleanBatchSized(600, &batch)); - CheckSlice(batch); + CheckSlices(batch); } -INSTANTIATE_TEST_SUITE_P(TestTableWriterSliceOffsets, TestTableWriterSlice, - ::testing::Combine(::testing::Values(0, 1, 300, 301, 302, 303, - 304, 305, 306, 307), - ::testing::Values(0, 1, 7, 8, 30, 32, 100))); +INSTANTIATE_TEST_SUITE_P( + FeatherTests, TestFeather, + ::testing::Values(TestParam(kFeatherV1Version), TestParam(kFeatherV2Version), + TestParam(kFeatherV2Version, Compression::LZ4), + TestParam(kFeatherV2Version, Compression::ZSTD))); } // namespace feather } // namespace ipc diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 129a6d13645..3ce72177155 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -479,12 +479,14 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, const int nbatches = static_cast(batches.size()); const int ncolumns = static_cast(schema->num_fields()); + int64_t num_rows = 0; for (int i = 0; i < nbatches; ++i) { if (!batches[i]->schema()->Equals(*schema, false)) { return Status::Invalid("Schema at index ", static_cast(i), " was different: \n", schema->ToString(), "\nvs\n", batches[i]->schema()->ToString()); } + num_rows += batches[i]->num_rows(); } std::vector> columns(ncolumns); @@ -497,7 +499,7 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, columns[i] = std::make_shared(column_arrays, schema->field(i)->type()); } - *table = Table::Make(schema, columns); + *table = Table::Make(schema, columns, num_rows); return Status::OK(); } diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 8700f67ae62..6c1fae31740 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -16,61 +16,45 @@ # under the License. # --------------------------------------------------------------------- -# Implement legacy Feather file format +# Implement Feather file format class FeatherError(Exception): pass -cdef class FeatherWriter: - cdef: - unique_ptr[CFeatherWriter] writer - - cdef public: - int64_t num_rows - - def __cinit__(self): - self.num_rows = -1 - - def open(self, object dest): - cdef shared_ptr[COutputStream] sink - get_writer(dest, &sink) - - with nogil: - check_status(CFeatherWriter.Open(sink, &self.writer)) - - def close(self): - if self.num_rows < 0: - self.num_rows = 0 - self.writer.get().SetNumRows(self.num_rows) - with nogil: - check_status(self.writer.get().Finalize()) +def write_feather(Table table, object dest, compression=None, + compression_level=None, chunksize=None, version=2): + cdef shared_ptr[COutputStream] sink + get_writer(dest, &sink) - def write_array(self, object name, object col, object mask=None): - cdef Array arr + cdef CFeatherProperties properties + if version == 2: + properties.version = kFeatherV2Version + else: + properties.version = kFeatherV1Version - if self.num_rows >= 0: - if len(col) != self.num_rows: - raise ValueError('prior column had a different number of rows') - else: - self.num_rows = len(col) + if compression == 'zstd': + properties.compression = CCompressionType_ZSTD + elif compression == 'lz4': + properties.compression = CCompressionType_LZ4 + else: + properties.compression = CCompressionType_UNCOMPRESSED - if isinstance(col, Array): - arr = col - else: - arr = Array.from_pandas(col, mask=mask) + if chunksize is not None: + properties.chunksize = chunksize - cdef c_string c_name = tobytes(name) + if compression_level is not None: + properties.compression_level = compression_level - with nogil: - check_status( - self.writer.get().Append(c_name, deref(arr.sp_array))) + with nogil: + check_status(WriteFeather(deref(table.table), sink.get(), + properties)) cdef class FeatherReader: cdef: - unique_ptr[CFeatherReader] reader + shared_ptr[CFeatherReader] reader def __cinit__(self): pass @@ -80,32 +64,9 @@ cdef class FeatherReader: get_reader(source, use_memory_map, &reader) with nogil: - check_status(CFeatherReader.Open(reader, &self.reader)) - - @property - def num_rows(self): - return self.reader.get().num_rows() - - @property - def num_columns(self): - return self.reader.get().num_columns() - - def get_column_name(self, int i): - cdef c_string name = self.reader.get().GetColumnName(i) - return frombytes(name) - - def get_column(self, int i): - if i < 0 or i >= self.num_columns: - raise IndexError(i) - - cdef shared_ptr[CChunkedArray] sp_chunked_array - with nogil: - check_status(self.reader.get() - .GetColumn(i, &sp_chunked_array)) - - return pyarrow_wrap_chunked_array(sp_chunked_array) + self.reader = GetResultValue(CFeatherReader.Open(reader)) - def _read(self): + def read(self): cdef shared_ptr[CTable] sp_table with nogil: check_status(self.reader.get() @@ -113,7 +74,7 @@ cdef class FeatherReader: return pyarrow_wrap_table(sp_table) - def _read_indices(self, indices): + def read_indices(self, indices): cdef: shared_ptr[CTable] sp_table vector[int] c_indices @@ -126,7 +87,7 @@ cdef class FeatherReader: return pyarrow_wrap_table(sp_table) - def _read_names(self, names): + def read_names(self, names): cdef: shared_ptr[CTable] sp_table vector[c_string] c_names diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 82b8ee9146b..172a9312b44 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -29,74 +29,6 @@ def _check_pandas_version(): raise ImportError("feather requires pandas >= 0.17.0") -class FeatherReader(ext.FeatherReader): - - def __init__(self, source): - _check_pandas_version() - self.source = source - self.open(source) - - def read_table(self, columns=None): - if columns is None: - return self._read() - column_types = [type(column) for column in columns] - if all(map(lambda t: t == int, column_types)): - return self._read_indices(columns) - elif all(map(lambda t: t == str, column_types)): - return self._read_names(columns) - - column_type_names = [t.__name__ for t in column_types] - raise TypeError("Columns must be indices or names. " - "Got columns {} of types {}" - .format(columns, column_type_names)) - - def read_pandas(self, columns=None, use_threads=True): - return self.read_table(columns=columns).to_pandas( - use_threads=use_threads) - - -def check_chunked_overflow(name, col): - if col.num_chunks == 1: - return - - if col.type in (ext.binary(), ext.string()): - raise ValueError("Column '{}' exceeds 2GB maximum capacity of " - "a Feather binary column. This restriction may be " - "lifted in the future".format(name)) - else: - # TODO(wesm): Not sure when else this might be reached - raise ValueError("Column '{}' of type {} was chunked on conversion " - "to Arrow and cannot be currently written to " - "Feather format".format(name, str(col.type))) - - -class FeatherWriter: - - def __init__(self, dest): - _check_pandas_version() - self.dest = dest - self.writer = ext.FeatherWriter() - self.writer.open(dest) - - def write(self, df): - if (_pandas_api.has_sparse - and isinstance(df, _pandas_api.pd.SparseDataFrame)): - df = df.to_dense() - - if not df.columns.is_unique: - raise ValueError("cannot serialize duplicate column names") - - # TODO(wesm): Remove this length check, see ARROW-1732 - if len(df.columns) > 0: - table = Table.from_pandas(df, preserve_index=False) - for i, name in enumerate(table.schema.names): - col = table[i] - check_chunked_overflow(name, col) - self.writer.write_array(name, col.chunk(0)) - - self.writer.close() - - class FeatherDataset: """ Encapsulates details of reading a list of Feather files. @@ -127,15 +59,15 @@ def read_table(self, columns=None): pyarrow.Table Content of the file as a table (of columns) """ - _fil = FeatherReader(self.paths[0]).read_table(columns=columns) + _fil = read_table(self.paths[0], columns=columns) self._tables = [_fil] self.schema = _fil.schema - for fil in self.paths[1:]: - fil_table = FeatherReader(fil).read_table(columns=columns) + for path in self.paths[1:]: + table = read_table(path, columns=columns) if self.validate_schema: - self.validate_schemas(fil, fil_table) - self._tables.append(fil_table) + self.validate_schemas(path, table) + self._tables.append(table) return concat_tables(self._tables) def validate_schemas(self, piece, table): @@ -165,25 +97,84 @@ def read_pandas(self, columns=None, use_threads=True): use_threads=use_threads) -def write_feather(df, dest): +def check_chunked_overflow(name, col): + if col.num_chunks == 1: + return + + if col.type in (ext.binary(), ext.string()): + raise ValueError("Column '{}' exceeds 2GB maximum capacity of " + "a Feather binary column. This restriction may be " + "lifted in the future".format(name)) + else: + # TODO(wesm): Not sure when else this might be reached + raise ValueError("Column '{}' of type {} was chunked on conversion " + "to Arrow and cannot be currently written to " + "Feather format".format(name, str(col.type))) + + +def write_feather(df, dest, compression=None, compression_level=None, + chunksize=None, version=2): """ Write a pandas.DataFrame to Feather format. Parameters ---------- - df : pandas.DataFrame - Dataframe to write out as feather format. + df : pandas.DataFrame or pyarrow.Table + Data to write out as Feather format. dest : str Local destination path. + compression : string, default None + Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses + LZ4 for V2 files if it is available, otherwise uncompressed. + compression_level : int, default None + Use a compression level particular to the chosen compressor. If None + use the default compression level + chunksize : int, default None + For V2 files, the internal maximum size of Arrow RecordBatch chunks + when writing the Arrow IPC file format. None means use the default, + which is currently 64K + version : int, default 2 + Feather file version. Version 2 is the current. Version 1 is the more + limited legacy format """ - writer = FeatherWriter(dest) + _check_pandas_version() + if (_pandas_api.has_sparse + and isinstance(df, _pandas_api.pd.SparseDataFrame)): + df = df.to_dense() + + if _pandas_api.is_data_frame(df): + table = Table.from_pandas(df, preserve_index=False) + + if version == 1: + # Version 1 does not chunking + for i, name in enumerate(table.schema.names): + col = table[i] + check_chunked_overflow(name, col) + else: + table = df + + if version == 1: + if len(table.column_names) > len(set(table.column_names)): + raise ValueError("cannot serialize duplicate column names") + + if compression is not None: + raise ValueError("Feather V1 files do not support compression " + "option") + + if chunksize is not None: + raise ValueError("Feather V1 files do not support chunksize " + "option") + + supported_compression_options = (None, 'lz4', 'zstd', 'uncompressed') + if compression not in supported_compression_options: + raise ValueError('compression="{}" not supported, must be one of {}' + .format(compression, supported_compression_options)) + try: - writer.write(df) + ext.write_feather(table, dest, compression=compression, + compression_level=compression_level, + chunksize=chunksize, version=version) except Exception: - # Try to make sure the resource is closed - import gc - writer = None - gc.collect() if isinstance(dest, str): try: os.remove(dest) @@ -194,12 +185,12 @@ def write_feather(df, dest): def read_feather(source, columns=None, use_threads=True): """ - Read a pandas.DataFrame from Feather format. + Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use + feather.read_table. Parameters ---------- source : str file path, or file-like object - columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read. @@ -210,8 +201,7 @@ def read_feather(source, columns=None, use_threads=True): ------- df : pandas.DataFrame """ - reader = FeatherReader(source) - return reader.read_pandas(columns=columns, use_threads=use_threads) + return read_table(source, columns=columns).to_pandas(use_threads=True) def read_table(source, columns=None): @@ -229,5 +219,20 @@ def read_table(source, columns=None): ------- table : pyarrow.Table """ - reader = FeatherReader(source) - return reader.read_table(columns=columns) + _check_pandas_version() + reader = ext.FeatherReader() + reader.open(source) + + if columns is None: + return reader.read() + + column_types = [type(column) for column in columns] + if all(map(lambda t: t == int, column_types)): + return reader.read_indices(columns) + elif all(map(lambda t: t == str, column_types)): + return reader.read_names(columns) + + column_type_names = [t.__name__ for t in column_types] + raise TypeError("Columns must be indices or names. " + "Got columns {} of types {}" + .format(columns, column_type_names)) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4b60e82d5e3..95b059f8648 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1320,33 +1320,26 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: const CIpcWriteOptions& options, CIpcPayload* out) - cdef cppclass CFeatherWriter" arrow::ipc::feather::TableWriter": - @staticmethod - CStatus Open(const shared_ptr[COutputStream]& stream, - unique_ptr[CFeatherWriter]* out) + int kFeatherV1Version" arrow::ipc::feather::kFeatherV1Version" + int kFeatherV2Version" arrow::ipc::feather::kFeatherV2Version" - void SetDescription(const c_string& desc) - void SetNumRows(int64_t num_rows) + cdef cppclass CFeatherProperties" arrow::ipc::feather::WriteProperties": + int version + int chunksize + CCompressionType compression + int compression_level - CStatus Append(const c_string& name, const CArray& values) - CStatus Finalize() + CStatus WriteFeather" arrow::ipc::feather::WriteTable"\ + (const CTable& table, COutputStream* out, + CFeatherProperties properties) - cdef cppclass CFeatherReader" arrow::ipc::feather::TableReader": + cdef cppclass CFeatherReader" arrow::ipc::feather::Reader": @staticmethod - CStatus Open(const shared_ptr[CRandomAccessFile]& file, - unique_ptr[CFeatherReader]* out) - - c_string GetDescription() - c_bool HasDescription() - - int64_t num_rows() - int64_t num_columns() - + CResult[shared_ptr[CFeatherReader]] Open( + const shared_ptr[CRandomAccessFile]& file) + int version() shared_ptr[CSchema] schema() - CStatus GetColumn(int i, shared_ptr[CChunkedArray]* out) - c_string GetColumnName(int i) - CStatus Read(shared_ptr[CTable]* out) CStatus Read(const vector[int] indices, shared_ptr[CTable]* out) CStatus Read(const vector[c_string] names, shared_ptr[CTable]* out) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 7a0c5c2b806..a5aba443836 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -19,16 +19,13 @@ import os import sys import tempfile -import unittest import pytest -from numpy.testing import assert_array_equal import numpy as np import pyarrow as pa -from pyarrow.feather import (read_feather, write_feather, - read_table, FeatherReader, FeatherDataset) -from pyarrow.lib import FeatherWriter +from pyarrow.feather import (read_feather, write_feather, read_table, + FeatherDataset) try: @@ -49,512 +46,555 @@ def random_path(prefix='feather_'): return tempfile.mktemp(prefix=prefix) -class TestFeatherReader(unittest.TestCase): +@pytest.fixture(scope="module", params=[1, 2]) +def version(request): + yield request.param - def setUp(self): - self.test_files = [] - def tearDown(self): - for path in self.test_files: - try: - os.remove(path) - except os.error: - pass +TEST_FILES = None - def test_file_not_exist(self): - with pytest.raises(pa.ArrowIOError): - FeatherReader('test_invalid_file') - def _get_null_counts(self, path, columns=None): - reader = FeatherReader(path) - counts = [] - for i in range(reader.num_columns): - col = reader.get_column(i) - name = reader.get_column_name(i) - if columns is None or name in columns: - counts.append(col.null_count) +def setup_module(module): + global TEST_FILES + TEST_FILES = [] - return counts - def _check_pandas_roundtrip(self, df, expected=None, path=None, - columns=None, null_counts=None, - use_threads=False): - if path is None: - path = random_path() +def teardown_module(module): + for path in TEST_FILES: + try: + os.remove(path) + except os.error: + pass - self.test_files.append(path) - write_feather(df, path) - if not os.path.exists(path): - raise Exception('file not written') - result = read_feather(path, columns, use_threads=use_threads) - if expected is None: - expected = df +def test_file_not_exist(): + with pytest.raises(pa.ArrowIOError): + read_feather('test_invalid_file') - assert_frame_equal(result, expected) - if null_counts is None: - null_counts = np.zeros(len(expected.columns)) - - np.testing.assert_array_equal(self._get_null_counts(path, columns), - null_counts) - - def _assert_error_on_write(self, df, exc, path=None): - # check that we are raising the exception - # on writing - - if path is None: - path = random_path() - - self.test_files.append(path) - - def f(): - write_feather(df, path) - - pytest.raises(exc, f) - - def test_dataset(self): - num_values = (100, 100) - num_files = 5 - paths = [random_path() for i in range(num_files)] - df = pd.DataFrame(np.random.randn(*num_values), - columns=['col_' + str(i) - for i in range(num_values[1])]) - - self.test_files.extend(paths) - for index, path in enumerate(paths): - rows = (index * (num_values[0] // num_files), - (index + 1) * (num_values[0] // num_files)) - writer = FeatherWriter() - writer.open(path) - - for col in range(num_values[1]): - writer.write_array(df.columns[col], - df.iloc[rows[0]:rows[1], col]) - - writer.close() - - data = FeatherDataset(paths).read_pandas() - assert_frame_equal(data, df) - - def test_num_columns_attr(self): - df0 = pd.DataFrame({}) - df1 = pd.DataFrame({ - 'foo': [1, 2, 3, 4, 5] - }) - df2 = pd.DataFrame({ - 'foo': [1, 2, 3, 4, 5], - 'bar': [1, 2, 3, 4, 5] - }) - for df, ncols in zip([df0, df1, df2], [0, 1, 2]): - path = random_path() - self.test_files.append(path) - write_feather(df, path) - - reader = FeatherReader(path) - assert reader.num_columns == ncols - - def test_num_rows_attr(self): - df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]}) +def _check_pandas_roundtrip(df, expected=None, path=None, + columns=None, use_threads=False, + version=None, compression=None, + compression_level=None): + if path is None: path = random_path() - self.test_files.append(path) - write_feather(df, path) - reader = FeatherReader(path) - assert reader.num_rows == len(df) + TEST_FILES.append(path) + write_feather(df, path, compression=compression, + compression_level=compression_level, version=version) + if not os.path.exists(path): + raise Exception('file not written') + + result = read_feather(path, columns, use_threads=use_threads) + if expected is None: + expected = df + + assert_frame_equal(result, expected) - df = pd.DataFrame({}) + +def _assert_error_on_write(df, exc, path=None): + # check that we are raising the exception + # on writing + + if path is None: path = random_path() - self.test_files.append(path) + + TEST_FILES.append(path) + + def f(): write_feather(df, path) - reader = FeatherReader(path) - assert reader.num_rows == 0 + pytest.raises(exc, f) - def test_float_no_nulls(self): - data = {} - numpy_dtypes = ['f4', 'f8'] - num_values = 100 - for dtype in numpy_dtypes: - values = np.random.randn(num_values) - data[dtype] = values.astype(dtype) +def test_dataset(version): + num_values = (100, 100) + num_files = 5 + paths = [random_path() for i in range(num_files)] + df = pd.DataFrame(np.random.randn(*num_values), + columns=['col_' + str(i) + for i in range(num_values[1])]) - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) + TEST_FILES.extend(paths) + for index, path in enumerate(paths): + rows = (index * (num_values[0] // num_files), + (index + 1) * (num_values[0] // num_files)) + + write_feather(df.iloc[rows[0]:rows[1]], path, version=version) + + data = FeatherDataset(paths).read_pandas() + assert_frame_equal(data, df) + + +def test_float_no_nulls(version): + data = {} + numpy_dtypes = ['f4', 'f8'] + num_values = 100 + + for dtype in numpy_dtypes: + values = np.random.randn(num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, version=version) + + +def test_read_table(version): + num_values = (100, 100) + path = random_path() + + TEST_FILES.append(path) + + values = np.random.randint(0, 100, size=num_values) + + df = pd.DataFrame(values, columns=['col_' + str(i) + for i in range(100)]) + write_feather(df, path, version=version) + + data = pd.DataFrame(values, + columns=['col_' + str(i) for i in range(100)]) + table = pa.Table.from_pandas(data) + + result = read_table(path) + + assert_frame_equal(table.to_pandas(), result.to_pandas()) - def test_read_table(self): - num_values = (100, 100) - path = random_path() - self.test_files.append(path) - writer = FeatherWriter() - writer.open(path) +def test_float_nulls(version): + num_values = 100 + path = random_path() + TEST_FILES.append(path) + + null_mask = np.random.randint(0, 10, size=num_values) < 3 + dtypes = ['f4', 'f8'] + expected_cols = [] + + arrays = [] + for name in dtypes: + values = np.random.randn(num_values).astype(name) + arrays.append(pa.array(values, mask=null_mask)) + + values[null_mask] = np.nan + + expected_cols.append(values) + + table = pa.table(arrays, names=dtypes) + write_feather(table, path, version=version) + + ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), + columns=dtypes) + + result = read_feather(path) + assert_frame_equal(result, ex_frame) + + +def test_integer_no_nulls(version): + data = {} + + numpy_dtypes = ['i1', 'i2', 'i4', 'i8', + 'u1', 'u2', 'u4', 'u8'] + num_values = 100 + + for dtype in numpy_dtypes: values = np.random.randint(0, 100, size=num_values) + data[dtype] = values.astype(dtype) - for i in range(100): - writer.write_array('col_' + str(i), values[:, i]) + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, version=version) - writer.close() - data = pd.DataFrame(values, - columns=['col_' + str(i) for i in range(100)]) - table = pa.Table.from_pandas(data) +def test_platform_numpy_integers(version): + data = {} - result = read_table(path) + numpy_dtypes = ['longlong'] + num_values = 100 - assert_frame_equal(table.to_pandas(), result.to_pandas()) + for dtype in numpy_dtypes: + values = np.random.randint(0, 100, size=num_values) + data[dtype] = values.astype(dtype) - def test_float_nulls(self): - num_values = 100 + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, version=version) - path = random_path() - self.test_files.append(path) - writer = FeatherWriter() - writer.open(path) - null_mask = np.random.randint(0, 10, size=num_values) < 3 - dtypes = ['f4', 'f8'] - expected_cols = [] - null_counts = [] - for name in dtypes: - values = np.random.randn(num_values).astype(name) - writer.write_array(name, values, null_mask) +def test_integer_with_nulls(version): + # pandas requires upcast to float dtype + path = random_path() + TEST_FILES.append(path) - values[null_mask] = np.nan + int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + num_values = 100 - expected_cols.append(values) - null_counts.append(null_mask.sum()) + arrays = [] + null_mask = np.random.randint(0, 10, size=num_values) < 3 + expected_cols = [] + for name in int_dtypes: + values = np.random.randint(0, 100, size=num_values) + arrays.append(pa.array(values, mask=null_mask)) - writer.close() + expected = values.astype('f8') + expected[null_mask] = np.nan - ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), - columns=dtypes) + expected_cols.append(expected) - result = read_feather(path) - assert_frame_equal(result, ex_frame) - assert_array_equal(self._get_null_counts(path), null_counts) + table = pa.table(arrays, names=int_dtypes) + write_feather(table, path, version=version) - def test_integer_no_nulls(self): - data = {} + ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), + columns=int_dtypes) - numpy_dtypes = ['i1', 'i2', 'i4', 'i8', - 'u1', 'u2', 'u4', 'u8'] - num_values = 100 + result = read_feather(path) + assert_frame_equal(result, ex_frame) - for dtype in numpy_dtypes: - values = np.random.randint(0, 100, size=num_values) - data[dtype] = values.astype(dtype) - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) +def test_boolean_no_nulls(version): + num_values = 100 - def test_platform_numpy_integers(self): - data = {} + np.random.seed(0) - numpy_dtypes = ['longlong'] - num_values = 100 + df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) + _check_pandas_roundtrip(df, version=version) - for dtype in numpy_dtypes: - values = np.random.randint(0, 100, size=num_values) - data[dtype] = values.astype(dtype) - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) +def test_boolean_nulls(version): + # pandas requires upcast to object dtype + path = random_path() + TEST_FILES.append(path) - def test_integer_with_nulls(self): - # pandas requires upcast to float dtype - path = random_path() - self.test_files.append(path) + num_values = 100 + np.random.seed(0) - int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - num_values = 100 + mask = np.random.randint(0, 10, size=num_values) < 3 + values = np.random.randint(0, 10, size=num_values) < 5 - writer = FeatherWriter() - writer.open(path) + table = pa.table([pa.array(values, mask=mask)], names=['bools']) + write_feather(table, path, version=version) - null_mask = np.random.randint(0, 10, size=num_values) < 3 - expected_cols = [] - for name in int_dtypes: - values = np.random.randint(0, 100, size=num_values) - writer.write_array(name, values, null_mask) + expected = values.astype(object) + expected[mask] = None - expected = values.astype('f8') - expected[null_mask] = np.nan + ex_frame = pd.DataFrame({'bools': expected}) - expected_cols.append(expected) + result = read_feather(path) + assert_frame_equal(result, ex_frame) - ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), - columns=int_dtypes) - writer.close() +def test_buffer_bounds_error(version): + # ARROW-1676 + path = random_path() + TEST_FILES.append(path) + for i in range(16, 256): + values = pa.array([None] + list(range(i)), type=pa.float64()) + + write_feather(pa.table([values], names=['arr']), path, + version=version) result = read_feather(path) - assert_frame_equal(result, ex_frame) + expected = pd.DataFrame({'arr': values.to_pandas()}) + assert_frame_equal(result, expected) - def test_boolean_no_nulls(self): - num_values = 100 + _check_pandas_roundtrip(expected, version=version) - np.random.seed(0) - df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) - self._check_pandas_roundtrip(df) +def test_boolean_object_nulls(version): + repeats = 100 + arr = np.array([False, None, True] * repeats, dtype=object) + df = pd.DataFrame({'bools': arr}) + _check_pandas_roundtrip(df, version=version) - def test_boolean_nulls(self): - # pandas requires upcast to object dtype - path = random_path() - self.test_files.append(path) - num_values = 100 - np.random.seed(0) +def test_delete_partial_file_on_error(version): + if sys.platform == 'win32': + pytest.skip('Windows hangs on to file handle for some reason') - writer = FeatherWriter() - writer.open(path) + class CustomClass: + pass - mask = np.random.randint(0, 10, size=num_values) < 3 - values = np.random.randint(0, 10, size=num_values) < 5 - writer.write_array('bools', values, mask) + # strings will fail + df = pd.DataFrame( + { + 'numbers': range(5), + 'strings': [b'foo', None, 'bar', CustomClass(), np.nan]}, + columns=['numbers', 'strings']) - expected = values.astype(object) - expected[mask] = None + path = random_path() + try: + write_feather(df, path, version=version) + except Exception: + pass - writer.close() + assert not os.path.exists(path) - ex_frame = pd.DataFrame({'bools': expected}) - result = read_feather(path) - assert_frame_equal(result, ex_frame) +def test_strings(version): + repeats = 1000 - def test_buffer_bounds_error(self): - # ARROW-1676 - path = random_path() - self.test_files.append(path) + # Mixed bytes, unicode, strings coerced to binary + values = [b'foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) - for i in range(16, 256): - values = pa.array([None] + list(range(i)), type=pa.float64()) + ex_values = [b'foo', None, b'bar', b'qux', np.nan] + expected = pd.DataFrame({'strings': ex_values * repeats}) + _check_pandas_roundtrip(df, expected, version=version) - writer = FeatherWriter() - writer.open(path) + # embedded nulls are ok + values = ['foo', None, 'bar', 'qux', None] + df = pd.DataFrame({'strings': values * repeats}) + expected = pd.DataFrame({'strings': values * repeats}) + _check_pandas_roundtrip(df, expected, version=version) - writer.write_array('arr', values) - writer.close() + values = ['foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + expected = pd.DataFrame({'strings': values * repeats}) + _check_pandas_roundtrip(df, expected, version=version) - result = read_feather(path) - expected = pd.DataFrame({'arr': values.to_pandas()}) - assert_frame_equal(result, expected) - self._check_pandas_roundtrip(expected, null_counts=[1]) +def test_empty_strings(version): + df = pd.DataFrame({'strings': [''] * 10}) + _check_pandas_roundtrip(df, version=version) - def test_boolean_object_nulls(self): - repeats = 100 - arr = np.array([False, None, True] * repeats, dtype=object) - df = pd.DataFrame({'bools': arr}) - self._check_pandas_roundtrip(df, null_counts=[1 * repeats]) - def test_delete_partial_file_on_error(self): - if sys.platform == 'win32': - pytest.skip('Windows hangs on to file handle for some reason') +def test_all_none(version): + df = pd.DataFrame({'all_none': [None] * 10}) + _check_pandas_roundtrip(df, version=version) - class CustomClass: - pass - # strings will fail - df = pd.DataFrame( - { - 'numbers': range(5), - 'strings': [b'foo', None, 'bar', CustomClass(), np.nan]}, - columns=['numbers', 'strings']) +def test_all_null_category(version): + # ARROW-1188 + df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)}) + df = df.assign(B=df.B.astype("category")) + _check_pandas_roundtrip(df, version=version) - path = random_path() - try: - write_feather(df, path) - except Exception: - pass - assert not os.path.exists(path) - - def test_strings(self): - repeats = 1000 - - # Mixed bytes, unicode, strings coerced to binary - values = [b'foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - - ex_values = [b'foo', None, b'bar', b'qux', np.nan] - expected = pd.DataFrame({'strings': ex_values * repeats}) - self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - - # embedded nulls are ok - values = ['foo', None, 'bar', 'qux', None] - df = pd.DataFrame({'strings': values * repeats}) - expected = pd.DataFrame({'strings': values * repeats}) - self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - - values = ['foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - expected = pd.DataFrame({'strings': values * repeats}) - self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - - def test_empty_strings(self): - df = pd.DataFrame({'strings': [''] * 10}) - self._check_pandas_roundtrip(df) - - def test_all_none(self): - df = pd.DataFrame({'all_none': [None] * 10}) - self._check_pandas_roundtrip(df, null_counts=[10]) - - def test_all_null_category(self): - # ARROW-1188 - df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)}) - df = df.assign(B=df.B.astype("category")) - self._check_pandas_roundtrip(df, null_counts=[0, 3]) - - def test_multithreaded_read(self): - data = {'c{}'.format(i): [''] * 10 - for i in range(100)} - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df, use_threads=True) - - def test_nan_as_null(self): - # Create a nan that is not numpy.nan - values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10) - df = pd.DataFrame({'strings': values}) - self._check_pandas_roundtrip(df) - - def test_category(self): - repeats = 1000 - values = ['foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - df['strings'] = df['strings'].astype('category') - - values = ['foo', None, 'bar', 'qux', None] - expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)}) - self._check_pandas_roundtrip(df, expected, - null_counts=[2 * repeats]) - - def test_timestamp(self): - df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) - df['with_tz'] = (df.naive.dt.tz_localize('utc') - .dt.tz_convert('America/Los_Angeles')) - - self._check_pandas_roundtrip(df) - - def test_timestamp_with_nulls(self): - df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1), - None, - pd.Timestamp(2016, 1, 3)]}) - df['with_tz'] = df.test.dt.tz_localize('utc') - - self._check_pandas_roundtrip(df, null_counts=[1, 1]) - - @pytest.mark.xfail(reason="not supported ATM", - raises=NotImplementedError) - def test_timedelta_with_nulls(self): - df = pd.DataFrame({'test': [pd.Timedelta('1 day'), - None, - pd.Timedelta('3 day')]}) - - self._check_pandas_roundtrip(df, null_counts=[1, 1]) - - def test_out_of_float64_timestamp_with_nulls(self): - df = pd.DataFrame( - {'test': pd.DatetimeIndex([1451606400000000001, - None, 14516064000030405])}) - df['with_tz'] = df.test.dt.tz_localize('utc') - self._check_pandas_roundtrip(df, null_counts=[1, 1]) - - def test_non_string_columns(self): - df = pd.DataFrame({0: [1, 2, 3, 4], - 1: [True, False, True, False]}) - - expected = df.rename(columns=str) - self._check_pandas_roundtrip(df, expected) - - @pytest.mark.skipif(not os.path.supports_unicode_filenames, - reason='unicode filenames not supported') - def test_unicode_filename(self): - # GH #209 - name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8') - df = pd.DataFrame({'foo': [1, 2, 3, 4]}) - self._check_pandas_roundtrip(df, path=random_path(prefix=name)) - - def test_read_columns(self): - data = {'foo': [1, 2, 3, 4], - 'boo': [5, 6, 7, 8], - 'woo': [1, 3, 5, 7]} - columns = list(data.keys())[1:3] - df = pd.DataFrame(data) - expected = pd.DataFrame({c: data[c] for c in columns}) - self._check_pandas_roundtrip(df, expected, columns=columns) - - def test_overwritten_file(self): - path = random_path() - self.test_files.append(path) +def test_multithreaded_read(version): + data = {'c{}'.format(i): [''] * 10 + for i in range(100)} + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, use_threads=True, version=version) - num_values = 100 - np.random.seed(0) - values = np.random.randint(0, 10, size=num_values) - write_feather(pd.DataFrame({'ints': values}), path) +def test_nan_as_null(version): + # Create a nan that is not numpy.nan + values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10) + df = pd.DataFrame({'strings': values}) + _check_pandas_roundtrip(df, version=version) - df = pd.DataFrame({'ints': values[0: num_values//2]}) - self._check_pandas_roundtrip(df, path=path) - def test_filelike_objects(self): - from io import BytesIO +def test_category(version): + repeats = 1000 + values = ['foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + df['strings'] = df['strings'].astype('category') - buf = BytesIO() + values = ['foo', None, 'bar', 'qux', None] + expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)}) + _check_pandas_roundtrip(df, expected, version=version) - # the copy makes it non-strided - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=['a', 'b', 'c']).copy() - write_feather(df, buf) - buf.seek(0) +def test_timestamp(version): + df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) + df['with_tz'] = (df.naive.dt.tz_localize('utc') + .dt.tz_convert('America/Los_Angeles')) - result = read_feather(buf) - assert_frame_equal(result, df) + _check_pandas_roundtrip(df, version=version) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") - def test_sparse_dataframe(self): - if not pa.pandas_compat._pandas_api.has_sparse: - pytest.skip("version of pandas does not support SparseDataFrame") - # GH #221 - data = {'A': [0, 1, 2], - 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) - expected = df.to_dense() - self._check_pandas_roundtrip(df, expected) - def test_duplicate_columns(self): +def test_timestamp_with_nulls(version): + df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1), + None, + pd.Timestamp(2016, 1, 3)]}) + df['with_tz'] = df.test.dt.tz_localize('utc') - # https://github.com/wesm/feather/issues/53 - # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() - self._assert_error_on_write(df, ValueError) + _check_pandas_roundtrip(df, version=version) - def test_unsupported(self): - # https://github.com/wesm/feather/issues/240 - # serializing actual python objects - # custom python objects - class A: - pass +@pytest.mark.xfail(reason="not supported", raises=TypeError) +def test_timedelta_with_nulls_v1(): + df = pd.DataFrame({'test': [pd.Timedelta('1 day'), + None, + pd.Timedelta('3 day')]}) + _check_pandas_roundtrip(df, version=1) + + +def test_timedelta_with_nulls(): + df = pd.DataFrame({'test': [pd.Timedelta('1 day'), + None, + pd.Timedelta('3 day')]}) + _check_pandas_roundtrip(df, version=2) + + +def test_out_of_float64_timestamp_with_nulls(version): + df = pd.DataFrame( + {'test': pd.DatetimeIndex([1451606400000000001, + None, 14516064000030405])}) + df['with_tz'] = df.test.dt.tz_localize('utc') + _check_pandas_roundtrip(df, version=version) + + +def test_non_string_columns(version): + df = pd.DataFrame({0: [1, 2, 3, 4], + 1: [True, False, True, False]}) + + expected = df.rename(columns=str) + _check_pandas_roundtrip(df, expected, version=version) + + +@pytest.mark.skipif(not os.path.supports_unicode_filenames, + reason='unicode filenames not supported') +def test_unicode_filename(version): + # GH #209 + name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8') + df = pd.DataFrame({'foo': [1, 2, 3, 4]}) + _check_pandas_roundtrip(df, path=random_path(prefix=name), + version=version) + + +def test_read_columns(version): + data = {'foo': [1, 2, 3, 4], + 'boo': [5, 6, 7, 8], + 'woo': [1, 3, 5, 7]} + columns = list(data.keys())[1:3] + df = pd.DataFrame(data) + expected = pd.DataFrame({c: data[c] for c in columns}) + _check_pandas_roundtrip(df, expected, version=version, columns=columns) + + +def test_overwritten_file(version): + path = random_path() + TEST_FILES.append(path) + + num_values = 100 + np.random.seed(0) + + values = np.random.randint(0, 10, size=num_values) + write_feather(pd.DataFrame({'ints': values}), path, version=version) + + df = pd.DataFrame({'ints': values[0: num_values//2]}) + _check_pandas_roundtrip(df, path=path, version=version) + + +def test_filelike_objects(version): + buf = io.BytesIO() + + # the copy makes it non-strided + df = pd.DataFrame(np.arange(12).reshape(4, 3), + columns=['a', 'b', 'c']).copy() + write_feather(df, buf, version=version) + + buf.seek(0) + + result = read_feather(buf) + assert_frame_equal(result, df) + + +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") +def test_sparse_dataframe(version): + if not pa.pandas_compat._pandas_api.has_sparse: + pytest.skip("version of pandas does not support SparseDataFrame") + # GH #221 + data = {'A': [0, 1, 2], + 'B': [1, 0, 1]} + df = pd.DataFrame(data).to_sparse(fill_value=1) + expected = df.to_dense() + _check_pandas_roundtrip(df, expected, version=version) - df = pd.DataFrame({'a': [A(), A()]}) - self._assert_error_on_write(df, ValueError) - # non-strings - df = pd.DataFrame({'a': ['a', 1, 2.0]}) - self._assert_error_on_write(df, TypeError) +def test_duplicate_columns(): - @pytest.mark.slow - def test_large_dataframe(self): - df = pd.DataFrame({'A': np.arange(400000000)}) - self._check_pandas_roundtrip(df) + # https://github.com/wesm/feather/issues/53 + # not currently able to handle duplicate columns + df = pd.DataFrame(np.arange(12).reshape(4, 3), + columns=list('aaa')).copy() + _assert_error_on_write(df, ValueError) + + +def test_unsupported(): + # https://github.com/wesm/feather/issues/240 + # serializing actual python objects + + # custom python objects + class A: + pass + + df = pd.DataFrame({'a': [A(), A()]}) + _assert_error_on_write(df, ValueError) + + # non-strings + df = pd.DataFrame({'a': ['a', 1, 2.0]}) + _assert_error_on_write(df, TypeError) + + +def test_v2_set_chunksize(): + df = pd.DataFrame({'A': np.arange(1000)}) + table = pa.table(df) + + buf = io.BytesIO() + write_feather(table, buf, chunksize=250, version=2) + + result = buf.getvalue() + + ipc_file = pa.ipc.open_file(pa.BufferReader(result)) + assert ipc_file.num_record_batches == 4 + assert len(ipc_file.get_batch(0)) == 250 + + +def test_v2_compression_options(): + df = pd.DataFrame({'A': np.arange(1000)}) + + cases = [ + # compression, compression_level + ('uncompressed', None), + ('lz4', None), + ('zstd', 1), + ('zstd', 10) + ] + + for compression, compression_level in cases: + _check_pandas_roundtrip(df, compression=compression, + compression_level=compression_level) + + buf = io.BytesIO() + + # LZ4 doesn't support compression_level + with pytest.raises(pa.ArrowInvalid, + match="doesn't support setting a compression level"): + write_feather(df, buf, compression='lz4', compression_level=10) + + # Trying to compress with V1 + with pytest.raises( + ValueError, + match="Feather V1 files do not support compression option"): + write_feather(df, buf, compression='lz4', version=1) + + # Trying to set chunksize with V1 + with pytest.raises( + ValueError, + match="Feather V1 files do not support chunksize option"): + write_feather(df, buf, chunksize=4096, version=1) + + # Unsupported compressor + with pytest.raises(ValueError, + match='compression="snappy" not supported'): + write_feather(df, buf, compression='snappy') + + +def test_v1_unsupported_types(): + table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0']) + + buf = io.BytesIO() + with pytest.raises(TypeError, + match=("Unsupported Feather V1 type: " + "list. " + "Use V2 format to serialize all Arrow types.")): + write_feather(table, buf, version=1) + + +@pytest.mark.slow +def test_large_dataframe(version): + df = pd.DataFrame({'A': np.arange(400000000)}) + _check_pandas_roundtrip(df, version=version) @pytest.mark.large_memory @@ -568,7 +608,13 @@ def test_chunked_binary_error_message(): ] * 2 * (1 << 10) df = pd.DataFrame({'byte_col': values}) + # Works fine with version 2 + buf = io.BytesIO() + write_feather(df, buf, version=2) + result = read_feather(pa.BufferReader(buf.getvalue())) + assert_frame_equal(result, df) + with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " "capacity of a Feather binary column. This restriction " "may be lifted in the future"): - write_feather(df, io.BytesIO()) + write_feather(df, io.BytesIO(), version=1) diff --git a/r/NAMESPACE b/r/NAMESPACE index 6964b91fa8c..b6320fc99fd 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -88,8 +88,7 @@ export(DictionaryArray) export(DirectoryPartitioning) export(DirectoryPartitioningFactory) export(Expression) -export(FeatherTableReader) -export(FeatherTableWriter) +export(FeatherReader) export(Field) export(FieldExpression) export(FileFormat) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 695a025602b..dd1f6d620cc 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -708,68 +708,24 @@ dataset___expr__ToString <- function(x){ .Call(`_arrow_dataset___expr__ToString` , x) } -ipc___feather___TableWriter__SetDescription <- function(writer, description){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description)) +ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level){ + invisible(.Call(`_arrow_ipc___WriteFeather__Table` , stream, table, version, chunk_size, compression, compression_level)) } -ipc___feather___TableWriter__SetNumRows <- function(writer, num_rows){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__SetNumRows` , writer, num_rows)) +ipc___feather___Reader__version <- function(reader){ + .Call(`_arrow_ipc___feather___Reader__version` , reader) } -ipc___feather___TableWriter__Append <- function(writer, name, values){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__Append` , writer, name, values)) +ipc___feather___Reader__Read <- function(reader, columns){ + .Call(`_arrow_ipc___feather___Reader__Read` , reader, columns) } -ipc___feather___TableWriter__Finalize <- function(writer){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__Finalize` , writer)) +ipc___feather___Reader__Open <- function(stream){ + .Call(`_arrow_ipc___feather___Reader__Open` , stream) } -ipc___feather___TableWriter__Open <- function(stream){ - .Call(`_arrow_ipc___feather___TableWriter__Open` , stream) -} - -ipc___TableWriter__RecordBatch__WriteFeather <- function(writer, batch){ - invisible(.Call(`_arrow_ipc___TableWriter__RecordBatch__WriteFeather` , writer, batch)) -} - -ipc___feather___TableReader__GetDescription <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__GetDescription` , reader) -} - -ipc___feather___TableReader__HasDescription <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__HasDescription` , reader) -} - -ipc___feather___TableReader__version <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__version` , reader) -} - -ipc___feather___TableReader__num_rows <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__num_rows` , reader) -} - -ipc___feather___TableReader__num_columns <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__num_columns` , reader) -} - -ipc___feather___TableReader__GetColumnName <- function(reader, i){ - .Call(`_arrow_ipc___feather___TableReader__GetColumnName` , reader, i) -} - -ipc___feather___TableReader__GetColumn <- function(reader, i){ - .Call(`_arrow_ipc___feather___TableReader__GetColumn` , reader, i) -} - -ipc___feather___TableReader__Read <- function(reader, columns){ - .Call(`_arrow_ipc___feather___TableReader__Read` , reader, columns) -} - -ipc___feather___TableReader__Open <- function(stream){ - .Call(`_arrow_ipc___feather___TableReader__Open` , stream) -} - -ipc___feather___TableReader__column_names <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__column_names` , reader) +ipc___feather___Reader__column_names <- function(reader){ + .Call(`_arrow_ipc___feather___Reader__column_names` , reader) } Field__initialize <- function(name, field, nullable){ diff --git a/r/R/feather.R b/r/R/feather.R index 86ef83cdb6e..9d5e0375949 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -17,11 +17,23 @@ #' Write data in the Feather format #' -#' @param x `data.frame` or RecordBatch -#' @param sink A file path or an OutputStream -#' -#' @return the input `x` invisibly. -#' +#' @param x `data.frame`, `RecordBatch`, or `Table` +#' @param sink A file path or an `OutputStream` +#' @param version integer Feather file version. Version 2 is the current. +#' Version 1 is the more limited legacy format. +#' @param chunk_size For V2 files, the number of rows that each chunk of data +#' should have in the file. Use a smaller `chunk_size` when you need faster +#' random row access. Default is 64K. This option is not supported for V1. +#' @param compression Name of compression codec to use, if any. Default is +#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +#' "uncompressed". "zstd" is the other available codec and generally has better +#' compression ratios in exchange for slower read and write performance +#' See [codec_is_available()]. This option is not supported for V1. +#' @param compression_level If `compression` is "zstd", you may +#' specify an integer compression level. If omitted, the compression codec's +#' default compression level is used. +#' +#' @return The input `x`, invisibly. #' @export #' @examples #' \donttest{ @@ -30,73 +42,69 @@ #' write_feather(mtcars, tf) #' } #' @include arrow-package.R -write_feather <- function(x, sink) { - x_out <- x +write_feather <- function(x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("default", "lz4", "uncompressed", "zstd"), + compression_level = NULL) { + # Handle and validate options before touching data + version <- as.integer(version) + assert_that(version %in% 1:2) + compression <- match.arg(compression) + chunk_size <- as.integer(chunk_size) + assert_that(chunk_size > 0) + if (compression == "default") { + if (version == 2 && codec_is_available("lz4")) { + compression <- "lz4" + } else { + compression <- "uncompressed" + } + } + if (is.null(compression_level)) { + # Use -1 as sentinal for "default" + compression_level <- -1L + } + compression_level <- as.integer(compression_level) + # Now make sure that options make sense together + if (version == 1) { + if (chunk_size != 65536L) { + stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) + } + if (compression != "uncompressed") { + stop("Feather version 1 does not support the 'compression' option", call. = FALSE) + } + if (compression_level != -1L) { + stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) + } + } + if (compression != "zstd" && compression_level != -1L) { + stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE) + } + # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: + version <- version + 1L + compression <- compression_from_name(compression) - if (is.data.frame(x)) { - x <- record_batch(x) + x_out <- x + if (is.data.frame(x) || inherits(x, "RecordBatch")) { + x <- Table$create(x) } - assert_is(x, "RecordBatch") + assert_is(x, "Table") if (is.character(sink)) { sink <- FileOutputStream$create(sink) on.exit(sink$close()) } assert_is(sink, "OutputStream") - - writer <- FeatherTableWriter$create(sink) - ipc___TableWriter__RecordBatch__WriteFeather(writer, x) + ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level) invisible(x_out) } -#' @title FeatherTableWriter class -#' @rdname FeatherTableWriter -#' @name FeatherTableWriter -#' @docType class -#' @usage NULL -#' @format NULL -#' @description This class enables you to write Feather files. See its usage in -#' [write_feather()]. -#' -#' @section Factory: -#' -#' The `FeatherTableWriter$create()` factory method instantiates the object and -#' takes the following argument: -#' -#' - `stream` An `OutputStream` -#' -#' @section Methods: -#' -#' - `$GetDescription()` -#' - `$HasDescription()` -#' - `$version()` -#' - `$num_rows()` -#' - `$num_columns()` -#' - `$GetColumnName()` -#' - `$GetColumn()` -#' - `$Read(columns)` -#' -#' @export -#' @include arrow-package.R -FeatherTableWriter <- R6Class("FeatherTableWriter", inherit = ArrowObject, - public = list( - SetDescription = function(description) ipc___feather___TableWriter__SetDescription(self, description), - SetNumRows = function(num_rows) ipc___feather___TableWriter__SetNumRows(self, num_rows), - Append = function(name, values) ipc___feather___TableWriter__Append(self, name, values), - Finalize = function() ipc___feather___TableWriter__Finalize(self) - ) -) - -FeatherTableWriter$create <- function(stream) { - assert_is(stream, "OutputStream") - unique_ptr(FeatherTableWriter, ipc___feather___TableWriter__Open(stream)) -} - #' Read a Feather file #' #' @param file A character file path, a raw vector, or `InputStream`, passed to -#' `FeatherTableReader$create()`. +#' `FeatherReader$create()`. #' @inheritParams read_delim_arrow #' @param ... additional parameters #' @@ -115,9 +123,9 @@ FeatherTableWriter$create <- function(stream) { #' df <- read_feather(tf, col_select = starts_with("Sepal")) #' } read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { - reader <- FeatherTableReader$create(file, ...) + reader <- FeatherReader$create(file, ...) - all_columns <- ipc___feather___TableReader__column_names(reader) + all_columns <- ipc___feather___Reader__column_names(reader) col_select <- enquo(col_select) columns <- if (!quo_is_null(col_select)) { vars_select(all_columns, !!col_select) @@ -130,9 +138,9 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { out } -#' @title FeatherTableReader class -#' @rdname FeatherTableReader -#' @name FeatherTableReader +#' @title FeatherReader class +#' @rdname FeatherReader +#' @name FeatherReader #' @docType class #' @usage NULL #' @format NULL @@ -142,7 +150,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { #' #' @section Factory: #' -#' The `FeatherTableReader$create()` factory method instantiates the object and +#' The `FeatherReader$create()` factory method instantiates the object and #' takes the following arguments: #' #' - `file` A character file name, raw vector, or Arrow file connection object @@ -152,33 +160,26 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { #' #' @section Methods: #' -#' - `$GetDescription()` -#' - `$HasDescription()` -#' - `$version()` -#' - `$num_rows()` -#' - `$num_columns()` -#' - `$GetColumnName()` -#' - `$GetColumn()` -#' - `$Read(columns)` +#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of +#' integer indices +#' - `$version`: Active binding, returns `1` or `2`, according to the Feather +#' file version #' #' @export #' @include arrow-package.R -FeatherTableReader <- R6Class("FeatherTableReader", inherit = ArrowObject, +FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject, public = list( - GetDescription = function() ipc___feather___TableReader__GetDescription(self), - HasDescription = function() ipc__feather___TableReader__HasDescription(self), - version = function() ipc___feather___TableReader__version(self), - num_rows = function() ipc___feather___TableReader__num_rows(self), - num_columns = function() ipc___feather___TableReader__num_columns(self), - GetColumnName = function(i) ipc___feather___TableReader__GetColumnName(self, i), - GetColumn = function(i) shared_ptr(Array, ipc___feather___TableReader__GetColumn(self, i)), Read = function(columns) { - shared_ptr(Table, ipc___feather___TableReader__Read(self, columns)) + shared_ptr(Table, ipc___feather___Reader__Read(self, columns)) } + ), + active = list( + # versions are officially 2 for V1 and 3 for V2 :shrug: + version = function() ipc___feather___Reader__version(self) - 1L ) ) -FeatherTableReader$create <- function(file, mmap = TRUE, ...) { +FeatherReader$create <- function(file, mmap = TRUE, ...) { file <- make_readable_file(file, mmap) - unique_ptr(FeatherTableReader, ipc___feather___TableReader__Open(file)) + shared_ptr(FeatherReader, ipc___feather___Reader__Open(file)) } diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 165892a8d7d..e51487d24b1 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -83,8 +83,7 @@ reference: - ParquetReaderProperties - ParquetFileWriter - ParquetWriterProperties - - FeatherTableReader - - FeatherTableWriter + - FeatherReader - CsvTableReader - RecordBatchReader - RecordBatchWriter diff --git a/r/man/FeatherTableReader.Rd b/r/man/FeatherReader.Rd similarity index 64% rename from r/man/FeatherTableReader.Rd rename to r/man/FeatherReader.Rd index f06197a0834..1a0dba194f1 100644 --- a/r/man/FeatherTableReader.Rd +++ b/r/man/FeatherReader.Rd @@ -1,9 +1,9 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/feather.R \docType{class} -\name{FeatherTableReader} -\alias{FeatherTableReader} -\title{FeatherTableReader class} +\name{FeatherReader} +\alias{FeatherReader} +\title{FeatherReader class} \description{ This class enables you to interact with Feather files. Create one to connect to a file or other InputStream, and call \code{Read()} on it to @@ -12,7 +12,7 @@ make an \code{arrow::Table}. See its usage in \code{\link[=read_feather]{read_fe \section{Factory}{ -The \code{FeatherTableReader$create()} factory method instantiates the object and +The \code{FeatherReader$create()} factory method instantiates the object and takes the following arguments: \itemize{ \item \code{file} A character file name, raw vector, or Arrow file connection object @@ -25,14 +25,10 @@ takes the following arguments: \section{Methods}{ \itemize{ -\item \verb{$GetDescription()} -\item \verb{$HasDescription()} -\item \verb{$version()} -\item \verb{$num_rows()} -\item \verb{$num_columns()} -\item \verb{$GetColumnName()} -\item \verb{$GetColumn()} -\item \verb{$Read(columns)} +\item \verb{$Read(columns)}: Returns a \code{Table} of the selected columns, a vector of +integer indices +\item \verb{$version}: Active binding, returns \code{1} or \code{2}, according to the Feather +file version } } diff --git a/r/man/FeatherTableWriter.Rd b/r/man/FeatherTableWriter.Rd deleted file mode 100644 index 3c6afaa134f..00000000000 --- a/r/man/FeatherTableWriter.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/feather.R -\docType{class} -\name{FeatherTableWriter} -\alias{FeatherTableWriter} -\title{FeatherTableWriter class} -\description{ -This class enables you to write Feather files. See its usage in -\code{\link[=write_feather]{write_feather()}}. -} -\section{Factory}{ - - -The \code{FeatherTableWriter$create()} factory method instantiates the object and -takes the following argument: -\itemize{ -\item \code{stream} An \code{OutputStream} -} -} - -\section{Methods}{ - -\itemize{ -\item \verb{$GetDescription()} -\item \verb{$HasDescription()} -\item \verb{$version()} -\item \verb{$num_rows()} -\item \verb{$num_columns()} -\item \verb{$GetColumnName()} -\item \verb{$GetColumn()} -\item \verb{$Read(columns)} -} -} - diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index f5d6c257070..eb842719673 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -8,7 +8,7 @@ read_feather(file, col_select = NULL, as_data_frame = TRUE, ...) } \arguments{ \item{file}{A character file path, a raw vector, or \code{InputStream}, passed to -\code{FeatherTableReader$create()}.} +\code{FeatherReader$create()}.} \item{col_select}{A character vector of column names to keep, as in the "select" argument to \code{data.table::fread()}, or a diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 09c3d42333c..daeb66b6fec 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -4,15 +4,39 @@ \alias{write_feather} \title{Write data in the Feather format} \usage{ -write_feather(x, sink) +write_feather( + x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("default", "lz4", "uncompressed", "zstd"), + compression_level = NULL +) } \arguments{ -\item{x}{\code{data.frame} or RecordBatch} +\item{x}{\code{data.frame}, \code{RecordBatch}, or \code{Table}} -\item{sink}{A file path or an OutputStream} +\item{sink}{A file path or an \code{OutputStream}} + +\item{version}{integer Feather file version. Version 2 is the current. +Version 1 is the more limited legacy format.} + +\item{chunk_size}{For V2 files, the number of rows that each chunk of data +should have in the file. Use a smaller \code{chunk_size} when you need faster +random row access. Default is 64K. This option is not supported for V1.} + +\item{compression}{Name of compression codec to use, if any. Default is +"lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +"uncompressed". "zstd" is the other available codec and generally has better +compression ratios in exchange for slower read and write performance +See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.} + +\item{compression_level}{If \code{compression} is "zstd", you may +specify an integer compression level. If omitted, the compression codec's +default compression level is used.} } \value{ -the input \code{x} invisibly. +The input \code{x}, invisibly. } \description{ Write data in the Feather format diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 161f0934a02..051d5a6f948 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2739,254 +2739,83 @@ RcppExport SEXP _arrow_dataset___expr__ToString(SEXP x_sexp){ // feather.cpp #if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__SetDescription(const std::unique_ptr& writer, const std::string& description); -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetDescription(SEXP writer_sexp, SEXP description_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter::type description(description_sexp); - ipc___feather___TableWriter__SetDescription(writer, description); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetDescription(SEXP writer_sexp, SEXP description_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__SetDescription(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__SetNumRows(const std::unique_ptr& writer, int64_t num_rows); -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetNumRows(SEXP writer_sexp, SEXP num_rows_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter::type num_rows(num_rows_sexp); - ipc___feather___TableWriter__SetNumRows(writer, num_rows); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetNumRows(SEXP writer_sexp, SEXP num_rows_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__SetNumRows(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__Append(const std::unique_ptr& writer, const std::string& name, const std::shared_ptr& values); -RcppExport SEXP _arrow_ipc___feather___TableWriter__Append(SEXP writer_sexp, SEXP name_sexp, SEXP values_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter::type name(name_sexp); - Rcpp::traits::input_parameter&>::type values(values_sexp); - ipc___feather___TableWriter__Append(writer, name, values); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__Append(SEXP writer_sexp, SEXP name_sexp, SEXP values_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__Append(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__Finalize(const std::unique_ptr& writer); -RcppExport SEXP _arrow_ipc___feather___TableWriter__Finalize(SEXP writer_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - ipc___feather___TableWriter__Finalize(writer); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__Finalize(SEXP writer_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__Finalize(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::unique_ptr ipc___feather___TableWriter__Open(const std::shared_ptr& stream); -RcppExport SEXP _arrow_ipc___feather___TableWriter__Open(SEXP stream_sexp){ +void ipc___WriteFeather__Table(const std::shared_ptr& stream, const std::shared_ptr& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level); +RcppExport SEXP _arrow_ipc___WriteFeather__Table(SEXP stream_sexp, SEXP table_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type stream(stream_sexp); - return Rcpp::wrap(ipc___feather___TableWriter__Open(stream)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__Open(SEXP stream_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___TableWriter__RecordBatch__WriteFeather(const std::unique_ptr& writer, const std::shared_ptr& batch); -RcppExport SEXP _arrow_ipc___TableWriter__RecordBatch__WriteFeather(SEXP writer_sexp, SEXP batch_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter&>::type batch(batch_sexp); - ipc___TableWriter__RecordBatch__WriteFeather(writer, batch); + Rcpp::traits::input_parameter&>::type table(table_sexp); + Rcpp::traits::input_parameter::type version(version_sexp); + Rcpp::traits::input_parameter::type chunk_size(chunk_size_sexp); + Rcpp::traits::input_parameter::type compression(compression_sexp); + Rcpp::traits::input_parameter::type compression_level(compression_level_sexp); + ipc___WriteFeather__Table(stream, table, version, chunk_size, compression, compression_level); return R_NilValue; END_RCPP } #else -RcppExport SEXP _arrow_ipc___TableWriter__RecordBatch__WriteFeather(SEXP writer_sexp, SEXP batch_sexp){ - Rf_error("Cannot call ipc___TableWriter__RecordBatch__WriteFeather(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::string ipc___feather___TableReader__GetDescription(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__GetDescription(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__GetDescription(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__GetDescription(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__GetDescription(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -bool ipc___feather___TableReader__HasDescription(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__HasDescription(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__HasDescription(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__HasDescription(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__HasDescription(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -int ipc___feather___TableReader__version(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__version(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__version(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__version(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__version(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___WriteFeather__Table(SEXP stream_sexp, SEXP table_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ + Rf_error("Cannot call ipc___WriteFeather__Table(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -int64_t ipc___feather___TableReader__num_rows(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__num_rows(SEXP reader_sexp){ +int ipc___feather___Reader__version(const std::shared_ptr& reader); +RcppExport SEXP _arrow_ipc___feather___Reader__version(SEXP reader_sexp){ BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__num_rows(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__num_rows(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__num_rows(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -int64_t ipc___feather___TableReader__num_columns(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__num_columns(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__num_columns(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__num_columns(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__num_columns(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::string ipc___feather___TableReader__GetColumnName(const std::unique_ptr& reader, int i); -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumnName(SEXP reader_sexp, SEXP i_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - Rcpp::traits::input_parameter::type i(i_sexp); - return Rcpp::wrap(ipc___feather___TableReader__GetColumnName(reader, i)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumnName(SEXP reader_sexp, SEXP i_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__GetColumnName(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ipc___feather___TableReader__GetColumn(const std::unique_ptr& reader, int i); -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumn(SEXP reader_sexp, SEXP i_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - Rcpp::traits::input_parameter::type i(i_sexp); - return Rcpp::wrap(ipc___feather___TableReader__GetColumn(reader, i)); + Rcpp::traits::input_parameter&>::type reader(reader_sexp); + return Rcpp::wrap(ipc___feather___Reader__version(reader)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumn(SEXP reader_sexp, SEXP i_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__GetColumn(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__version(SEXP reader_sexp){ + Rf_error("Cannot call ipc___feather___Reader__version(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ipc___feather___TableReader__Read(const std::unique_ptr& reader, SEXP columns); -RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP reader_sexp, SEXP columns_sexp){ +std::shared_ptr ipc___feather___Reader__Read(const std::shared_ptr& reader, SEXP columns); +RcppExport SEXP _arrow_ipc___feather___Reader__Read(SEXP reader_sexp, SEXP columns_sexp){ BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); + Rcpp::traits::input_parameter&>::type reader(reader_sexp); Rcpp::traits::input_parameter::type columns(columns_sexp); - return Rcpp::wrap(ipc___feather___TableReader__Read(reader, columns)); + return Rcpp::wrap(ipc___feather___Reader__Read(reader, columns)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP reader_sexp, SEXP columns_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__Read(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__Read(SEXP reader_sexp, SEXP columns_sexp){ + Rf_error("Cannot call ipc___feather___Reader__Read(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -std::unique_ptr ipc___feather___TableReader__Open(const std::shared_ptr& stream); -RcppExport SEXP _arrow_ipc___feather___TableReader__Open(SEXP stream_sexp){ +std::shared_ptr ipc___feather___Reader__Open(const std::shared_ptr& stream); +RcppExport SEXP _arrow_ipc___feather___Reader__Open(SEXP stream_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type stream(stream_sexp); - return Rcpp::wrap(ipc___feather___TableReader__Open(stream)); + return Rcpp::wrap(ipc___feather___Reader__Open(stream)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__Open(SEXP stream_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__Open(SEXP stream_sexp){ + Rf_error("Cannot call ipc___feather___Reader__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -Rcpp::CharacterVector ipc___feather___TableReader__column_names(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__column_names(SEXP reader_sexp){ +Rcpp::CharacterVector ipc___feather___Reader__column_names(const std::shared_ptr& reader); +RcppExport SEXP _arrow_ipc___feather___Reader__column_names(SEXP reader_sexp){ BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__column_names(reader)); + Rcpp::traits::input_parameter&>::type reader(reader_sexp); + return Rcpp::wrap(ipc___feather___Reader__column_names(reader)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__column_names(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__column_names(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__column_names(SEXP reader_sexp){ + Rf_error("Cannot call ipc___feather___Reader__column_names(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6104,22 +5933,11 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___expr__is_valid", (DL_FUNC) &_arrow_dataset___expr__is_valid, 1}, { "_arrow_dataset___expr__scalar", (DL_FUNC) &_arrow_dataset___expr__scalar, 1}, { "_arrow_dataset___expr__ToString", (DL_FUNC) &_arrow_dataset___expr__ToString, 1}, - { "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, - { "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, - { "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3}, - { "_arrow_ipc___feather___TableWriter__Finalize", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Finalize, 1}, - { "_arrow_ipc___feather___TableWriter__Open", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Open, 1}, - { "_arrow_ipc___TableWriter__RecordBatch__WriteFeather", (DL_FUNC) &_arrow_ipc___TableWriter__RecordBatch__WriteFeather, 2}, - { "_arrow_ipc___feather___TableReader__GetDescription", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetDescription, 1}, - { "_arrow_ipc___feather___TableReader__HasDescription", (DL_FUNC) &_arrow_ipc___feather___TableReader__HasDescription, 1}, - { "_arrow_ipc___feather___TableReader__version", (DL_FUNC) &_arrow_ipc___feather___TableReader__version, 1}, - { "_arrow_ipc___feather___TableReader__num_rows", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_rows, 1}, - { "_arrow_ipc___feather___TableReader__num_columns", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_columns, 1}, - { "_arrow_ipc___feather___TableReader__GetColumnName", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumnName, 2}, - { "_arrow_ipc___feather___TableReader__GetColumn", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumn, 2}, - { "_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 2}, - { "_arrow_ipc___feather___TableReader__Open", (DL_FUNC) &_arrow_ipc___feather___TableReader__Open, 1}, - { "_arrow_ipc___feather___TableReader__column_names", (DL_FUNC) &_arrow_ipc___feather___TableReader__column_names, 1}, + { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, + { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, + { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, + { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, + { "_arrow_ipc___feather___Reader__column_names", (DL_FUNC) &_arrow_ipc___feather___Reader__column_names, 1}, { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, diff --git a/r/src/feather.cpp b/r/src/feather.cpp index 7bdfeab72b2..7ab73606f8b 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -19,103 +19,34 @@ #if defined(ARROW_R_WITH_ARROW) -// ---------- TableWriter - -// [[arrow::export]] -void ipc___feather___TableWriter__SetDescription( - const std::unique_ptr& writer, - const std::string& description) { - writer->SetDescription(description); -} - -// [[arrow::export]] -void ipc___feather___TableWriter__SetNumRows( - const std::unique_ptr& writer, int64_t num_rows) { - writer->SetNumRows(num_rows); -} - -// [[arrow::export]] -void ipc___feather___TableWriter__Append( - const std::unique_ptr& writer, - const std::string& name, const std::shared_ptr& values) { - STOP_IF_NOT_OK(writer->Append(name, *values)); -} - -// [[arrow::export]] -void ipc___feather___TableWriter__Finalize( - const std::unique_ptr& writer) { - STOP_IF_NOT_OK(writer->Finalize()); -} - -// [[arrow::export]] -std::unique_ptr ipc___feather___TableWriter__Open( - const std::shared_ptr& stream) { - std::unique_ptr writer; - STOP_IF_NOT_OK(arrow::ipc::feather::TableWriter::Open(stream, &writer)); - return writer; -} - -// [[arrow::export]] -void ipc___TableWriter__RecordBatch__WriteFeather( - const std::unique_ptr& writer, - const std::shared_ptr& batch) { - writer->SetNumRows(batch->num_rows()); - - for (int i = 0; i < batch->num_columns(); i++) { - STOP_IF_NOT_OK(writer->Append(batch->column_name(i), *batch->column(i))); +// ---------- WriteFeather + +// [[arrow::export]] +void ipc___WriteFeather__Table(const std::shared_ptr& stream, + const std::shared_ptr& table, int version, + int chunk_size, arrow::Compression::type compression, + int compression_level) { + auto properties = arrow::ipc::feather::WriteProperties::Defaults(); + properties.version = version; + properties.chunksize = chunk_size; + properties.compression = compression; + if (compression_level != -1) { + properties.compression_level = compression_level; } - STOP_IF_NOT_OK(writer->Finalize()); + STOP_IF_NOT_OK(arrow::ipc::feather::WriteTable(*table, stream.get(), properties)); } -// ----------- TableReader +// ----------- Reader // [[arrow::export]] -std::string ipc___feather___TableReader__GetDescription( - const std::unique_ptr& reader) { - return reader->GetDescription(); -} - -// [[arrow::export]] -bool ipc___feather___TableReader__HasDescription( - const std::unique_ptr& reader) { - return reader->HasDescription(); -} - -// [[arrow::export]] -int ipc___feather___TableReader__version( - const std::unique_ptr& reader) { +int ipc___feather___Reader__version( + const std::shared_ptr& reader) { return reader->version(); } // [[arrow::export]] -int64_t ipc___feather___TableReader__num_rows( - const std::unique_ptr& reader) { - return reader->num_rows(); -} - -// [[arrow::export]] -int64_t ipc___feather___TableReader__num_columns( - const std::unique_ptr& reader) { - return reader->num_columns(); -} - -// [[arrow::export]] -std::string ipc___feather___TableReader__GetColumnName( - const std::unique_ptr& reader, int i) { - return reader->GetColumnName(i); -} - -// [[arrow::export]] -std::shared_ptr ipc___feather___TableReader__GetColumn( - const std::unique_ptr& reader, int i) { - std::shared_ptr column; - STOP_IF_NOT_OK(reader->GetColumn(i, &column)); - return column; -} - -// [[arrow::export]] -std::shared_ptr ipc___feather___TableReader__Read( - const std::unique_ptr& reader, SEXP columns) { +std::shared_ptr ipc___feather___Reader__Read( + const std::shared_ptr& reader, SEXP columns) { std::shared_ptr table; switch (TYPEOF(columns)) { @@ -140,20 +71,19 @@ std::shared_ptr ipc___feather___TableReader__Read( } // [[arrow::export]] -std::unique_ptr ipc___feather___TableReader__Open( +std::shared_ptr ipc___feather___Reader__Open( const std::shared_ptr& stream) { - std::unique_ptr reader; - STOP_IF_NOT_OK(arrow::ipc::feather::TableReader::Open(stream, &reader)); - return reader; + return VALUE_OR_STOP(arrow::ipc::feather::Reader::Open(stream)); } // [[arrow::export]] -Rcpp::CharacterVector ipc___feather___TableReader__column_names( - const std::unique_ptr& reader) { - int64_t n = reader->num_columns(); +Rcpp::CharacterVector ipc___feather___Reader__column_names( + const std::shared_ptr& reader) { + auto sch = reader->schema(); + int64_t n = sch->num_fields(); Rcpp::CharacterVector out(n); - for (int64_t i = 0; i < n; i++) { - out[i] = reader->GetColumnName(i); + for (int i = 0; i < n; i++) { + out[i] = sch->field(i)->name(); } return out; } diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index c5153a9891f..7ae52eb2118 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -21,29 +21,30 @@ feather_file <- tempfile() tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) test_that("Write a feather file", { - write_feather(tib, feather_file) - expect_true(file.exists(feather_file)) -}) - -test_that("write_feather() returns its input", { tib_out <- write_feather(tib, feather_file) + expect_true(file.exists(feather_file)) + # Input is returned unmodified expect_identical(tib_out, tib) }) -test_that("feather read/write round trip", { +expect_feather_roundtrip <- function(write_fun) { tf2 <- normalizePath(tempfile(), mustWork = FALSE) - write_feather(tib, tf2) + tf3 <- tempfile() + on.exit({ + unlink(tf2) + unlink(tf3) + }) + + # Write two ways. These are what varies with each run + write_fun(tib, tf2) expect_true(file.exists(tf2)) - tf3 <- tempfile() stream <- FileOutputStream$create(tf3) - write_feather(tib, stream) + write_fun(tib, stream) stream$close() expect_true(file.exists(tf3)) - tab1 <- read_feather(feather_file) - expect_is(tab1, "data.frame") - + # Read both back tab2 <- read_feather(tf2) expect_is(tab2, "data.frame") @@ -58,17 +59,53 @@ test_that("feather read/write round trip", { tab5 <- read_feather(ReadableFile$create(tf3)) expect_is(tab5, "data.frame") - expect_equal(tib, tab1) expect_equal(tib, tab2) expect_equal(tib, tab3) expect_equal(tib, tab4) expect_equal(tib, tab5) +} - unlink(tf2) - unlink(tf3) +test_that("feather read/write round trip", { + expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 1)) + expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 2)) + expect_feather_roundtrip(function(x, f) write_feather(x, f, chunk_size = 32)) + if (codec_is_available("lz4")) { + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4")) + } + if (codec_is_available("zstd")) { + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd")) + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd", compression_level = 3)) + } + + # Write from Arrow data structures + expect_feather_roundtrip(function(x, f) write_feather(RecordBatch$create(x), f)) + expect_feather_roundtrip(function(x, f) write_feather(Table$create(x), f)) }) -test_that("feather handles col_select = ", { +test_that("write_feather option error handling", { + tf <- tempfile() + expect_false(file.exists(tf)) + expect_error( + write_feather(tib, tf, version = 1, chunk_size = 1024), + "Feather version 1 does not support the 'chunk_size' option" + ) + expect_error( + write_feather(tib, tf, version = 1, compression = "lz4"), + "Feather version 1 does not support the 'compression' option" + ) + expect_error( + write_feather(tib, tf, version = 1, compression_level = 1024), + "Feather version 1 does not support the 'compression_level' option" + ) + expect_error( + write_feather(tib, tf, compression_level = 1024), + "Can only specify a 'compression_level' when 'compression' is 'zstd'" + ) + expect_match_arg_error(write_feather(tib, tf, compression = "bz2")) + expect_false(file.exists(tf)) +}) + +test_that("read_feather supports col_select = ", { tab1 <- read_feather(feather_file, col_select = c("x", "y")) expect_is(tab1, "data.frame") @@ -111,4 +148,19 @@ test_that("Read feather from raw vector", { expect_is(df, "data.frame") }) +test_that("FeatherReader", { + v1 <- tempfile() + v2 <- tempfile() + on.exit({ + unlink(v1) + unlink(v2) + }) + write_feather(tib, v1, version = 1) + write_feather(tib, v2) + reader1 <- FeatherReader$create(v1) + expect_identical(reader1$version, 1L) + reader2 <- FeatherReader$create(v2) + expect_identical(reader2$version, 2L) +}) + unlink(feather_file) diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb index f098ab6cfb6..6c44b99ed30 100644 --- a/ruby/red-arrow/lib/arrow/table-saver.rb +++ b/ruby/red-arrow/lib/arrow/table-saver.rb @@ -155,10 +155,14 @@ def save_as_tsv end def save_as_feather - open_output_stream do |output| - FeatherFileWriter.open(output) do |writer| - writer.write(@table) + open_raw_output_stream do |output| + properties = FeatherWriteProperties.new + properties.class.properties.each do |name| + value = @options[name.to_sym] + next if value.nil? + properties.__send__("#{name}=", value) end + @table.write_as_feather(output, properties) end end end diff --git a/ruby/red-arrow/test/test-feather.rb b/ruby/red-arrow/test/test-feather.rb index 75113e96fae..d36df5ed6fa 100644 --- a/ruby/red-arrow/test/test-feather.rb +++ b/ruby/red-arrow/test/test-feather.rb @@ -18,17 +18,32 @@ class FeatherTest < Test::Unit::TestCase include Helper::Fixture - def test_save_load + def setup columns = { "message" => Arrow::StringArray.new(["Start", "Crash", "Shutdown"]), "is_critical" => Arrow::BooleanArray.new([false, true, false]), } - table = Arrow::Table.new(columns) + @table = Arrow::Table.new(columns) - output = Tempfile.new(["red-arrow", ".feather"]) - table.save(output.path) - output.close + @output = Tempfile.new(["red-arrow", ".feather"]) + begin + yield(@output) + rescue + @output.close! + end + end + + def test_default + @table.save(@output.path) + @output.close + + assert_equal(@table, Arrow::Table.load(@output.path)) + end + + def test_compression + @table.save(@output.path, compression: :zstd) + @output.close - assert_equal(table, Arrow::Table.load(output.path)) + assert_equal(@table, Arrow::Table.load(@output.path)) end end