From 32d45bb98208adada01426e716275957beaa5de6 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 14:53:00 -0400 Subject: [PATCH 01/22] Add basic CSV Writer C++ Proxy class. --- matlab/src/cpp/arrow/matlab/error/error.h | 1 + .../cpp/arrow/matlab/io/csv/proxy/writer.cc | 89 +++++++++++++++++++ .../cpp/arrow/matlab/io/csv/proxy/writer.h | 38 ++++++++ matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + 4 files changed, 130 insertions(+) create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 4ff77da8d83..6ab6b557f45 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -182,6 +182,7 @@ namespace arrow::matlab::error { static const char* TABLE_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:table:InvalidNumericColumnIndex"; static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; static const char* FAILED_TO_OPEN_FILE_FOR_READ = "arrow:io:FailedToOpenFileForRead"; + static const char* CSV_FAILED_TO_WRITE_TABLE = "arrow:io:csv:FailedToWriteTable"; static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch"; static const char* FEATHER_FAILED_TO_CREATE_READER = "arrow:io:feather:FailedToCreateReader"; diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc new file mode 100644 index 00000000000..029ebcd5809 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/io/feather/proxy/writer.h" +#include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/error/error.h" + +#include "arrow/result.h" +#include "arrow/table.h" +#include "arrow/util/utf8.h" + +#include "arrow/io/file.h" +#include "arrow/csv/writer.h" +#include "arrow/csv/options.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::io::feather::proxy { + + Writer::Writer(const std::string& filename) : filename{filename} { + REGISTER_METHOD(Writer, getFilename); + REGISTER_METHOD(Writer, write); + } + + libmexclass::proxy::MakeResult Writer::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + mda::StructArray opts = constructor_arguments[0]; + const mda::StringArray filename_mda = opts[0]["Filename"]; + + const auto filename_utf16 = std::u16string(filename_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, + arrow::util::UTF16StringToUTF8(filename_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared(filename_utf8); + } + + void Writer::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename, + arrow::util::UTF8StringToUTF16(filename), + context, + error::UNICODE_CONVERSION_ERROR_ID); + mda::ArrayFactory factory; + auto str_mda = factory.createScalar(utf16_filename); + context.outputs[0] = str_mda; + } + + void Writer::write(libmexclass::proxy::method::Context& context) { + namespace csv = ::arrow::csv; + namespace mda = ::matlab::data; + mda::StructArray opts = context.inputs[0]; + const mda::TypedArray record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"]; + const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0]; + + auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id); + auto record_batch_proxy = std::static_pointer_cast(proxy); + auto record_batch = record_batch_proxy->unwrap(); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table, + arrow::Table::FromRecordBatches({record_batch}), + context, + error::TABLE_FROM_RECORD_BATCH); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr output_stream, + arrow::io::FileOutputStream::Open(filename), + context, + error::FAILED_TO_OPEN_FILE_FOR_WRITE); + write_props.version = arrow::ipc::csv::kFeatherV1Version; + csv::WriteOptions options; + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(csv::WriteCSV(*table, options, output_stream.get()), + context, + error::CSV_FAILED_TO_WRITE_TABLE); + } +} diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h new file mode 100644 index 00000000000..6a389f5f25f --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::csv::proxy { + + class Writer : public libmexclass::proxy::Proxy { + public: + Writer(const std::string& filename); + ~Writer() {} + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getFilename(libmexclass::proxy::method::Context& context); + void write(libmexclass::proxy::method::Context& context); + + private: + const std::string filename; + }; + +} diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index ebeb020a9e7..c9a97dc9c58 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -37,6 +37,7 @@ #include "arrow/matlab/type/proxy/field.h" #include "arrow/matlab/io/feather/proxy/writer.h" #include "arrow/matlab/io/feather/proxy/reader.h" +#include "arrow/matlab/io/csv/proxy/writer.h" #include "factory.h" @@ -85,6 +86,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.type.proxy.StructType , arrow::matlab::type::proxy::StructType); REGISTER_PROXY(arrow.io.feather.proxy.Writer , arrow::matlab::io::feather::proxy::Writer); REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader); + REGISTER_PROXY(arrow.io.csv.proxy.Writer , arrow::matlab::io::csv::proxy::Writer); return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; From 36bff748287ba455941b2a31f6d78af80d5de4d8 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 14:56:51 -0400 Subject: [PATCH 02/22] Add basic CSV Writer MATLAB class implementation. --- matlab/src/matlab/+arrow/+io/+csv/Writer.m | 47 ++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 matlab/src/matlab/+arrow/+io/+csv/Writer.m diff --git a/matlab/src/matlab/+arrow/+io/+csv/Writer.m b/matlab/src/matlab/+arrow/+io/+csv/Writer.m new file mode 100644 index 00000000000..5d6816ba35c --- /dev/null +++ b/matlab/src/matlab/+arrow/+io/+csv/Writer.m @@ -0,0 +1,47 @@ +%WRITER Class for writing CSV files. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Writer < matlab.mixin.Scalar + + properties(Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent) + Filename + end + + methods + function obj = Writer(filename) + arguments + filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + proxyName = "arrow.io.csv.proxy.Writer"; + obj.Proxy = arrow.internal.proxy.create(proxyName, args); + end + + function write(obj, recordBatch) + args = struct(RecordBatchProxyID=recordBatch.Proxy.ID); + obj.Proxy.write(args); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + end +end From d9f2e30599e9fa50ad9938d97f2bad2375b9da5f Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 14:59:00 -0400 Subject: [PATCH 03/22] Fix namespace and header include. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc index 029ebcd5809..a1fdc0b063a 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/matlab/io/feather/proxy/writer.h" +#include "arrow/matlab/io/csv/proxy/writer.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/error/error.h" @@ -29,7 +29,7 @@ #include "libmexclass/proxy/ProxyManager.h" -namespace arrow::matlab::io::feather::proxy { +namespace arrow::matlab::io::csv::proxy { Writer::Writer(const std::string& filename) : filename{filename} { REGISTER_METHOD(Writer, getFilename); From ccd5ad954e3cd149b8b88ca143877407b4a597f8 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 15:09:07 -0400 Subject: [PATCH 04/22] Use WriterProxy type alias. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc index a1fdc0b063a..1678d549cd4 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc @@ -40,13 +40,14 @@ namespace arrow::matlab::io::csv::proxy { namespace mda = ::matlab::data; mda::StructArray opts = constructor_arguments[0]; const mda::StringArray filename_mda = opts[0]["Filename"]; + using WriterProxy = ::arrow::matlab::io::csv::proxy::Writer; const auto filename_utf16 = std::u16string(filename_mda[0]); MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID); - return std::make_shared(filename_utf8); + return std::make_shared(filename_utf8); } void Writer::getFilename(libmexclass::proxy::method::Context& context) { From 67d087fb272f82f230d636a4aad709b7da7a5b64 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 15:28:41 -0400 Subject: [PATCH 05/22] Add CSV writer source code to CMakeLists.txt. --- matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 40c6b5a51d4..65a0a2a4ea7 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -72,8 +72,6 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc") - - set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy/factory.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_INCLUDE_DIRS ${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_ROOT_INCLUDE_DIR} From a01ffa1b703c5f264da008f97463041e6b73ed8a Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 15:30:48 -0400 Subject: [PATCH 06/22] Delete old Feather code for Writer properties. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc index 1678d549cd4..2d8e771c1f6 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc @@ -81,7 +81,6 @@ namespace arrow::matlab::io::csv::proxy { arrow::io::FileOutputStream::Open(filename), context, error::FAILED_TO_OPEN_FILE_FOR_WRITE); - write_props.version = arrow::ipc::csv::kFeatherV1Version; csv::WriteOptions options; MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(csv::WriteCSV(*table, options, output_stream.get()), context, From 7140b6d61d51fe39e2f54db58e24a71ee69fc91c Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 13 Sep 2023 16:47:01 -0400 Subject: [PATCH 07/22] 1. Rename `arrow.io.csv.Writer` to `arrow.io.csv.TableWriter`. 2. Change `write` method to take in an `arrow.tabular.Table`, rather than an `arrow.tabular.RecordBatch`. 3. Add basic implementation of `arrow.io.csv.TableReader` class. Co-authored-by: Sarah Gilmore --- matlab/src/cpp/arrow/matlab/error/error.h | 2 + .../arrow/matlab/io/csv/proxy/table_reader.cc | 93 +++++++++++++++++++ .../arrow/matlab/io/csv/proxy/table_reader.h | 38 ++++++++ .../csv/proxy/{writer.cc => table_writer.cc} | 37 ++++---- .../io/csv/proxy/{writer.h => table_writer.h} | 6 +- matlab/src/cpp/arrow/matlab/proxy/factory.cc | 6 +- .../src/matlab/+arrow/+io/+csv/TableReader.m | 51 ++++++++++ .../+io/+csv/{Writer.m => TableWriter.m} | 12 +-- .../cmake/BuildMatlabArrowInterface.cmake | 2 + 9 files changed, 216 insertions(+), 31 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h rename matlab/src/cpp/arrow/matlab/io/csv/proxy/{writer.cc => table_writer.cc} (68%) rename matlab/src/cpp/arrow/matlab/io/csv/proxy/{writer.h => table_writer.h} (89%) create mode 100644 matlab/src/matlab/+arrow/+io/+csv/TableReader.m rename matlab/src/matlab/+arrow/+io/+csv/{Writer.m => TableWriter.m} (81%) diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 6ab6b557f45..ada9954353d 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -183,6 +183,8 @@ namespace arrow::matlab::error { static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; static const char* FAILED_TO_OPEN_FILE_FOR_READ = "arrow:io:FailedToOpenFileForRead"; static const char* CSV_FAILED_TO_WRITE_TABLE = "arrow:io:csv:FailedToWriteTable"; + static const char* CSV_FAILED_TO_CREATE_TABLE_READER = "arrow:io:csv:FailedToCreateTableReader"; + static const char* CSV_FAILED_TO_READ_TABLE = "arrow:io:csv:FailedToReadTable"; static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch"; static const char* FEATHER_FAILED_TO_CREATE_READER = "arrow:io:feather:FailedToCreateReader"; diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc new file mode 100644 index 00000000000..42c2b8a47e6 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc @@ -0,0 +1,93 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "libmexclass/proxy/ProxyManager.h" + +#include "arrow/matlab/error/error.h" +#include "arrow/matlab/io/csv/proxy/table_reader.h" +#include "arrow/matlab/tabular/proxy/table.h" + +#include "arrow/util/utf8.h" + +#include "arrow/result.h" + +#include "arrow/io/file.h" +#include "arrow/io/interfaces.h" +#include "arrow/csv/reader.h" +#include "arrow/table.h" + +namespace arrow::matlab::io::csv::proxy { + + TableReader::TableReader(const std::string& filename) : filename{filename} { + REGISTER_METHOD(TableReader, read); + REGISTER_METHOD(TableReader, getFilename); + } + + libmexclass::proxy::MakeResult TableReader::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using TableReaderProxy = arrow::matlab::io::csv::proxy::TableReader; + + mda::StructArray args = constructor_arguments[0]; + const mda::StringArray filename_utf16_mda = args[0]["Filename"]; + const auto filename_utf16 = std::u16string(filename_utf16_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename, arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared(filename); + } + + void TableReader::read(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + namespace csv = ::arrow::csv; + using TableProxy = arrow::matlab::tabular::proxy::Table; + + mda::ArrayFactory factory; + + // Create a file input stream. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source, arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context, error::FAILED_TO_OPEN_FILE_FOR_READ); + + const ::arrow::io::IOContext io_context; + const csv::ReadOptions read_options; + const csv::ParseOptions parse_options; + const csv::ConvertOptions convert_options; + + // Create a TableReader from the file input stream. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto table_reader, + csv::TableReader::Make(io_context, source, read_options, parse_options, convert_options), + context, + error::CSV_FAILED_TO_CREATE_TABLE_READER); + + // Read a Table from the file. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table, table_reader->Read(), context, error::CSV_FAILED_TO_READ_TABLE); + + auto table_proxy = std::make_shared(table); + const auto table_proxy_id = ProxyManager::manageProxy(table_proxy); + + const auto table_proxy_id_mda = factory.createScalar(table_proxy_id); + + context.outputs[0] = table_proxy_id_mda; + } + + void TableReader::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto filename_utf16, arrow::util::UTF8StringToUTF16(filename), context, error::UNICODE_CONVERSION_ERROR_ID); + auto filename_utf16_mda = factory.createScalar(filename_utf16); + context.outputs[0] = filename_utf16_mda; + } + +} diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h new file mode 100644 index 00000000000..d5dfce50e40 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::csv::proxy { + + class TableReader : public libmexclass::proxy::Proxy { + public: + TableReader(const std::string& filename); + ~TableReader() {} + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void read(libmexclass::proxy::method::Context& context); + void getFilename(libmexclass::proxy::method::Context& context); + + private: + const std::string filename; + }; + +} diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc similarity index 68% rename from matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc rename to matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc index 2d8e771c1f6..df48f1478da 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/matlab/io/csv/proxy/writer.h" -#include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/io/csv/proxy/table_writer.h" +#include "arrow/matlab/tabular/proxy/table.h" #include "arrow/matlab/error/error.h" #include "arrow/result.h" @@ -31,26 +31,26 @@ namespace arrow::matlab::io::csv::proxy { - Writer::Writer(const std::string& filename) : filename{filename} { - REGISTER_METHOD(Writer, getFilename); - REGISTER_METHOD(Writer, write); + TableWriter::TableWriter(const std::string& filename) : filename{filename} { + REGISTER_METHOD(TableWriter, getFilename); + REGISTER_METHOD(TableWriter, write); } - libmexclass::proxy::MakeResult Writer::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + libmexclass::proxy::MakeResult TableWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { namespace mda = ::matlab::data; mda::StructArray opts = constructor_arguments[0]; const mda::StringArray filename_mda = opts[0]["Filename"]; - using WriterProxy = ::arrow::matlab::io::csv::proxy::Writer; + using TableWriterProxy = ::arrow::matlab::io::csv::proxy::TableWriter; const auto filename_utf16 = std::u16string(filename_mda[0]); MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID); - return std::make_shared(filename_utf8); + return std::make_shared(filename_utf8); } - void Writer::getFilename(libmexclass::proxy::method::Context& context) { + void TableWriter::getFilename(libmexclass::proxy::method::Context& context) { namespace mda = ::matlab::data; MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename, arrow::util::UTF8StringToUTF16(filename), @@ -61,21 +61,18 @@ namespace arrow::matlab::io::csv::proxy { context.outputs[0] = str_mda; } - void Writer::write(libmexclass::proxy::method::Context& context) { + void TableWriter::write(libmexclass::proxy::method::Context& context) { namespace csv = ::arrow::csv; namespace mda = ::matlab::data; - mda::StructArray opts = context.inputs[0]; - const mda::TypedArray record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"]; - const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0]; + using TableProxy = ::arrow::matlab::tabular::proxy::Table; - auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id); - auto record_batch_proxy = std::static_pointer_cast(proxy); - auto record_batch = record_batch_proxy->unwrap(); + mda::StructArray opts = context.inputs[0]; + const mda::TypedArray table_proxy_id_mda = opts[0]["TableProxyID"]; + const uint64_t table_proxy_id = table_proxy_id_mda[0]; - MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table, - arrow::Table::FromRecordBatches({record_batch}), - context, - error::TABLE_FROM_RECORD_BATCH); + auto proxy = libmexclass::proxy::ProxyManager::getProxy(table_proxy_id); + auto table_proxy = std::static_pointer_cast(proxy); + auto table = table_proxy->unwrap(); MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr output_stream, arrow::io::FileOutputStream::Open(filename), diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h similarity index 89% rename from matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h rename to matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h index 6a389f5f25f..b9916bd9bdc 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/writer.h +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h @@ -21,10 +21,10 @@ namespace arrow::matlab::io::csv::proxy { - class Writer : public libmexclass::proxy::Proxy { + class TableWriter : public libmexclass::proxy::Proxy { public: - Writer(const std::string& filename); - ~Writer() {} + TableWriter(const std::string& filename); + ~TableWriter() {} static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); protected: diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index c9a97dc9c58..d1f46c7e2f7 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -37,7 +37,8 @@ #include "arrow/matlab/type/proxy/field.h" #include "arrow/matlab/io/feather/proxy/writer.h" #include "arrow/matlab/io/feather/proxy/reader.h" -#include "arrow/matlab/io/csv/proxy/writer.h" +#include "arrow/matlab/io/csv/proxy/table_writer.h" +#include "arrow/matlab/io/csv/proxy/table_reader.h" #include "factory.h" @@ -86,7 +87,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.type.proxy.StructType , arrow::matlab::type::proxy::StructType); REGISTER_PROXY(arrow.io.feather.proxy.Writer , arrow::matlab::io::feather::proxy::Writer); REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader); - REGISTER_PROXY(arrow.io.csv.proxy.Writer , arrow::matlab::io::csv::proxy::Writer); + REGISTER_PROXY(arrow.io.csv.proxy.TableWriter , arrow::matlab::io::csv::proxy::TableWriter); + REGISTER_PROXY(arrow.io.csv.proxy.TableReader , arrow::matlab::io::csv::proxy::TableReader); return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; diff --git a/matlab/src/matlab/+arrow/+io/+csv/TableReader.m b/matlab/src/matlab/+arrow/+io/+csv/TableReader.m new file mode 100644 index 00000000000..6e9d925b6bc --- /dev/null +++ b/matlab/src/matlab/+arrow/+io/+csv/TableReader.m @@ -0,0 +1,51 @@ +%TABLEREADER Reads tabular data from a CSV file into an arrow.tabular.Table. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef TableReader + + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + properties (Dependent, SetAccess=private, GetAccess=public) + Filename + end + + methods + + function obj = TableReader(filename) + arguments + filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + obj.Proxy = arrow.internal.proxy.create("arrow.io.csv.proxy.TableReader", args); + end + + function table = read(obj) + tableProxyID = obj.Proxy.read(); + proxy = libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.Table", ID=tableProxyID); + table = arrow.tabular.Table(proxy); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+io/+csv/Writer.m b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m similarity index 81% rename from matlab/src/matlab/+arrow/+io/+csv/Writer.m rename to matlab/src/matlab/+arrow/+io/+csv/TableWriter.m index 5d6816ba35c..351426b1d15 100644 --- a/matlab/src/matlab/+arrow/+io/+csv/Writer.m +++ b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m @@ -1,4 +1,4 @@ -%WRITER Class for writing CSV files. +%TABLEWRITER Writes tabular data in an arrow.tabular.Table to a CSV file. % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with @@ -14,7 +14,7 @@ % WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef Writer < matlab.mixin.Scalar +classdef TableWriter < matlab.mixin.Scalar properties(Hidden, SetAccess=private, GetAccess=public) Proxy @@ -25,18 +25,18 @@ end methods - function obj = Writer(filename) + function obj = TableWriter(filename) arguments filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} end args = struct(Filename=filename); - proxyName = "arrow.io.csv.proxy.Writer"; + proxyName = "arrow.io.csv.proxy.TableWriter"; obj.Proxy = arrow.internal.proxy.create(proxyName, args); end - function write(obj, recordBatch) - args = struct(RecordBatchProxyID=recordBatch.Proxy.ID); + function write(obj, table) + args = struct(TableProxyID=table.Proxy.ID); obj.Proxy.write(args); end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 65a0a2a4ea7..294612dda37 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -70,6 +70,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/writer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/reader.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") From 12b4f3184f1b6994043d004a10d7e85998900b3b Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 15 Sep 2023 14:38:53 -0400 Subject: [PATCH 08/22] Add basic CSV round-trip tests. Co-authored-by: Sarah Gilmore --- matlab/test/arrow/io/csv/tRoundTrip.m | 86 +++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 matlab/test/arrow/io/csv/tRoundTrip.m diff --git a/matlab/test/arrow/io/csv/tRoundTrip.m b/matlab/test/arrow/io/csv/tRoundTrip.m new file mode 100644 index 00000000000..94843f6d6ea --- /dev/null +++ b/matlab/test/arrow/io/csv/tRoundTrip.m @@ -0,0 +1,86 @@ +%TROUNDTRIP Round trip tests for CSV. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tRoundTrip < matlab.unittest.TestCase + + properties + MatlabTableNumeric + MatlabTableString + ArrowTableNumeric + ArrowTableString + Filename + end + + methods (TestClassSetup) + + function initializeProperties(testCase) + % Seed the random number generator. + rng(1); + + testCase.MatlabTableNumeric = array2table(rand(10000, 5), ... + VariableNames=["😀", "🌲", "🥭", " ", "ABC"]); + testCase.ArrowTableNumeric = arrow.table(testCase.MatlabTableNumeric); + testCase.MatlabTableString = table(["A"; "B"; "C"], ... + [""; " "; " "], ... + ["😀"; "🌲"; "🥭"]); + testCase.ArrowTableString = arrow.table(testCase.MatlabTableString); + end + + end + + methods (TestMethodSetup) + + function setupTempFile(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + fixture = testCase.applyFixture(TemporaryFolderFixture); + testCase.Filename = fullfile(fixture.Folder, "temp.csv"); + end + + end + + methods(Test) + + function Numeric(testCase) + import arrow.io.csv.* + + arrowTableWrite = testCase.ArrowTableNumeric; + + writer = TableWriter(testCase.Filename); + reader = TableReader(testCase.Filename); + + writer.write(arrowTableWrite); + arrowTableRead = reader.read(); + + testCase.verifyEqual(arrowTableRead, arrowTableWrite); + end + + function String(testCase) + import arrow.io.csv.* + + arrowTableWrite = testCase.ArrowTableString; + + writer = TableWriter(testCase.Filename); + reader = TableReader(testCase.Filename); + + writer.write(arrowTableWrite); + arrowTableRead = reader.read(); + + testCase.verifyEqual(arrowTableRead, arrowTableWrite); + end + + end + +end \ No newline at end of file From 9c88a9fa6786b33db0426a462863361c0f8889a0 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 15 Sep 2023 14:45:49 -0400 Subject: [PATCH 09/22] Use default values for read, write, parse, and convert options for CSV. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc | 6 +++--- matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc index 42c2b8a47e6..cb6a88748cd 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc @@ -60,9 +60,9 @@ namespace arrow::matlab::io::csv::proxy { MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source, arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context, error::FAILED_TO_OPEN_FILE_FOR_READ); const ::arrow::io::IOContext io_context; - const csv::ReadOptions read_options; - const csv::ParseOptions parse_options; - const csv::ConvertOptions convert_options; + const csv::ReadOptions read_options = csv::ReadOptions::Defaults(); + const csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); + const csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults(); // Create a TableReader from the file input stream. MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto table_reader, diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc index df48f1478da..22db6118b5d 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc @@ -78,7 +78,7 @@ namespace arrow::matlab::io::csv::proxy { arrow::io::FileOutputStream::Open(filename), context, error::FAILED_TO_OPEN_FILE_FOR_WRITE); - csv::WriteOptions options; + csv::WriteOptions options = csv::WriteOptions::Defaults(); MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(csv::WriteCSV(*table, options, output_stream.get()), context, error::CSV_FAILED_TO_WRITE_TABLE); From b1a5c5ce5adf8d652fc86b5cecf57e707e99c936 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 15 Sep 2023 16:48:53 -0400 Subject: [PATCH 10/22] 1. Parameterize CSV tests. 2. Create CSV test superclass. 3. Add tError test class. Co-authored-by: Sarah Gilmore --- matlab/test/arrow/io/csv/CSVTest.m | 90 +++++++++++++++++++++++++++ matlab/test/arrow/io/csv/tError.m | 46 ++++++++++++++ matlab/test/arrow/io/csv/tRoundTrip.m | 70 +++++++++------------ 3 files changed, 167 insertions(+), 39 deletions(-) create mode 100644 matlab/test/arrow/io/csv/CSVTest.m create mode 100644 matlab/test/arrow/io/csv/tError.m diff --git a/matlab/test/arrow/io/csv/CSVTest.m b/matlab/test/arrow/io/csv/CSVTest.m new file mode 100644 index 00000000000..bfa59b0873d --- /dev/null +++ b/matlab/test/arrow/io/csv/CSVTest.m @@ -0,0 +1,90 @@ +%CSVTEST Super class for CSV related tests. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef CSVTest < matlab.unittest.TestCase + + properties + Filename + end + + methods (TestClassSetup) + + function initializeProperties(~) + % Seed the random number generator. + rng(1); + end + + end + + methods (TestMethodSetup) + + function setupTestFilename(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + fixture = testCase.applyFixture(TemporaryFolderFixture); + testCase.Filename = fullfile(fixture.Folder, "filename.csv"); + end + + end + + methods + + function arrowTable = makeArrowTable(testCase, opts) + arguments + testCase + opts.Type + opts.ColumnNames + opts.NumRows + opts.WithNulls (1, 1) logical = false + end + + if opts.Type == "numeric" + matlabTable = array2table(rand(opts.NumRows, numel(opts.ColumnNames))); + elseif opts.Type == "string" + matlabTable = array2table("A" + rand(opts.NumRows, numel(opts.ColumnNames)) + "B"); + end + + if opts.WithNulls + matlabTable = testCase.setNullValues(matlabTable, NullPercentage=0.2); + end + + arrays = cell(1, width(matlabTable)); + for ii = 1:width(matlabTable) + arrays{ii} = arrow.array(matlabTable.(ii)); + end + arrowTable = arrow.tabular.Table.fromArrays(arrays{:}, ColumnNames=opts.ColumnNames); + end + + function tWithNulls = setNullValues(testCase, t, opts) + arguments + testCase %#ok + t table + opts.NullPercentage (1, 1) double {mustBeGreaterThanOrEqual(opts.NullPercentage, 0)} = 0.5 + end + + tWithNulls = t; + for ii = 1:width(t) + temp = tWithNulls.(ii); + numValues = numel(temp); + numNulls = uint64(opts.NullPercentage * numValues); + nullIndices = randperm(numValues, numNulls); + temp(nullIndices) = missing; + tWithNulls.(ii) = temp; + end + end + + end + +end diff --git a/matlab/test/arrow/io/csv/tError.m b/matlab/test/arrow/io/csv/tError.m new file mode 100644 index 00000000000..b2db0590f24 --- /dev/null +++ b/matlab/test/arrow/io/csv/tError.m @@ -0,0 +1,46 @@ +%TERROR Error tests for CSV. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tError < CSVTest + + methods (TestMethodSetup) + + function setupTestFilename(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + fixture = testCase.applyFixture(TemporaryFolderFixture); + testCase.Filename = fullfile(fixture.Folder, "filename.csv"); + end + + end + + methods(Test) + + function EmptyCsvFile(testCase) + import arrow.io.csv.* + + arrowTableWrite = arrow.table(); + + writer = TableWriter(testCase.Filename); + reader = TableReader(testCase.Filename); + + writer.write(arrowTableWrite); + fcn = @() reader.read(); + testCase.verifyError(fcn, "arrow:io:csv:FailedToReadTable"); + end + + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/io/csv/tRoundTrip.m b/matlab/test/arrow/io/csv/tRoundTrip.m index 94843f6d6ea..de4eb18f427 100644 --- a/matlab/test/arrow/io/csv/tRoundTrip.m +++ b/matlab/test/arrow/io/csv/tRoundTrip.m @@ -14,49 +14,36 @@ % WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or % implied. See the License for the specific language governing % permissions and limitations under the License. -classdef tRoundTrip < matlab.unittest.TestCase - - properties - MatlabTableNumeric - MatlabTableString - ArrowTableNumeric - ArrowTableString - Filename - end - - methods (TestClassSetup) - - function initializeProperties(testCase) - % Seed the random number generator. - rng(1); - - testCase.MatlabTableNumeric = array2table(rand(10000, 5), ... - VariableNames=["😀", "🌲", "🥭", " ", "ABC"]); - testCase.ArrowTableNumeric = arrow.table(testCase.MatlabTableNumeric); - testCase.MatlabTableString = table(["A"; "B"; "C"], ... - [""; " "; " "], ... - ["😀"; "🌲"; "🥭"]); - testCase.ArrowTableString = arrow.table(testCase.MatlabTableString); - end - - end - - methods (TestMethodSetup) - - function setupTempFile(testCase) - import matlab.unittest.fixtures.TemporaryFolderFixture - fixture = testCase.applyFixture(TemporaryFolderFixture); - testCase.Filename = fullfile(fixture.Folder, "temp.csv"); - end - +classdef tRoundTrip < CSVTest + + properties (TestParameter) + NumRows = { ... + 2, ... + 10, ... + 100 ... + } + WithNulls = { ... + true, ... + false ... + } + ColumnNames = {... + ["A", "B", "C"], ... + ["😀", "🌲", "🥭", " ", "ABC"], ... + [" ", " ", " "] + } end methods(Test) - function Numeric(testCase) + function Numeric(testCase, NumRows, WithNulls, ColumnNames) import arrow.io.csv.* - arrowTableWrite = testCase.ArrowTableNumeric; + arrowTableWrite = testCase.makeArrowTable(... + Type="numeric", ... + NumRows=NumRows, ... + WithNulls=WithNulls, ... + ColumnNames=ColumnNames ... + ); writer = TableWriter(testCase.Filename); reader = TableReader(testCase.Filename); @@ -67,10 +54,15 @@ function Numeric(testCase) testCase.verifyEqual(arrowTableRead, arrowTableWrite); end - function String(testCase) + function String(testCase, NumRows, ColumnNames) import arrow.io.csv.* - arrowTableWrite = testCase.ArrowTableString; + arrowTableWrite = testCase.makeArrowTable(... + Type="string", ... + NumRows=NumRows, ... + WithNulls=false, ... + ColumnNames=ColumnNames ... + ); writer = TableWriter(testCase.Filename); reader = TableReader(testCase.Filename); From 58a5ea3184393efcd01ccf76963c67a12505f139 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 15 Sep 2023 16:53:53 -0400 Subject: [PATCH 11/22] Add verifyRoundTrip method to CSVTest class. --- matlab/test/arrow/io/csv/CSVTest.m | 12 ++++++++++++ matlab/test/arrow/io/csv/tRoundTrip.m | 24 ++++-------------------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/matlab/test/arrow/io/csv/CSVTest.m b/matlab/test/arrow/io/csv/CSVTest.m index bfa59b0873d..49f77eaaa7c 100644 --- a/matlab/test/arrow/io/csv/CSVTest.m +++ b/matlab/test/arrow/io/csv/CSVTest.m @@ -41,6 +41,18 @@ function setupTestFilename(testCase) methods + function verifyRoundTrip(testCase, arrowTable) + import arrow.io.csv.* + + writer = TableWriter(testCase.Filename); + reader = TableReader(testCase.Filename); + + writer.write(arrowTable); + arrowTableRead = reader.read(); + + testCase.verifyEqual(arrowTableRead, arrowTable); + end + function arrowTable = makeArrowTable(testCase, opts) arguments testCase diff --git a/matlab/test/arrow/io/csv/tRoundTrip.m b/matlab/test/arrow/io/csv/tRoundTrip.m index de4eb18f427..cb358225801 100644 --- a/matlab/test/arrow/io/csv/tRoundTrip.m +++ b/matlab/test/arrow/io/csv/tRoundTrip.m @@ -36,41 +36,25 @@ methods(Test) function Numeric(testCase, NumRows, WithNulls, ColumnNames) - import arrow.io.csv.* - - arrowTableWrite = testCase.makeArrowTable(... + arrowTable = testCase.makeArrowTable(... Type="numeric", ... NumRows=NumRows, ... WithNulls=WithNulls, ... ColumnNames=ColumnNames ... ); - writer = TableWriter(testCase.Filename); - reader = TableReader(testCase.Filename); - - writer.write(arrowTableWrite); - arrowTableRead = reader.read(); - - testCase.verifyEqual(arrowTableRead, arrowTableWrite); + testCase.verifyRoundTrip(arrowTable); end function String(testCase, NumRows, ColumnNames) - import arrow.io.csv.* - - arrowTableWrite = testCase.makeArrowTable(... + arrowTable = testCase.makeArrowTable(... Type="string", ... NumRows=NumRows, ... WithNulls=false, ... ColumnNames=ColumnNames ... ); - writer = TableWriter(testCase.Filename); - reader = TableReader(testCase.Filename); - - writer.write(arrowTableWrite); - arrowTableRead = reader.read(); - - testCase.verifyEqual(arrowTableRead, arrowTableWrite); + testCase.verifyRoundTrip(arrowTable); end end From 934e9ad9ef7e4785f0b10893a6a4fa238305a632 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 15 Sep 2023 16:55:27 -0400 Subject: [PATCH 12/22] Remove unused TestMethodSetup block in tError. --- matlab/test/arrow/io/csv/tError.m | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/matlab/test/arrow/io/csv/tError.m b/matlab/test/arrow/io/csv/tError.m index b2db0590f24..c904393a166 100644 --- a/matlab/test/arrow/io/csv/tError.m +++ b/matlab/test/arrow/io/csv/tError.m @@ -16,16 +16,6 @@ % permissions and limitations under the License. classdef tError < CSVTest - methods (TestMethodSetup) - - function setupTestFilename(testCase) - import matlab.unittest.fixtures.TemporaryFolderFixture - fixture = testCase.applyFixture(TemporaryFolderFixture); - testCase.Filename = fullfile(fixture.Folder, "filename.csv"); - end - - end - methods(Test) function EmptyCsvFile(testCase) From 356e2ea8e7e45ccb403eb3c074c3bbb24abd96a9 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 18 Sep 2023 11:24:58 -0400 Subject: [PATCH 13/22] 1. Update arguments block type properties for TableReader and TableWriter filename argument. 2. Add basic error tests for TableReader and TableWriter. Co-authored-by: Sarah Gilmore --- .../src/matlab/+arrow/+io/+csv/TableReader.m | 2 +- .../src/matlab/+arrow/+io/+csv/TableWriter.m | 2 +- matlab/test/arrow/io/csv/tError.m | 18 +++++++++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/matlab/src/matlab/+arrow/+io/+csv/TableReader.m b/matlab/src/matlab/+arrow/+io/+csv/TableReader.m index 6e9d925b6bc..1e0308bb8d4 100644 --- a/matlab/src/matlab/+arrow/+io/+csv/TableReader.m +++ b/matlab/src/matlab/+arrow/+io/+csv/TableReader.m @@ -29,7 +29,7 @@ function obj = TableReader(filename) arguments - filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + filename (1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} end args = struct(Filename=filename); diff --git a/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m index 351426b1d15..79e3c3efe2f 100644 --- a/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m +++ b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m @@ -27,7 +27,7 @@ methods function obj = TableWriter(filename) arguments - filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + filename (1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} end args = struct(Filename=filename); diff --git a/matlab/test/arrow/io/csv/tError.m b/matlab/test/arrow/io/csv/tError.m index c904393a166..5961616a74d 100644 --- a/matlab/test/arrow/io/csv/tError.m +++ b/matlab/test/arrow/io/csv/tError.m @@ -18,7 +18,7 @@ methods(Test) - function EmptyCsvFile(testCase) + function EmptyFile(testCase) import arrow.io.csv.* arrowTableWrite = arrow.table(); @@ -31,6 +31,22 @@ function EmptyCsvFile(testCase) testCase.verifyError(fcn, "arrow:io:csv:FailedToReadTable"); end + function InvalidWriterFilenameType(testCase) + import arrow.io.csv.* + fcn = @() TableWriter(table); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + fcn = @() TableWriter(["a", "b"]); + testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); + end + + function InvalidReaderFilenameType(testCase) + import arrow.io.csv.* + fcn = @() TableReader(table); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + fcn = @() TableReader(["a", "b"]); + testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); + end + end end \ No newline at end of file From 54179e295f56f5425ef4c4c7ff5395861e49f7f4 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 18 Sep 2023 11:41:25 -0400 Subject: [PATCH 14/22] 1. Set access for Filename property of TableWriter. 2. Add more error tests. Co-authored-by: Sarah Gilmore --- .../src/matlab/+arrow/+io/+csv/TableWriter.m | 6 +++++- matlab/test/arrow/io/csv/tError.m | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m index 79e3c3efe2f..eb1aafe08f5 100644 --- a/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m +++ b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m @@ -20,7 +20,7 @@ Proxy end - properties(Dependent) + properties(Dependent, SetAccess=private, GetAccess=public) Filename end @@ -36,6 +36,10 @@ end function write(obj, table) + arguments + obj (1, 1) arrow.io.csv.TableWriter + table (1, 1) arrow.tabular.Table + end args = struct(TableProxyID=table.Proxy.ID); obj.Proxy.write(args); end diff --git a/matlab/test/arrow/io/csv/tError.m b/matlab/test/arrow/io/csv/tError.m index 5961616a74d..24c420e7ba2 100644 --- a/matlab/test/arrow/io/csv/tError.m +++ b/matlab/test/arrow/io/csv/tError.m @@ -47,6 +47,27 @@ function InvalidReaderFilenameType(testCase) testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end + function InvalidWriterWriteType(testCase) + import arrow.io.csv.* + writer = TableWriter(testCase.Filename); + fcn = @() writer.write("text"); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + end + + function WriterFilenameNoSetter(testCase) + import arrow.io.csv.* + writer = TableWriter(testCase.Filename); + fcn = @() setfield(writer, "Filename", "filename.csv"); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + + function ReaderFilenameNoSetter(testCase) + import arrow.io.csv.* + reader = TableReader(testCase.Filename); + fcn = @() setfield(reader, "Filename", "filename.csv"); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + end end \ No newline at end of file From 6a5126cc4a9de90591f1b5c6f8661bf45e2b19d3 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 18 Sep 2023 12:17:02 -0400 Subject: [PATCH 15/22] Enable Arrow CSV functionality with `-D ARROW_CSV=ON` in MATLAB CI workflow script. Co-authored-by: Sarah Gilmore --- ci/scripts/matlab_build.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/scripts/matlab_build.sh b/ci/scripts/matlab_build.sh index d3f86adbb8a..ffa373d8a3a 100755 --- a/ci/scripts/matlab_build.sh +++ b/ci/scripts/matlab_build.sh @@ -30,5 +30,6 @@ cmake \ -B ${build_dir} \ -G Ninja \ -D CMAKE_INSTALL_PREFIX=${install_dir} \ - -D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF + -D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF \ + -D ARROW_CSV=ON cmake --build ${build_dir} --config Release --target install From 35aefb4c4405328ba36ae224b57dca90abd86d20 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 18 Sep 2023 13:08:23 -0400 Subject: [PATCH 16/22] Enable ARROW_CSV in ExternalProject_Add call. --- ci/scripts/matlab_build.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/scripts/matlab_build.sh b/ci/scripts/matlab_build.sh index ffa373d8a3a..d3f86adbb8a 100755 --- a/ci/scripts/matlab_build.sh +++ b/ci/scripts/matlab_build.sh @@ -30,6 +30,5 @@ cmake \ -B ${build_dir} \ -G Ninja \ -D CMAKE_INSTALL_PREFIX=${install_dir} \ - -D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF \ - -D ARROW_CSV=ON + -D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF cmake --build ${build_dir} --config Release --target install From b3fb5dc5e90a964a5a89a6ad3b08ead6a466876b Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Tue, 19 Sep 2023 16:13:01 -0400 Subject: [PATCH 17/22] Enable ARROW_CSV component in ExternalProject_Add call. --- matlab/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index c8100a389ac..6af71531e58 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -35,7 +35,9 @@ function(build_arrow) set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-prefix") set(ARROW_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-build") set(ARROW_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}" - "-DCMAKE_INSTALL_LIBDIR=lib" "-DARROW_BUILD_STATIC=OFF") + "-DCMAKE_INSTALL_LIBDIR=lib" + "-DARROW_BUILD_STATIC=OFF" + "-DARROW_CSV=ON") add_library(arrow_shared SHARED IMPORTED) set(ARROW_LIBRARY_TARGET arrow_shared) From a843a8d4cc2ca961e8e27e4b5da0f6ed269e6c66 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Tue, 19 Sep 2023 16:14:13 -0400 Subject: [PATCH 18/22] Fix CMake linting errors. --- matlab/CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 6af71531e58..b7af37a2785 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -34,10 +34,9 @@ function(build_arrow) set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-prefix") set(ARROW_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-build") - set(ARROW_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}" - "-DCMAKE_INSTALL_LIBDIR=lib" - "-DARROW_BUILD_STATIC=OFF" - "-DARROW_CSV=ON") + set(ARROW_CMAKE_ARGS + "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}" "-DCMAKE_INSTALL_LIBDIR=lib" + "-DARROW_BUILD_STATIC=OFF" "-DARROW_CSV=ON") add_library(arrow_shared SHARED IMPORTED) set(ARROW_LIBRARY_TARGET arrow_shared) From 67d9ff9d5b875479df3ecc53098b789f8c53c856 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 20 Sep 2023 09:09:26 -0400 Subject: [PATCH 19/22] Use `auto` for declaring options in `TableReader::read`. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc index cb6a88748cd..ab9935ce145 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc @@ -60,9 +60,9 @@ namespace arrow::matlab::io::csv::proxy { MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source, arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context, error::FAILED_TO_OPEN_FILE_FOR_READ); const ::arrow::io::IOContext io_context; - const csv::ReadOptions read_options = csv::ReadOptions::Defaults(); - const csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); - const csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults(); + const auto read_options = csv::ReadOptions::Defaults(); + const auto parse_options = csv::ParseOptions::Defaults(); + const auto convert_options = csv::ConvertOptions::Defaults(); // Create a TableReader from the file input stream. MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto table_reader, From 73deaa66f8a59352953525be942cff0b63987625 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 20 Sep 2023 09:10:47 -0400 Subject: [PATCH 20/22] Use `auto` keyword when initializing output stream in `TableWriter`. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc index 22db6118b5d..078cf1d501d 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc @@ -74,7 +74,7 @@ namespace arrow::matlab::io::csv::proxy { auto table_proxy = std::static_pointer_cast(proxy); auto table = table_proxy->unwrap(); - MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr output_stream, + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto output_stream, arrow::io::FileOutputStream::Open(filename), context, error::FAILED_TO_OPEN_FILE_FOR_WRITE); From ecf2bee884a80a3d3addfdb843439816f52ea4fc Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 20 Sep 2023 09:12:08 -0400 Subject: [PATCH 21/22] Use `auto` keyword when declaring `WriteOptions`. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc index 078cf1d501d..25f62358ff2 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc @@ -78,7 +78,7 @@ namespace arrow::matlab::io::csv::proxy { arrow::io::FileOutputStream::Open(filename), context, error::FAILED_TO_OPEN_FILE_FOR_WRITE); - csv::WriteOptions options = csv::WriteOptions::Defaults(); + const auto options = csv::WriteOptions::Defaults(); MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(csv::WriteCSV(*table, options, output_stream.get()), context, error::CSV_FAILED_TO_WRITE_TABLE); From aea2f3974f9eef65f445576e9c01275b186eda03 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 20 Sep 2023 11:10:34 -0400 Subject: [PATCH 22/22] Mark `output_stream` as `const`. --- matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc index 25f62358ff2..b24bd81b066 100644 --- a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc @@ -74,7 +74,7 @@ namespace arrow::matlab::io::csv::proxy { auto table_proxy = std::static_pointer_cast(proxy); auto table = table_proxy->unwrap(); - MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto output_stream, + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto output_stream, arrow::io::FileOutputStream::Open(filename), context, error::FAILED_TO_OPEN_FILE_FOR_WRITE);