diff --git a/.github/workflows/buildbot.yml b/.github/workflows/buildbot.yml new file mode 100644 index 00000000000..3bd45d477c5 --- /dev/null +++ b/.github/workflows/buildbot.yml @@ -0,0 +1,54 @@ +name: Buildbot + +on: [push] + +jobs: + build: + name: Build + runs-on: ubuntu-latest + + strategy: + matrix: + builder: + - "AMD64 Debian 9 Rust 1.35" + - "AMD64 Debian 9 Go 1.11.11" + - "AMD64 Debian 9 Go 1.12.6" + - "AMD64 Conda C++" + - "AMD64 Conda Python 2.7" + - "AMD64 Conda Python 3.6" + - "AMD64 Conda Python 3.7" + - "AMD64 Conda R" + - "AMD64 Debian 9 NodeJS 11" + - "AMD64 Java OpenJDK 11" + - "AMD64 Java OpenJDK 8" + - "AMD64 Ubuntu 18.04 C GLib" + - "AMD64 Ubuntu 18.04 C++" + - "AMD64 Ubuntu 18.04 Python 3" + - "AMD64 Ubuntu 18.04 R" + + steps: + + - name: Checkout Ursabot + uses: actions/checkout@v1 + with: + repository: ursa-labs/ursabot + ref: refs/heads/master + path: ursabot + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: '3.7' + - name: Install Ursabot + run: pip install -e ../ursabot + - name: Check Ursabot Command + run: ursabot --help + + - name: Run ${{ matrix.builder }} Builder + run: | + cd ../ursabot/projects/arrow + ursabot project build \ + --repo https://github.com/$GITHUB_REPOSITORY \ + --branch $GITHUB_REF \ + --commit $GITHUB_SHA \ + "${{ matrix.builder }}" + diff --git a/LICENSE.txt b/LICENSE.txt index cb359c05720..af3e97fc2b9 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1841,8 +1841,8 @@ This project includes code from the autobrew project. * r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb are based on code from the autobrew project. -Copyright: Copyright (c) 2017 - 2019, Jeroen Ooms. -All rights reserved. +Copyright (c) 2019, Jeroen Ooms +License: MIT Homepage: https://github.com/jeroen/autobrew -------------------------------------------------------------------------------- @@ -1874,3 +1874,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +cpp/src/arrow/vendored/base64.cpp has the following license + +ZLIB License + +Copyright (C) 2004-2017 René Nyffenegger + +This source code is provided 'as-is', without any express or implied +warranty. In no event will the author be held liable for any damages arising +from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including +commercial applications, and to alter it and redistribute it freely, subject to +the following restrictions: + +1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + +3. This notice may not be removed or altered from any source distribution. + +René Nyffenegger rene.nyffenegger@adp-gmbh.ch diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 202c24f0a58..6b095b878e5 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -226,7 +226,7 @@ if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ]; then doxygen popd cd ../docs - sphinx-build -q -b html -d _build/doctrees -W source _build/html + sphinx-build -q -b html -d _build/doctrees -W --keep-going source _build/html fi popd # $ARROW_PYTHON_DIR diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index 38ce17fb810..60551910b96 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2074,7 +2074,9 @@ INCLUDE_FILE_PATTERNS = PREDEFINED = __attribute__(x)= \ __declspec(x)= \ + PARQUET_EXPORT= \ ARROW_EXPORT= \ + ARROW_FLIGHT_EXPORT= \ ARROW_EXTERN_TEMPLATE= \ ARROW_DEPRECATED(x)= diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py index 1d1592d233e..afd933140ac 100755 --- a/cpp/build-support/run_clang_format.py +++ b/cpp/build-support/run_clang_format.py @@ -28,12 +28,10 @@ # examine the output of clang-format and if changes are # present assemble a (unified)patch of the difference -def _check_one_file(completed_processes, filename): +def _check_one_file(filename, formatted): with open(filename, "rb") as reader: original = reader.read() - returncode, stdout, stderr = completed_processes[filename] - formatted = stdout if formatted != original: # Run the equivalent of diff -u diff = list(difflib.unified_diff( @@ -106,20 +104,21 @@ def _check_one_file(completed_processes, filename): [arguments.clang_format_binary, filename] for filename in formatted_filenames ], stdout=PIPE, stderr=PIPE) - for returncode, stdout, stderr in results: + + checker_args = [] + for filename, res in zip(formatted_filenames, results): # if any clang-format reported a parse error, bubble it + returncode, stdout, stderr = res if returncode != 0: + print(stderr) sys.exit(returncode) + checker_args.append((filename, stdout)) error = False - checker = partial(_check_one_file, { - filename: result - for filename, result in zip(formatted_filenames, results) - }) pool = mp.Pool() try: # check the output from each invocation of clang-format in parallel - for filename, diff in pool.imap(checker, formatted_filenames): + for filename, diff in pool.starmap(_check_one_file, checker_args): if not arguments.quiet: print("Checking {}".format(filename)) if diff: diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index bb0a365e68f..91ff4436998 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -117,6 +117,7 @@ set(ARROW_SRCS filesystem/filesystem.cc filesystem/localfs.cc filesystem/mockfs.cc + filesystem/path_tree.cc filesystem/path_util.cc filesystem/util_internal.cc io/buffered.cc @@ -127,6 +128,7 @@ set(ARROW_SRCS io/interfaces.cc io/memory.cc io/readahead.cc + io/slow.cc testing/util.cc util/basic_decimal.cc util/bit_util.cc @@ -144,6 +146,7 @@ set(ARROW_SRCS util/thread_pool.cc util/trie.cc util/utf8.cc + vendored/base64.cpp vendored/datetime/tz.cpp) # Add dependencies for third-party allocators. diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 9ad29790377..66ab766ed7c 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -1234,6 +1234,7 @@ struct ValidateVisitor { } Status Visit(const StructArray& array) { + const auto& struct_type = checked_cast(*array.type()); if (array.num_fields() > 0) { // Validate fields int64_t array_length = array.field(0)->length(); @@ -1245,10 +1246,17 @@ struct ValidateVisitor { it->type()->ToString(), " at position [", idx, "]"); } + auto it_type = struct_type.child(i)->type(); + if (!it->type()->Equals(it_type)) { + return Status::Invalid("Child array at position [", idx, + "] does not match type field: ", it->type()->ToString(), + " vs ", it_type->ToString()); + } + const Status child_valid = it->Validate(); if (!child_valid.ok()) { return Status::Invalid("Child array invalid: ", child_valid.ToString(), - " at position [", idx, "}"); + " at position [", idx, "]"); } ++idx; } diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 4daaa9f1887..48af5a6b8ea 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -32,82 +32,91 @@ class DataType; namespace csv { +// Silly workaround for https://github.com/michaeljones/breathe/issues/453 +constexpr char kDefaultEscapeChar = '\\'; + struct ARROW_EXPORT ParseOptions { // Parsing options - // Field delimiter + /// Field delimiter char delimiter = ','; - // Whether quoting is used + /// Whether quoting is used bool quoting = true; - // Quoting character (if `quoting` is true) + /// Quoting character (if `quoting` is true) char quote_char = '"'; - // Whether a quote inside a value is double-quoted + /// Whether a quote inside a value is double-quoted bool double_quote = true; - // Whether escaping is used + /// Whether escaping is used bool escaping = false; - // Escaping character (if `escaping` is true) - char escape_char = '\\'; - // Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters + /// Escaping character (if `escaping` is true) + char escape_char = kDefaultEscapeChar; + /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters bool newlines_in_values = false; - // Whether empty lines are ignored. If false, an empty line represents - // a single empty value (assuming a one-column CSV file). + /// Whether empty lines are ignored. If false, an empty line represents + /// a single empty value (assuming a one-column CSV file). bool ignore_empty_lines = true; + /// Create parsing options with default values static ParseOptions Defaults(); }; struct ARROW_EXPORT ConvertOptions { // Conversion options - // Whether to check UTF8 validity of string columns + /// Whether to check UTF8 validity of string columns bool check_utf8 = true; - // Optional per-column types (disabling type inference on those columns) + /// Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; - // Recognized spellings for null values + /// Recognized spellings for null values std::vector null_values; - // Recognized spellings for boolean values + /// Recognized spellings for boolean true values std::vector true_values; + /// Recognized spellings for boolean false values std::vector false_values; - // Whether string / binary columns can have null values. - // If true, then strings in "null_values" are considered null for string columns. - // If false, then all strings are valid string values. + /// Whether string / binary columns can have null values. + /// + /// If true, then strings in "null_values" are considered null for string columns. + /// If false, then all strings are valid string values. bool strings_can_be_null = false; // XXX Should we have a separate FilterOptions? - // If non-empty, indicates the names of columns from the CSV file that should - // be actually read and converted (in the vector's order). - // Columns not in this vector will be ignored. + /// If non-empty, indicates the names of columns from the CSV file that should + /// be actually read and converted (in the vector's order). + /// Columns not in this vector will be ignored. std::vector include_columns; - // If false, columns in `include_columns` but not in the CSV file will error out. - // If true, columns in `include_columns` but not in the CSV file will produce - // a column of nulls (whose type is selected using `column_types`, - // or null by default) - // This option is ignored if `include_columns` is empty. + /// If false, columns in `include_columns` but not in the CSV file will error out. + /// If true, columns in `include_columns` but not in the CSV file will produce + /// a column of nulls (whose type is selected using `column_types`, + /// or null by default) + /// This option is ignored if `include_columns` is empty. bool include_missing_columns = false; + /// Create conversion options with default values, including conventional + /// values for `null_values`, `true_values` and `false_values` static ConvertOptions Defaults(); }; struct ARROW_EXPORT ReadOptions { // Reader options - // Whether to use the global CPU thread pool + /// Whether to use the global CPU thread pool bool use_threads = true; - // Block size we request from the IO layer; also determines the size of - // chunks when use_threads is true + /// Block size we request from the IO layer; also determines the size of + /// chunks when use_threads is true int32_t block_size = 1 << 20; // 1 MB - // Number of header rows to skip (not including the row of column names, if any) + /// Number of header rows to skip (not including the row of column names, if any) int32_t skip_rows = 0; - // Column names for the target table. - // If empty, fall back on autogenerate_column_names. + /// Column names for the target table. + /// If empty, fall back on autogenerate_column_names. std::vector column_names; - // Whether to autogenerate column names if `column_names` is empty. - // If true, column names will be of the form "f0", "f1"... - // If false, column names will be read from the first CSV row after `skip_rows`. + /// Whether to autogenerate column names if `column_names` is empty. + /// If true, column names will be of the form "f0", "f1"... + /// If false, column names will be read from the first CSV row after `skip_rows`. bool autogenerate_column_names = false; + /// Create read options with default values static ReadOptions Defaults(); }; diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h index 53255f96778..8645d24d083 100644 --- a/cpp/src/arrow/csv/reader.h +++ b/cpp/src/arrow/csv/reader.h @@ -35,12 +35,15 @@ class InputStream; namespace csv { +/// A class that reads an entire CSV file into a Arrow Table class ARROW_EXPORT TableReader { public: virtual ~TableReader() = default; + /// Read the entire CSV file and convert it to a Arrow Table virtual Status Read(std::shared_ptr* out) = 0; + /// Create a TableReader instance static Status Make(MemoryPool* pool, std::shared_ptr input, const ReadOptions&, const ParseOptions&, const ConvertOptions&, std::shared_ptr* out); diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 0b1e09e70f2..42ed178b012 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -138,4 +138,7 @@ std::shared_ptr GetExtensionType(const std::string& type_name) { return registry->GetType(type_name); } +extern const char kExtensionTypeKeyName[] = "ARROW:extension:name"; +extern const char kExtensionMetadataKeyName[] = "ARROW:extension:metadata"; + } // namespace arrow diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h index 32ab7c6cc45..560121eaa1f 100644 --- a/cpp/src/arrow/extension_type.h +++ b/cpp/src/arrow/extension_type.h @@ -142,4 +142,7 @@ Status UnregisterExtensionType(const std::string& type_name); ARROW_EXPORT std::shared_ptr GetExtensionType(const std::string& type_name); +ARROW_EXPORT extern const char kExtensionTypeKeyName[]; +ARROW_EXPORT extern const char kExtensionMetadataKeyName[]; + } // namespace arrow diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 148fa748403..efb78055a9a 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/filesystem") add_arrow_test(filesystem_test) add_arrow_test(localfs_test) +add_arrow_test(path_tree_test) if(ARROW_S3) add_arrow_test(s3fs_test) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 4c3d75c067f..aa9677e45f2 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -19,6 +19,7 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/path_util.h" +#include "arrow/io/slow.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -70,11 +71,19 @@ std::string FileStats::base_name() const { return internal::GetAbstractPathParent(path_).second; } +std::string FileStats::dir_name() const { + return internal::GetAbstractPathParent(path_).first; +} + // Debug helper std::ostream& operator<<(std::ostream& os, const FileStats& stats) { return os << "FileStats(" << stats.type() << ", " << stats.path() << ")"; } +std::string FileStats::extension() const { + return internal::GetAbstractPathExtension(path_); +} + ////////////////////////////////////////////////////////////////////////// // FileSystem default method implementations @@ -232,8 +241,91 @@ Status SubTreeFileSystem::OpenAppendStream(const std::string& path, return base_fs_->OpenAppendStream(s, out); } -std::string FileStats::extension() const { - return internal::GetAbstractPathExtension(path_); +////////////////////////////////////////////////////////////////////////// +// SlowFileSystem implementation + +SlowFileSystem::SlowFileSystem(std::shared_ptr base_fs, + std::shared_ptr latencies) + : base_fs_(base_fs), latencies_(latencies) {} + +SlowFileSystem::SlowFileSystem(std::shared_ptr base_fs, + double average_latency) + : base_fs_(base_fs), latencies_(io::LatencyGenerator::Make(average_latency)) {} + +SlowFileSystem::SlowFileSystem(std::shared_ptr base_fs, + double average_latency, int32_t seed) + : base_fs_(base_fs), latencies_(io::LatencyGenerator::Make(average_latency, seed)) {} + +Status SlowFileSystem::GetTargetStats(const std::string& path, FileStats* out) { + latencies_->Sleep(); + return base_fs_->GetTargetStats(path, out); +} + +Status SlowFileSystem::GetTargetStats(const Selector& selector, + std::vector* out) { + latencies_->Sleep(); + return base_fs_->GetTargetStats(selector, out); +} + +Status SlowFileSystem::CreateDir(const std::string& path, bool recursive) { + latencies_->Sleep(); + return base_fs_->CreateDir(path, recursive); +} + +Status SlowFileSystem::DeleteDir(const std::string& path) { + latencies_->Sleep(); + return base_fs_->DeleteDir(path); +} + +Status SlowFileSystem::DeleteDirContents(const std::string& path) { + latencies_->Sleep(); + return base_fs_->DeleteDirContents(path); +} + +Status SlowFileSystem::DeleteFile(const std::string& path) { + latencies_->Sleep(); + return base_fs_->DeleteFile(path); +} + +Status SlowFileSystem::Move(const std::string& src, const std::string& dest) { + latencies_->Sleep(); + return base_fs_->Move(src, dest); +} + +Status SlowFileSystem::CopyFile(const std::string& src, const std::string& dest) { + latencies_->Sleep(); + return base_fs_->CopyFile(src, dest); +} + +Status SlowFileSystem::OpenInputStream(const std::string& path, + std::shared_ptr* out) { + latencies_->Sleep(); + std::shared_ptr stream; + RETURN_NOT_OK(base_fs_->OpenInputStream(path, &stream)); + *out = std::make_shared(stream, latencies_); + return Status::OK(); +} + +Status SlowFileSystem::OpenInputFile(const std::string& path, + std::shared_ptr* out) { + latencies_->Sleep(); + std::shared_ptr file; + RETURN_NOT_OK(base_fs_->OpenInputFile(path, &file)); + *out = std::make_shared(file, latencies_); + return Status::OK(); +} + +Status SlowFileSystem::OpenOutputStream(const std::string& path, + std::shared_ptr* out) { + latencies_->Sleep(); + // XXX Should we have a SlowOutputStream that waits on Flush() and Close()? + return base_fs_->OpenOutputStream(path, out); +} + +Status SlowFileSystem::OpenAppendStream(const std::string& path, + std::shared_ptr* out) { + latencies_->Sleep(); + return base_fs_->OpenAppendStream(path, out); } } // namespace fs diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 6e24db03dfe..bc17346894e 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -25,7 +25,6 @@ #include #include "arrow/status.h" -#include "arrow/util/compression.h" #include "arrow/util/visibility.h" // The Windows API defines macros from *File resolving to either @@ -44,6 +43,7 @@ namespace arrow { namespace io { class InputStream; +class LatencyGenerator; class OutputStream; class RandomAccessFile; @@ -58,14 +58,16 @@ using TimePoint = /// \brief EXPERIMENTAL: FileSystem entry type enum class ARROW_EXPORT FileType : int8_t { - // Target does not exist + /// Entry does not exist NonExistent, - // Target exists but its type is unknown (could be a special file such - // as a Unix socket or character device, or Windows NUL / CON / ...) + /// Entry exists but its type is unknown + /// + /// This can designate a special file such as a Unix socket or character + /// device, or Windows NUL / CON / ... Unknown, - // Target is a regular file + /// Entry is a regular file File, - // Target is a directory + /// Entry is a directory Directory }; @@ -84,29 +86,36 @@ struct ARROW_EXPORT FileStats { FileStats(const FileStats&) = default; FileStats& operator=(const FileStats&) = default; - // The file type. + /// The file type FileType type() const { return type_; } void set_type(FileType type) { type_ = type; } - // The full file path in the filesystem. + /// The full file path in the filesystem std::string path() const { return path_; } void set_path(const std::string& path) { path_ = path; } - // The file base name (component after the last directory separator). + /// The file base name (component after the last directory separator) std::string base_name() const; - // The size in bytes, if available. Only regular files are guaranteed - // to have a size. + // The directory base name (component before the file base name). + std::string dir_name() const; + + /// The size in bytes, if available + /// + /// Only regular files are guaranteed to have a size. int64_t size() const { return size_; } void set_size(int64_t size) { size_ = size; } - // The file extension + /// The file extension (excluding the dot) std::string extension() const; - // The time of last modification, if available. + /// The time of last modification, if available TimePoint mtime() const { return mtime_; } void set_mtime(TimePoint mtime) { mtime_ = mtime; } + bool IsFile() const { return type_ == FileType::File; } + bool IsDirectory() const { return type_ == FileType::Directory; } + bool operator==(const FileStats& other) const { return type() == other.type() && path() == other.path() && size() == other.size() && mtime() == other.mtime(); @@ -228,7 +237,9 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem { std::shared_ptr base_fs); ~SubTreeFileSystem() override; + /// \cond FALSE using FileSystem::GetTargetStats; + /// \endcond Status GetTargetStats(const std::string& path, FileStats* out) override; Status GetTargetStats(const Selector& select, std::vector* out) override; @@ -265,5 +276,47 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem { Status FixStats(FileStats* st) const; }; +/// \brief EXPERIMENTAL: a FileSystem implementation that delegates to another +/// implementation but inserts latencies at various points. +class ARROW_EXPORT SlowFileSystem : public FileSystem { + public: + SlowFileSystem(std::shared_ptr base_fs, + std::shared_ptr latencies); + SlowFileSystem(std::shared_ptr base_fs, double average_latency); + SlowFileSystem(std::shared_ptr base_fs, double average_latency, + int32_t seed); + + using FileSystem::GetTargetStats; + Status GetTargetStats(const std::string& path, FileStats* out) override; + Status GetTargetStats(const Selector& select, std::vector* out) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path) override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Status OpenInputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenInputFile(const std::string& path, + std::shared_ptr* out) override; + + Status OpenOutputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenAppendStream(const std::string& path, + std::shared_ptr* out) override; + + protected: + std::shared_ptr base_fs_; + std::shared_ptr latencies_; +}; + } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc index 47ddc16273f..c8eba30218d 100644 --- a/cpp/src/arrow/filesystem/filesystem_test.cc +++ b/cpp/src/arrow/filesystem/filesystem_test.cc @@ -612,6 +612,27 @@ TEST_F(TestSubTreeFileSystem, GetTargetStatsSelector) { ASSERT_EQ(stats.size(), 0); } +//////////////////////////////////////////////////////////////////////////// +// Generic SlowFileSystem tests + +class TestSlowFSGeneric : public ::testing::Test, public GenericFileSystemTest { + public: + void SetUp() override { + time_ = TimePoint(TimePoint::duration(42)); + fs_ = std::make_shared(time_); + slow_fs_ = std::make_shared(fs_, 0.001); + } + + protected: + std::shared_ptr GetEmptyFileSystem() override { return slow_fs_; } + + TimePoint time_; + std::shared_ptr fs_; + std::shared_ptr slow_fs_; +}; + +GENERIC_FS_TEST_FUNCTIONS(TestSlowFSGeneric); + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/localfs.h b/cpp/src/arrow/filesystem/localfs.h index 73e35206d71..57da283b036 100644 --- a/cpp/src/arrow/filesystem/localfs.h +++ b/cpp/src/arrow/filesystem/localfs.h @@ -29,14 +29,18 @@ namespace fs { /// \brief EXPERIMENTAL: a FileSystem implementation accessing files /// on the local machine. /// -/// Details such as symlinks are abstracted away (symlinks are always followed, -/// except when deleting an entry). +/// This class handles only `/`-separated paths. If desired, conversion +/// from Windows backslash-separated paths should be done by the caller. +/// Details such as symlinks are abstracted away (symlinks are always +/// followed, except when deleting an entry). class ARROW_EXPORT LocalFileSystem : public FileSystem { public: LocalFileSystem(); ~LocalFileSystem() override; + /// \cond FALSE using FileSystem::GetTargetStats; + /// \endcond Status GetTargetStats(const std::string& path, FileStats* out) override; Status GetTargetStats(const Selector& select, std::vector* out) override; diff --git a/cpp/src/arrow/filesystem/path_tree.cc b/cpp/src/arrow/filesystem/path_tree.cc new file mode 100644 index 00000000000..682d3ba1021 --- /dev/null +++ b/cpp/src/arrow/filesystem/path_tree.cc @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +#include "arrow/filesystem/path_tree.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/filesystem/path_util.h" + +namespace arrow { +namespace fs { + +using PathTreeByPathMap = std::unordered_map>; + +std::shared_ptr FindAncestor(const PathTreeByPathMap& directories, + std::string path) { + while (path != "") { + auto parent = internal::GetAbstractPathParent(path).first; + auto found = directories.find(parent); + if (found != directories.end()) { + return found->second; + } + + path = std::move(parent); + } + + return nullptr; +} + +Status PathTree::Make(std::vector stats, PathForest* out) { + PathTreeByPathMap directories; + PathForest forest; + + auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) { + if (s.path() == "") { + return; + } + + auto ancestor = FindAncestor(directories, s.path()); + auto node = std::make_shared(s); + if (ancestor) { + ancestor->AddChild(node); + } else { + forest.push_back(node); + } + + if (s.type() == FileType::Directory) { + directories[s.path()] = node; + } + }; + + // Insert nodes by ascending path length, ensuring that nodes are always + // inserted after their ancestors. Note that this strategy does not account + // for special directories like '..'. It is expected that path are absolute. + auto cmp = [](const FileStats& lhs, const FileStats& rhs) { + return lhs.path().size() < rhs.path().size(); + }; + std::stable_sort(stats.begin(), stats.end(), cmp); + std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root); + + *out = std::move(forest); + return Status::OK(); +} + +Status PathTree::Make(std::vector stats, std::shared_ptr* out) { + PathForest forest; + RETURN_NOT_OK(Make(stats, &forest)); + + auto size = forest.size(); + if (size > 1) { + return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1."); + } else if (size == 1) { + *out = forest[0]; + } + + return Status::OK(); +} + +std::ostream& operator<<(std::ostream& os, const PathTree& tree) { + os << "PathTree(" << tree.stats(); + + const auto& subtrees = tree.subtrees(); + if (subtrees.size()) { + os << ", ["; + for (size_t i = 0; i < subtrees.size(); i++) { + if (i != 0) os << ", "; + os << *subtrees[i]; + } + os << "]"; + } + os << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const std::shared_ptr& tree) { + if (tree != nullptr) { + return os << *tree.get(); + } + + return os; +} + +bool operator==(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == NULLPTR && rhs == NULLPTR) { + return true; + } else if (lhs != NULLPTR && rhs != NULLPTR) { + return *lhs == *rhs; + } + + return false; +} + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_tree.h b/cpp/src/arrow/filesystem/path_tree.h new file mode 100644 index 00000000000..50ec1f94704 --- /dev/null +++ b/cpp/src/arrow/filesystem/path_tree.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/filesystem/filesystem.h" + +#include +#include +#include +#include +#include + +#include "arrow/status.h" + +namespace arrow { +namespace fs { + +class ARROW_EXPORT PathTree; + +/// \brief A PathForest consists of multiples PathTree +using PathForest = std::vector>; + +/// \brief A PathTree is a utility to transform a vector of FileStats into a +/// forest representation for tree traversal purposes. Node in the graph wraps +/// a FileStats. Files are expected to be found only at leaves of the tree. +class ARROW_EXPORT PathTree { + public: + explicit PathTree(FileStats stats) : stats_(stats) {} + PathTree(FileStats stats, std::vector> subtrees) + : stats_(stats), subtrees_(std::move(subtrees)) {} + + /// \brief Transforms a FileStats vector into a forest of trees. Since there + /// is no guarantee of complete trees, it is possible to have a forest + /// (multiple roots). The caller should ensure that stats have unique path. + static Status Make(std::vector stats, PathForest* out); + + /// \brief Like MakeForest but fails if there's more than one root. + static Status Make(std::vector stats, std::shared_ptr* out); + + /// \brief Returns the FileStat of this node. + FileStats stats() const { return stats_; } + /// \brief Returns the subtrees under this node. + std::vector> subtrees() const { return subtrees_; } + + /// \brief Visit with eager pruning. + template + Status Visit(Visitor&& v, Matcher&& m) const { + bool match = false; + ARROW_RETURN_NOT_OK(m(stats_, &match)); + if (!match) { + return Status::OK(); + } + + ARROW_RETURN_NOT_OK(v(stats_)); + + for (const auto& t : subtrees_) { + ARROW_RETURN_NOT_OK(t->Visit(v, m)); + } + + return Status::OK(); + } + + template + Status Visit(Visitor&& v) const { + auto always_match = [](const FileStats& t, bool* match) { + *match = true; + return Status::OK(); + }; + return Visit(v, always_match); + } + + bool operator==(const PathTree& other) const { + return stats_ == other.stats_ && subtrees_ == other.subtrees_; + } + + protected: + FileStats stats_; + std::vector> subtrees_; + + // The AddChild method is convenient to create trees in a top-down fashion, + // e.g. the Make factory constructor. + void AddChild(std::shared_ptr child) { + subtrees_.push_back(std::move(child)); + } +}; + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, + const std::shared_ptr& tree); +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree); + +ARROW_EXPORT bool operator==(const std::shared_ptr& lhs, + const std::shared_ptr& rhs); + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_tree_test.cc b/cpp/src/arrow/filesystem/path_tree_test.cc new file mode 100644 index 00000000000..fb38ad45835 --- /dev/null +++ b/cpp/src/arrow/filesystem/path_tree_test.cc @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/path_tree.h" + +#include +#include +#include + +#include +#include + +#include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/test_util.h" +#include "arrow/testing/gtest_util.h" + +using testing::ContainerEq; + +namespace arrow { +namespace fs { + +static std::shared_ptr PT(FileStats stats) { + return std::make_shared(std::move(stats)); +} + +static std::shared_ptr PT(FileStats stats, + std::vector> subtrees) { + return std::make_shared(std::move(stats), std::move(subtrees)); +} + +void AssertMakePathTree(std::vector stats, + std::vector> expected) { + std::vector> actual; + + ASSERT_OK(PathTree::Make(stats, &actual)); + EXPECT_THAT(actual, ContainerEq(expected)); +} + +TEST(TestPathTree, Basic) { + AssertMakePathTree({}, {}); + + AssertMakePathTree({File("aa")}, {PT(File("aa"))}); + AssertMakePathTree({Dir("AA")}, {PT(Dir("AA"))}); + AssertMakePathTree({Dir("AA"), File("AA/aa")}, {PT(Dir("AA"), {PT(File("AA/aa"))})}); + + // Missing parent can still find ancestor. + AssertMakePathTree({Dir("AA"), File("AA/BB/bb")}, + {PT(Dir("AA"), {PT(File("AA/BB/bb"))})}); + + // Ancestors should link to parent irregardless of the ordering + AssertMakePathTree({File("AA/aa"), Dir("AA")}, {PT(Dir("AA"), {PT(File("AA/aa"))})}); + + // Multiple roots are supported. + AssertMakePathTree({File("aa"), File("bb")}, {PT(File("aa")), PT(File("bb"))}); + AssertMakePathTree( + {File("00"), Dir("AA"), File("AA/aa"), File("BB/bb")}, + {PT(File("00")), PT(Dir("AA"), {PT(File("AA/aa"))}), PT(File("BB/bb"))}); +} + +TEST(TestPathTree, HourlyETL) { + // This test mimics a scenario where an ETL dumps hourly files in a structure + // `$year/$month/$day/$hour/*.parquet`. + + constexpr int64_t kYears = 8; + constexpr int64_t kMonthsPerYear = 12; + constexpr int64_t kDaysPerMonth = 31; + constexpr int64_t kHoursPerDay = 24; + constexpr int64_t kFilesPerHour = 4; + + // Avoid constructing strings + std::vector numbers{kDaysPerMonth + 1}; + for (size_t i = 0; i < numbers.size(); i++) { + numbers[i] = std::to_string(i); + } + + auto join = [](const std::vector& path) { + return internal::JoinAbstractPath(path); + }; + + std::vector stats; + + PathForest forest; + for (int64_t year = 0; year < kYears; year++) { + auto year_str = std::to_string(year + 2000); + auto year_dir = Dir(year_str); + stats.push_back(year_dir); + + PathForest months; + for (int64_t month = 0; month < kMonthsPerYear; month++) { + auto month_str = join({year_str, numbers[month + 1]}); + auto month_dir = Dir(month_str); + stats.push_back(month_dir); + + PathForest days; + for (int64_t day = 0; day < kDaysPerMonth; day++) { + auto day_str = join({month_str, numbers[day + 1]}); + auto day_dir = Dir(day_str); + stats.push_back(day_dir); + + PathForest hours; + for (int64_t hour = 0; hour < kHoursPerDay; hour++) { + auto hour_str = join({day_str, numbers[hour]}); + auto hour_dir = Dir(hour_str); + stats.push_back(hour_dir); + + PathForest files; + for (int64_t file = 0; file < kFilesPerHour; file++) { + auto file_str = join({hour_str, numbers[file] + ".parquet"}); + auto file_fd = File(file_str); + stats.push_back(file_fd); + files.push_back(PT(file_fd)); + } + + auto hour_pt = PT(hour_dir, std::move(files)); + hours.push_back(hour_pt); + } + + auto day_pt = PT(day_dir, std::move(hours)); + days.push_back(day_pt); + } + + auto month_pt = PT(month_dir, std::move(days)); + months.push_back(month_pt); + } + + auto year_pt = PT(year_dir, std::move(months)); + forest.push_back(year_pt); + } + + AssertMakePathTree(stats, forest); +} + +TEST(TestPathTree, Visit) { + std::shared_ptr tree; + ASSERT_OK(PathTree::Make({Dir("A"), File("A/a")}, &tree)); + + // Should propagate failure + auto visit_noop = [](const FileStats&) { return Status::OK(); }; + ASSERT_OK(tree->Visit(visit_noop)); + auto visit_fail = [](const FileStats&) { return Status::Invalid(""); }; + ASSERT_RAISES(Invalid, tree->Visit(visit_fail)); + auto match_fail = [](const FileStats&, bool* match) { return Status::Invalid(""); }; + ASSERT_RAISES(Invalid, tree->Visit(visit_noop, match_fail)); + + // Ensure basic visit of all nodes + std::vector collect; + auto visit = [&collect](const FileStats& f) { + collect.push_back(f); + return Status::OK(); + }; + ASSERT_OK(tree->Visit(visit)); + EXPECT_THAT(collect, ContainerEq(std::vector{Dir("A"), File("A/a")})); + + // Matcher should be evaluated on all nodes + collect.resize(0); + auto match_dir = [](const FileStats& s, bool* m) { + *m = s.IsDirectory(); + return Status::OK(); + }; + ASSERT_OK(tree->Visit(visit, match_dir)); + EXPECT_THAT(collect, ContainerEq(std::vector{Dir("A")})); +} + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index c02c4c15402..8dd3d87f8ff 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -37,7 +37,7 @@ namespace fs { extern ARROW_EXPORT const char* kS3DefaultRegion; -/// Options for the S3 FileSystem implementation. +/// Options for the S3FileSystem implementation. struct ARROW_EXPORT S3Options { /// AWS region to connect to (default "us-east-1") std::string region = kS3DefaultRegion; @@ -79,7 +79,9 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { public: ~S3FileSystem() override; + /// \cond FALSE using FileSystem::GetTargetStats; + /// \endcond Status GetTargetStats(const std::string& path, FileStats* out) override; Status GetTargetStats(const Selector& select, std::vector* out) override; @@ -120,6 +122,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { Status OpenAppendStream(const std::string& path, std::shared_ptr* out) override; + /// Create a S3FileSystem instance from the given options. static Status Make(const S3Options& options, std::shared_ptr* out); protected: diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 47f423ca200..6786e3973a4 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -649,20 +649,6 @@ void GenericFileSystemTest::TestGetTargetStatsSelector(FileSystem* fs) { ASSERT_RAISES(IOError, fs->GetTargetStats(s, &stats)); } -FileStats File(std::string path) { - FileStats st; - st.set_type(FileType::File); - st.set_path(path); - return st; -} - -FileStats Dir(std::string path) { - FileStats st; - st.set_type(FileType::Directory); - st.set_path(path); - return st; -} - void GetSortedStats(FileSystem* fs, Selector s, std::vector& stats) { ASSERT_OK(fs->GetTargetStats(s, &stats)); // Clear mtime & size for easier testing. diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index e9c7f708d57..b7edc357243 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -29,6 +29,20 @@ namespace fs { static constexpr double kTimeSlack = 2.0; // In seconds +static inline FileStats File(std::string path) { + FileStats st; + st.set_type(FileType::File); + st.set_path(path); + return st; +} + +static inline FileStats Dir(std::string path) { + FileStats st; + st.set_type(FileType::Directory); + st.set_path(path); + return st; +} + ARROW_EXPORT void CreateFile(FileSystem* fs, const std::string& path, const std::string& data); diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 352f4044d75..c2a3f4411d7 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -104,8 +104,12 @@ class ARROW_FLIGHT_EXPORT FlightServerOptions { public: explicit FlightServerOptions(const Location& location_); + /// \brief The host & port (or domain socket path) to listen on. + /// Use port 0 to bind to an available port. Location location; + /// \brief The authentication handler to use. std::unique_ptr auth_handler; + /// \brief A list of TLS certificate+key pairs to use. std::vector tls_certificates; /// \brief A Flight implementation-specific callback to customize /// transport-specific options. diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 330c1231e7a..2aaa3d36877 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -74,7 +74,7 @@ enum class FlightStatusCode : int8_t { #pragma warning(disable : 4275) #endif -/// \brief Flight-specific information in a Status. +/// \brief Flight-specific error information in a Status. class ARROW_FLIGHT_EXPORT FlightStatusDetail : public arrow::StatusDetail { public: explicit FlightStatusDetail(FlightStatusCode code) : code_{code} {} @@ -101,7 +101,11 @@ class ARROW_FLIGHT_EXPORT FlightStatusDetail : public arrow::StatusDetail { #pragma warning(pop) #endif -/// \brief Make an appropriate Arrow status for the given Flight status. +/// \brief Make an appropriate Arrow status for the given +/// Flight-specific status. +/// +/// \param code The status code. +/// \param message The message for the error. ARROW_FLIGHT_EXPORT Status MakeFlightError(FlightStatusCode code, const std::string& message); @@ -114,16 +118,16 @@ struct ARROW_FLIGHT_EXPORT CertKeyPair { std::string pem_key; }; -/// \brief A type of action that can be performed with the DoAction RPC +/// \brief A type of action that can be performed with the DoAction RPC. struct ARROW_FLIGHT_EXPORT ActionType { - /// Name of action + /// \brief The name of the action. std::string type; - /// Opaque action description + /// \brief A human-readable description of the action. std::string description; }; -/// \brief Opaque selection critera for ListFlights RPC +/// \brief Opaque selection criteria for ListFlights RPC struct ARROW_FLIGHT_EXPORT Criteria { /// Opaque criteria expression, dependent on server implementation std::string expression; @@ -153,9 +157,6 @@ struct ARROW_FLIGHT_EXPORT BasicAuth { static Status Serialize(const BasicAuth& basic_auth, std::string* out); }; -/// \brief A message received after completing a DoPut stream -struct ARROW_FLIGHT_EXPORT PutResult {}; - /// \brief A request to retrieve or generate a dataset struct ARROW_FLIGHT_EXPORT FlightDescriptor { enum DescriptorType { @@ -406,26 +407,26 @@ class ARROW_FLIGHT_EXPORT FlightInfo { mutable bool reconstructed_schema_; }; -/// \brief An iterator to FlightInfo instances returned by ListFlights +/// \brief An iterator to FlightInfo instances returned by ListFlights. class ARROW_FLIGHT_EXPORT FlightListing { public: virtual ~FlightListing() = default; - /// \brief Retrieve the next FlightInfo from the iterator. Returns nullptr - /// when there are none left - /// \param[out] info a single FlightInfo + /// \brief Retrieve the next FlightInfo from the iterator. + /// \param[out] info A single FlightInfo. Set to \a nullptr if there + /// are none left. /// \return Status virtual Status Next(std::unique_ptr* info) = 0; }; -/// \brief An iterator to Result instances returned by DoAction +/// \brief An iterator to Result instances returned by DoAction. class ARROW_FLIGHT_EXPORT ResultStream { public: virtual ~ResultStream() = default; - /// \brief Retrieve the next Result from the iterator. Returns nullptr - /// when there are none left - /// \param[out] info a single Result + /// \brief Retrieve the next Result from the iterator. + /// \param[out] info A single result. Set to \a nullptr if there + /// are none left. /// \return Status virtual Status Next(std::unique_ptr* info) = 0; }; @@ -454,8 +455,10 @@ class ARROW_FLIGHT_EXPORT MetadataRecordBatchReader { virtual Status ReadAll(std::shared_ptr
* table); }; -// \brief Create a FlightListing from a vector of FlightInfo objects. This can -// be iterated once, then it is consumed +/// \brief A FlightListing implementation based on a vector of +/// FlightInfo objects. +/// +/// This can be iterated once, then it is consumed. class ARROW_FLIGHT_EXPORT SimpleFlightListing : public FlightListing { public: explicit SimpleFlightListing(const std::vector& flights); @@ -468,6 +471,10 @@ class ARROW_FLIGHT_EXPORT SimpleFlightListing : public FlightListing { std::vector flights_; }; +/// \brief A ResultStream implementation based on a vector of +/// Result objects. +/// +/// This can be iterated once, then it is consumed. class ARROW_FLIGHT_EXPORT SimpleResultStream : public ResultStream { public: explicit SimpleResultStream(std::vector&& results); diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index 10b6e8f6c73..59653947742 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -36,6 +36,7 @@ class Status; namespace io { +/// \brief An operating system file open in write-only mode. class ARROW_EXPORT FileOutputStream : public OutputStream { public: ~FileOutputStream() override; @@ -95,8 +96,9 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { // Write bytes to the stream. Thread-safe Status Write(const void* data, int64_t nbytes) override; - + /// \cond FALSE using Writable::Write; + /// \endcond int file_descriptor() const; @@ -107,7 +109,11 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { std::unique_ptr impl_; }; -// Operating system file +/// \brief An operating system file open in read-only mode. +/// +/// Reads through this implementation are unbuffered. If many small reads +/// need to be issued, it is recommended to use a buffering layer for good +/// performance. class ARROW_EXPORT ReadableFile : public internal::RandomAccessFileConcurrencyWrapper { public: @@ -173,12 +179,13 @@ class ARROW_EXPORT ReadableFile std::unique_ptr impl_; }; -// A file interface that uses memory-mapped files for memory interactions, -// supporting zero copy reads. The same class is used for both reading and -// writing. -// -// If opening a file in a writable mode, it is not truncated first as with -// FileOutputStream +/// \brief A file interface that uses memory-mapped files for memory interactions +/// +/// This implementation supports zero-copy reads. The same class is used +/// for both reading and writing. +/// +/// If opening a file in a writable mode, it is not truncated first as with +/// FileOutputStream. class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { public: ~MemoryMappedFile() override; diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 51af229f000..2346e1514f0 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -37,7 +37,7 @@ class Status; namespace io { -// \brief An output stream that writes to a resizable buffer +/// \brief An output stream that writes to a resizable buffer class ARROW_EXPORT BufferOutputStream : public OutputStream { public: explicit BufferOutputStream(const std::shared_ptr& buffer); @@ -61,7 +61,9 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { Status Tell(int64_t* position) const override; Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE using OutputStream::Write; + /// \endcond /// Close the stream and return the buffer Status Finish(std::shared_ptr* result); @@ -88,7 +90,11 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { uint8_t* mutable_data_; }; -// \brief A helper class to tracks the size of allocations +/// \brief A helper class to tracks the size of allocations +/// +/// Writes to this stream do not copy or retain any data, they just bump +/// a size counter that can be later used to know exactly which data size +/// needs to be allocated for actual writing. class ARROW_EXPORT MockOutputStream : public OutputStream { public: MockOutputStream() : extent_bytes_written_(0), is_open_(true) {} @@ -106,7 +112,7 @@ class ARROW_EXPORT MockOutputStream : public OutputStream { bool is_open_; }; -/// \brief Enables random writes into a fixed-size mutable buffer +/// \brief An output stream that writes into a fixed-size mutable buffer class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile { public: /// Input buffer must be mutable, will abort if not diff --git a/cpp/src/arrow/io/memory_test.cc b/cpp/src/arrow/io/memory_test.cc index d545520ddbc..7b0621bcc92 100644 --- a/cpp/src/arrow/io/memory_test.cc +++ b/cpp/src/arrow/io/memory_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -26,6 +27,7 @@ #include "arrow/buffer.h" #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" +#include "arrow/io/slow.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" @@ -313,5 +315,39 @@ TEST(TestMemcopy, ParallelMemcopy) { } } +template +void TestSlowInputStream() { + using clock = std::chrono::high_resolution_clock; + + auto stream = std::make_shared(util::string_view("abcdefghijkl")); + const double latency = 0.6; + auto slow = std::make_shared(stream, latency); + + ASSERT_FALSE(slow->closed()); + std::shared_ptr buf; + auto t1 = clock::now(); + ASSERT_OK(slow->Read(6, &buf)); + auto t2 = clock::now(); + AssertBufferEqual(*buf, "abcdef"); + auto dt = std::chrono::duration_cast>(t2 - t1).count(); + ASSERT_LT(dt, latency * 3); // likely + ASSERT_GT(dt, latency / 3); // likely + + util::string_view view; + ASSERT_OK(slow->Peek(4, &view)); + ASSERT_EQ(view, util::string_view("ghij")); + + ASSERT_OK(slow->Close()); + ASSERT_TRUE(slow->closed()); + ASSERT_TRUE(stream->closed()); + ASSERT_OK(slow->Close()); + ASSERT_TRUE(slow->closed()); + ASSERT_TRUE(stream->closed()); +} + +TEST(TestSlowInputStream, Basics) { TestSlowInputStream(); } + +TEST(TestSlowRandomAccessFile, Basics) { TestSlowInputStream(); } + } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/slow.cc b/cpp/src/arrow/io/slow.cc new file mode 100644 index 00000000000..8422aaa6f5c --- /dev/null +++ b/cpp/src/arrow/io/slow.cc @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/io/slow.h" + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/io/util_internal.h" +#include "arrow/status.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace io { + +// Multiply the average by this ratio to get the intended standard deviation +static constexpr double kStandardDeviationRatio = 0.1; + +class LatencyGeneratorImpl : public LatencyGenerator { + public: + ~LatencyGeneratorImpl() override = default; + + LatencyGeneratorImpl(double average_latency, int32_t seed) + : gen_(static_cast(seed)), + latency_dist_(average_latency, average_latency * kStandardDeviationRatio) {} + + double NextLatency() override { + // std::random distributions are unlikely to be thread-safe, and + // a RandomAccessFile may be called from multiple threads + std::lock_guard lock(mutex_); + return std::max(0.0, latency_dist_(gen_)); + } + + private: + std::default_random_engine gen_; + std::normal_distribution latency_dist_; + std::mutex mutex_; +}; + +LatencyGenerator::~LatencyGenerator() {} + +void LatencyGenerator::Sleep() { + std::this_thread::sleep_for(std::chrono::duration(NextLatency())); +} + +std::shared_ptr LatencyGenerator::Make(double average_latency) { + auto seed = static_cast(std::random_device()()); + return std::make_shared(average_latency, seed); +} + +std::shared_ptr LatencyGenerator::Make(double average_latency, + int32_t seed) { + return std::make_shared(average_latency, seed); +} + +////////////////////////////////////////////////////////////////////////// +// SlowInputStream implementation + +SlowInputStream::~SlowInputStream() { internal::CloseFromDestructor(this); } + +Status SlowInputStream::Close() { return stream_->Close(); } + +Status SlowInputStream::Abort() { return stream_->Abort(); } + +bool SlowInputStream::closed() const { return stream_->closed(); } + +Status SlowInputStream::Tell(int64_t* position) const { return stream_->Tell(position); } + +Status SlowInputStream::Read(int64_t nbytes, int64_t* bytes_read, void* out) { + latencies_->Sleep(); + return stream_->Read(nbytes, bytes_read, out); +} + +Status SlowInputStream::Read(int64_t nbytes, std::shared_ptr* out) { + latencies_->Sleep(); + return stream_->Read(nbytes, out); +} + +Status SlowInputStream::Peek(int64_t nbytes, util::string_view* out) { + return stream_->Peek(nbytes, out); +} + +////////////////////////////////////////////////////////////////////////// +// SlowRandomAccessFile implementation + +SlowRandomAccessFile::~SlowRandomAccessFile() { internal::CloseFromDestructor(this); } + +Status SlowRandomAccessFile::Close() { return stream_->Close(); } + +Status SlowRandomAccessFile::Abort() { return stream_->Abort(); } + +bool SlowRandomAccessFile::closed() const { return stream_->closed(); } + +Status SlowRandomAccessFile::GetSize(int64_t* size) { return stream_->GetSize(size); } + +Status SlowRandomAccessFile::Seek(int64_t position) { return stream_->Seek(position); } + +Status SlowRandomAccessFile::Tell(int64_t* position) const { + return stream_->Tell(position); +} + +Status SlowRandomAccessFile::Read(int64_t nbytes, int64_t* bytes_read, void* out) { + latencies_->Sleep(); + return stream_->Read(nbytes, bytes_read, out); +} + +Status SlowRandomAccessFile::Read(int64_t nbytes, std::shared_ptr* out) { + latencies_->Sleep(); + return stream_->Read(nbytes, out); +} + +Status SlowRandomAccessFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* out) { + latencies_->Sleep(); + return stream_->ReadAt(position, nbytes, bytes_read, out); +} + +Status SlowRandomAccessFile::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { + latencies_->Sleep(); + return stream_->ReadAt(position, nbytes, out); +} + +Status SlowRandomAccessFile::Peek(int64_t nbytes, util::string_view* out) { + return stream_->Peek(nbytes, out); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/slow.h b/cpp/src/arrow/io/slow.h new file mode 100644 index 00000000000..57bddceb0bc --- /dev/null +++ b/cpp/src/arrow/io/slow.h @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Slow stream implementations, mainly for testing and benchmarking + +#pragma once + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class Status; + +namespace io { + +class ARROW_EXPORT LatencyGenerator { + public: + virtual ~LatencyGenerator(); + + void Sleep(); + + virtual double NextLatency() = 0; + + static std::shared_ptr Make(double average_latency); + static std::shared_ptr Make(double average_latency, int32_t seed); +}; + +// XXX use ConcurrencyWrapper? It could increase chances of finding a race. + +template +class ARROW_EXPORT SlowInputStreamBase : public StreamType { + public: + SlowInputStreamBase(std::shared_ptr stream, + std::shared_ptr latencies) + : stream_(std::move(stream)), latencies_(std::move(latencies)) {} + + SlowInputStreamBase(std::shared_ptr stream, double average_latency) + : stream_(std::move(stream)), latencies_(LatencyGenerator::Make(average_latency)) {} + + SlowInputStreamBase(std::shared_ptr stream, double average_latency, + int32_t seed) + : stream_(std::move(stream)), + latencies_(LatencyGenerator::Make(average_latency, seed)) {} + + protected: + std::shared_ptr stream_; + std::shared_ptr latencies_; +}; + +/// \brief An InputStream wrapper that makes reads slower. +/// +/// Read() calls are made slower by an average latency (in seconds). +/// Actual latencies form a normal distribution closely centered +/// on the average latency. +/// Other calls are forwarded directly. +class ARROW_EXPORT SlowInputStream : public SlowInputStreamBase { + public: + ~SlowInputStream() override; + + using SlowInputStreamBase::SlowInputStreamBase; + + Status Close() override; + Status Abort() override; + bool closed() const override; + + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status Peek(int64_t nbytes, util::string_view* out) override; + + Status Tell(int64_t* position) const override; +}; + +/// \brief A RandomAccessFile wrapper that makes reads slower. +/// +/// Similar to SlowInputStream, but allows random access and seeking. +class ARROW_EXPORT SlowRandomAccessFile : public SlowInputStreamBase { + public: + ~SlowRandomAccessFile() override; + + using SlowInputStreamBase::SlowInputStreamBase; + + Status Close() override; + Status Abort() override; + bool closed() const override; + + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* out) override; + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + Status Peek(int64_t nbytes, util::string_view* out) override; + + Status GetSize(int64_t* size) override; + Status Seek(int64_t position) override; + Status Tell(int64_t* position) const override; +}; + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index dff3369a27f..d4ed8b7e0da 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -62,9 +62,6 @@ using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; using KVVector = flatbuffers::Vector; -static const char kExtensionTypeKeyName[] = "ARROW:extension:name"; -static const char kExtensionMetadataKeyName[] = "ARROW:extension:metadata"; - MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version) { switch (version) { case flatbuf::MetadataVersion_V1: diff --git a/cpp/src/arrow/json/options.h b/cpp/src/arrow/json/options.h index 8d27faabea2..f0750418803 100644 --- a/cpp/src/arrow/json/options.h +++ b/cpp/src/arrow/json/options.h @@ -29,33 +29,44 @@ class Schema; namespace json { -enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType }; +enum class UnexpectedFieldBehavior : char { + /// Unexpected JSON fields are ignored + Ignore, + /// Unexpected JSON fields error out + Error, + /// Unexpected JSON fields are type-inferred and included in the output + InferType +}; struct ARROW_EXPORT ParseOptions { // Parsing options - // Optional explicit schema (no type inference, ignores other fields) + /// Optional explicit schema (disables type inference on those fields) std::shared_ptr explicit_schema; - // Whether objects may be printed across multiple lines (for example pretty printed) - // NB: if false, input must end with an empty line + /// Whether objects may be printed across multiple lines (for example pretty-printed) + /// + /// If true, parsing may be slower + /// If false, input must end with an empty line bool newlines_in_values = false; - // How should parse handle fields outside the explicit_schema? + /// How JSON fields outside of explicit_schema (if given) are treated UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + /// Create parsing options with default values static ParseOptions Defaults(); }; struct ARROW_EXPORT ReadOptions { // Reader options - // Whether to use the global CPU thread pool + /// Whether to use the global CPU thread pool bool use_threads = true; - // Block size we request from the IO layer; also determines the size of - // chunks when use_threads is true + /// Block size we request from the IO layer; also determines the size of + /// chunks when use_threads is true int32_t block_size = 1 << 20; // 1 MB + /// Create read options with default values static ReadOptions Defaults(); }; diff --git a/cpp/src/arrow/json/reader.h b/cpp/src/arrow/json/reader.h index 51a3473a04e..9ffe47de08c 100644 --- a/cpp/src/arrow/json/reader.h +++ b/cpp/src/arrow/json/reader.h @@ -39,12 +39,17 @@ class InputStream; namespace json { +/// A class that reads an entire JSON file into a Arrow Table +/// +/// The file is expected to consist of individual line-separated JSON objects class ARROW_EXPORT TableReader { public: virtual ~TableReader() = default; + /// Read the entire JSON file and convert it to a Arrow Table virtual Status Read(std::shared_ptr
* out) = 0; + /// Create a TableReader instance static Status Make(MemoryPool* pool, std::shared_ptr input, const ReadOptions&, const ParseOptions&, std::shared_ptr* out); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 2b31b476334..02650a6d35e 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -31,7 +31,12 @@ namespace arrow { struct SparseTensorFormat { /// EXPERIMENTAL: The index format type of SparseTensor - enum type { COO, CSR }; + enum type { + /// Coordinate list (COO) format. + COO, + /// Compressed sparse row (CSR) format. + CSR + }; }; /// \brief EXPERIMENTAL: The base class for the index of a sparse tensor @@ -83,10 +88,15 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase& coords); /// \brief Return a tensor that has the coordinates of the non-zero values + /// + /// The returned tensor is a Nx3 tensor where N is the number of non-zero + /// values. Each 3-element column has the form `{row, column, index}`, + /// indicating that the value for the logical element at `{row, column}` + /// should be found at the given physical index. const std::shared_ptr& indices() const { return coords_; } /// \brief Return a string representation of the sparse index @@ -120,7 +130,7 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase& indptr, const std::shared_ptr& indices); @@ -231,20 +241,23 @@ class SparseTensorImpl : public SparseTensor { public: virtual ~SparseTensorImpl() = default; - // Constructor with all attributes + /// \brief Construct a sparse tensor from physical data buffer and logical index SparseTensorImpl(const std::shared_ptr& sparse_index, const std::shared_ptr& type, const std::shared_ptr& data, const std::vector& shape, const std::vector& dim_names) : SparseTensor(type, data, shape, sparse_index, dim_names) {} - // Constructor for empty sparse tensor + /// \brief Construct an empty sparse tensor SparseTensorImpl(const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names = {}) : SparseTensorImpl(NULLPTR, type, NULLPTR, shape, dim_names) {} - // Constructor with a dense tensor + /// \brief Construct a sparse tensor from a dense tensor + /// + /// The dense tensor is re-encoded as a sparse index and a physical + /// data buffer for the non-zero value. SparseTensorImpl(const Tensor& tensor, const std::shared_ptr& index_value_type) : SparseTensorImpl(NULLPTR, tensor.type(), NULLPTR, tensor.shape(), diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 6a3269f6b2f..f07d410eefa 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -105,9 +105,10 @@ class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; class ARROW_EXPORT StatusDetail { public: virtual ~StatusDetail() = default; - // Return a unique id for the type of the StatusDetail - // (effectively a poor man's substitude for RTTI). + /// \brief Return a unique id for the type of the StatusDetail + /// (effectively a poor man's substitude for RTTI). virtual const char* type_id() const = 0; + /// \brief Produce a human-readable description of this status. virtual std::string ToString() const = 0; }; diff --git a/cpp/src/arrow/util/base64.h b/cpp/src/arrow/util/base64.h new file mode 100644 index 00000000000..9ab41412ac3 --- /dev/null +++ b/cpp/src/arrow/util/base64.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +ARROW_EXPORT +std::string base64_encode(unsigned char const*, unsigned int len); + +ARROW_EXPORT +std::string base64_decode(std::string const& s); + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h index a05d43e738d..9ffd79b27f4 100644 --- a/cpp/src/arrow/util/compression.h +++ b/cpp/src/arrow/util/compression.h @@ -29,6 +29,7 @@ namespace arrow { class Status; struct Compression { + /// \brief Compression algorithm enum type { UNCOMPRESSED, SNAPPY, GZIP, BROTLI, ZSTD, LZ4, LZO, BZ2 }; }; @@ -96,6 +97,7 @@ class ARROW_EXPORT Decompressor { // XXX add methods for buffer size heuristics? }; +/// \brief Compression codec class ARROW_EXPORT Codec { public: virtual ~Codec(); @@ -107,7 +109,10 @@ class ARROW_EXPORT Codec { /// \brief Return a string name for compression type static std::string GetCodecAsString(Compression::type t); + /// \brief Create a codec for the given compression algorithm static Status Create(Compression::type codec, std::unique_ptr* out); + + /// \brief Create a codec for the given compression algorithm and level static Status Create(Compression::type codec, int compression_level, std::unique_ptr* out); diff --git a/cpp/src/arrow/vendored/base64.cpp b/cpp/src/arrow/vendored/base64.cpp new file mode 100644 index 00000000000..50ece19455e --- /dev/null +++ b/cpp/src/arrow/vendored/base64.cpp @@ -0,0 +1,128 @@ +/* + base64.cpp and base64.h + + base64 encoding and decoding with C++. + + Version: 1.01.00 + + Copyright (C) 2004-2017 René Nyffenegger + + This source code is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + + 3. This notice may not be removed or altered from any source distribution. + + René Nyffenegger rene.nyffenegger@adp-gmbh.ch + +*/ + +#include "arrow/util/base64.h" +#include + +namespace arrow { +namespace util { + +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + + +static inline bool is_base64(unsigned char c) { + return (isalnum(c) || (c == '+') || (c == '/')); +} + +std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { + std::string ret; + int i = 0; + int j = 0; + unsigned char char_array_3[3]; + unsigned char char_array_4[4]; + + while (in_len--) { + char_array_3[i++] = *(bytes_to_encode++); + if (i == 3) { + char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + char_array_4[3] = char_array_3[2] & 0x3f; + + for(i = 0; (i <4) ; i++) + ret += base64_chars[char_array_4[i]]; + i = 0; + } + } + + if (i) + { + for(j = i; j < 3; j++) + char_array_3[j] = '\0'; + + char_array_4[0] = ( char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + + for (j = 0; (j < i + 1); j++) + ret += base64_chars[char_array_4[j]]; + + while((i++ < 3)) + ret += '='; + + } + + return ret; + +} + +std::string base64_decode(std::string const& encoded_string) { + size_t in_len = encoded_string.size(); + int i = 0; + int j = 0; + int in_ = 0; + unsigned char char_array_4[4], char_array_3[3]; + std::string ret; + + while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i ==4) { + for (i = 0; i <4; i++) + char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff; + + char_array_3[0] = ( char_array_4[0] << 2 ) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + ret += char_array_3[i]; + i = 0; + } + } + + if (i) { + for (j = 0; j < i; j++) + char_array_4[j] = base64_chars.find(char_array_4[j]) & 0xff; + + char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + + for (j = 0; (j < i - 1); j++) ret += char_array_3[j]; + } + + return ret; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 17ebf925bb9..f027de1fddb 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -48,23 +48,22 @@ class ColumnChunkReader; class ColumnReader; class RowGroupReader; -// Arrow read adapter class for deserializing Parquet files as Arrow row -// batches. -// -// This interfaces caters for different use cases and thus provides different -// interfaces. In its most simplistic form, we cater for a user that wants to -// read the whole Parquet at once with the FileReader::ReadTable method. -// -// More advanced users that also want to implement parallelism on top of each -// single Parquet files should do this on the RowGroup level. For this, they can -// call FileReader::RowGroup(i)->ReadTable to receive only the specified -// RowGroup as a table. -// -// In the most advanced situation, where a consumer wants to independently read -// RowGroups in parallel and consume each column individually, they can call -// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column -// instance. -// +/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches. +/// +/// This interfaces caters for different use cases and thus provides different +/// interfaces. In its most simplistic form, we cater for a user that wants to +/// read the whole Parquet at once with the `FileReader::ReadTable` method. +/// +/// More advanced users that also want to implement parallelism on top of each +/// single Parquet files should do this on the RowGroup level. For this, they can +/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified +/// RowGroup as a table. +/// +/// In the most advanced situation, where a consumer wants to independently read +/// RowGroups in parallel and consume each column individually, they can call +/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column` +/// instance. +/// // TODO(wesm): nested data does not always make sense with this user // interface unless you are only reading a single leaf node from a branch of // a table. For example: @@ -106,11 +105,13 @@ class RowGroupReader; // arrays class PARQUET_EXPORT FileReader { public: + /// Factory function to create a FileReader from a ParquetFileReader and properties static ::arrow::Status Make(::arrow::MemoryPool* pool, std::unique_ptr reader, const ArrowReaderProperties& properties, std::unique_ptr* out); + /// Factory function to create a FileReader from a ParquetFileReader static ::arrow::Status Make(::arrow::MemoryPool* pool, std::unique_ptr reader, std::unique_ptr* out); @@ -127,7 +128,9 @@ class PARQUET_EXPORT FileReader { /// \brief Return arrow schema for all the columns. virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0; - // Read column as a whole into an Array. + /// \brief Read column as a whole into a chunked array. + /// + /// The indicated column index is relative to the schema virtual ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; @@ -183,11 +186,12 @@ class PARQUET_EXPORT FileReader { return ::arrow::Status::OK(); } - // Read a table of columns into a Table + /// Read all columns into a Table virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0; - // Read a table of columns into a Table. Read only the indicated column - // indices (relative to the schema) + /// \brief Read the given columns into a Table + /// + /// The indicated column indices are relative to the schema virtual ::arrow::Status ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) = 0; @@ -212,6 +216,7 @@ class PARQUET_EXPORT FileReader { /// FileReader. virtual std::shared_ptr RowGroup(int row_group_index) = 0; + /// \brief The number of row groups in the file virtual int num_row_groups() const = 0; virtual ParquetFileReader* parquet_reader() const = 0; @@ -270,14 +275,18 @@ class PARQUET_EXPORT FileReaderBuilder { public: FileReaderBuilder(); + /// Create FileReaderBuilder from Arrow file and optional properties / metadata ::arrow::Status Open(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, const ReaderProperties& properties = default_reader_properties(), const std::shared_ptr& metadata = NULLPTR); ParquetFileReader* raw_reader() { return raw_reader_.get(); } + /// Set Arrow MemoryPool for memory allocation FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool); + /// Set Arrow reader properties FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties); + /// Build FileReader instance ::arrow::Status Build(std::unique_ptr* out); private: @@ -286,6 +295,13 @@ class PARQUET_EXPORT FileReaderBuilder { std::unique_ptr raw_reader_; }; +/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers +/// +/// @{ + +/// \brief Build FileReader from Arrow file and MemoryPool +/// +/// Advanced settings are supported through the FileReaderBuilder class. PARQUET_EXPORT ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, ::arrow::MemoryPool* allocator, @@ -306,6 +322,8 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f const ArrowReaderProperties& properties, std::unique_ptr* reader); +/// @} + PARQUET_EXPORT ::arrow::Status FromParquetSchema( const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index b234291eb84..7b803fa261f 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -32,12 +32,14 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/compute/kernel.h" +#include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/base64.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" @@ -575,7 +577,8 @@ Status GetOriginSchema(const std::shared_ptr& metadata, // The original Arrow schema was serialized using the store_schema option. We // deserialize it here and use it to inform read options such as // dictionary-encoded fields - auto schema_buf = std::make_shared(metadata->value(schema_index)); + auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index)); + auto schema_buf = std::make_shared(decoded); ::arrow::ipc::DictionaryMemo dict_memo; ::arrow::io::BufferReader input(schema_buf); @@ -620,6 +623,27 @@ Status ApplyOriginalMetadata(std::shared_ptr field, const Field& origin_f field = field->WithType( ::arrow::dictionary(::arrow::int32(), field->type(), dict_origin_type.ordered())); } + // restore field metadata + std::shared_ptr field_metadata = origin_field.metadata(); + if (field_metadata != nullptr) { + field = field->WithMetadata(field_metadata); + + // extension type + int name_index = field_metadata->FindKey(::arrow::kExtensionTypeKeyName); + if (name_index != -1) { + std::string type_name = field_metadata->value(name_index); + int data_index = field_metadata->FindKey(::arrow::kExtensionMetadataKeyName); + std::string type_data = data_index == -1 ? "" : field_metadata->value(data_index); + + std::shared_ptr<::arrow::ExtensionType> ext_type = + ::arrow::GetExtensionType(type_name); + if (ext_type != nullptr) { + std::shared_ptr deserialized; + RETURN_NOT_OK(ext_type->Deserialize(field->type(), type_data, &deserialized)); + field = field->WithType(deserialized); + } + } + } *out = field; return Status::OK(); } @@ -1098,6 +1122,25 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* pool, return Status::OK(); } +Status TransferExtension(RecordReader* reader, std::shared_ptr value_type, + const ColumnDescriptor* descr, MemoryPool* pool, Datum* out) { + std::shared_ptr result; + auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(value_type); + auto storage_type = ext_type->storage_type(); + RETURN_NOT_OK(TransferColumnData(reader, storage_type, descr, pool, &result)); + + ::arrow::ArrayVector out_chunks(result->num_chunks()); + for (int i = 0; i < result->num_chunks(); i++) { + auto chunk = result->chunk(i); + auto ext_data = chunk->data()->Copy(); + ext_data->type = ext_type; + auto ext_result = ext_type->MakeArray(ext_data); + out_chunks[i] = ext_result; + } + *out = std::make_shared(out_chunks); + return Status::OK(); +} + #define TRANSFER_INT32(ENUM, ArrowType) \ case ::arrow::Type::ENUM: { \ Status s = TransferInt(reader, pool, value_type, &result); \ @@ -1194,6 +1237,9 @@ Status TransferColumnData(internal::RecordReader* reader, return Status::NotImplemented("TimeUnit not supported"); } } break; + case ::arrow::Type::EXTENSION: { + RETURN_NOT_OK(TransferExtension(reader, value_type, descr, pool, &result)); + } break; default: return Status::NotImplemented("No support for reading columns of type ", value_type->ToString()); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 49d0dd98808..2a8893f271e 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/extension_type.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -308,6 +309,12 @@ Status FieldToNode(const std::shared_ptr& field, field->name(), dict_type.value_type(), field->nullable(), field->metadata()); return FieldToNode(unpacked_field, properties, arrow_properties, out); } + case ArrowTypeId::EXTENSION: { + auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type()); + std::shared_ptr<::arrow::Field> storage_field = ::arrow::field( + field->name(), ext_type->storage_type(), field->nullable(), field->metadata()); + return FieldToNode(storage_field, properties, arrow_properties, out); + } default: { // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR return Status::NotImplemented( diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 950e3de721f..18c01a527ea 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -26,9 +26,11 @@ #include "arrow/array.h" #include "arrow/buffer_builder.h" +#include "arrow/extension_type.h" #include "arrow/ipc/writer.h" #include "arrow/table.h" #include "arrow/type.h" +#include "arrow/util/base64.h" #include "arrow/visitor_inline.h" #include "parquet/arrow/reader_internal.h" @@ -48,6 +50,7 @@ using arrow::DictionaryArray; using arrow::Field; using arrow::FixedSizeBinaryArray; using Int16BufferBuilder = arrow::TypedBufferBuilder; +using arrow::ExtensionArray; using arrow::ListArray; using arrow::MemoryPool; using arrow::NumericArray; @@ -115,6 +118,8 @@ class LevelBuilder { return VisitInline(*array.values()); } + Status Visit(const ExtensionArray& array) { return VisitInline(*array.storage()); } + #define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \ return Status::NotImplemented("Level generation for " #ArrowTypePrefix \ @@ -126,7 +131,6 @@ class LevelBuilder { NOT_IMPLEMENTED_VISIT(FixedSizeList) NOT_IMPLEMENTED_VISIT(Struct) NOT_IMPLEMENTED_VISIT(Union) - NOT_IMPLEMENTED_VISIT(Extension) #undef NOT_IMPLEMENTED_VISIT @@ -574,7 +578,13 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo ::arrow::ipc::DictionaryMemo dict_memo; std::shared_ptr serialized; RETURN_NOT_OK(::arrow::ipc::SerializeSchema(schema, &dict_memo, pool, &serialized)); - result->Append(kArrowSchemaKey, serialized->ToString()); + + // The serialized schema is not UTF-8, which is required for Thrift + std::string schema_as_string = serialized->ToString(); + std::string schema_base64 = ::arrow::util::base64_encode( + reinterpret_cast(schema_as_string.data()), + static_cast(schema_as_string.size())); + result->Append(kArrowSchemaKey, schema_base64); *out = result; return Status::OK(); } diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index cfcb6eb0c7b..354c2ba8de0 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -42,11 +42,10 @@ class ParquetFileWriter; namespace arrow { -/** - * Iterative API: - * Start a new RowGroup/Chunk with NewRowGroup - * Write column-by-column the whole column chunk - */ +/// \brief Iterative FileWriter class +/// +/// Start a new RowGroup or Chunk with NewRowGroup. +/// Write column-by-column the whole column chunk. class PARQUET_EXPORT FileWriter { public: static ::arrow::Status Make( @@ -99,11 +98,9 @@ PARQUET_EXPORT ::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink); -/** - * Write a Table to Parquet. - * - * The table shall only consist of columns of primitive type or of primitive lists. - */ +/// \brief Write a Table to Parquet. +/// +/// The table shall only consist of columns of primitive type or of primitive lists. ::arrow::Status PARQUET_EXPORT WriteTable( const ::arrow::Table& table, MemoryPool* pool, const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size, diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 62ce9d76b13..07c14206ae4 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -133,6 +133,7 @@ dev/tasks/linux-packages/debian/source/format dev/tasks/linux-packages/debian/watch dev/tasks/conda-recipes/* docs/requirements.txt +integration/spark/ARROW-6429.patch go/arrow/go.sum go/arrow/Gopkg.lock go/arrow/internal/cpu/* diff --git a/dev/tasks/linux-packages/azure.linux.arm64.yml b/dev/tasks/linux-packages/azure.linux.arm64.yml index 080c9dbc535..01ef6893c53 100644 --- a/dev/tasks/linux-packages/azure.linux.arm64.yml +++ b/dev/tasks/linux-packages/azure.linux.arm64.yml @@ -56,6 +56,8 @@ jobs: rake dist {{ build_command }} popd + env: + ARROW_VERSION: {{ arrow.version }} displayName: Build # Using github release tries to find a common ancestor between the diff --git a/dev/tasks/linux-packages/azure.linux.yml b/dev/tasks/linux-packages/azure.linux.yml index fbe1c9486dd..6253bf40a1b 100644 --- a/dev/tasks/linux-packages/azure.linux.yml +++ b/dev/tasks/linux-packages/azure.linux.yml @@ -40,6 +40,8 @@ jobs: rake dist {{ build_command }} popd + env: + ARROW_VERSION: {{ arrow.version }} displayName: Build # Using github release tries to find a common ancestor between the diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 1c113b7de68..9b7d356980b 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -28,6 +28,10 @@ API Reference api/array api/builder api/table + api/tensor api/utilities + api/io + api/formats api/cuda api/flight + api/filesystem diff --git a/docs/source/cpp/api/filesystem.rst b/docs/source/cpp/api/filesystem.rst new file mode 100644 index 00000000000..293dd080963 --- /dev/null +++ b/docs/source/cpp/api/filesystem.rst @@ -0,0 +1,46 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========== +Filesystems +=========== + +Interface +========= + +.. doxygenenum:: arrow::fs::FileType + +.. doxygenstruct:: arrow::fs::FileStats + :members: + +.. doxygenclass:: arrow::fs::FileSystem + :members: + +Concrete implementations +======================== + +.. doxygenclass:: arrow::fs::SubTreeFileSystem + :members: + +.. doxygenclass:: arrow::fs::LocalFileSystem + :members: + +.. doxygenstruct:: arrow::fs::S3Options + :members: + +.. doxygenclass:: arrow::fs::S3FileSystem + :members: diff --git a/docs/source/cpp/api/flight.rst b/docs/source/cpp/api/flight.rst index 4e56a7690ac..7801d88f04f 100644 --- a/docs/source/cpp/api/flight.rst +++ b/docs/source/cpp/api/flight.rst @@ -15,15 +15,18 @@ .. specific language governing permissions and limitations .. under the License. +.. default-domain:: cpp +.. highlight:: cpp + ================ Arrow Flight RPC ================ -.. warning:: Flight is currently unstable. APIs are subject to change, - though we don't expect drastic changes. +.. note:: Flight is currently unstable. APIs are subject to change, + though we don't expect drastic changes. -.. warning:: Flight is currently only available when built from source - appropriately. +.. note:: Flight is currently only available when built from source + appropriately. Common Types ============ @@ -64,7 +67,7 @@ Common Types :project: arrow_cpp :members: -.. doxygenstruct:: arrow::flight::PutResult +.. doxygenclass:: arrow::flight::MetadataRecordBatchReader :project: arrow_cpp :members: @@ -87,6 +90,10 @@ Clients :project: arrow_cpp :members: +.. doxygenclass:: arrow::flight::FlightClientOptions + :project: arrow_cpp + :members: + .. doxygenclass:: arrow::flight::FlightCallOptions :project: arrow_cpp :members: @@ -98,6 +105,14 @@ Clients .. doxygentypedef:: arrow::flight::TimeoutDuration :project: arrow_cpp +.. doxygenclass:: arrow::flight::FlightStreamReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightStreamWriter + :project: arrow_cpp + :members: + Servers ======= @@ -105,6 +120,14 @@ Servers :project: arrow_cpp :members: +.. doxygenclass:: arrow::flight::FlightServerOptions + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::CertKeyPair + :project: arrow_cpp + :members: + .. doxygenclass:: arrow::flight::FlightDataStream :project: arrow_cpp :members: @@ -113,6 +136,10 @@ Servers :project: arrow_cpp :members: +.. doxygenclass:: arrow::flight::FlightMetadataWriter + :project: arrow_cpp + :members: + .. doxygenclass:: arrow::flight::RecordBatchStream :project: arrow_cpp :members: @@ -124,3 +151,28 @@ Servers .. doxygenclass:: arrow::flight::ServerCallContext :project: arrow_cpp :members: + +.. doxygenclass:: arrow::flight::SimpleFlightListing + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::SimpleResultStream + :project: arrow_cpp + :members: + +Error Handling +============== + +Error handling uses the normal :class:`arrow::Status` class, combined +with a custom :class:`arrow::StatusDetail` object for Flight-specific +error codes. + +.. doxygenenum:: arrow::flight::FlightStatusCode + :project: arrow_cpp + +.. doxygenclass:: arrow::flight::FlightStatusDetail + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::flight::MakeFlightError + :project: arrow_cpp diff --git a/docs/source/cpp/api/formats.rst b/docs/source/cpp/api/formats.rst new file mode 100644 index 00000000000..5713b034d62 --- /dev/null +++ b/docs/source/cpp/api/formats.rst @@ -0,0 +1,86 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +File Formats +============ + +CSV +=== + +.. doxygenstruct:: arrow::csv::ReadOptions + :members: + +.. doxygenstruct:: arrow::csv::ParseOptions + :members: + +.. doxygenstruct:: arrow::csv::ConvertOptions + :members: + +.. doxygenclass:: arrow::csv::TableReader + :members: + +Line-separated JSON +=================== + +.. doxygenenum:: arrow::json::UnexpectedFieldBehavior + +.. doxygenstruct:: arrow::json::ReadOptions + :members: + +.. doxygenstruct:: arrow::json::ParseOptions + :members: + +.. doxygenclass:: arrow::json::TableReader + :members: + +Parquet reader +============== + +.. doxygenclass:: parquet::ReaderProperties + :members: + +.. doxygenclass:: parquet::ArrowReaderProperties + :members: + +.. doxygenclass:: parquet::ParquetFileReader + :members: + +.. doxygenclass:: parquet::arrow::FileReader + :members: + +.. doxygenclass:: parquet::arrow::FileReaderBuilder + :members: + +.. doxygengroup:: parquet-arrow-reader-factories + :content-only: + +Parquet writer +============== + +.. doxygenclass:: parquet::WriterProperties + :members: + +.. doxygenclass:: parquet::ArrowWriterProperties + :members: + +.. doxygenclass:: parquet::arrow::FileWriter + :members: + +.. doxygenfunction:: parquet::arrow::WriteTable + +.. TODO ORC diff --git a/docs/source/cpp/api/io.rst b/docs/source/cpp/api/io.rst new file mode 100644 index 00000000000..735136a0d47 --- /dev/null +++ b/docs/source/cpp/api/io.rst @@ -0,0 +1,95 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Input / output +============== + +Interfaces +========== + +.. doxygenclass:: arrow::io::FileInterface + :members: + +.. doxygenclass:: arrow::io::Readable + :members: + +.. doxygenclass:: arrow::io::Seekable + :members: + +.. doxygenclass:: arrow::io::Writable + :members: + +.. doxygenclass:: arrow::io::InputStream + :members: + +.. doxygenclass:: arrow::io::RandomAccessFile + :members: + +.. doxygenclass:: arrow::io::OutputStream + :members: + +.. doxygenclass:: arrow::io::ReadWriteFileInterface + :members: + +Concrete implementations +======================== + +In-memory streams +----------------- + +.. doxygenclass:: arrow::io::BufferReader + :members: + +.. doxygenclass:: arrow::io::MockOutputStream + :members: + +.. doxygenclass:: arrow::io::BufferOutputStream + :members: + +.. doxygenclass:: arrow::io::FixedSizeBufferWriter + :members: + +Local files +----------- + +.. doxygenclass:: arrow::io::ReadableFile + :members: + +.. doxygenclass:: arrow::io::FileOutputStream + :members: + +.. doxygenclass:: arrow::io::MemoryMappedFile + :members: + +Buffering input / output wrappers +--------------------------------- + +.. doxygenclass:: arrow::io::BufferedInputStream + :members: + +.. doxygenclass:: arrow::io::BufferedOutputStream + :members: + +Compressed input / output wrappers +---------------------------------- + +.. doxygenclass:: arrow::io::CompressedInputStream + :members: + +.. doxygenclass:: arrow::io::CompressedOutputStream + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index c921229e6cb..93d8444b916 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -39,6 +39,15 @@ Memory Pools .. doxygenfunction:: arrow::default_memory_pool :project: arrow_cpp +.. doxygenfunction:: arrow::jemalloc_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::mimalloc_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::system_memory_pool + :project: arrow_cpp + .. doxygenclass:: arrow::MemoryPool :project: arrow_cpp :members: diff --git a/docs/source/cpp/api/support.rst b/docs/source/cpp/api/support.rst index b165a9973b4..4749229b458 100644 --- a/docs/source/cpp/api/support.rst +++ b/docs/source/cpp/api/support.rst @@ -25,5 +25,8 @@ Error return and reporting :project: arrow_cpp :members: -.. doxygendefine:: ARROW_RETURN_NOT_OK +.. doxygenclass:: arrow::StatusDetail + :project: arrow_cpp + :members: +.. doxygendefine:: ARROW_RETURN_NOT_OK diff --git a/docs/source/cpp/api/tensor.rst b/docs/source/cpp/api/tensor.rst new file mode 100644 index 00000000000..eb783f296b9 --- /dev/null +++ b/docs/source/cpp/api/tensor.rst @@ -0,0 +1,53 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======= +Tensors +======= + +Dense Tensors +============= + +.. doxygenclass:: arrow::Tensor + :members: + +.. doxygenclass:: arrow::NumericTensor + :members: + +Sparse Tensors +============== + +.. doxygenenum:: arrow::SparseTensorFormat::type + +.. doxygenclass:: arrow::SparseIndex + :members: + +.. doxygenclass:: arrow::SparseCOOIndex + :members: + +.. doxygenclass:: arrow::SparseCSRIndex + :members: + +.. doxygenclass:: arrow::SparseTensor + :members: + +.. doxygenclass:: arrow::SparseTensorImpl + :members: + +.. doxygentypedef:: arrow::SparseTensorCOO + +.. doxygentypedef:: arrow::SparseTensorCSR diff --git a/docs/source/cpp/api/utilities.rst b/docs/source/cpp/api/utilities.rst index 0aaffb54a47..87c5a3bbe04 100644 --- a/docs/source/cpp/api/utilities.rst +++ b/docs/source/cpp/api/utilities.rst @@ -37,3 +37,16 @@ Abstract Sequences :project: arrow_cpp :members: +Compression +=========== + +.. doxygenenum:: arrow::Compression::type + +.. doxygenclass:: arrow::util::Codec + :members: + +.. doxygenclass:: arrow::util::Compressor + :members: + +.. doxygenclass:: arrow::util::Decompressor + :members: diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst new file mode 100644 index 00000000000..5be5483ac4c --- /dev/null +++ b/docs/source/cpp/csv.rst @@ -0,0 +1,144 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow::csv + +================= +Reading CSV files +================= + +Arrow provides a fast CSV reader allowing ingestion of external data +as Arrow tables. + +Basic usage +=========== + +A CSV file is read from a :class:`~arrow::io::InputStream`. + +.. code-block:: cpp + + #include "arrow/csv/api.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr input = ...; + + auto read_options = arrow::csv::ReadOptions::Defaults(); + auto parse_options = arrow::csv::ParseOptions::Defaults(); + auto convert_options = arrow::csv::ConvertOptions::Defaults(); + + // Instantiate TableReader from input stream and options + std::shared_ptr reader; + st = arrow::csv::TableReader::Make(pool, input, read_options, + parse_options, convert_options, + &reader); + if (!st.ok()) { + // Handle TableReader instantiation error... + } + + std::shared_ptr table; + // Read table from CSV file + st = reader->Read(&table); + if (!st.ok()) { + // Handle CSV read error + // (for example a CSV syntax error or failed type conversion) + } + } + +Column names +============ + +There are three possible ways to infer column names from the CSV file: + +* By default, the column names are read from the first row in the CSV file +* If :member:`ReadOptions::column_names` is set, it forces the column + names in the table to these values (the first row in the CSV file is + read as data) +* If :member:`ReadOptions::autogenerate_column_names` is true, column names + will be autogenerated with the pattern "f0", "f1"... (the first row in the + CSV file is read as data) + +Column selection +================ + +By default, Arrow reads all columns in the CSV file. You can narrow the +selection of columns with the :member:`ConvertOptions::include_columns` +option. If some columns in :member:`ConvertOptions::include_columns` +are missing from the CSV file, an error will be emitted unless +:member:`ConvertOptions::include_missing_columns` is true, in which case +the missing columns are assumed to contain all-null values. + +Interaction with column names +----------------------------- + +If both :member:`ReadOptions::column_names` and +:member:`ConvertOptions::include_columns` are specified, +the :member:`ReadOptions::column_names` are assumed to map to CSV columns, +and :member:`ConvertOptions::include_columns` is a subset of those column +names that will part of the Arrow Table. + +Data types +========== + +By default, the CSV reader infers the most appropriate data type for each +column. Type inference considers the following data types, in order: + +* Null +* Int64 +* Boolean +* Timestamp (with seconds unit) +* Float64 +* String +* Binary + +It is possible to override type inference for select columns by setting +the :member:`ConvertOptions::column_types` option. Explicit data types +can be chosen from the following list: + +* Null +* All Integer types +* Float32 and Float64 +* Decimal128 +* Boolean +* Timestamp +* Binary and Large Binary +* String and Large String (with optional UTF8 input validation) +* Fixed-Size Binary + +Other data types do not support conversion from CSV values and will error out. + +Nulls +----- + +Null values are recognized from the spellings stored in +:member:`ConvertOptions::null_values`. The :func:`ConvertOptions::Defaults` +factory method will initialize a number of conventional null spellings such +as ``N/A``. + +Performance +=========== + +By default, the CSV reader will parallelize reads in order to exploit all +CPU cores on your machine. You can change this setting in +:member:`ReadOptions::use_threads`. A reasonable expectation is at least +100 MB/s per core on a modern desktop machine (measured in source CSV bytes, +not target Arrow data bytes). diff --git a/docs/source/cpp/flight.rst b/docs/source/cpp/flight.rst new file mode 100644 index 00000000000..c1d2e43b9f4 --- /dev/null +++ b/docs/source/cpp/flight.rst @@ -0,0 +1,119 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================ +Arrow Flight RPC +================ + +Arrow Flight is an RPC framework for efficient transfer of Flight data +over the network. See :doc:`../format/Flight` for full details on +the protocol, or :doc:`./api/flight` for API docs. + +Writing a Flight Service +======================== + +Servers are subclasses of :class:`arrow::flight::FlightServerBase`. To +implement individual RPCs, override the RPC methods on this class. + +.. code-block:: cpp + + class MyFlightServer : public FlightServerBase { + Status ListFlights(const ServerCallContext& context, const Criteria* criteria, + std::unique_ptr* listings) override { + std::vector flights = ...; + *listings = std::unique_ptr(new SimpleFlightListing(flights)); + return Status::OK(); + } + }; + +Each RPC method always takes a +:class:`arrow::flight::ServerCallContext` for common parameters and +returns a :class:`arrow::Status` to indicate success or +failure. Flight-specific error codes can be returned via +:func:`arrow::flight::MakeFlightError`. + +RPC methods that return a value in addition to a status will use an +out parameter, as shown above. Often, there are helper classes +providing basic implementations of these out parameters. For instance, +above, :class:`arrow::flight::SimpleFlightListing` uses a vector of +:class:`arrow::flight::FlightInfo` objects as the result of a +``ListFlights`` RPC. + +To start a server, create a :class:`arrow::flight::Location` to +specify where to listen, and call +:func:`arrow::flight::FlightServerBase::Init`. This will start the +server, but won't block the rest of the program. Use +:func:`arrow::flight::FlightServerBase::SetShutdownOnSignals` to +enable stopping the server if an interrupt signal is received, then +call :func:`arrow::flight::FlightServerBase::Serve` to block until the +server stops. + +.. code-block:: cpp + + std::unique_ptr server; + // Initialize server + arrow::flight::Location location; + // Listen to all interfaces on a free port + ARROW_CHECK_OK(arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0, &location)); + arrow::flight::FlightServerOptions options(location); + + // Start the server + ARROW_CHECK_OK(server->Init(options)); + // Exit with a clean error code (0) on SIGTERM + ARROW_CHECK_OK(server->SetShutdownOnSignals({SIGTERM})); + + std::cout << "Server listening on localhost:" << server->port() << std::endl; + ARROW_CHECK_OK(server->Serve()); + + +Enabling TLS and Authentication +------------------------------- + +TLS can be enabled by providing a certificate and key pair to +:func:`FlightServerBase::Init +`. Additionally, use +:func:`Location::ForGrpcTls ` to +construct the :class:`arrow::flight::Location` to listen on. + +Similarly, authentication can be enabled by providing an +implementation of :class:`ServerAuthHandler +`. Authentication consists of two +parts: on initial client connection, the server and client +authentication implementations can perform any negotiation needed; +then, on each RPC thereafter, the client provides a token. The server +authentication handler validates the token and provides the identity +of the client. This identity can be obtained from the +:class:`arrow::flight::ServerCallContext`. + +Using the Flight Client +======================= + +To connect to a Flight service, create an instance of +:class:`arrow::flight::FlightClient` by calling :func:`Connect +`. This takes a Location and +returns the client through an out parameter. To authenticate, call +:func:`Authenticate ` with +the desired client authentication implementation. + +Each RPC method returns :class:`arrow::Status` to indicate the +success/failure of the request. Any other return values are specified +through out parameters. They also take an optional :class:`options +` parameter that allows specifying a +timeout for the call. diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 7c55b76912d..05fea989c66 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -18,8 +18,8 @@ .. default-domain:: cpp .. highlight:: cpp -Getting Started -=============== +User Guide +========== .. toctree:: @@ -29,3 +29,8 @@ Getting Started arrays datatypes tables + io + parquet + csv + json + flight diff --git a/docs/source/cpp/io.rst b/docs/source/cpp/io.rst new file mode 100644 index 00000000000..898bc9a5592 --- /dev/null +++ b/docs/source/cpp/io.rst @@ -0,0 +1,86 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +============================== +Input / output and filesystems +============================== + +Arrow provides a range of C++ interfaces abstracting the concrete details +of input / output operations. They operate on streams of untyped binary data. +Those abstractions are used for various purposes such as reading CSV or +Parquet data, transmitting IPC streams, and more. + +.. cpp:namespace:: arrow::io + +Reading binary data +=================== + +Interfaces for reading binary data come in two flavours: + +* Sequential reading: the :class:`InputStream` interface provides + ``Read`` methods; it is recommended to ``Read`` to a ``Buffer`` as it + may in some cases avoid a memory copy. + +* Random access reading: the :class:`RandomAccessFile` interface + provides additional facilities for positioning and, most importantly, + the ``ReadAt`` methods which allow parallel reading from multiple threads. + +Concrete implementations are available for :class:`in-memory reads `, +:class:`unbuffered file reads `, +:class:`memory-mapped file reads `, +:class:`buffered reads `, +:class:`compressed reads `. + +Writing binary data +=================== + +Writing binary data is mostly done through the :class:`OutputStream` +interface. + +Concrete implementations are available for :class:`in-memory writes `, +:class:`unbuffered file writes `, +:class:`memory-mapped file writes `, +:class:`buffered writes `, +:class:`compressed writes `. + +.. cpp:namespace:: arrow::fs + +Filesystems +=========== + +The :class:`filesystem interface ` allows abstracted access over +various data storage backends such as the local filesystem or a S3 bucket. +It provides input and output streams as well as directory operations. + +The filesystem interface exposes a simplified view of the underlying data +storage. Data paths are represented as *abstract paths*, which are +``/``-separated, even on Windows, and shouldn't include special path +components such as ``.`` and ``..``. Symbolic links, if supported by the +underlying storage, are automatically dereferenced. Only basic +:class:`metadata ` about file entries, such as the file size +and modification time, is made available. + +Concrete implementations are available for +:class:`local filesystem access ` and +:class:`Amazon S3-compatible storage `. + +.. note:: + The filesystem layer is currently experimental. API details may vary + in the future. diff --git a/docs/source/cpp/json.rst b/docs/source/cpp/json.rst new file mode 100644 index 00000000000..93dcdfa3c47 --- /dev/null +++ b/docs/source/cpp/json.rst @@ -0,0 +1,125 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow::json + +================== +Reading JSON files +================== + +Arrow allows reading line-separated JSON files as Arrow tables. Each +independent JSON object in the input file is converted to a row in +the target Arrow table. + +Basic usage +=========== + +A JSON file is read from a :class:`~arrow::io::InputStream`. + +.. code-block:: cpp + + #include "arrow/json/api.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr input = ...; + + auto read_options = arrow::json::ReadOptions::Defaults(); + auto parse_options = arrow::json::ParseOptions::Defaults(); + + // Instantiate TableReader from input stream and options + std::shared_ptr reader; + st = arrow::json::TableReader::Make(pool, input, read_options, + parse_options, &reader); + if (!st.ok()) { + // Handle TableReader instantiation error... + } + + std::shared_ptr table; + // Read table from JSON file + st = reader->Read(&table); + if (!st.ok()) { + // Handle JSON read error + // (for example a JSON syntax error or failed type conversion) + } + } + +Data types +========== + +Since JSON values are typed, the possible Arrow data types on output +depend on the input value types. Top-level JSON values should always be +objects. The fields of top-level objects are taken to represent columns +in the Arrow data. For each name/value pair in a JSON object, there are +two possible modes of deciding the output data type: + +* if the name is in :class:`ConvertOptions::explicit_schema`, + conversion of the JSON value to the corresponding Arrow data type is + attempted; + +* otherwise, the Arrow data type is determined via type inference on + the JSON value, trying out a number of Arrow data types in order. + +The following tables show the possible combinations for each of those +two modes. + +.. table:: Explicit conversions from JSON to Arrow + :align: center + + +-----------------+----------------------------------------------------+ + | JSON value type | Allowed Arrow data types | + +=================+====================================================+ + | Null | Any (including Null) | + +-----------------+----------------------------------------------------+ + | Number | All Integer types, Float32, Float64, | + | | Date32, Date64, Time32, Time64 | + +-----------------+----------------------------------------------------+ + | Boolean | Boolean | + +-----------------+----------------------------------------------------+ + | String | Binary, LargeBinary, String, LargeString, | + | | Timestamp | + +-----------------+----------------------------------------------------+ + | Array | List | + +-----------------+----------------------------------------------------+ + | Object (nested) | Struct | + +-----------------+----------------------------------------------------+ + +.. table:: Implicit type inference from JSON to Arrow + :align: center + + +-----------------+----------------------------------------------------+ + | JSON value type | Inferred Arrow data types (in order) | + +=================+====================================================+ + | Null | Null, any other | + +-----------------+----------------------------------------------------+ + | Number | Int64, Float64 | + | | | + +-----------------+----------------------------------------------------+ + | Boolean | Boolean | + +-----------------+----------------------------------------------------+ + | String | Timestamp (with seconds unit), String | + | | | + +-----------------+----------------------------------------------------+ + | Array | List | + +-----------------+----------------------------------------------------+ + | Object (nested) | Struct | + +-----------------+----------------------------------------------------+ diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst index 53fc998eae6..ccebdba45dd 100644 --- a/docs/source/cpp/overview.rst +++ b/docs/source/cpp/overview.rst @@ -89,3 +89,9 @@ The devices layer Basic **CUDA** integration is provided, allowing to describe Arrow data backed by GPU-allocated memory. + +The filesystem layer +-------------------- + +A filesystem abstraction allows reading and writing data from different storage +backends, such as the local filesystem or a S3 bucket. diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst new file mode 100644 index 00000000000..62e342e9e69 --- /dev/null +++ b/docs/source/cpp/parquet.rst @@ -0,0 +1,69 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: parquet::arrow + +================================= +Reading and writing Parquet files +================================= + +The Parquet C++ library is part of the Apache Arrow project and benefits +from tight integration with Arrow C++. + +Reading +======= + +The Parquet :class:`FileReader` requires a :class:`::arrow::io::RandomAccessFile` +instance representing the input file. + +.. code-block:: cpp + + #include "arrow/parquet/arrow/reader.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr input = ...; + + // Open Parquet file reader + std::unique_ptr arrow_reader; + st = parquet::arrow::OpenFile(input, pool, &arrow_reader); + if (!st.ok()) { + // Handle error instantiating file reader... + } + + // Read entire file as a single Arrow table + std::shared_ptr table; + st = arrow_reader->ReadTable(&table); + if (!st.ok()) { + // Handle error reading Parquet data... + } + } + +Finer-grained options are available through the :class:`FileReaderBuilder` +helper class. + +.. TODO write section about performance and memory efficiency + +Writing +======= + +TODO: write this diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index e929c6eecd8..d5dbb3f73b7 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -18,9 +18,9 @@ .. default-domain:: cpp .. highlight:: cpp -======================== -Two-dimensional Datasets -======================== +============ +Tabular Data +============ While arrays and chunked arrays represent a one-dimensional sequence of homogenous values, data often comes in the form of two-dimensional sets of diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index b3476eadf33..7ffa2430128 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -78,7 +78,7 @@ See `Protocol Buffer Definitions`_ for full details on the methods and messages involved. Authentication -~~~~~~~~~~~~~~ +-------------- Flight supports application-implemented authentication methods. Authentication, if enabled, has two phases: at connection @@ -92,6 +92,50 @@ call, or they may establish trust during the handshake and not validate a token for each call. (Note that the latter is not secure if you choose to deploy a layer 7 load balancer, as is common with gRPC.) +Error Handling +-------------- + +Arrow Flight defines its own set of error codes. The implementation +differs between languages (e.g. in C++, Unimplemented is a general +Arrow error status while it's a Flight-specific exception in Java), +but the following set is exposed: + ++----------------+-------------------------------------------+ +|Error Code |Description | ++================+===========================================+ +|UNKNOWN |An unknown error. The default if no other | +| |error applies. | ++----------------+-------------------------------------------+ +|INTERNAL |An error internal to the service | +| |implementation occurred. | ++----------------+-------------------------------------------+ +|INVALID_ARGUMENT|The client passed an invalid argument to | +| |the RPC. | ++----------------+-------------------------------------------+ +|TIMED_OUT |The operation exceeded a timeout or | +| |deadline. | ++----------------+-------------------------------------------+ +|NOT_FOUND |The requested resource (action, data | +| |stream) was not found. | ++----------------+-------------------------------------------+ +|ALREADY_EXISTS |The resource already exists. | ++----------------+-------------------------------------------+ +|CANCELLED |The operation was cancelled (either by the | +| |client or the server). | ++----------------+-------------------------------------------+ +|UNAUTHENTICATED |The client is not authenticated. | ++----------------+-------------------------------------------+ +|UNAUTHORIZED |The client is authenticated, but does not | +| |have permissions for the requested | +| |operation. | ++----------------+-------------------------------------------+ +|UNIMPLEMENTED |The RPC is not implemented. | ++----------------+-------------------------------------------+ +|UNAVAILABLE |The server is not available. May be emitted| +| |by the client for connectivity reasons. | ++----------------+-------------------------------------------+ + + External Resources ------------------ diff --git a/integration/spark/ARROW-6429.patch b/integration/spark/ARROW-6429.patch new file mode 100644 index 00000000000..e6140847111 --- /dev/null +++ b/integration/spark/ARROW-6429.patch @@ -0,0 +1,31 @@ +diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala +index 1a6f4ac..42d555b 100644 +--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala ++++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala +@@ -26,7 +26,7 @@ import org.apache.arrow.flatbuf.MessageHeader + import org.apache.arrow.memory.BufferAllocator + import org.apache.arrow.vector._ + import org.apache.arrow.vector.ipc.{ArrowStreamWriter, ReadChannel, WriteChannel} +-import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, MessageSerializer} ++import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, IpcOption, MessageSerializer} + + import org.apache.spark.TaskContext + import org.apache.spark.api.java.JavaRDD +@@ -64,7 +64,7 @@ private[sql] class ArrowBatchStreamWriter( + * End the Arrow stream, does not close output stream. + */ + def end(): Unit = { +- ArrowStreamWriter.writeEndOfStream(writeChannel) ++ ArrowStreamWriter.writeEndOfStream(writeChannel, new IpcOption) + } + } + +@@ -252,7 +252,7 @@ private[sql] object ArrowConverters { + if (msgMetadata.getMessage.headerType() == MessageHeader.RecordBatch) { + + // Buffer backed output large enough to hold the complete serialized message +- val bbout = new ByteBufferOutputStream(4 + msgMetadata.getMessageLength + bodyLength) ++ val bbout = new ByteBufferOutputStream(8 + msgMetadata.getMessageLength + bodyLength) + + // Write message metadata to ByteBuffer output stream + MessageSerializer.writeMessageBuffer( diff --git a/integration/spark/Dockerfile b/integration/spark/Dockerfile index d6a29bde6bc..6b032034986 100644 --- a/integration/spark/Dockerfile +++ b/integration/spark/Dockerfile @@ -17,7 +17,7 @@ FROM arrow:python-3.6 # installing java and maven -ARG MAVEN_VERSION=3.5.4 +ARG MAVEN_VERSION=3.6.2 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ MAVEN_HOME=/usr/local/maven \ M2_HOME=/root/.m2 \ @@ -38,6 +38,11 @@ RUN wget -q -O /tmp/spark.tar.gz https://github.com/apache/spark/archive/$SPARK_ tar -xzf /tmp/spark.tar.gz -C /spark --strip-components=1 && \ rm /tmp/spark.tar.gz +# patch spark to build with current Arrow Java +COPY integration/spark/ARROW-6429.patch /tmp/ +RUN patch -d /spark -p1 -i /tmp/ARROW-6429.patch && \ + rm /tmp/ARROW-6429.patch + # build cpp with tests ENV CC=gcc \ CXX=g++ \ diff --git a/integration/spark/runtest.sh b/integration/spark/runtest.sh index 593683b8079..f8d2cbe9cb7 100755 --- a/integration/spark/runtest.sh +++ b/integration/spark/runtest.sh @@ -47,8 +47,6 @@ pushd /spark build/mvn -B -Dtest=none -DwildcardSuites=$(IFS=,; echo "${SPARK_SCALA_TESTS[*]}") test # Run pyarrow related Python tests only - echo "Testing PySpark:" - SPARK_PYTHON_TESTS=( "pyspark.sql.tests.test_arrow" "pyspark.sql.tests.test_pandas_udf" @@ -58,5 +56,5 @@ pushd /spark "pyspark.sql.tests.test_pandas_udf_window") (echo "Testing PySpark:"; IFS=$'\n'; echo "${SPARK_PYTHON_TESTS[*]}") - python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")" + python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")" --python-executables python popd diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BigIntConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BigIntConsumer.java index 28af49015ec..561499ab9e8 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BigIntConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BigIntConsumer.java @@ -45,8 +45,9 @@ public BigIntConsumer(BigIntVector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { long value = resultSet.getLong(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BitConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BitConsumer.java index 9a1a74c0a78..20d181f8a02 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BitConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/BitConsumer.java @@ -45,8 +45,9 @@ public BitConsumer(BitVector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { boolean value = resultSet.getBoolean(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value ? 1 : 0); + vector.setSafe(currentIndex, value ? 1 : 0); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java index f373ca839b6..856dd8da2e7 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DateConsumer.java @@ -57,8 +57,9 @@ public void consume(ResultSet resultSet) throws SQLException { Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : resultSet.getDate(columnIndexInResultSet, calendar); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, date.getTime()); + vector.setSafe(currentIndex, date.getTime()); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DecimalConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DecimalConsumer.java index fbf1672a648..b93560c0a6d 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DecimalConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DecimalConsumer.java @@ -46,8 +46,9 @@ public DecimalConsumer(DecimalVector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { BigDecimal value = resultSet.getBigDecimal(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DoubleConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DoubleConsumer.java index fbaced733bd..80118a2d4c0 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DoubleConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/DoubleConsumer.java @@ -45,8 +45,9 @@ public DoubleConsumer(Float8Vector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { double value = resultSet.getDouble(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/FloatConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/FloatConsumer.java index 485616ded43..0625c6464dd 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/FloatConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/FloatConsumer.java @@ -45,8 +45,9 @@ public FloatConsumer(Float4Vector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { float value = resultSet.getFloat(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/IntConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/IntConsumer.java index fb69e620831..42bddc3c6aa 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/IntConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/IntConsumer.java @@ -45,8 +45,9 @@ public IntConsumer(IntVector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { int value = resultSet.getInt(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/SmallIntConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/SmallIntConsumer.java index 7a3af254c70..8a27a4d682e 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/SmallIntConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/SmallIntConsumer.java @@ -45,8 +45,9 @@ public SmallIntConsumer(SmallIntVector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { short value = resultSet.getShort(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimeConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimeConsumer.java index 1a593ca839d..be9caaacc7b 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimeConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimeConsumer.java @@ -57,8 +57,9 @@ public void consume(ResultSet resultSet) throws SQLException { Time time = calendar == null ? resultSet.getTime(columnIndexInResultSet) : resultSet.getTime(columnIndexInResultSet, calendar); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, (int) time.getTime()); + vector.setSafe(currentIndex, (int) time.getTime()); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimestampConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimestampConsumer.java index 7d2acae68c8..94b84c1e481 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimestampConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TimestampConsumer.java @@ -57,8 +57,9 @@ public void consume(ResultSet resultSet) throws SQLException { Timestamp timestamp = calendar == null ? resultSet.getTimestamp(columnIndexInResultSet) : resultSet.getTimestamp(columnIndexInResultSet, calendar); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, timestamp.getTime()); + vector.setSafe(currentIndex, timestamp.getTime()); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TinyIntConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TinyIntConsumer.java index da93949914a..8089d5e2657 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TinyIntConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/TinyIntConsumer.java @@ -45,8 +45,9 @@ public TinyIntConsumer(TinyIntVector vector, int index) { public void consume(ResultSet resultSet) throws SQLException { byte value = resultSet.getByte(columnIndexInResultSet); if (!resultSet.wasNull()) { - vector.setSafe(currentIndex++, value); + vector.setSafe(currentIndex, value); } + currentIndex++; } @Override diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/VarCharConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/VarCharConsumer.java index f13755a8d11..8387700751c 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/VarCharConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/VarCharConsumer.java @@ -48,8 +48,9 @@ public void consume(ResultSet resultSet) throws SQLException { if (!resultSet.wasNull()) { byte[] bytes = value.getBytes(StandardCharsets.UTF_8); - vector.setSafe(currentIndex++, bytes); + vector.setSafe(currentIndex, bytes); } + currentIndex++; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java index 6e74c212116..4b4000cef8e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/ApproxEqualsVisitor.java @@ -69,8 +69,14 @@ public void setDoubleDiffFunction(DiffFunctionDouble doubleDiffFunction) { @Override public Boolean visit(BaseFixedWidthVector left, Range range) { if (left instanceof Float4Vector) { + if (!validate(left)) { + return false; + } return float4ApproxEquals(range); } else if (left instanceof Float8Vector) { + if (!validate(left)) { + return false; + } return float8ApproxEquals(range); } else { return super.visit(left, range); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 5d43031ffb5..d6c9ac7b4f3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -58,7 +58,11 @@ public RangeEqualsVisitor(ValueVector left, ValueVector right, boolean isTypeChe Preconditions.checkArgument(right != null, "right vector cannot be null"); - // types cannot change for a visitor instance. so, the check is done only once. + // type usually checks only once unless the left vector is changed. + checkType(); + } + + private void checkType() { if (!isTypeCheckNeeded) { typeCompareResult = true; } else if (left == right) { @@ -68,6 +72,17 @@ public RangeEqualsVisitor(ValueVector left, ValueVector right, boolean isTypeChe } } + /** + * Validate the passed left vector, if it is changed, reset and check type. + */ + protected boolean validate(ValueVector left) { + if (left != this.left) { + this.left = left; + checkType(); + } + return typeCompareResult; + } + /** * Constructs a new instance. * @@ -79,7 +94,7 @@ public RangeEqualsVisitor(ValueVector left, ValueVector right) { } /** - * Check range equals without passing IN param in VectorVisitor. + * Check range equals. */ public boolean rangeEquals(Range range) { if (!typeCompareResult) { @@ -107,42 +122,59 @@ public ValueVector getRight() { return right; } - public boolean isTypeCheckNeeded() { - return isTypeCheckNeeded; - } - @Override public Boolean visit(BaseFixedWidthVector left, Range range) { + if (!validate(left)) { + return false; + } return compareBaseFixedWidthVectors(range); } @Override public Boolean visit(BaseVariableWidthVector left, Range range) { + if (!validate(left)) { + return false; + } return compareBaseVariableWidthVectors(range); } @Override public Boolean visit(ListVector left, Range range) { + if (!validate(left)) { + return false; + } return compareListVectors(range); } @Override public Boolean visit(FixedSizeListVector left, Range range) { + if (!validate(left)) { + return false; + } return compareFixedSizeListVectors(range); } @Override public Boolean visit(NonNullableStructVector left, Range range) { + if (!validate(left)) { + return false; + } return compareStructVectors(range); } @Override public Boolean visit(UnionVector left, Range range) { + if (!validate(left)) { + return false; + } return compareUnionVectors(range); } @Override public Boolean visit(ZeroVector left, Range range) { + if (!validate(left)) { + return false; + } return true; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index 04d73e231cd..6cfd70ddc77 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -86,6 +86,31 @@ public void testIntVectorEqualsWithNull() { } } + @Test + public void testEqualsWithTypeChange() { + try (final IntVector vector1 = new IntVector("intVector1", allocator); + final IntVector vector2 = new IntVector("intVector2", allocator); + final BigIntVector vector3 = new BigIntVector("bigIntVector", allocator)) { + + vector1.allocateNew(2); + vector1.setValueCount(2); + vector2.allocateNew(2); + vector2.setValueCount(2); + + vector1.setSafe(0, 1); + vector1.setSafe(1, 2); + + vector2.setSafe(0, 1); + vector2.setSafe(1, 2); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + Range range = new Range(0, 0, 2); + assertTrue(vector1.accept(visitor, range)); + // visitor left vector changed, will reset and check type again + assertFalse(vector3.accept(visitor, range)); + } + } + @Test public void testBaseFixedWidthVectorRangeEqual() { try (final IntVector vector1 = new IntVector("int", allocator); diff --git a/js/src/compute/dataframe.ts b/js/src/compute/dataframe.ts index 2d9962d5317..e82e65cc1b9 100644 --- a/js/src/compute/dataframe.ts +++ b/js/src/compute/dataframe.ts @@ -31,6 +31,7 @@ export type NextFunc = (idx: number, batch: RecordBatch) => void; Table.prototype.countBy = function(this: Table, name: Col | string) { return new DataFrame(this.chunks).countBy(name); }; Table.prototype.scan = function(this: Table, next: NextFunc, bind?: BindFunc) { return new DataFrame(this.chunks).scan(next, bind); }; +Table.prototype.scanReverse = function(this: Table, next: NextFunc, bind?: BindFunc) { return new DataFrame(this.chunks).scanReverse(next, bind); }; Table.prototype.filter = function(this: Table, predicate: Predicate): FilteredDataFrame { return new DataFrame(this.chunks).filter(predicate); }; export class DataFrame extends Table { @@ -49,6 +50,18 @@ export class DataFrame extends Tabl } } } + public scanReverse(next: NextFunc, bind?: BindFunc) { + const batches = this.chunks, numBatches = batches.length; + for (let batchIndex = numBatches; --batchIndex >= 0;) { + // load batches + const batch = batches[batchIndex]; + if (bind) { bind(batch); } + // yield all indices + for (let index = batch.length; --index >= 0;) { + next(index, batch); + } + } + } public countBy(name: Col | string) { const batches = this.chunks, numBatches = batches.length; const count_by = typeof name === 'string' ? new Col(name) : name as Col; @@ -130,6 +143,23 @@ export class FilteredDataFrame exte } } } + public scanReverse(next: NextFunc, bind?: BindFunc) { + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = numBatches; --batchIndex >= 0;) { + // load batches + const batch = batches[batchIndex]; + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + if (bind) { bind(batch); } + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = batch.length; --index >= 0;) { + if (predicate(index, batch)) { next(index, batch); } + } + } + } public count(): number { // inlined version of this: // let sum = 0; diff --git a/js/src/table.ts b/js/src/table.ts index b7cdbe22127..5c41e14a9f5 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -44,6 +44,7 @@ export interface Table { clone(chunks?: RecordBatch[], offsets?: Uint32Array): Table; scan(next: import('./compute/dataframe').NextFunc, bind?: import('./compute/dataframe').BindFunc): void; + scanReverse(next: import('./compute/dataframe').NextFunc, bind?: import('./compute/dataframe').BindFunc): void; countBy(name: import('./compute/predicate').Col | string): import('./compute/dataframe').CountByResult; filter(predicate: import('./compute/predicate').Predicate): import('./compute/dataframe').FilteredDataFrame; } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 22c50a7737b..ae2f058e7da 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -326,6 +326,24 @@ describe(`Table`, () => { } }); }); + describe(`scanReverse()`, () => { + test(`yields all values`, () => { + const table = datum.table(); + let expected_idx = values.length; + table.scanReverse((idx, batch) => { + const columns = batch.schema.fields.map((_, i) => batch.getChildAt(i)!); + expect(columns.map((c) => c.get(idx))).toEqual(values[--expected_idx]); + }); + }); + test(`calls bind function with every batch`, () => { + const table = datum.table(); + let bind = jest.fn(); + table.scanReverse(() => { }, bind); + for (let batch of table.chunks) { + expect(bind).toHaveBeenCalledWith(batch); + } + }); + }); test(`count() returns the correct length`, () => { const table = datum.table(); const values = datum.values(); @@ -434,6 +452,26 @@ describe(`Table`, () => { } }); }); + describe(`scanReverse()`, () => { + test(`iterates over expected values in reverse`, () => { + let expected_idx = expected.length; + filtered.scanReverse((idx, batch) => { + const columns = batch.schema.fields.map((_, i) => batch.getChildAt(i)!); + expect(columns.map((c) => c.get(idx))).toEqual(expected[--expected_idx]); + }); + }); + test(`calls bind function on every batch`, () => { + // Techincally, we only need to call bind on + // batches with data that match the predicate, so + // this test may fail in the future if we change + // that - and that's ok! + let bind = jest.fn(); + filtered.scanReverse(() => { }, bind); + for (let batch of table.chunks) { + expect(bind).toHaveBeenCalledWith(batch); + } + }); + }); }); } test(`countBy on dictionary returns the correct counts`, () => { diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e47327ff2d4..843138a55b5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -903,12 +903,10 @@ cdef class Array(_PandasConvertible): self, &out)) result = pandas_api.series(wrap_array_output(out), name=self._name) - if isinstance(self.type, TimestampType): - tz = self.type.tz - if tz is not None: - tz = string_to_tzinfo(tz) - result = (result.dt.tz_localize('utc') - .dt.tz_convert(tz)) + if isinstance(self.type, TimestampType) and self.type.tz is not None: + from pyarrow.pandas_compat import make_tz_aware + + result = make_tz_aware(result, self.type.tz) return result @@ -1290,7 +1288,9 @@ cdef class UnionArray(Array): check_status(CUnionArray.MakeDense( deref(types.ap), deref(value_offsets.ap), c, c_field_names, c_type_codes, &out)) - return pyarrow_wrap_array(out) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result @staticmethod def from_sparse(Array types, list children, list field_names=None, @@ -1328,7 +1328,9 @@ cdef class UnionArray(Array): c_field_names, c_type_codes, &out)) - return pyarrow_wrap_array(out) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result cdef class StringArray(Array): @@ -1503,7 +1505,9 @@ cdef class DictionaryArray(Array): c_result.reset(new CDictionaryArray(c_type, _indices.sp_array, _dictionary.sp_array)) - return pyarrow_wrap_array(c_result) + cdef Array result = pyarrow_wrap_array(c_result) + result.validate() + return result cdef class StructArray(Array): @@ -1628,7 +1632,9 @@ cdef class StructArray(Array): else: c_result = CStructArray.MakeFromFields( c_arrays, c_fields, shared_ptr[CBuffer](), -1, 0) - return pyarrow_wrap_array(GetResultValue(c_result)) + cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) + result.validate() + return result cdef class ExtensionArray(Array): @@ -1667,7 +1673,9 @@ cdef class ExtensionArray(Array): "for extension type {1}".format(storage.type, typ)) ext_array = make_shared[CExtensionArray](typ.sp_type, storage.sp_array) - return pyarrow_wrap_array( ext_array) + cdef Array result = pyarrow_wrap_array( ext_array) + result.validate() + return result cdef dict _array_classes = { diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 4b7e2e02987..900711ab6bd 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -1046,3 +1046,17 @@ def _add_any_metadata(table, pandas_metadata): return pa.Table.from_arrays(columns, schema=pa.schema(fields)) else: return table + + +# ---------------------------------------------------------------------- +# Helper functions used in lib + + +def make_tz_aware(series, tz): + """ + Make a datetime64 Series timezone-aware for the given tz + """ + tz = pa.lib.string_to_tzinfo(tz) + series = (series.dt.tz_localize('utc') + .dt.tz_convert(tz)) + return series diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index a02dd56c9b1..e75930a1bc4 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -27,6 +27,7 @@ import os import re import six +import operator import pyarrow as pa import pyarrow.lib as lib @@ -142,15 +143,17 @@ def _build_nested_paths(self): result = defaultdict(list) - def _visit_piece(i, key, rest): - result[key].append(i) + for i, path in enumerate(paths): + key = path[0] + rest = path[1:] + while True: + result[key].append(i) - if len(rest) > 0: - nested_key = '.'.join((key, rest[0])) - _visit_piece(i, nested_key, rest[1:]) + if not rest: + break - for i, path in enumerate(paths): - _visit_piece(i, path[0], path[1:]) + key = '.'.join((key, rest[0])) + rest = rest[1:] return result @@ -922,6 +925,11 @@ def _path_split(path, sep): EXCLUDED_PARQUET_PATHS = {'_SUCCESS'} +class _ParquetDatasetMetadata: + __slots__ = ('fs', 'memory_map', 'read_dictionary', 'common_metadata', + 'buffer_size') + + def _open_dataset_file(dataset, path, meta=None): if dataset.fs is not None and not isinstance(dataset.fs, LocalFileSystem): path = dataset.fs.open(path, mode='rb') @@ -999,32 +1007,37 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, memory_map=False, buffer_size=0): + self._metadata = _ParquetDatasetMetadata() a_path = path_or_paths if isinstance(a_path, list): a_path = a_path[0] - self.fs, _ = _get_filesystem_and_path(filesystem, a_path) + self._metadata.fs, _ = _get_filesystem_and_path(filesystem, a_path) if isinstance(path_or_paths, list): self.paths = [_parse_uri(path) for path in path_or_paths] else: self.paths = _parse_uri(path_or_paths) - self.read_dictionary = read_dictionary - self.memory_map = memory_map - self.buffer_size = buffer_size + self._metadata.read_dictionary = read_dictionary + self._metadata.memory_map = memory_map + self._metadata.buffer_size = buffer_size (self.pieces, self.partitions, self.common_metadata_path, self.metadata_path) = _make_manifest( path_or_paths, self.fs, metadata_nthreads=metadata_nthreads, - open_file_func=partial(_open_dataset_file, self)) + open_file_func=partial(_open_dataset_file, self._metadata) + ) if self.common_metadata_path is not None: with self.fs.open(self.common_metadata_path) as f: - self.common_metadata = read_metadata(f, memory_map=memory_map) + self._metadata.common_metadata = read_metadata( + f, + memory_map=memory_map + ) else: - self.common_metadata = None + self._metadata.common_metadata = None if metadata is None and self.metadata_path is not None: with self.fs.open(self.metadata_path) as f: @@ -1171,6 +1184,16 @@ def all_filters_accept(piece): self.pieces = [p for p in self.pieces if all_filters_accept(p)] + fs = property(operator.attrgetter('_metadata.fs')) + memory_map = property(operator.attrgetter('_metadata.memory_map')) + read_dictionary = property( + operator.attrgetter('_metadata.read_dictionary') + ) + common_metadata = property( + operator.attrgetter('_metadata.common_metadata') + ) + buffer_size = property(operator.attrgetter('_metadata.buffer_size')) + def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, open_file_func=None): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index a09766fd4b7..086c4f45011 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -163,7 +163,14 @@ cdef class ChunkedArray(_PandasConvertible): self.sp_chunked_array, self, &out)) - return pandas_api.series(wrap_array_output(out), name=self._name) + result = pandas_api.series(wrap_array_output(out), name=self._name) + + if isinstance(self.type, TimestampType) and self.type.tz is not None: + from pyarrow.pandas_compat import make_tz_aware + + result = make_tz_aware(result, self.type.tz) + + return result def __array__(self, dtype=None): values = self.to_pandas().values diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index a095d81bcf7..bfb1e8e5d0b 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -51,7 +51,9 @@ 'plasma', 's3', 'tensorflow', - 'flight' + 'flight', + 'slow', + 'requires_testing_data', ] @@ -70,6 +72,8 @@ 's3': False, 'tensorflow': False, 'flight': False, + 'slow': False, + 'requires_testing_data': True, } try: @@ -166,18 +170,6 @@ def bool_env(name, default=None): action='store_true', default=default, help=('Run only the {} test group'.format(group))) - parser.addoption('--runslow', action='store_true', - default=False, help='run slow tests') - - -def pytest_collection_modifyitems(config, items): - if not config.getoption('--runslow'): - skip_slow = pytest.mark.skip(reason='need --runslow option to run') - - for item in items: - if 'slow' in item.keywords: - item.add_marker(skip_slow) - def pytest_runtest_setup(item): only_set = False diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 9aa8a7c8b48..f4deb241865 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -152,15 +152,15 @@ def test_to_pandas_zero_copy(): arr = pa.array(range(10)) for i in range(10): - np_arr = arr.to_pandas() - assert sys.getrefcount(np_arr) == 2 - np_arr = None # noqa + series = arr.to_pandas() + assert sys.getrefcount(series) == 2 + series = None # noqa assert sys.getrefcount(arr) == 2 for i in range(10): arr = pa.array(range(10)) - np_arr = arr.to_pandas() + series = arr.to_pandas() arr = None gc.collect() @@ -168,9 +168,9 @@ def test_to_pandas_zero_copy(): # Because of py.test's assert inspection magic, if you put getrefcount # on the line being examined, it will be 1 higher than you expect - base_refcount = sys.getrefcount(np_arr.base) + base_refcount = sys.getrefcount(series.values.base) assert base_refcount == 2 - np_arr.sum() + series.sum() @pytest.mark.nopandas @@ -422,7 +422,7 @@ def test_struct_from_buffers(): def test_struct_from_arrays(): - a = pa.array([4, 5, 6]) + a = pa.array([4, 5, 6], type=pa.int64()) b = pa.array(["bar", None, ""]) c = pa.array([[1, 2], None, [3, None]]) expected_list = [ @@ -447,7 +447,7 @@ def test_struct_from_arrays(): # From fields fa = pa.field("a", a.type, nullable=False) fb = pa.field("b", b.type) - fc = pa.field("c", b.type) + fc = pa.field("c", c.type) arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc]) assert arr.type == pa.struct([fa, fb, fc]) assert not arr.type[0].nullable @@ -460,6 +460,11 @@ def test_struct_from_arrays(): assert arr.type == pa.struct([]) assert arr.to_pylist() == [] + # Inconsistent fields + fa2 = pa.field("a", pa.int32()) + with pytest.raises(ValueError, match="int64 vs int32"): + pa.StructArray.from_arrays([a, b, c], fields=[fa2, fb, fc]) + def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) @@ -1786,3 +1791,6 @@ def test_to_pandas_timezone(): arr = pa.array([1, 2, 3], type=pa.timestamp('s', tz='Europe/Brussels')) s = arr.to_pandas() assert s.dt.tz is not None + arr = pa.chunked_array([arr]) + s = arr.to_pandas() + assert s.dt.tz is not None diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 3c03c4c31ec..35a40241450 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -352,3 +352,39 @@ def test_generic_ext_type_register(registered_period_type): period_type = PeriodType('D') with pytest.raises(KeyError): pa.register_extension_type(period_type) + + +@pytest.mark.parquet +def test_parquet(tmpdir, registered_period_type): + # parquet support for extension types + period_type = PeriodType('D') + storage = pa.array([1, 2, 3, 4], pa.int64()) + arr = pa.ExtensionArray.from_storage(period_type, storage) + table = pa.table([arr], names=["ext"]) + + import pyarrow.parquet as pq + + filename = tmpdir / 'extension_type.parquet' + pq.write_table(table, filename) + + # stored in parquet as storage type but with extension metadata saved + # in the serialized arrow schema + meta = pq.read_metadata(filename) + assert meta.schema.column(0).physical_type == "INT64" + assert b"ARROW:schema" in meta.metadata + + import base64 + decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) + schema = pa.read_schema(pa.BufferReader(decoded_schema)) + assert schema.field("ext").metadata == { + b'ARROW:extension:metadata': b'freq=D', + b'ARROW:extension:name': b'pandas.period'} + + # when reading in, properly create extension type if it is registered + result = pq.read_table(filename) + assert result.column("ext").type == period_type + + # when the type is not registered, read in as storage type + pa.unregister_extension_type(period_type.extension_name) + result = pq.read_table(filename) + assert result.column("ext").type == pa.int64() diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 12fad99ba84..76148e7d8b2 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -1191,7 +1191,7 @@ def test_compressed_recordbatch_stream(compression): else: raise writer = pa.RecordBatchStreamWriter(stream, table.schema) - writer.write_table(table, chunksize=3) + writer.write_table(table, max_chunksize=3) writer.close() stream.close() # Flush data buf = raw.getvalue() diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f8a3563e25f..a2c6352bdda 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2352,9 +2352,7 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir): # nanosecond timestamps by default n = 9 df = pd.DataFrame({'x': range(n)}, - index=pd.DatetimeIndex(start='2017-01-01', - freq='1n', - periods=n)) + index=pd.date_range('2017-01-01', freq='1n', periods=n)) tb = pa.Table.from_pandas(df) filename = tempdir / 'written.parquet' @@ -3025,7 +3023,7 @@ def test_write_nested_zero_length_array_chunk_failure(): # Each column is a ChunkedArray with 2 elements my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten() for batch in data] - my_batches = [pa.RecordBatch.from_arrays(batch, pa.schema(cols)) + my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols)) for batch in my_arrays] tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) _check_roundtrip(tbl) @@ -3347,3 +3345,42 @@ def test_filter_before_validate_schema(tempdir): # read single file using filter table = pq.read_table(tempdir, filters=[[('A', '==', 0)]]) assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) + + +@pytest.mark.pandas +@pytest.mark.fastparquet +@pytest.mark.filterwarnings("ignore:RangeIndex:DeprecationWarning") +def test_fastparquet_cross_compatibility(tempdir): + fp = pytest.importorskip('fastparquet') + + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(4.0, 7.0, dtype="float64"), + "d": [True, False, True], + "e": pd.date_range("20130101", periods=3), + "f": pd.Categorical(["a", "b", "a"]), + # fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip + # "g": [[1, 2], None, [1, 2, 3]], + } + ) + table = pa.table(df) + + # Arrow -> fastparquet + file_arrow = str(tempdir / "cross_compat_arrow.parquet") + pq.write_table(table, file_arrow, compression=None) + + fp_file = fp.ParquetFile(file_arrow) + df_fp = fp_file.to_pandas() + tm.assert_frame_equal(df, df_fp) + + # Fastparquet -> arrow + file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet") + fp.write(file_fastparquet, df) + + table_fp = pq.read_pandas(file_fastparquet) + # for fastparquet written file, categoricals comes back as strings + # (no arrow schema in parquet metadata) + df['f'] = df['f'].astype(object) + tm.assert_frame_equal(table_fp.to_pandas(), df) diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 1761b2e3483..5ba54d3277b 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -520,6 +520,8 @@ def deserializer(data): assert np.alltrue(new_x.view(np.ndarray) == np.zeros(3)) +@pytest.mark.filterwarnings( + "ignore:the matrix subclass:PendingDeprecationWarning") def test_numpy_matrix_serialization(tmpdir): class CustomType(object): def __init__(self, val): diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index f9b2092ce07..434ec8029c1 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -187,15 +187,19 @@ def test_chunked_array_pickle(data, typ): @pytest.mark.pandas def test_chunked_array_to_pandas(): + import pandas as pd + data = [ pa.array([-10, -5, 0, 5, 10]) ] table = pa.table(data, names=['a']) col = table.column(0) assert isinstance(col, pa.ChunkedArray) - array = col.to_pandas() - assert array.shape == (5,) - assert array[0] == -10 + series = col.to_pandas() + assert isinstance(series, pd.Series) + assert series.shape == (5,) + assert series[0] == -10 + assert series.name == 'a' @pytest.mark.pandas diff --git a/r/NAMESPACE b/r/NAMESPACE index 31f056226ce..3f880fb4d04 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -3,16 +3,12 @@ S3method("!=",Object) S3method("$",RecordBatch) S3method("$",Table) -S3method("==",Array) -S3method("==",DataType) -S3method("==",Field) -S3method("==",Message) -S3method("==",RecordBatch) -S3method("==",Schema) +S3method("==",Object) S3method("[",RecordBatch) S3method("[",Table) S3method("[[",RecordBatch) S3method("[[",Table) +S3method(all,equal.Object) S3method(as.data.frame,RecordBatch) S3method(as.data.frame,Table) S3method(as.raw,Buffer) @@ -84,6 +80,7 @@ export(MessageType) export(MockOutputStream) export(ParquetFileReader) export(ParquetReaderProperties) +export(ParquetVersionType) export(RandomAccessFile) export(ReadableFile) export(RecordBatchFileReader) @@ -173,6 +170,7 @@ importFrom(rlang,dots_n) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,is_false) +importFrom(rlang,is_integerish) importFrom(rlang,list2) importFrom(rlang,quo_is_null) importFrom(rlang,warn) diff --git a/r/R/array.R b/r/R/array.R index bd1e161f0ba..2c50edb4b45 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -74,7 +74,10 @@ Array <- R6Class("Array", ApproxEquals = function(other) Array__ApproxEquals(self, other), data = function() shared_ptr(ArrayData, Array__data(self)), as_vector = function() Array__as_vector(self), - ToString = function() Array__ToString(self), + ToString = function() { + typ <- paste0("<", self$type$ToString(), ">") + paste(typ, Array__ToString(self), sep = "\n") + }, Slice = function(offset, length = NULL){ if (is.null(length)) { shared_ptr(Array, Array__Slice1(self, offset)) @@ -153,6 +156,3 @@ length.Array <- function(x) x$length() #' @export as.vector.Array <- function(x, mode) x$as_vector() - -#' @export -`==.Array` <- function(x, y) x$Equals(y) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 00a911bbe25..107541d64ca 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -18,7 +18,7 @@ #' @importFrom R6 R6Class #' @importFrom purrr map map_int map2 #' @importFrom assertthat assert_that -#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish #' @importFrom Rcpp sourceCpp #' @importFrom tidyselect vars_select #' @useDynLib arrow, .registration = TRUE @@ -54,15 +54,28 @@ Object <- R6Class("Object", self$`.:xp:.` <- xp }, print = function(...){ - cat(class(self)[[1]], "\n") + cat(class(self)[[1]], "\n", sep = "") if (!is.null(self$ToString)){ - cat(self$ToString(), "\n") + cat(self$ToString(), "\n", sep = "") } invisible(self) } ) ) +#' @export +`!=.Object` <- function(lhs, rhs) !(lhs == rhs) + +#' @export +`==.Object` <- function(x, y) { + x$Equals(y) +} + +#' @export +all.equal.Object <- function(target, current, ...) { + target == current +} + shared_ptr <- function(class, xp) { if (!shared_ptr_is_null(xp)) class$new(xp) } diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 271fcd3c5de..40c05f9b6d0 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -180,6 +180,10 @@ Buffer__data <- function(buffer){ .Call(`_arrow_Buffer__data` , buffer) } +Buffer__Equals <- function(x, y){ + .Call(`_arrow_Buffer__Equals` , x, y) +} + ChunkedArray__length <- function(chunked_array){ .Call(`_arrow_ChunkedArray__length` , chunked_array) } @@ -220,10 +224,18 @@ ChunkedArray__Validate <- function(chunked_array){ invisible(.Call(`_arrow_ChunkedArray__Validate` , chunked_array)) } +ChunkedArray__Equals <- function(x, y){ + .Call(`_arrow_ChunkedArray__Equals` , x, y) +} + util___Codec__Create <- function(codec, compression_level){ .Call(`_arrow_util___Codec__Create` , codec, compression_level) } +util___Codec__name <- function(codec){ + .Call(`_arrow_util___Codec__name` , codec) +} + io___CompressedOutputStream__Make <- function(codec, raw){ .Call(`_arrow_io___CompressedOutputStream__Make` , codec, raw) } @@ -880,8 +892,108 @@ parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices){ .Call(`_arrow_parquet___arrow___FileReader__ReadTable2` , reader, column_indices) } -write_parquet_file <- function(table, filename){ - invisible(.Call(`_arrow_write_parquet_file` , table, filename)) +parquet___default_arrow_writer_properties <- function(){ + .Call(`_arrow_parquet___default_arrow_writer_properties` ) +} + +parquet___ArrowWriterProperties___Builder__create <- function(){ + .Call(`_arrow_parquet___ArrowWriterProperties___Builder__create` ) +} + +parquet___ArrowWriterProperties___Builder__store_schema <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__store_schema` , builder)) +} + +parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__coerce_timestamps <- function(builder, unit){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps` , builder, unit)) +} + +parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps <- function(builder){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps` , builder)) +} + +parquet___ArrowWriterProperties___Builder__build <- function(builder){ + .Call(`_arrow_parquet___ArrowWriterProperties___Builder__build` , builder) +} + +parquet___default_writer_properties <- function(){ + .Call(`_arrow_parquet___default_writer_properties` ) +} + +parquet___WriterProperties___Builder__create <- function(){ + .Call(`_arrow_parquet___WriterProperties___Builder__create` ) +} + +parquet___WriterProperties___Builder__version <- function(builder, version){ + invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version` , builder, version)) +} + +parquet___ArrowWriterProperties___Builder__default_compression <- function(builder, compression){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_compression` , builder, compression)) +} + +parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions` , builder, paths, types)) +} + +parquet___ArrowWriterProperties___Builder__default_compression_level <- function(builder, compression_level){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_compression_level` , builder, compression_level)) +} + +parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels` , builder, paths, levels)) +} + +parquet___ArrowWriterProperties___Builder__default_write_statistics <- function(builder, write_statistics){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics` , builder, write_statistics)) +} + +parquet___ArrowWriterProperties___Builder__default_use_dictionary <- function(builder, use_dictionary){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary` , builder, use_dictionary)) +} + +parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary` , builder, paths, use_dictionary)) +} + +parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics` , builder, paths, write_statistics)) +} + +parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size){ + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size` , builder, data_page_size)) +} + +parquet___WriterProperties___Builder__build <- function(builder){ + .Call(`_arrow_parquet___WriterProperties___Builder__build` , builder) +} + +parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties){ + .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open` , schema, sink, properties, arrow_properties) +} + +parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size){ + invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable` , writer, table, chunk_size)) +} + +parquet___arrow___FileWriter__Close <- function(writer){ + invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close` , writer)) +} + +parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties){ + invisible(.Call(`_arrow_parquet___arrow___WriteTable` , table, sink, properties, arrow_properties)) } parquet___arrow___FileReader__GetSchema <- function(reader){ @@ -1088,6 +1200,10 @@ Table__Slice2 <- function(table, offset, length){ .Call(`_arrow_Table__Slice2` , table, offset, length) } +Table__Equals <- function(lhs, rhs){ + .Call(`_arrow_Table__Equals` , lhs, rhs) +} + Table__GetColumnByName <- function(table, name){ .Call(`_arrow_Table__GetColumnByName` , table, name) } diff --git a/r/R/buffer.R b/r/R/buffer.R index d1f789175cc..0f11fdcf0a9 100644 --- a/r/R/buffer.R +++ b/r/R/buffer.R @@ -38,7 +38,8 @@ Buffer <- R6Class("Buffer", inherit = Object, public = list( ZeroPadding = function() Buffer__ZeroPadding(self), - data = function() Buffer__data(self) + data = function() Buffer__data(self), + Equals = function(other) Buffer__Equals(self, other) ), active = list( diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index a6e4946d6e0..58f4b4e81d1 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -69,6 +69,23 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = Object, }, Validate = function() { ChunkedArray__Validate(self) + }, + ToString = function() { + out <- self$chunk(0)$ToString() + if (self$num_chunks > 1) { + # Regardless of whether the first array prints with ellipsis, we need + # to ellipsize because there's more data than is contained in this + # chunk + if (grepl("...\n", out, fixed = TRUE)) { + out <- sub("\\.\\.\\..*$", "...\n]", out) + } else { + out <- sub("\\n\\]$", ",\n ...\n]", out) + } + } + out + }, + Equals = function(other) { + ChunkedArray__Equals(self, other) } ), active = list( diff --git a/r/R/compression.R b/r/R/compression.R index 5fbe53e0c48..15375f4399c 100644 --- a/r/R/compression.R +++ b/r/R/compression.R @@ -36,18 +36,26 @@ #' @rdname Codec #' @name Codec #' @export -Codec <- R6Class("Codec", inherit = Object) +Codec <- R6Class("Codec", inherit = Object, + active = list( + name = function() util___Codec__name(self), + level = function() abort("Codec$level() not yet implemented") + ) +) Codec$create <- function(type = "gzip", compression_level = NA) { if (is.character(type)) { type <- unique_ptr(Codec, util___Codec__Create( - CompressionType[[match.arg(toupper(type), names(CompressionType))]], - compression_level + compression_from_name(type), compression_level )) } assert_is(type, "Codec") type } +compression_from_name <- function(name) { + map_int(name, ~CompressionType[[match.arg(toupper(.x), names(CompressionType))]]) +} + #' @title Compressed stream classes #' @rdname compression #' @name compression diff --git a/r/R/dictionary.R b/r/R/dictionary.R index ab33c3e1982..6273ffc2c87 100644 --- a/r/R/dictionary.R +++ b/r/R/dictionary.R @@ -31,7 +31,11 @@ #' @name DictionaryType DictionaryType <- R6Class("DictionaryType", inherit = FixedWidthType, - + public = list( + ToString = function() { + prettier_dictionary_type(DataType__ToString(self)) + } + ), active = list( index_type = function() DataType$create(DictionaryType__index_type(self)), value_type = function() DataType$create(DictionaryType__value_type(self)), @@ -39,20 +43,27 @@ DictionaryType <- R6Class("DictionaryType", ordered = function() DictionaryType__ordered(self) ) ) +DictionaryType$create <- function(index_type = int32(), + value_type = utf8(), + ordered = FALSE) { + assert_is(index_type, "DataType") + assert_is(value_type, "DataType") + shared_ptr(DictionaryType, DictionaryType__initialize(index_type, value_type, ordered)) +} #' Create a dictionary type #' -#' @param index_type index type, e.g. [int32()] -#' @param value_type value type, probably [utf8()] -#' @param ordered Is this an ordered dictionary ? +#' @param index_type A DataType for the indices (default [int32()]) +#' @param value_type A DataType for the values (default [utf8()]) +#' @param ordered Is this an ordered dictionary (default `FALSE`)? #' #' @return A [DictionaryType] #' @seealso [Other Arrow data types][data-type] #' @export -dictionary <- function(index_type, value_type, ordered = FALSE) { - assert_that( - inherits(index_type, "DataType"), - inherits(index_type, "DataType") - ) - shared_ptr(DictionaryType, DictionaryType__initialize(index_type, value_type, ordered)) +dictionary <- DictionaryType$create + +prettier_dictionary_type <- function(x) { + # Prettier format the "ordered" attribute + x <- sub(", ordered=0", "", x) + sub("ordered=1", "ordered", x) } diff --git a/r/R/enums.R b/r/R/enums.R index ade6d8e94f3..cad25f882a1 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -84,3 +84,9 @@ CompressionType <- enum("Compression::type", FileType <- enum("FileType", NonExistent = 0L, Unknown = 1L, File = 2L, Directory = 3L ) + +#' @export +#' @rdname enums +ParquetVersionType <- enum("ParquetVersionType", + PARQUET_1_0 = 0L, PARQUET_2_0 = 1L +) diff --git a/r/R/feather.R b/r/R/feather.R index e1b74bdda9c..420307063ba 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -17,8 +17,8 @@ #' Write data in the Feather format #' -#' @param data `data.frame` or RecordBatch -#' @param stream A file path or an OutputStream +#' @param x `data.frame` or RecordBatch +#' @param sink A file path or an OutputStream #' #' @export #' @examples @@ -30,20 +30,20 @@ #' }) #' } #' @include arrow-package.R -write_feather <- function(data, stream) { - if (is.data.frame(data)) { - data <- record_batch(data) +write_feather <- function(x, sink) { + if (is.data.frame(x)) { + x <- record_batch(x) } - assert_is(data, "RecordBatch") + assert_is(x, "RecordBatch") - if (is.character(stream)) { - stream <- FileOutputStream$create(stream) - on.exit(stream$close()) + if (is.character(sink)) { + sink <- FileOutputStream$create(sink) + on.exit(sink$close()) } - assert_is(stream, "OutputStream") + assert_is(sink, "OutputStream") - writer <- FeatherTableWriter$create(stream) - ipc___TableWriter__RecordBatch__WriteFeather(writer, data) + writer <- FeatherTableWriter$create(sink) + ipc___TableWriter__RecordBatch__WriteFeather(writer, x) } #' @title FeatherTableWriter class diff --git a/r/R/field.R b/r/R/field.R index fc5abc879cb..18337a15659 100644 --- a/r/R/field.R +++ b/r/R/field.R @@ -34,7 +34,7 @@ Field <- R6Class("Field", inherit = Object, public = list( ToString = function() { - Field__ToString(self) + prettier_dictionary_type(Field__ToString(self)) }, Equals = function(other) { inherits(other, "Field") && Field__Equals(self, other) @@ -67,11 +67,6 @@ Field$create <- function(name, type, metadata) { shared_ptr(Field, Field__initialize(name, type, TRUE)) } -#' @export -`==.Field` <- function(lhs, rhs){ - lhs$Equals(rhs) -} - #' @param name field name #' @param type logical type, instance of [DataType] #' @param metadata currently ignored diff --git a/r/R/filesystem.R b/r/R/filesystem.R index d20edcc6c02..ce507cc3e50 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -108,7 +108,10 @@ Selector <- R6Class("Selector", ) Selector$create <- function(base_dir, allow_non_existent = FALSE, recursive = FALSE) { - shared_ptr(Selector, fs___Selector__create(base_dir, allow_non_existent, recursive)) + shared_ptr( + Selector, + fs___Selector__create(clean_path_rel(base_dir), allow_non_existent, recursive) + ) } #' @title FileSystem classes @@ -165,53 +168,61 @@ FileSystem <- R6Class("FileSystem", inherit = Object, public = list( GetTargetStats = function(x) { if (inherits(x, "Selector")) { - map(fs___FileSystem__GetTargetStats_Selector(self, x), shared_ptr, class = FileStats) + map( + fs___FileSystem__GetTargetStats_Selector(self, x), + shared_ptr, + class = FileStats + ) } else if (is.character(x)){ - map(fs___FileSystem__GetTargetStats_Paths(self, x), shared_ptr, class = FileStats) + map( + fs___FileSystem__GetTargetStats_Paths(self, clean_path_rel(x)), + shared_ptr, + class = FileStats + ) } else { - abort("incompatible type for FileSystem$GetTargetStarts()") + abort("incompatible type for FileSystem$GetTargetStats()") } }, CreateDir = function(path, recursive = TRUE) { - fs___FileSystem__CreateDir(self, path, isTRUE(recursive)) + fs___FileSystem__CreateDir(self, clean_path_rel(path), isTRUE(recursive)) }, DeleteDir = function(path) { - fs___FileSystem__DeleteDir(self, path) + fs___FileSystem__DeleteDir(self, clean_path_rel(path)) }, DeleteDirContents = function(path) { - fs___FileSystem__DeleteDirContents(self, path) + fs___FileSystem__DeleteDirContents(self, clean_path_rel(path)) }, DeleteFile = function(path) { - fs___FileSystem__DeleteFile(self, path) + fs___FileSystem__DeleteFile(self, clean_path_rel(path)) }, DeleteFiles = function(paths) { - fs___FileSystem__DeleteFiles(self, paths) + fs___FileSystem__DeleteFiles(self, clean_path_rel(paths)) }, Move = function(src, dest) { - fs___FileSystem__Move(self, src, dest) + fs___FileSystem__Move(self, clean_path_rel(src), clean_path_rel(dest)) }, CopyFile = function(src, dest) { - fs___FileSystem__CopyFile(self, src, dest) + fs___FileSystem__CopyFile(self, clean_path_rel(src), clean_path_rel(dest)) }, OpenInputStream = function(path) { - shared_ptr(InputStream, fs___FileSystem__OpenInputStream(self, path)) + shared_ptr(InputStream, fs___FileSystem__OpenInputStream(self, clean_path_rel(path))) }, OpenInputFile = function(path) { - shared_ptr(InputStream, fs___FileSystem__OpenInputFile(self, path)) + shared_ptr(InputStream, fs___FileSystem__OpenInputFile(self, clean_path_rel(path))) }, OpenOutputStream = function(path) { - shared_ptr(OutputStream, fs___FileSystem__OpenOutputStream(self, path)) + shared_ptr(OutputStream, fs___FileSystem__OpenOutputStream(self, clean_path_rel(path))) }, OpenAppendStream = function(path) { - shared_ptr(OutputStream, fs___FileSystem__OpenAppendStream(self, path)) + shared_ptr(OutputStream, fs___FileSystem__OpenAppendStream(self, clean_path_rel(path))) } ) ) @@ -232,6 +243,17 @@ LocalFileSystem$create <- function() { #' @export SubTreeFileSystem <- R6Class("SubTreeFileSystem", inherit = FileSystem) SubTreeFileSystem$create <- function(base_path, base_fs) { - xp <- fs___SubTreeFileSystem__create(base_path, base_fs) + xp <- fs___SubTreeFileSystem__create(clean_path_rel(base_path), base_fs) shared_ptr(SubTreeFileSystem, xp) } + +clean_path_abs <- function(path) { + # Make sure we have a valid, absolute, forward-slashed path for passing to Arrow + normalizePath(path, winslash = "/", mustWork = FALSE) +} + +clean_path_rel <- function(path) { + # Make sure all path separators are "/", not "\" as on Windows + path_sep <- ifelse(tolower(Sys.info()[["sysname"]]) == "windows", "\\\\", "/") + gsub(path_sep, "/", path) +} diff --git a/r/R/io.R b/r/R/io.R index f5390e32b25..255dc1e8241 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -74,8 +74,7 @@ OutputStream <- R6Class("OutputStream", inherit = Writable, #' @export FileOutputStream <- R6Class("FileOutputStream", inherit = OutputStream) FileOutputStream$create <- function(path) { - path <- normalizePath(path, mustWork = FALSE) - shared_ptr(FileOutputStream, io___FileOutputStream__Open(path)) + shared_ptr(FileOutputStream, io___FileOutputStream__Open(clean_path_abs(path))) } #' @usage NULL @@ -148,7 +147,7 @@ Readable <- R6Class("Readable", inherit = Object, #' #' @section Methods: #' -#' - `$GetSize()`: +#' - `$GetSize()`: #' - `$supports_zero_copy()`: Logical #' - `$seek(position)`: go to that position in the stream #' - `$tell()`: return the position in the stream @@ -210,7 +209,7 @@ MemoryMappedFile <- R6Class("MemoryMappedFile", inherit = RandomAccessFile, #' @export ReadableFile <- R6Class("ReadableFile", inherit = RandomAccessFile) ReadableFile$create <- function(path) { - shared_ptr(ReadableFile, io___ReadableFile__Open(normalizePath(path))) + shared_ptr(ReadableFile, io___ReadableFile__Open(clean_path_abs(path))) } #' @usage NULL @@ -232,7 +231,7 @@ BufferReader$create <- function(x) { #' #' @export mmap_create <- function(path, size) { - path <- normalizePath(path, mustWork = FALSE) + path <- clean_path_abs(path) shared_ptr(MemoryMappedFile, io___MemoryMappedFile__Create(path, size)) } @@ -244,7 +243,7 @@ mmap_create <- function(path, size) { #' @export mmap_open <- function(path, mode = c("read", "write", "readwrite")) { mode <- match(match.arg(mode), c("read", "write", "readwrite")) - 1L - path <- normalizePath(path) + path <- clean_path_abs(path) shared_ptr(MemoryMappedFile, io___MemoryMappedFile__Open(path, mode)) } diff --git a/r/R/message.R b/r/R/message.R index 701d157fd43..51e0f965e27 100644 --- a/r/R/message.R +++ b/r/R/message.R @@ -45,9 +45,6 @@ Message <- R6Class("Message", inherit = Object, ) ) -#' @export -`==.Message` <- function(x, y) x$Equals(y) - #' @title class arrow::MessageReader #' #' @usage NULL diff --git a/r/R/parquet.R b/r/R/parquet.R index d36e5c33dd7..706494ab37f 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. - #' Read a Parquet file #' #' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format. @@ -34,10 +33,10 @@ #' } #' @export read_parquet <- function(file, - col_select = NULL, - as_data_frame = TRUE, - props = ParquetReaderProperties$create(), - ...) { + col_select = NULL, + as_data_frame = TRUE, + props = ParquetReaderProperties$create(), + ...) { reader <- ParquetFileReader$create(file, props = props, ...) tab <- reader$ReadTable(!!enquo(col_select)) @@ -47,6 +46,285 @@ read_parquet <- function(file, tab } +#' Write Parquet file to disk +#' +#' [Parquet](https://parquet.apache.org/) is a columnar storage file format. +#' This function enables you to write Parquet files from R. +#' +#' @param x An [arrow::Table][Table], or an object convertible to it. +#' @param sink an [arrow::io::OutputStream][OutputStream] or a string which is interpreted as a file path +#' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used. +#' +#' @param version parquet version, "1.0" or "2.0". +#' @param compression compression algorithm. No compression by default. +#' @param compression_level compression level. +#' @param use_dictionary Specify if we should use dictionary encoding. +#' @param write_statistics Specify if we should write statistics +#' @param data_page_size Set a target threshhold for the approximate encoded size of data +#' pages within a column chunk. If omitted, the default data page size (1Mb) is used. +#' @param properties properties for parquet writer, derived from arguments +#' `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics` and `data_page_size` +#' +#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format +#' @param coerce_timestamps Cast timestamps a particular resolution. can be NULL, "ms" or "us" +#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a particular +#' resolution. E.g. if microsecond or nanosecond data is lost when coercing to +#' ms', do not raise an exception +#' +#' @param arrow_properties arrow specific writer properties, derived from +#' arguments `use_deprecated_int96_timestamps`, `coerce_timestamps` and `allow_truncated_timestamps` +#' +#' @details The parameters `compression`, `compression_level`, `use_dictionary` and `write_statistics` support +#' various patterns: +#' - The default `NULL` leaves the parameter unspecified, and the C++ library uses an appropriate default for +#' each column +#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns +#' - An unnamed vector, of the same size as the number of columns, to specify a value for each column, in +#' positional order +#' - A named vector, to specify the value for the named columns, the default value for the setting is used +#' when not supplied. +#' +#' @return NULL, invisibly +#' +#' @examples +#' \donttest{ +#' tf1 <- tempfile(fileext = ".parquet") +#' write_parquet(data.frame(x = 1:5), tf2) +#' +#' # using compression +#' tf2 <- tempfile(fileext = ".gz.parquet") +#' write_parquet(data.frame(x = 1:5), compression = "gzip", compression_level = 5) +#' +#' } +#' @export +write_parquet <- function(x, + sink, + chunk_size = NULL, + + # writer properties + version = NULL, + compression = NULL, + compression_level = NULL, + use_dictionary = NULL, + write_statistics = NULL, + data_page_size = NULL, + + properties = ParquetWriterProperties$create( + x, + version = version, + compression = compression, + compression_level = compression_level, + use_dictionary = use_dictionary, + write_statistics = write_statistics, + data_page_size = data_page_size + ), + + # arrow writer properties + use_deprecated_int96_timestamps = FALSE, + coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + + arrow_properties = ParquetArrowWriterProperties$create( + use_deprecated_int96_timestamps = use_deprecated_int96_timestamps, + coerce_timestamps = coerce_timestamps, + allow_truncated_timestamps = allow_truncated_timestamps + ) +) { + x <- to_arrow(x) + + if (is.character(sink)) { + sink <- FileOutputStream$create(sink) + on.exit(sink$close()) + } else if (!inherits(sink, OutputStream)) { + abort("sink must be a file path or an OutputStream") + } + + schema <- x$schema + writer <- ParquetFileWriter$create(schema, sink, properties = properties, arrow_properties = arrow_properties) + writer$WriteTable(x, chunk_size = chunk_size %||% x$num_rows) + writer$Close() +} + + +ParquetArrowWriterPropertiesBuilder <- R6Class("ParquetArrowWriterPropertiesBuilder", inherit = Object, + public = list( + store_schema = function() { + parquet___ArrowWriterProperties___Builder__store_schema(self) + self + }, + set_int96_support = function(use_deprecated_int96_timestamps = FALSE) { + if (use_deprecated_int96_timestamps) { + parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(self) + } else { + parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(self) + } + self + }, + set_coerce_timestamps = function(coerce_timestamps = NULL) { + if (!is.null(coerce_timestamps)) { + unit <- make_valid_time_unit(coerce_timestamps, + c("ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO) + ) + parquet___ArrowWriterProperties___Builder__coerce_timestamps(unit) + } + self + }, + set_allow_truncated_timestamps = function(allow_truncated_timestamps = FALSE) { + if (allow_truncated_timestamps) { + parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(self) + } else { + parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(self) + } + + self + } + + ) +) +ParquetArrowWriterProperties <- R6Class("ParquetArrowWriterProperties", inherit = Object) + +ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, allow_truncated_timestamps = FALSE) { + if (!use_deprecated_int96_timestamps && is.null(coerce_timestamps) && !allow_truncated_timestamps) { + shared_ptr(ParquetArrowWriterProperties, parquet___default_arrow_writer_properties()) + } else { + builder <- shared_ptr(ParquetArrowWriterPropertiesBuilder, parquet___ArrowWriterProperties___Builder__create()) + builder$store_schema() + builder$set_int96_support(use_deprecated_int96_timestamps) + builder$set_coerce_timestamps(coerce_timestamps) + builder$set_allow_truncated_timestamps(allow_truncated_timestamps) + shared_ptr(ParquetArrowWriterProperties, parquet___ArrowWriterProperties___Builder__build(builder)) + } +} + +valid_parquet_version <- c( + "1.0" = ParquetVersionType$PARQUET_1_0, + "2.0" = ParquetVersionType$PARQUET_2_0 +) + +make_valid_version <- function(version, valid_versions = valid_parquet_version) { + if (is_integerish(version)) { + version <- as.character(version) + } + tryCatch( + valid_versions[[match.arg(version, choices = names(valid_versions))]], + error = function(cond) { + stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call.=FALSE) + } + ) +} + +ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = Object) +ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder", inherit = Object, + public = list( + set_version = function(version) { + parquet___ArrowWriterProperties___Builder__version(self, make_valid_version(version)) + }, + + set_compression = function(table, compression){ + private$.set(table, compression_from_name(compression), "compression", is.integer, + parquet___ArrowWriterProperties___Builder__default_compression, + parquet___ArrowWriterProperties___Builder__set_compressions + ) + }, + + set_compression_level = function(table, compression_level){ + private$.set(table, compression_level, "compression_level", is_integerish, + parquet___ArrowWriterProperties___Builder__default_compression_level, + parquet___ArrowWriterProperties___Builder__set_compression_levels + ) + }, + + set_dictionary = function(table, use_dictionary) { + private$.set(table, use_dictionary, "use_dictionary", is.logical, + parquet___ArrowWriterProperties___Builder__default_use_dictionary, + parquet___ArrowWriterProperties___Builder__set_use_dictionary + ) + }, + + set_write_statistics = function(table, write_statistics) { + private$.set(table, write_statistics, "write_statistics", is.logical, + parquet___ArrowWriterProperties___Builder__default_write_statistics, + parquet___ArrowWriterProperties___Builder__set_write_statistics + ) + }, + + set_data_page_size = function(data_page_size) { + parquet___ArrowWriterProperties___Builder__data_page_size(self, data_page_size) + } + ), + + private = list( + .set = function(table, value, name, is, default, multiple) { + msg <- paste0("unsupported ", name, "= specification") + assert_that(is(value), msg = msg) + column_names <- names(table) + if (is.null(given_names <- names(value))) { + if (length(value) == 1L) { + default(self, value) + } else if (length(value) == length(column_names)) { + multiple(self, column_names, value) + } + } else if(all(given_names %in% column_names)) { + multiple(self, given_names, value) + } else { + abort(msg) + } + } + ) + +) + +ParquetWriterProperties$create <- function(table, version = NULL, compression = NULL, compression_level = NULL, use_dictionary = NULL, write_statistics = NULL, data_page_size = NULL) { + if (is.null(version) && is.null(compression) && is.null(compression_level) && is.null(use_dictionary) && is.null(write_statistics) && is.null(data_page_size)) { + shared_ptr(ParquetWriterProperties, parquet___default_writer_properties()) + } else { + builder <- shared_ptr(ParquetWriterPropertiesBuilder, parquet___WriterProperties___Builder__create()) + if (!is.null(version)) { + builder$set_version(version) + } + if (!is.null(compression)) { + builder$set_compression(table, compression = compression) + } + if (!is.null(compression_level)) { + builder$set_compression_level(table, compression_level = compression_level) + } + if (!is.null(use_dictionary)) { + builder$set_dictionary(table, use_dictionary) + } + if (!is.null(write_statistics)) { + builder$set_write_statistics(table, write_statistics) + } + if (!is.null(data_page_size)) { + builder$set_data_page_size(data_page_size) + } + shared_ptr(ParquetWriterProperties, parquet___WriterProperties___Builder__build(builder)) + } +} + +ParquetFileWriter <- R6Class("ParquetFileWriter", inherit = Object, + public = list( + WriteTable = function(table, chunk_size) { + parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) + }, + Close = function() { + parquet___arrow___FileWriter__Close(self) + } + ) + +) +ParquetFileWriter$create <- function( + schema, + sink, + properties = ParquetWriterProperties$create(), + arrow_properties = ParquetArrowWriterProperties$create() +) { + unique_ptr( + ParquetFileWriter, + parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties) + ) +} + + #' @title ParquetFileReader class #' @rdname ParquetFileReader #' @name ParquetFileReader @@ -162,23 +440,3 @@ ParquetReaderProperties$create <- function(use_threads = option_use_threads()) { parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads)) ) } - - -#' Write Parquet file to disk -#' -#' [Parquet](https://parquet.apache.org/) is a columnar storage file format. -#' This function enables you to write Parquet files from R. -#' -#' @param table An [arrow::Table][Table], or an object convertible to it -#' @param file a file path -#' -#' @examples -#' \donttest{ -#' tf <- tempfile(fileext = ".parquet") -#' on.exit(unlink(tf)) -#' write_parquet(tibble::tibble(x = 1:5), tf) -#' } -#' @export -write_parquet <- function(table, file) { - write_parquet_file(to_arrow(table), file) -} diff --git a/r/R/record-batch-writer.R b/r/R/record-batch-writer.R index 6f43c52d96a..a1cfd8a0040 100644 --- a/r/R/record-batch-writer.R +++ b/r/R/record-batch-writer.R @@ -58,12 +58,11 @@ RecordBatchWriter <- R6Class("RecordBatchWriter", inherit = Object, write_table = function(table) ipc___RecordBatchWriter__WriteTable(self, table), write = function(x) { + x <- to_arrow(x) if (inherits(x, "RecordBatch")) { self$write_batch(x) } else if (inherits(x, "Table")) { self$write_table(x) - } else if (inherits(x, "data.frame")) { - self$write_table(table(x)) } else { abort("unexpected type for RecordBatchWriter$write(), must be an arrow::RecordBatch or an arrow::Table") } diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 80796b90d2f..6dcb18f1c3e 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -111,6 +111,7 @@ RecordBatch <- R6Class("RecordBatch", inherit = Object, }, serialize = function() ipc___SerializeRecordBatch__Raw(self), + ToString = function() ToString_tabular(self), cast = function(target_schema, safe = TRUE, options = cast_options(safe)) { assert_is(target_schema, "Schema") @@ -162,11 +163,6 @@ names.RecordBatch <- function(x) { x$names() } -#' @export -`==.RecordBatch` <- function(x, y) { - x$Equals(y) -} - #' @importFrom methods as #' @export `[.RecordBatch` <- function(x, i, j, ..., drop = FALSE) { @@ -246,3 +242,11 @@ tail.RecordBatch <- function(x, n = 6L, ...) { } x$Slice(n) } + +ToString_tabular <- function(x, ...) { + # Generic to work with both RecordBatch and Table + sch <- unlist(strsplit(x$schema$ToString(), "\n")) + sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch) + dims <- sprintf("%s rows x %s columns", nrow(x), ncol(x)) + paste(c(dims, sch), collapse = "\n") +} diff --git a/r/R/schema.R b/r/R/schema.R index 9f28fb53d17..11230158e77 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -49,7 +49,7 @@ Schema <- R6Class("Schema", inherit = Object, public = list( - ToString = function() Schema__ToString(self), + ToString = function() prettier_dictionary_type(Schema__ToString(self)), num_fields = function() Schema__num_fields(self), field = function(i) shared_ptr(Field, Schema__field(self, i)), serialize = function() Schema__serialize(self), @@ -62,9 +62,6 @@ Schema <- R6Class("Schema", Schema$create <- function(...) shared_ptr(Schema, schema_(.fields(list2(...)))) -#' @export -`==.Schema` <- function(lhs, rhs) lhs$Equals(rhs) - #' @param ... named list of [data types][data-type] #' @export #' @rdname Schema diff --git a/r/R/table.R b/r/R/table.R index 47626b160d9..b3175e57941 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -104,6 +104,7 @@ Table <- R6Class("Table", inherit = Object, field = function(i) shared_ptr(Field, Table__field(self, i)), serialize = function(output_stream, ...) write_table(self, output_stream, ...), + ToString = function() ToString_tabular(self), cast = function(target_schema, safe = TRUE, options = cast_options(safe)) { assert_is(target_schema, "Schema") @@ -123,12 +124,17 @@ Table <- R6Class("Table", inherit = Object, shared_ptr(Table, Table__select(self, indices)) } }, + Slice = function(offset, length = NULL) { if (is.null(length)) { shared_ptr(Table, Table__Slice1(self, offset)) } else { shared_ptr(Table, Table__Slice2(self, offset, length)) } + }, + + Equals = function(other) { + Table__Equals(self, other) } ), diff --git a/r/R/type.R b/r/R/type.R index 36d81e293c0..51601e46790 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -16,12 +16,6 @@ # under the License. #' @include arrow-package.R - -#' @export -`!=.Object` <- function(lhs, rhs){ - !(lhs == rhs) -} - #' @title class arrow::DataType #' #' @usage NULL @@ -134,9 +128,6 @@ FixedWidthType <- R6Class("FixedWidthType", ) ) -#' @export -`==.DataType` <- function(lhs, rhs) lhs$Equals(rhs) - Int8 <- R6Class("Int8", inherit = FixedWidthType) Int16 <- R6Class("Int16", inherit = FixedWidthType) Int32 <- R6Class("Int32", inherit = FixedWidthType) diff --git a/r/R/write-arrow.R b/r/R/write-arrow.R index dbab158204f..3903b1901cf 100644 --- a/r/R/write-arrow.R +++ b/r/R/write-arrow.R @@ -30,7 +30,7 @@ to_arrow.data.frame <- function(x) Table$create(!!!x) #' #' @param x an [arrow::Table][Table], an [arrow::RecordBatch][RecordBatch] or a data.frame #' -#' @param stream where to serialize to +#' @param sink where to serialize to #' #' - A [arrow::RecordBatchWriter][RecordBatchWriter]: the `$write()` #' of `x` is used. The stream is left open. This uses the streaming format @@ -50,20 +50,20 @@ to_arrow.data.frame <- function(x) Table$create(!!!x) #' and [arrow::RecordBatchStreamWriter][RecordBatchStreamWriter] can be used for more flexibility. #' #' @export -write_arrow <- function(x, stream, ...) { - UseMethod("write_arrow", stream) +write_arrow <- function(x, sink, ...) { + UseMethod("write_arrow", sink) } #' @export -write_arrow.RecordBatchWriter <- function(x, stream, ...){ - stream$write(x) +write_arrow.RecordBatchWriter <- function(x, sink, ...){ + sink$write(x) } #' @export -write_arrow.character <- function(x, stream, ...) { - assert_that(length(stream) == 1L) +write_arrow.character <- function(x, sink, ...) { + assert_that(length(sink) == 1L) x <- to_arrow(x) - file_stream <- FileOutputStream$create(stream) + file_stream <- FileOutputStream$create(sink) on.exit(file_stream$close()) file_writer <- RecordBatchFileWriter$create(file_stream, x$schema) on.exit({ @@ -77,7 +77,7 @@ write_arrow.character <- function(x, stream, ...) { } #' @export -write_arrow.raw <- function(x, stream, ...) { +write_arrow.raw <- function(x, sink, ...) { x <- to_arrow(x) schema <- x$schema diff --git a/r/README.Rmd b/r/README.Rmd index b07d8c0bdcd..2442b83435f 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -54,9 +54,13 @@ When installing from source, if the R and C++ library versions do not match, ins library(arrow) set.seed(24) -tab <- Table$create(x = 1:10, y = rnorm(10)) -tab$schema +tab <- Table$create( + x = 1:10, + y = rnorm(10), + z = as.factor(rep(c("b", "c"), 5)) +) tab +tab$x as.data.frame(tab) ``` @@ -157,6 +161,24 @@ remotes::install_github("romainfrancois/decor") install.packages(c("dplyr", "purrr", "glue")) ``` +We use Google C++ style in our C++ code. Check for style errors with + +``` +./lint.sh +``` + +Fix any style issues before committing with + +``` +./lint.sh --fix +``` + +The lint script requires Python 3 and `clang-format-7`. If the command isn't +found, you can explicitly provide the path to it like +`CLANG_FORMAT=$(which clang-format-7) ./lint.sh`. On macOS, +you can get this by installing LLVM via Homebrew and running the script as +`CLANG_FORMAT=$(brew --prefix llvm@7)/bin/clang-format ./lint.sh` + ### Useful functions Within an R session, these can help with package development: diff --git a/r/README.md b/r/README.md index 9733da91a95..9d542973045 100644 --- a/r/README.md +++ b/r/README.md @@ -69,25 +69,44 @@ Arrow C++ library first. library(arrow) set.seed(24) -tab <- Table$create(x = 1:10, y = rnorm(10)) -tab$schema -#> Schema -#> x: int32 -#> y: double +tab <- Table$create( + x = 1:10, + y = rnorm(10), + z = as.factor(rep(c("b", "c"), 5)) +) tab #> Table +#> 10 rows x 3 columns +#> $x +#> $y +#> $z > +tab$x +#> ChunkedArray +#> +#> [ +#> 1, +#> 2, +#> 3, +#> 4, +#> 5, +#> 6, +#> 7, +#> 8, +#> 9, +#> 10 +#> ] as.data.frame(tab) -#> x y -#> 1 1 -0.545880758 -#> 2 2 0.536585304 -#> 3 3 0.419623149 -#> 4 4 -0.583627199 -#> 5 5 0.847460017 -#> 6 6 0.266021979 -#> 7 7 0.444585270 -#> 8 8 -0.466495124 -#> 9 9 -0.848370044 -#> 10 10 0.002311942 +#> x y z +#> 1 1 -0.545880758 b +#> 2 2 0.536585304 c +#> 3 3 0.419623149 b +#> 4 4 -0.583627199 c +#> 5 5 0.847460017 b +#> 6 6 0.266021979 c +#> 7 7 0.444585270 b +#> 8 8 -0.466495124 c +#> 9 9 -0.848370044 b +#> 10 10 0.002311942 c ``` ## Installing a development version @@ -214,6 +233,20 @@ remotes::install_github("romainfrancois/decor") install.packages(c("dplyr", "purrr", "glue")) ``` +We use Google C++ style in our C++ code. Check for style errors with + + ./lint.sh + +Fix any style issues before committing with + + ./lint.sh --fix + +The lint script requires Python 3 and `clang-format-7`. If the command +isn’t found, you can explicitly provide the path to it like +`CLANG_FORMAT=$(which clang-format-7) ./lint.sh`. On macOS, you can get +this by installing LLVM via Homebrew and running the script as +`CLANG_FORMAT=$(brew --prefix llvm@7)/bin/clang-format ./lint.sh` + ### Useful functions Within an R session, these can help with package development: diff --git a/r/lint.sh b/r/lint.sh index fed64c1b31c..bd0f7a81100 100755 --- a/r/lint.sh +++ b/r/lint.sh @@ -17,11 +17,14 @@ # specific language governing permissions and limitations # under the License. +# This script requires Python 3 and clang-format, which should already be +# on your system. See r/README.md for further guidance + SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" CPP_BUILD_SUPPORT=$SOURCE_DIR/../cpp/build-support # Run clang-format -CLANG_FORMAT=clang-format-7 +: ${CLANG_FORMAT:=clang-format-7} $CPP_BUILD_SUPPORT/run_clang_format.py \ --clang_format_binary=$CLANG_FORMAT \ --exclude_glob=$CPP_BUILD_SUPPORT/lint_exclusions.txt \ diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd index 183513e8c47..2716a173c78 100644 --- a/r/man/dictionary.Rd +++ b/r/man/dictionary.Rd @@ -4,14 +4,15 @@ \alias{dictionary} \title{Create a dictionary type} \usage{ -dictionary(index_type, value_type, ordered = FALSE) +dictionary(index_type = int32(), value_type = utf8(), + ordered = FALSE) } \arguments{ -\item{index_type}{index type, e.g. \code{\link[=int32]{int32()}}} +\item{index_type}{A DataType for the indices (default \code{\link[=int32]{int32()}})} -\item{value_type}{value type, probably \code{\link[=utf8]{utf8()}}} +\item{value_type}{A DataType for the values (default \code{\link[=utf8]{utf8()}})} -\item{ordered}{Is this an ordered dictionary ?} +\item{ordered}{Is this an ordered dictionary (default \code{FALSE})?} } \value{ A \link{DictionaryType} diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 3d841fa0c64..7f7358f760b 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -11,6 +11,7 @@ \alias{MessageType} \alias{CompressionType} \alias{FileType} +\alias{ParquetVersionType} \title{Arrow enums} \format{An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.} \usage{ @@ -29,6 +30,8 @@ MessageType CompressionType FileType + +ParquetVersionType } \description{ Arrow enums diff --git a/r/man/write_arrow.Rd b/r/man/write_arrow.Rd index 1820e0e1536..c4d67033fbe 100644 --- a/r/man/write_arrow.Rd +++ b/r/man/write_arrow.Rd @@ -4,12 +4,12 @@ \alias{write_arrow} \title{Write Arrow formatted data} \usage{ -write_arrow(x, stream, ...) +write_arrow(x, sink, ...) } \arguments{ \item{x}{an \link[=Table]{arrow::Table}, an \link[=RecordBatch]{arrow::RecordBatch} or a data.frame} -\item{stream}{where to serialize to +\item{sink}{where to serialize to \itemize{ \item A \link[=RecordBatchWriter]{arrow::RecordBatchWriter}: the \code{$write()} of \code{x} is used. The stream is left open. This uses the streaming format diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 24636a09cb0..9bc37975281 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -4,12 +4,12 @@ \alias{write_feather} \title{Write data in the Feather format} \usage{ -write_feather(data, stream) +write_feather(x, sink) } \arguments{ -\item{data}{\code{data.frame} or RecordBatch} +\item{x}{\code{data.frame} or RecordBatch} -\item{stream}{A file path or an OutputStream} +\item{sink}{A file path or an OutputStream} } \description{ Write data in the Feather format diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index b0fb7bc6761..b2471d7a5b7 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -4,21 +4,79 @@ \alias{write_parquet} \title{Write Parquet file to disk} \usage{ -write_parquet(table, file) +write_parquet(x, sink, chunk_size = NULL, version = NULL, + compression = NULL, compression_level = NULL, + use_dictionary = NULL, write_statistics = NULL, + data_page_size = NULL, properties = ParquetWriterProperties$create(x, + version = version, compression = compression, compression_level = + compression_level, use_dictionary = use_dictionary, write_statistics = + write_statistics, data_page_size = data_page_size), + use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps + = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, + allow_truncated_timestamps = allow_truncated_timestamps)) } \arguments{ -\item{table}{An \link[=Table]{arrow::Table}, or an object convertible to it} +\item{x}{An \link[=Table]{arrow::Table}, or an object convertible to it.} -\item{file}{a file path} +\item{sink}{an \link[=OutputStream]{arrow::io::OutputStream} or a string which is interpreted as a file path} + +\item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.} + +\item{version}{parquet version, "1.0" or "2.0".} + +\item{compression}{compression algorithm. No compression by default.} + +\item{compression_level}{compression level.} + +\item{use_dictionary}{Specify if we should use dictionary encoding.} + +\item{write_statistics}{Specify if we should write statistics} + +\item{data_page_size}{Set a target threshhold for the approximate encoded size of data +pages within a column chunk. If omitted, the default data page size (1Mb) is used.} + +\item{properties}{properties for parquet writer, derived from arguments +\code{version}, \code{compression}, \code{compression_level}, \code{use_dictionary}, \code{write_statistics} and \code{data_page_size}} + +\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet format} + +\item{coerce_timestamps}{Cast timestamps a particular resolution. can be NULL, "ms" or "us"} + +\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps to a particular +resolution. E.g. if microsecond or nanosecond data is lost when coercing to +ms', do not raise an exception} + +\item{arrow_properties}{arrow specific writer properties, derived from +arguments \code{use_deprecated_int96_timestamps}, \code{coerce_timestamps} and \code{allow_truncated_timestamps}} +} +\value{ +NULL, invisibly } \description{ \href{https://parquet.apache.org/}{Parquet} is a columnar storage file format. This function enables you to write Parquet files from R. } +\details{ +The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and \code{write_statistics} support +various patterns: +- The default \code{NULL} leaves the parameter unspecified, and the C++ library uses an appropriate default for +each column +- A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns +- An unnamed vector, of the same size as the number of columns, to specify a value for each column, in +positional order +- A named vector, to specify the value for the named columns, the default value for the setting is used +when not supplied. +} \examples{ \donttest{ -tf <- tempfile(fileext = ".parquet") -on.exit(unlink(tf)) -write_parquet(tibble::tibble(x = 1:5), tf) +tf1 <- tempfile(fileext = ".parquet") +write_parquet(data.frame(x = 1:5), tf2) + +# using compression +tf2 <- tempfile(fileext = ".gz.parquet") +write_parquet(data.frame(x = 1:5), compression = "gzip", compression_level = 5) + } } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 10a657514e7..b00eca925a4 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -701,6 +701,22 @@ RcppExport SEXP _arrow_Buffer__data(SEXP buffer_sexp){ } #endif +// buffer.cpp +#if defined(ARROW_R_WITH_ARROW) +bool Buffer__Equals(const std::shared_ptr& x, const std::shared_ptr& y); +RcppExport SEXP _arrow_Buffer__Equals(SEXP x_sexp, SEXP y_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type x(x_sexp); + Rcpp::traits::input_parameter&>::type y(y_sexp); + return Rcpp::wrap(Buffer__Equals(x, y)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Buffer__Equals(SEXP x_sexp, SEXP y_sexp){ + Rf_error("Cannot call Buffer__Equals(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // chunkedarray.cpp #if defined(ARROW_R_WITH_ARROW) int ChunkedArray__length(const std::shared_ptr& chunked_array); @@ -857,6 +873,22 @@ RcppExport SEXP _arrow_ChunkedArray__Validate(SEXP chunked_array_sexp){ } #endif +// chunkedarray.cpp +#if defined(ARROW_R_WITH_ARROW) +bool ChunkedArray__Equals(const std::shared_ptr& x, const std::shared_ptr& y); +RcppExport SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type x(x_sexp); + Rcpp::traits::input_parameter&>::type y(y_sexp); + return Rcpp::wrap(ChunkedArray__Equals(x, y)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){ + Rf_error("Cannot call ChunkedArray__Equals(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // compression.cpp #if defined(ARROW_R_WITH_ARROW) std::unique_ptr util___Codec__Create(arrow::Compression::type codec, int compression_level); @@ -873,6 +905,21 @@ RcppExport SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_le } #endif +// compression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::string util___Codec__name(const std::unique_ptr& codec); +RcppExport SEXP _arrow_util___Codec__name(SEXP codec_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type codec(codec_sexp); + return Rcpp::wrap(util___Codec__name(codec)); +END_RCPP +} +#else +RcppExport SEXP _arrow_util___Codec__name(SEXP codec_sexp){ + Rf_error("Cannot call util___Codec__name(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // compression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr io___CompressedOutputStream__Make(const std::unique_ptr& codec, const std::shared_ptr& raw); @@ -3416,18 +3463,429 @@ RcppExport SEXP _arrow_parquet___arrow___FileReader__ReadTable2(SEXP reader_sexp // parquet.cpp #if defined(ARROW_R_WITH_ARROW) -void write_parquet_file(const std::shared_ptr& table, std::string filename); -RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){ +std::shared_ptr parquet___default_arrow_writer_properties(); +RcppExport SEXP _arrow_parquet___default_arrow_writer_properties(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___default_arrow_writer_properties()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___default_arrow_writer_properties(){ + Rf_error("Cannot call parquet___default_arrow_writer_properties(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___ArrowWriterProperties___Builder__create(); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__create(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___ArrowWriterProperties___Builder__create()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__create(){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__create(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__store_schema(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__store_schema(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__store_schema(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__store_schema(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__store_schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__coerce_timestamps(const std::shared_ptr& builder, arrow::TimeUnit::type unit); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps(SEXP builder_sexp, SEXP unit_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type unit(unit_sexp); + parquet___ArrowWriterProperties___Builder__coerce_timestamps(builder, unit); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps(SEXP builder_sexp, SEXP unit_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__coerce_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(builder); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___ArrowWriterProperties___Builder__build(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__build(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + return Rcpp::wrap(parquet___ArrowWriterProperties___Builder__build(builder)); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__build(SEXP builder_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__build(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___default_writer_properties(); +RcppExport SEXP _arrow_parquet___default_writer_properties(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___default_writer_properties()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___default_writer_properties(){ + Rf_error("Cannot call parquet___default_writer_properties(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___WriterProperties___Builder__create(); +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__create(){ +BEGIN_RCPP + return Rcpp::wrap(parquet___WriterProperties___Builder__create()); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__create(){ + Rf_error("Cannot call parquet___WriterProperties___Builder__create(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___WriterProperties___Builder__version(const std::shared_ptr& builder, const parquet::ParquetVersion::type& version); +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__version(SEXP builder_sexp, SEXP version_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type version(version_sexp); + parquet___WriterProperties___Builder__version(builder, version); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__version(SEXP builder_sexp, SEXP version_sexp){ + Rf_error("Cannot call parquet___WriterProperties___Builder__version(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_compression(const std::shared_ptr& builder, const arrow::Compression::type& compression); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression(SEXP builder_sexp, SEXP compression_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type compression(compression_sexp); + parquet___ArrowWriterProperties___Builder__default_compression(builder, compression); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression(SEXP builder_sexp, SEXP compression_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_compression(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_compressions(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::IntegerVector& types); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compressions(SEXP builder_sexp, SEXP paths_sexp, SEXP types_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type types(types_sexp); + parquet___ArrowWriterProperties___Builder__set_compressions(builder, paths, types); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compressions(SEXP builder_sexp, SEXP paths_sexp, SEXP types_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_compressions(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_compression_level(const std::shared_ptr& builder, int compression_level); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression_level(SEXP builder_sexp, SEXP compression_level_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type compression_level(compression_level_sexp); + parquet___ArrowWriterProperties___Builder__default_compression_level(builder, compression_level); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_compression_level(SEXP builder_sexp, SEXP compression_level_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_compression_level(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_compression_levels(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::IntegerVector& levels); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels(SEXP builder_sexp, SEXP paths_sexp, SEXP levels_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type levels(levels_sexp); + parquet___ArrowWriterProperties___Builder__set_compression_levels(builder, paths, levels); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels(SEXP builder_sexp, SEXP paths_sexp, SEXP levels_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_compression_levels(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_write_statistics(const std::shared_ptr& builder, bool write_statistics); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics(SEXP builder_sexp, SEXP write_statistics_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type write_statistics(write_statistics_sexp); + parquet___ArrowWriterProperties___Builder__default_write_statistics(builder, write_statistics); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics(SEXP builder_sexp, SEXP write_statistics_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_write_statistics(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__default_use_dictionary(const std::shared_ptr& builder, bool use_dictionary); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary(SEXP builder_sexp, SEXP use_dictionary_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type use_dictionary(use_dictionary_sexp); + parquet___ArrowWriterProperties___Builder__default_use_dictionary(builder, use_dictionary); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary(SEXP builder_sexp, SEXP use_dictionary_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__default_use_dictionary(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_use_dictionary(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::LogicalVector& use_dictionary); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary(SEXP builder_sexp, SEXP paths_sexp, SEXP use_dictionary_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type use_dictionary(use_dictionary_sexp); + parquet___ArrowWriterProperties___Builder__set_use_dictionary(builder, paths, use_dictionary); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary(SEXP builder_sexp, SEXP paths_sexp, SEXP use_dictionary_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_use_dictionary(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__set_write_statistics(const std::shared_ptr& builder, const std::vector& paths, const Rcpp::LogicalVector& write_statistics); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics(SEXP builder_sexp, SEXP paths_sexp, SEXP write_statistics_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter&>::type paths(paths_sexp); + Rcpp::traits::input_parameter::type write_statistics(write_statistics_sexp); + parquet___ArrowWriterProperties___Builder__set_write_statistics(builder, paths, write_statistics); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics(SEXP builder_sexp, SEXP paths_sexp, SEXP write_statistics_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_write_statistics(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___ArrowWriterProperties___Builder__data_page_size(const std::shared_ptr& builder, int64_t data_page_size); +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__data_page_size(SEXP builder_sexp, SEXP data_page_size_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + Rcpp::traits::input_parameter::type data_page_size(data_page_size_sexp); + parquet___ArrowWriterProperties___Builder__data_page_size(builder, data_page_size); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___ArrowWriterProperties___Builder__data_page_size(SEXP builder_sexp, SEXP data_page_size_sexp){ + Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__data_page_size(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr parquet___WriterProperties___Builder__build(const std::shared_ptr& builder); +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__build(SEXP builder_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type builder(builder_sexp); + return Rcpp::wrap(parquet___WriterProperties___Builder__build(builder)); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___WriterProperties___Builder__build(SEXP builder_sexp){ + Rf_error("Cannot call parquet___WriterProperties___Builder__build(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +std::unique_ptr parquet___arrow___ParquetFileWriter__Open(const std::shared_ptr& schema, const std::shared_ptr& sink, const std::shared_ptr& properties, const std::shared_ptr& arrow_properties); +RcppExport SEXP _arrow_parquet___arrow___ParquetFileWriter__Open(SEXP schema_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type schema(schema_sexp); + Rcpp::traits::input_parameter&>::type sink(sink_sexp); + Rcpp::traits::input_parameter&>::type properties(properties_sexp); + Rcpp::traits::input_parameter&>::type arrow_properties(arrow_properties_sexp); + return Rcpp::wrap(parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties)); +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___arrow___ParquetFileWriter__Open(SEXP schema_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ + Rf_error("Cannot call parquet___arrow___ParquetFileWriter__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___arrow___FileWriter__WriteTable(const std::unique_ptr& writer, const std::shared_ptr& table, int64_t chunk_size); +RcppExport SEXP _arrow_parquet___arrow___FileWriter__WriteTable(SEXP writer_sexp, SEXP table_sexp, SEXP chunk_size_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type writer(writer_sexp); + Rcpp::traits::input_parameter&>::type table(table_sexp); + Rcpp::traits::input_parameter::type chunk_size(chunk_size_sexp); + parquet___arrow___FileWriter__WriteTable(writer, table, chunk_size); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___arrow___FileWriter__WriteTable(SEXP writer_sexp, SEXP table_sexp, SEXP chunk_size_sexp){ + Rf_error("Cannot call parquet___arrow___FileWriter__WriteTable(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___arrow___FileWriter__Close(const std::unique_ptr& writer); +RcppExport SEXP _arrow_parquet___arrow___FileWriter__Close(SEXP writer_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type writer(writer_sexp); + parquet___arrow___FileWriter__Close(writer); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_parquet___arrow___FileWriter__Close(SEXP writer_sexp){ + Rf_error("Cannot call parquet___arrow___FileWriter__Close(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// parquet.cpp +#if defined(ARROW_R_WITH_ARROW) +void parquet___arrow___WriteTable(const std::shared_ptr& table, const std::shared_ptr& sink, const std::shared_ptr& properties, const std::shared_ptr& arrow_properties); +RcppExport SEXP _arrow_parquet___arrow___WriteTable(SEXP table_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type table(table_sexp); - Rcpp::traits::input_parameter::type filename(filename_sexp); - write_parquet_file(table, filename); + Rcpp::traits::input_parameter&>::type sink(sink_sexp); + Rcpp::traits::input_parameter&>::type properties(properties_sexp); + Rcpp::traits::input_parameter&>::type arrow_properties(arrow_properties_sexp); + parquet___arrow___WriteTable(table, sink, properties, arrow_properties); return R_NilValue; END_RCPP } #else -RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){ - Rf_error("Cannot call write_parquet_file(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_parquet___arrow___WriteTable(SEXP table_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){ + Rf_error("Cannot call parquet___arrow___WriteTable(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -4225,6 +4683,22 @@ RcppExport SEXP _arrow_Table__Slice2(SEXP table_sexp, SEXP offset_sexp, SEXP len } #endif +// table.cpp +#if defined(ARROW_R_WITH_ARROW) +bool Table__Equals(const std::shared_ptr& lhs, const std::shared_ptr& rhs); +RcppExport SEXP _arrow_Table__Equals(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter&>::type rhs(rhs_sexp); + return Rcpp::wrap(Table__Equals(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Table__Equals(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call Table__Equals(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // table.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Table__GetColumnByName(const std::shared_ptr& table, const std::string& name); @@ -4361,6 +4835,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, + { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, @@ -4371,7 +4846,9 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ChunkArray__Slice2", (DL_FUNC) &_arrow_ChunkArray__Slice2, 3}, { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, + { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, + { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, { "_arrow_compute___CastOptions__initialize", (DL_FUNC) &_arrow_compute___CastOptions__initialize, 3}, @@ -4536,7 +5013,32 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, - { "_arrow_write_parquet_file", (DL_FUNC) &_arrow_write_parquet_file, 2}, + { "_arrow_parquet___default_arrow_writer_properties", (DL_FUNC) &_arrow_parquet___default_arrow_writer_properties, 0}, + { "_arrow_parquet___ArrowWriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__create, 0}, + { "_arrow_parquet___ArrowWriterProperties___Builder__store_schema", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__store_schema, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__coerce_timestamps, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps, 1}, + { "_arrow_parquet___ArrowWriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__build, 1}, + { "_arrow_parquet___default_writer_properties", (DL_FUNC) &_arrow_parquet___default_writer_properties, 0}, + { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, + { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_compression", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_compression, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_compression_level", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_compression_level, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_write_statistics, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__default_use_dictionary, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, + { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, + { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, + { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, + { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, + { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, @@ -4588,6 +5090,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, + { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 2}, { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, { "_arrow_Table__select", (DL_FUNC) &_arrow_Table__select, 2}, { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 2}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index f8c52fa0716..69f5cb39e62 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -207,6 +207,7 @@ RCPP_EXPOSED_ENUM_NODECL(arrow::io::FileMode::type) RCPP_EXPOSED_ENUM_NODECL(arrow::ipc::Message::Type) RCPP_EXPOSED_ENUM_NODECL(arrow::Compression::type) RCPP_EXPOSED_ENUM_NODECL(arrow::fs::FileType) +RCPP_EXPOSED_ENUM_NODECL(parquet::ParquetVersion::type) SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array); SEXP Array__as_vector(const std::shared_ptr& array); diff --git a/r/src/buffer.cpp b/r/src/buffer.cpp index 00df28d12ea..09ab39a5f98 100644 --- a/r/src/buffer.cpp +++ b/r/src/buffer.cpp @@ -62,4 +62,10 @@ Rcpp::RawVector Buffer__data(const std::shared_ptr& buffer) { return Rcpp::RawVector(buffer->data(), buffer->data() + buffer->size()); } +// [[arrow::export]] +bool Buffer__Equals(const std::shared_ptr& x, + const std::shared_ptr& y) { + return x->Equals(*y.get()); +} + #endif diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index 317728757a7..aef2a0eca21 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -80,4 +80,10 @@ void ChunkedArray__Validate(const std::shared_ptr& chunked_ STOP_IF_NOT_OK(chunked_array->Validate()); } +// [[arrow::export]] +bool ChunkedArray__Equals(const std::shared_ptr& x, + const std::shared_ptr& y) { + return x->Equals(y); +} + #endif diff --git a/r/src/compression.cpp b/r/src/compression.cpp index 4f9bc1772f4..4e6ec3105d7 100644 --- a/r/src/compression.cpp +++ b/r/src/compression.cpp @@ -27,6 +27,11 @@ std::unique_ptr util___Codec__Create(arrow::Compression::typ return out; } +// [[arrow::export]] +std::string util___Codec__name(const std::unique_ptr& codec) { + return codec->name(); +} + // [[arrow::export]] std::shared_ptr io___CompressedOutputStream__Make( const std::unique_ptr& codec, diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp index 9f9216a9522..add820b7dee 100644 --- a/r/src/parquet.cpp +++ b/r/src/parquet.cpp @@ -83,12 +83,213 @@ std::shared_ptr parquet___arrow___FileReader__ReadTable2( } // [[arrow::export]] -void write_parquet_file(const std::shared_ptr& table, - std::string filename) { - std::shared_ptr sink; - PARQUET_THROW_NOT_OK(arrow::io::FileOutputStream::Open(filename, &sink)); +std::shared_ptr +parquet___default_arrow_writer_properties() { + return parquet::default_arrow_writer_properties(); +} + +// [[arrow::export]] +std::shared_ptr +parquet___ArrowWriterProperties___Builder__create() { + return std::make_shared(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__store_schema( + const std::shared_ptr& builder) { + builder->store_schema(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__enable_deprecated_int96_timestamps( + const std::shared_ptr& builder) { + builder->enable_deprecated_int96_timestamps(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__disable_deprecated_int96_timestamps( + const std::shared_ptr& builder) { + builder->disable_deprecated_int96_timestamps(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__coerce_timestamps( + const std::shared_ptr& builder, + arrow::TimeUnit::type unit) { + builder->coerce_timestamps(unit); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__allow_truncated_timestamps( + const std::shared_ptr& builder) { + builder->allow_truncated_timestamps(); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__disallow_truncated_timestamps( + const std::shared_ptr& builder) { + builder->disallow_truncated_timestamps(); +} + +// [[arrow::export]] +std::shared_ptr +parquet___ArrowWriterProperties___Builder__build( + const std::shared_ptr& builder) { + return builder->build(); +} + +// [[arrow::export]] +std::shared_ptr parquet___default_writer_properties() { + return parquet::default_writer_properties(); +} + +// [[arrow::export]] +std::shared_ptr +parquet___WriterProperties___Builder__create() { + return std::make_shared(); +} + +// [[arrow::export]] +void parquet___WriterProperties___Builder__version( + const std::shared_ptr& builder, + const parquet::ParquetVersion::type& version) { + builder->version(version); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_compression( + const std::shared_ptr& builder, + const arrow::Compression::type& compression) { + builder->compression(compression); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_compressions( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::IntegerVector& types) { + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + builder->compression(paths[i], static_cast(types[i])); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_compression_level( + const std::shared_ptr& builder, + int compression_level) { + builder->compression_level(compression_level); +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_compression_levels( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::IntegerVector& levels) { + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + builder->compression_level(paths[i], levels[i]); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_write_statistics( + const std::shared_ptr& builder, + bool write_statistics) { + if (write_statistics) { + builder->enable_statistics(); + } else { + builder->disable_statistics(); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__default_use_dictionary( + const std::shared_ptr& builder, + bool use_dictionary) { + if (use_dictionary) { + builder->enable_dictionary(); + } else { + builder->disable_dictionary(); + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_use_dictionary( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::LogicalVector& use_dictionary) { + builder->disable_dictionary(); + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + if (use_dictionary[i]) { + builder->enable_dictionary(paths[i]); + } else { + builder->disable_dictionary(paths[i]); + } + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__set_write_statistics( + const std::shared_ptr& builder, + const std::vector& paths, const Rcpp::LogicalVector& write_statistics) { + builder->disable_statistics(); + auto n = paths.size(); + for (decltype(n) i = 0; i < n; i++) { + if (write_statistics[i]) { + builder->enable_statistics(paths[i]); + } else { + builder->disable_statistics(paths[i]); + } + } +} + +// [[arrow::export]] +void parquet___ArrowWriterProperties___Builder__data_page_size( + const std::shared_ptr& builder, + int64_t data_page_size) { + builder->data_pagesize(data_page_size); +} + +// [[arrow::export]] +std::shared_ptr parquet___WriterProperties___Builder__build( + const std::shared_ptr& builder) { + return builder->build(); +} + +// [[arrow::export]] +std::unique_ptr parquet___arrow___ParquetFileWriter__Open( + const std::shared_ptr& schema, + const std::shared_ptr& sink, + const std::shared_ptr& properties, + const std::shared_ptr& arrow_properties) { + std::unique_ptr writer; + PARQUET_THROW_NOT_OK( + parquet::arrow::FileWriter::Open(*schema, arrow::default_memory_pool(), sink, + properties, arrow_properties, &writer)); + return writer; +} + +// [[arrow::export]] +void parquet___arrow___FileWriter__WriteTable( + const std::unique_ptr& writer, + const std::shared_ptr& table, int64_t chunk_size) { + PARQUET_THROW_NOT_OK(writer->WriteTable(*table, chunk_size)); +} + +// [[arrow::export]] +void parquet___arrow___FileWriter__Close( + const std::unique_ptr& writer) { + PARQUET_THROW_NOT_OK(writer->Close()); +} + +// [[arrow::export]] +void parquet___arrow___WriteTable( + const std::shared_ptr& table, + const std::shared_ptr& sink, + const std::shared_ptr& properties, + const std::shared_ptr& arrow_properties) { PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), - sink, table->num_rows())); + sink, table->num_rows(), properties, + arrow_properties)); } // [[arrow::export]] diff --git a/r/src/table.cpp b/r/src/table.cpp index a78f1196294..e17db49ffb5 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -75,17 +75,23 @@ std::vector Table__ColumnNames(const std::shared_ptr& } // [[arrow::export]] -std::shared_ptr Table__Slice1( - const std::shared_ptr& table, int offset) { +std::shared_ptr Table__Slice1(const std::shared_ptr& table, + int offset) { return table->Slice(offset); } // [[arrow::export]] -std::shared_ptr Table__Slice2( - const std::shared_ptr& table, int offset, int length) { +std::shared_ptr Table__Slice2(const std::shared_ptr& table, + int offset, int length) { return table->Slice(offset, length); } +// [[arrow::export]] +bool Table__Equals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + return lhs->Equals(*rhs.get()); +} + // [[arrow::export]] std::shared_ptr Table__GetColumnByName( const std::shared_ptr& table, const std::string& name) { diff --git a/r/tests/testthat/helper-parquet.R b/r/tests/testthat/helper-parquet.R new file mode 100644 index 00000000000..1eec5b08862 --- /dev/null +++ b/r/tests/testthat/helper-parquet.R @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_parquet_roundtrip <- function(tab, ...) { + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(tab, tf, ...) + expect_equal(read_parquet(tf, as_data_frame = FALSE), tab) +} diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 95ef12d4c86..016b137f04c 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -52,6 +52,11 @@ test_that("Array", { expect_equal(z_dbl$as_vector(), as.numeric(4:5)) }) +test_that("Array print method includes type", { + x <- Array$create(c(1:10, 1:10, 1:5)) + expect_output(print(x), "Array\n\n[\n", fixed = TRUE) +}) + test_that("Array supports NA", { x_int <- Array$create(as.integer(c(1:10, NA))) x_dbl <- Array$create(as.numeric(c(1:10, NA))) @@ -257,7 +262,7 @@ test_that("array supports integer64", { expect_true(a$IsNull(3L)) }) -test_that("array$as_vector() correctly handles all NA inte64 (ARROW-3795)", { +test_that("array$as_vector() correctly handles all NA int64 (ARROW-3795)", { x <- bit64::as.integer64(NA) a <- Array$create(x) expect_true(is.na(a$as_vector())) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 83959527f9c..6ee630061df 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -28,13 +28,13 @@ test_that("RecordBatch", { ) batch <- record_batch(tbl) - expect_true(batch == batch) + expect_equal(batch, batch) expect_equal( batch$schema, schema( int = int32(), dbl = float64(), lgl = boolean(), chr = utf8(), - fct = dictionary(int32(), Array$create(letters[1:10])) + fct = dictionary(int8(), utf8()) ) ) expect_equal(batch$num_columns, 5L) @@ -69,12 +69,12 @@ test_that("RecordBatch", { col_fct <- batch$column(4) expect_true(inherits(col_fct, 'Array')) expect_equal(col_fct$as_vector(), tbl$fct) - expect_equal(col_fct$type, dictionary(int32(), Array$create(letters[1:10]))) + expect_equal(col_fct$type, dictionary(int8(), utf8())) batch2 <- batch$RemoveColumn(0) expect_equal( batch2$schema, - schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int32(), Array$create(letters[1:10]))) + schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) ) expect_equal(batch2$column(0), batch$column(1)) expect_identical(as.data.frame(batch2), tbl[,-1]) @@ -120,6 +120,23 @@ test_that("head and tail on RecordBatch", { expect_identical(as.data.frame(tail(batch, -4)), tail(tbl, -4)) }) +test_that("RecordBatch print method", { + expect_output( + print(batch), + paste( + "RecordBatch", + "10 rows x 5 columns", + "$int ", + "$dbl ", + "$lgl ", + "$chr ", + "$fct >", + sep = "\n" + ), + fixed = TRUE + ) +}) + test_that("RecordBatch with 0 rows are supported", { tbl <- tibble::tibble( int = integer(), @@ -139,7 +156,7 @@ test_that("RecordBatch with 0 rows are supported", { dbl = float64(), lgl = boolean(), chr = utf8(), - fct = dictionary(int32(), Array$create(c("a", "b"))) + fct = dictionary(int8(), utf8()) ) ) }) @@ -191,10 +208,11 @@ test_that("record_batch() handles data frame columns", { tib <- tibble::tibble(x = 1:10, y = 1:10) # because tib is named here, this becomes a struct array batch <- record_batch(a = 1:10, b = tib) - expect_equal(batch$schema, + expect_equal( + batch$schema, schema( a = int32(), - struct(x = int32(), y = int32()) + b = struct(x = int32(), y = int32()) ) ) out <- as.data.frame(batch) @@ -202,7 +220,8 @@ test_that("record_batch() handles data frame columns", { # if not named, columns from tib are auto spliced batch2 <- record_batch(a = 1:10, tib) - expect_equal(batch$schema, + expect_equal( + batch2$schema, schema(a = int32(), x = int32(), y = int32()) ) out <- as.data.frame(batch2) @@ -256,3 +275,4 @@ test_that("record_batch() only auto splice data frames", { regexp = "only data frames are allowed as unnamed arguments to be auto spliced" ) }) + diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 372af7d07bc..61fb5465423 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -111,6 +111,15 @@ test_that("[, [[, $ for Table", { }) test_that("head and tail on Table", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + expect_identical(as.data.frame(head(tab)), head(tbl)) expect_identical(as.data.frame(head(tab, 4)), head(tbl, 4)) expect_identical(as.data.frame(head(tab, -4)), head(tbl, -4)) @@ -119,7 +128,33 @@ test_that("head and tail on Table", { expect_identical(as.data.frame(tail(tab, -4)), tail(tbl, -4)) }) +test_that("Table print method", { + expect_output( + print(tab), + paste( + "Table", + "10 rows x 5 columns", + "$int ", + "$dbl ", + "$lgl ", + "$chr ", + "$fct >", + sep = "\n" + ), + fixed = TRUE + ) +}) + test_that("table active bindings", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + expect_identical(dim(tbl), dim(tab)) expect_is(tab$columns, "list") expect_equal(tab$columns[[1]], tab[[1]]) @@ -179,3 +214,23 @@ test_that("table() auto splices (ARROW-5718)", { expect_equal(tab3$schema, s) expect_equivalent(as.data.frame(tab3), df) }) + +test_that("==.Table", { + tab1 <- Table$create(x = 1:2, y = c("a", "b")) + tab2 <- Table$create(x = 1:2, y = c("a", "b")) + tab3 <- Table$create(x = 1:2) + tab4 <- Table$create(x = 1:2, y = c("a", "b"), z = 3:4) + + expect_true(tab1 == tab2) + expect_true(tab2 == tab1) + + expect_false(tab1 == tab3) + expect_false(tab3 == tab1) + + expect_false(tab1 == tab4) + expect_false(tab4 == tab1) + + expect_true(all.equal(tab1, tab2)) + expect_equal(tab1, tab2) +}) + diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index ff3daef7936..0e2a21ba5e6 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -27,8 +27,8 @@ test_that("ChunkedArray", { y <- x$Slice(8) expect_equal(y$type, int32()) expect_equal(y$num_chunks, 3L) - expect_equal(y$length(), 17L) - expect_equal(y$as_vector(), c(9:10, 1:10, 1:5)) + expect_equal(length(y), 17L) + expect_equal(as.vector(y), c(9:10, 1:10, 1:5)) z <- x$Slice(8, 5) expect_equal(z$type, int32()) @@ -55,6 +55,55 @@ test_that("ChunkedArray", { expect_equal(z_dbl$as_vector(), as.numeric(3:4)) }) +test_that("print ChunkedArray", { + x1 <- chunked_array(c(1,2,3), c(4,5,6)) + expect_output( + print(x1), + paste( + "ChunkedArray", + "", + "[", + " 1,", + " 2,", + " 3,", + " ...", + "]", + sep = "\n" + ), + fixed = TRUE + ) + x2 <- chunked_array(1:30, c(4,5,6)) + expect_output( + print(x2), + paste( + "ChunkedArray", + "", + "[", + " 1,", + " 2,", + " 3,", + " 4,", + " 5,", + " 6,", + " 7,", + " 8,", + " 9,", + " 10,", + " ...", + "]", + sep = "\n" + ), + fixed = TRUE + ) + # If there's only one chunk, it should look like a regular Array + x3 <- chunked_array(1:30) + expect_output( + print(x3), + paste0("Chunked", paste(capture.output(print(Array$create(1:30))), collapse = "\n")), + fixed = TRUE + ) +}) + test_that("ChunkedArray handles !!! splicing", { data <- list(1, 2, 3) x <- chunked_array(!!!data) diff --git a/r/tests/testthat/test-compressed.R b/r/tests/testthat/test-compressed.R index 8bf1092616e..dedb1a8c84a 100644 --- a/r/tests/testthat/test-compressed.R +++ b/r/tests/testthat/test-compressed.R @@ -36,7 +36,6 @@ test_that("can write Buffer to CompressedOutputStream and read back in Compresse stream2$close() sink2$close() - input1 <- CompressedInputStream$create(tf1) buf1 <- input1$Read(1024L) diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R index fd8bef1fc79..5a408ed3f6c 100644 --- a/r/tests/testthat/test-data-type.R +++ b/r/tests/testthat/test-data-type.R @@ -367,7 +367,9 @@ test_that("DictionaryType works as expected (ARROW-3355)", { expect_false(d == int32()) expect_equal(d$id, Type$DICTIONARY) expect_equal(d$bit_width, 32L) - expect_equal(d$ToString(), "dictionary") + expect_equal(d$ToString(), "dictionary") expect_equal(d$index_type, int32()) expect_equal(d$value_type, utf8()) + ord <- dictionary(ordered = TRUE) + expect_equal(ord$ToString(), "dictionary") }) diff --git a/r/tests/testthat/test-field.R b/r/tests/testthat/test-field.R index d7de087d12f..8a4c88f46dd 100644 --- a/r/tests/testthat/test-field.R +++ b/r/tests/testthat/test-field.R @@ -28,3 +28,11 @@ test_that("field() factory", { test_that("Field validation", { expect_error(schema(b = 32), "b must be arrow::DataType, not numeric") }) + +test_that("Print method for field", { + expect_output(print(field("x", int32())), "Field\nx: int32") + expect_output( + print(field("zz", dictionary())), + "Field\nzz: dictionary" + ) +}) diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R index 1c198383066..c0565adcfc0 100644 --- a/r/tests/testthat/test-filesystem.R +++ b/r/tests/testthat/test-filesystem.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -context("test-type") +context("File system") test_that("LocalFilesystem", { fs <- LocalFileSystem$create() @@ -83,7 +83,6 @@ test_that("SubTreeFilesystem", { expect_is(st_fs, "FileSystem") st_fs$CreateDir("test") st_fs$CopyFile("DESCRIPTION", "DESC.txt") - skip_on_os("windows") # See ARROW-6622 stats <- st_fs$GetTargetStats(c("DESCRIPTION", "test", "nope", "DESC.txt")) expect_equal(stats[[1L]]$type, FileType$File) expect_equal(stats[[2L]]$type, FileType$Directory) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index fd6f40fcd56..18aa4298d46 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -49,3 +49,46 @@ test_that("read_parquet() with raw data", { df <- read_parquet(test_raw) expect_identical(dim(df), c(10L, 11L)) }) + +test_that("write_parquet() handles various compression= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "snappy") + expect_parquet_roundtrip(tab, compression = rep("snappy", 3L)) + expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy")) +}) + +test_that("write_parquet() handles various compression_level= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L)) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L)) +}) + +test_that("write_parquet() handles various use_dictionary= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, use_dictionary = TRUE) + expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE)) +}) + +test_that("write_parquet() handles various write_statistics= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, write_statistics = TRUE) + expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE)) +}) + +test_that("make_valid_version()", { + expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0) +}) diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index ec56d6a783b..dcda9ce4277 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -97,7 +97,7 @@ test_that("table round trip handles NA in integer and numeric", { expect_equal(tab$column(0)$type, int32()) expect_equal(tab$column(1)$type, float64()) - expect_equal(tab$column(2)$type, int8()) + expect_equal(tab$column(2)$type, uint8()) tf <- tempfile() write_arrow(tbl, tf) diff --git a/r/tests/testthat/test-record-batch-reader.R b/r/tests/testthat/test-record-batch-reader.R index b557f0669ac..2851538010b 100644 --- a/r/tests/testthat/test-record-batch-reader.R +++ b/r/tests/testthat/test-record-batch-reader.R @@ -18,16 +18,20 @@ context("RecordBatch.*(Reader|Writer)") test_that("RecordBatchStreamReader / Writer", { - batch <- record_batch( + tbl <- tibble::tibble( x = 1:10, y = letters[1:10] ) + batch <- record_batch(tbl) + tab <- Table$create(tbl) sink <- BufferOutputStream$create() expect_equal(sink$tell(), 0) writer <- RecordBatchStreamWriter$create(sink, batch$schema) expect_is(writer, "RecordBatchStreamWriter") - writer$write_batch(batch) + writer$write(batch) + writer$write(tab) + writer$write(tbl) expect_true(sink$tell() > 0) writer$close() @@ -40,20 +44,22 @@ test_that("RecordBatchStreamReader / Writer", { batch1 <- reader$read_next_batch() expect_is(batch1, "RecordBatch") expect_equal(batch, batch1) - + batch2 <- reader$read_next_batch() + expect_is(batch2, "RecordBatch") + expect_equal(batch, batch2) + batch3 <- reader$read_next_batch() + expect_is(batch3, "RecordBatch") + expect_equal(batch, batch3) expect_null(reader$read_next_batch()) }) test_that("RecordBatchFileReader / Writer", { - batch <- record_batch( - x = 1:10, - y = letters[1:10] - ) - sink <- BufferOutputStream$create() writer <- RecordBatchFileWriter$create(sink, batch$schema) expect_is(writer, "RecordBatchFileWriter") - writer$write_batch(batch) + writer$write(batch) + writer$write(tab) + writer$write(tbl) writer$close() buf <- sink$getvalue() @@ -66,5 +72,5 @@ test_that("RecordBatchFileReader / Writer", { expect_is(batch1, "RecordBatch") expect_equal(batch, batch1) - expect_equal(reader$num_record_batches, 1) + expect_equal(reader$num_record_batches, 3) }) diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 19934c6e472..f50b0783db6 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -35,7 +35,7 @@ test_that("type() infers from R type", { expect_equal(type(""), utf8()) expect_equal( type(iris$Species), - dictionary(int8(), Array$create(levels(iris$Species)), FALSE) + dictionary(int8(), utf8(), FALSE) ) expect_equal( type(lubridate::ymd_hms("2019-02-14 13:55:05")), diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index cd7f1073cbc..636cd767120 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1458,5 +1458,4 @@ mod tests { let mut builder = StructBuilder::new(fields, field_builders); assert!(builder.field_builder::(0).is_none()); } - } diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs index cd05b595531..76568ae33b5 100644 --- a/rust/arrow/src/bitmap.rs +++ b/rust/arrow/src/bitmap.rs @@ -55,6 +55,10 @@ impl Bitmap { assert!(i < (self.bits.len() << 3)); unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i) } } + + pub fn to_buffer(self) -> Buffer { + self.bits + } } impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { @@ -122,5 +126,4 @@ mod tests { assert_eq!(true, bitmap.is_set(6)); assert_eq!(false, bitmap.is_set(7)); } - } diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md index 44463eb4105..7058a3b1b43 100644 --- a/rust/datafusion/README.md +++ b/rust/datafusion/README.md @@ -41,7 +41,7 @@ cargo run --bin datafusion-cli ``` ##### Use Dockerfile ```sh -git clone https://github/apache/arrow +git clone https://github.com/apache/arrow cd arrow docker build -f rust/datafusion/Dockerfile . --tag datafusion-cli docker run -it -v $(your_data_location):/data datafusion-cli diff --git a/rust/datafusion/src/datasource/parquet.rs b/rust/datafusion/src/datasource/parquet.rs index d5da6ad68c4..6447d8b48ca 100644 --- a/rust/datafusion/src/datasource/parquet.rs +++ b/rust/datafusion/src/datasource/parquet.rs @@ -37,23 +37,27 @@ use parquet::file::reader::*; use crate::datasource::{ScanResult, TableProvider}; use crate::error::{ExecutionError, Result}; +use crate::execution::physical_plan::common; use crate::execution::physical_plan::BatchIterator; /// Table-based representation of a `ParquetFile` pub struct ParquetTable { - filename: String, + filenames: Vec, schema: Arc, } impl ParquetTable { /// Attempt to initialize a new `ParquetTable` from a file path - pub fn try_new(filename: &str) -> Result { - let parquet_file = ParquetFile::open(filename, None, 0)?; - let schema = parquet_file.projection_schema.clone(); - Ok(Self { - filename: filename.to_string(), - schema, - }) + pub fn try_new(path: &str) -> Result { + let mut filenames: Vec = vec![]; + common::build_file_list(path, &mut filenames, ".parquet")?; + if filenames.is_empty() { + Err(ExecutionError::General("No files found".to_string())) + } else { + let parquet_file = ParquetFile::open(&filenames[0], None, 0)?; + let schema = parquet_file.projection_schema.clone(); + Ok(Self { filenames, schema }) + } } } @@ -70,17 +74,16 @@ impl TableProvider for ParquetTable { projection: &Option>, batch_size: usize, ) -> Result> { - // note that this code currently assumes the filename is a file rather than a directory - // and therefore only returns a single partition - let parquet_file = match projection { - Some(p) => ParquetScanPartition::try_new( - &self.filename, - Some(p.clone()), - batch_size, - )?, - None => ParquetScanPartition::try_new(&self.filename, None, batch_size)?, - }; - Ok(vec![Arc::new(Mutex::new(parquet_file))]) + Ok(self + .filenames + .iter() + .map(|filename| { + ParquetScanPartition::try_new(filename, projection.clone(), batch_size) + .and_then(|part| { + Ok(Arc::new(Mutex::new(part)) as Arc>) + }) + }) + .collect::>>()?) } } @@ -241,7 +244,7 @@ where builder.append_slice(&converted_buffer[0..values_read])?; } else { let mut value_index = 0; - for i in 0..def_levels.len() { + for i in 0..levels_read { if def_levels[i] != 0 { builder.append_value(converted_buffer[value_index].into())?; value_index += 1; diff --git a/rust/datafusion/src/execution/aggregate.rs b/rust/datafusion/src/execution/aggregate.rs index e96b12f6d9e..1795f492c5a 100644 --- a/rust/datafusion/src/execution/aggregate.rs +++ b/rust/datafusion/src/execution/aggregate.rs @@ -1471,5 +1471,4 @@ mod tests { ds, ))))) } - } diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs index e9c368d3a41..dc54b9978ec 100644 --- a/rust/datafusion/src/execution/context.rs +++ b/rust/datafusion/src/execution/context.rs @@ -37,10 +37,13 @@ use crate::execution::filter::FilterRelation; use crate::execution::limit::LimitRelation; use crate::execution::physical_plan::common; use crate::execution::physical_plan::datasource::DatasourceExec; -use crate::execution::physical_plan::expressions::{Column, Sum}; +use crate::execution::physical_plan::expressions::{ + BinaryExpr, CastExpr, Column, Literal, Sum, +}; use crate::execution::physical_plan::hash_aggregate::HashAggregateExec; use crate::execution::physical_plan::merge::MergeExec; use crate::execution::physical_plan::projection::ProjectionExec; +use crate::execution::physical_plan::selection::SelectionExec; use crate::execution::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr}; use crate::execution::projection::ProjectRelation; use crate::execution::relation::{DataSourceRelation, Relation}; @@ -280,6 +283,12 @@ impl ExecutionContext { schema.clone(), )?)) } + LogicalPlan::Selection { input, expr, .. } => { + let input = self.create_physical_plan(input, batch_size)?; + let input_schema = input.as_ref().schema().clone(); + let runtime_expr = self.create_physical_expr(expr, &input_schema)?; + Ok(Arc::new(SelectionExec::try_new(runtime_expr, input)?)) + } _ => Err(ExecutionError::General( "Unsupported logical plan variant".to_string(), )), @@ -290,13 +299,25 @@ impl ExecutionContext { pub fn create_physical_expr( &self, e: &Expr, - _input_schema: &Schema, + input_schema: &Schema, ) -> Result> { match e { Expr::Column(i) => Ok(Arc::new(Column::new(*i))), - _ => Err(ExecutionError::NotImplemented( - "Unsupported expression".to_string(), - )), + Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), + Expr::BinaryExpr { left, op, right } => Ok(Arc::new(BinaryExpr::new( + self.create_physical_expr(left, input_schema)?, + op.clone(), + self.create_physical_expr(right, input_schema)?, + ))), + Expr::Cast { expr, data_type } => Ok(Arc::new(CastExpr::try_new( + self.create_physical_expr(expr, input_schema)?, + input_schema, + data_type.clone(), + )?)), + other => Err(ExecutionError::NotImplemented(format!( + "Physical plan does not support logical expression {:?}", + other + ))), } } @@ -569,6 +590,29 @@ mod tests { Ok(()) } + #[test] + fn parallel_selection() -> Result<()> { + let tmp_dir = TempDir::new("parallel_selection")?; + let partition_count = 4; + let mut ctx = create_ctx(&tmp_dir, partition_count)?; + + let logical_plan = + ctx.create_logical_plan("SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3")?; + let logical_plan = ctx.optimize(&logical_plan)?; + + let physical_plan = ctx.create_physical_plan(&logical_plan, 1024)?; + + let results = ctx.collect(physical_plan.as_ref())?; + + // there should be one batch per partition + assert_eq!(results.len(), partition_count); + + let row_count: usize = results.iter().map(|batch| batch.num_rows()).sum(); + assert_eq!(row_count, 20); + + Ok(()) + } + #[test] fn aggregate() -> Result<()> { let results = execute("SELECT SUM(c1), SUM(c2) FROM test", 4)?; @@ -638,5 +682,4 @@ mod tests { Ok(ctx) } - } diff --git a/rust/datafusion/src/execution/physical_plan/common.rs b/rust/datafusion/src/execution/physical_plan/common.rs index e6ba826f0e9..60872b06e47 100644 --- a/rust/datafusion/src/execution/physical_plan/common.rs +++ b/rust/datafusion/src/execution/physical_plan/common.rs @@ -17,9 +17,11 @@ //! Defines common code used in execution plans +use std::fs; +use std::fs::metadata; use std::sync::{Arc, Mutex}; -use crate::error::Result; +use crate::error::{ExecutionError, Result}; use crate::execution::physical_plan::BatchIterator; use arrow::datatypes::Schema; @@ -75,3 +77,30 @@ pub fn collect(it: Arc>) -> Result> { } } } + +/// Recursively build a list of files in a directory with a given extension +pub fn build_file_list(dir: &str, filenames: &mut Vec, ext: &str) -> Result<()> { + let metadata = metadata(dir)?; + if metadata.is_file() { + if dir.ends_with(ext) { + filenames.push(dir.to_string()); + } + } else { + for entry in fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + if let Some(path_name) = path.to_str() { + if path.is_dir() { + build_file_list(path_name, filenames, ext)?; + } else { + if path_name.ends_with(ext) { + filenames.push(path_name.to_string()); + } + } + } else { + return Err(ExecutionError::General("Invalid path".to_string())); + } + } + } + Ok(()) +} diff --git a/rust/datafusion/src/execution/physical_plan/csv.rs b/rust/datafusion/src/execution/physical_plan/csv.rs index 306718fe5c4..a07417cf22e 100644 --- a/rust/datafusion/src/execution/physical_plan/csv.rs +++ b/rust/datafusion/src/execution/physical_plan/csv.rs @@ -17,12 +17,11 @@ //! Execution plan for reading CSV files -use std::fs; -use std::fs::metadata; use std::fs::File; use std::sync::{Arc, Mutex}; -use crate::error::{ExecutionError, Result}; +use crate::error::Result; +use crate::execution::physical_plan::common; use crate::execution::physical_plan::{BatchIterator, ExecutionPlan, Partition}; use arrow::csv; use arrow::datatypes::Schema; @@ -51,7 +50,7 @@ impl ExecutionPlan for CsvExec { /// Get the partitions for this execution plan. Each partition can be executed in parallel. fn partitions(&self) -> Result>> { let mut filenames: Vec = vec![]; - self.build_file_list(&self.path, &mut filenames)?; + common::build_file_list(&self.path, &mut filenames, ".csv")?; let partitions = filenames .iter() .map(|filename| { @@ -85,33 +84,6 @@ impl CsvExec { batch_size, }) } - - /// Recursively build a list of csv files in a directory - fn build_file_list(&self, dir: &str, filenames: &mut Vec) -> Result<()> { - let metadata = metadata(dir)?; - if metadata.is_file() { - if dir.ends_with(".csv") { - filenames.push(dir.to_string()); - } - } else { - for entry in fs::read_dir(dir)? { - let entry = entry?; - let path = entry.path(); - if let Some(path_name) = path.to_str() { - if path.is_dir() { - self.build_file_list(path_name, filenames)?; - } else { - if path_name.ends_with(".csv") { - filenames.push(path_name.to_string()); - } - } - } else { - return Err(ExecutionError::General("Invalid path".to_string())); - } - } - } - Ok(()) - } } /// CSV Partition diff --git a/rust/datafusion/src/execution/physical_plan/expressions.rs b/rust/datafusion/src/execution/physical_plan/expressions.rs index fe1a7de6d8f..9e0741c263c 100644 --- a/rust/datafusion/src/execution/physical_plan/expressions.rs +++ b/rust/datafusion/src/execution/physical_plan/expressions.rs @@ -23,16 +23,18 @@ use std::sync::Arc; use crate::error::{ExecutionError, Result}; use crate::execution::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; -use crate::logicalplan::ScalarValue; +use crate::logicalplan::{Operator, ScalarValue}; use arrow::array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::array::{ Float32Builder, Float64Builder, Int16Builder, Int32Builder, Int64Builder, Int8Builder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, }; +use arrow::compute::kernels::boolean::{and, or}; use arrow::compute::kernels::cast::cast; +use arrow::compute::kernels::comparison::{eq, gt, gt_eq, lt, lt_eq, neq}; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; @@ -197,6 +199,140 @@ pub fn sum(expr: Arc) -> Arc { Arc::new(Sum::new(expr)) } +/// Invoke a compute kernel on a pair of arrays +macro_rules! compute_op { + ($LEFT:expr, $RIGHT:expr, $OP:ident, $DT:ident) => {{ + let ll = $LEFT + .as_any() + .downcast_ref::<$DT>() + .expect("compute_op failed to downcast array"); + let rr = $RIGHT + .as_any() + .downcast_ref::<$DT>() + .expect("compute_op failed to downcast array"); + Ok(Arc::new($OP(&ll, &rr)?)) + }}; +} + +/// Invoke a compute kernel on a pair of arrays +macro_rules! comparison_op { + ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ + match $LEFT.data_type() { + DataType::Int8 => compute_op!($LEFT, $RIGHT, $OP, Int8Array), + DataType::Int16 => compute_op!($LEFT, $RIGHT, $OP, Int16Array), + DataType::Int32 => compute_op!($LEFT, $RIGHT, $OP, Int32Array), + DataType::Int64 => compute_op!($LEFT, $RIGHT, $OP, Int64Array), + DataType::UInt8 => compute_op!($LEFT, $RIGHT, $OP, UInt8Array), + DataType::UInt16 => compute_op!($LEFT, $RIGHT, $OP, UInt16Array), + DataType::UInt32 => compute_op!($LEFT, $RIGHT, $OP, UInt32Array), + DataType::UInt64 => compute_op!($LEFT, $RIGHT, $OP, UInt64Array), + DataType::Float32 => compute_op!($LEFT, $RIGHT, $OP, Float32Array), + DataType::Float64 => compute_op!($LEFT, $RIGHT, $OP, Float64Array), + other => Err(ExecutionError::General(format!( + "Unsupported data type {:?}", + other + ))), + } + }}; +} + +/// Invoke a boolean kernel on a pair of arrays +macro_rules! boolean_op { + ($LEFT:expr, $RIGHT:expr, $OP:ident) => {{ + let ll = $LEFT + .as_any() + .downcast_ref::() + .expect("boolean_op failed to downcast array"); + let rr = $RIGHT + .as_any() + .downcast_ref::() + .expect("boolean_op failed to downcast array"); + Ok(Arc::new($OP(&ll, &rr)?)) + }}; +} +/// Binary expression +pub struct BinaryExpr { + left: Arc, + op: Operator, + right: Arc, +} + +impl BinaryExpr { + /// Create new binary expression + pub fn new( + left: Arc, + op: Operator, + right: Arc, + ) -> Self { + Self { left, op, right } + } +} + +impl PhysicalExpr for BinaryExpr { + fn name(&self) -> String { + format!("{:?}", self.op) + } + + fn data_type(&self, input_schema: &Schema) -> Result { + self.left.data_type(input_schema) + } + + fn evaluate(&self, batch: &RecordBatch) -> Result { + let left = self.left.evaluate(batch)?; + let right = self.right.evaluate(batch)?; + if left.data_type() != right.data_type() { + return Err(ExecutionError::General(format!( + "Cannot evaluate binary expression {:?} with types {:?} and {:?}", + self.op, + left.data_type(), + right.data_type() + ))); + } + match &self.op { + Operator::Lt => comparison_op!(left, right, lt), + Operator::LtEq => comparison_op!(left, right, lt_eq), + Operator::Gt => comparison_op!(left, right, gt), + Operator::GtEq => comparison_op!(left, right, gt_eq), + Operator::Eq => comparison_op!(left, right, eq), + Operator::NotEq => comparison_op!(left, right, neq), + Operator::And => { + if left.data_type() == &DataType::Boolean { + boolean_op!(left, right, and) + } else { + return Err(ExecutionError::General(format!( + "Cannot evaluate binary expression {:?} with types {:?} and {:?}", + self.op, + left.data_type(), + right.data_type() + ))); + } + } + Operator::Or => { + if left.data_type() == &DataType::Boolean { + boolean_op!(left, right, or) + } else { + return Err(ExecutionError::General(format!( + "Cannot evaluate binary expression {:?} with types {:?} and {:?}", + self.op, + left.data_type(), + right.data_type() + ))); + } + } + _ => Err(ExecutionError::General("Unsupported operator".to_string())), + } + } +} + +/// Create a binary expression +pub fn binary( + l: Arc, + op: Operator, + r: Arc, +) -> Arc { + Arc::new(BinaryExpr::new(l, op, r)) +} + /// CAST expression casts an expression to a specific data type pub struct CastExpr { /// The expression to cast @@ -335,6 +471,71 @@ mod tests { use arrow::array::BinaryArray; use arrow::datatypes::*; + #[test] + fn binary_comparison() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ]); + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + let b = Int32Array::from(vec![1, 2, 4, 8, 16]); + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(a), Arc::new(b)], + )?; + + // expression: "a < b" + let lt = binary(col(0), Operator::Lt, col(1)); + let result = lt.evaluate(&batch)?; + assert_eq!(result.len(), 5); + + let expected = vec![false, false, true, true, true]; + let result = result + .as_any() + .downcast_ref::() + .expect("failed to downcast to BooleanArray"); + for i in 0..5 { + assert_eq!(result.value(i), expected[i]); + } + + Ok(()) + } + + #[test] + fn binary_nested() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ]); + let a = Int32Array::from(vec![2, 4, 6, 8, 10]); + let b = Int32Array::from(vec![2, 5, 4, 8, 8]); + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(a), Arc::new(b)], + )?; + + // expression: "a < b OR a == b" + let expr = binary( + binary(col(0), Operator::Lt, col(1)), + Operator::Or, + binary(col(0), Operator::Eq, col(1)), + ); + let result = expr.evaluate(&batch)?; + assert_eq!(result.len(), 5); + + let expected = vec![true, true, false, true, false]; + let result = result + .as_any() + .downcast_ref::() + .expect("failed to downcast to BooleanArray"); + for i in 0..5 { + print!("{}", i); + assert_eq!(result.value(i), expected[i]); + } + + Ok(()) + } + #[test] fn literal_i32() -> Result<()> { // create an arbitrary record bacth diff --git a/rust/datafusion/src/execution/physical_plan/hash_aggregate.rs b/rust/datafusion/src/execution/physical_plan/hash_aggregate.rs index 9c50b9f0825..491a81af855 100644 --- a/rust/datafusion/src/execution/physical_plan/hash_aggregate.rs +++ b/rust/datafusion/src/execution/physical_plan/hash_aggregate.rs @@ -720,5 +720,4 @@ mod tests { Ok(()) } - } diff --git a/rust/datafusion/src/execution/physical_plan/merge.rs b/rust/datafusion/src/execution/physical_plan/merge.rs index 9d1de2fd464..0ef8a39e8e6 100644 --- a/rust/datafusion/src/execution/physical_plan/merge.rs +++ b/rust/datafusion/src/execution/physical_plan/merge.rs @@ -134,5 +134,4 @@ mod tests { Ok(()) } - } diff --git a/rust/datafusion/src/execution/physical_plan/mod.rs b/rust/datafusion/src/execution/physical_plan/mod.rs index eb53392ad4d..f0c34c228db 100644 --- a/rust/datafusion/src/execution/physical_plan/mod.rs +++ b/rust/datafusion/src/execution/physical_plan/mod.rs @@ -88,3 +88,4 @@ pub mod expressions; pub mod hash_aggregate; pub mod merge; pub mod projection; +pub mod selection; diff --git a/rust/datafusion/src/execution/physical_plan/projection.rs b/rust/datafusion/src/execution/physical_plan/projection.rs index c1ca743002d..3cbf3d3887a 100644 --- a/rust/datafusion/src/execution/physical_plan/projection.rs +++ b/rust/datafusion/src/execution/physical_plan/projection.rs @@ -171,5 +171,4 @@ mod tests { Ok(()) } - } diff --git a/rust/datafusion/src/execution/physical_plan/selection.rs b/rust/datafusion/src/execution/physical_plan/selection.rs new file mode 100644 index 00000000000..7bea172c394 --- /dev/null +++ b/rust/datafusion/src/execution/physical_plan/selection.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines the selection execution plan. A selection filters rows based on a predicate + +use std::sync::{Arc, Mutex}; + +use crate::error::{ExecutionError, Result}; +use crate::execution::physical_plan::{ + BatchIterator, ExecutionPlan, Partition, PhysicalExpr, +}; +use arrow::array::BooleanArray; +use arrow::compute::filter; +use arrow::datatypes::Schema; +use arrow::record_batch::RecordBatch; + +/// Execution plan for a Selection +pub struct SelectionExec { + /// The selection predicate expression + expr: Arc, + /// The input plan + input: Arc, +} + +impl SelectionExec { + /// Create a selection on an input + pub fn try_new( + expr: Arc, + input: Arc, + ) -> Result { + Ok(Self { + expr: expr.clone(), + input: input.clone(), + }) + } +} + +impl ExecutionPlan for SelectionExec { + /// Get the schema for this execution plan + fn schema(&self) -> Arc { + // The selection operator does not make any changes to the schema of its input + self.input.schema() + } + + /// Get the partitions for this execution plan + fn partitions(&self) -> Result>> { + let partitions: Vec> = self + .input + .partitions()? + .iter() + .map(|p| { + let expr = self.expr.clone(); + let partition: Arc = Arc::new(SelectionPartition { + schema: self.input.schema(), + expr, + input: p.clone() as Arc, + }); + + partition + }) + .collect(); + + Ok(partitions) + } +} + +/// Represents a single partition of a Selection execution plan +struct SelectionPartition { + schema: Arc, + expr: Arc, + input: Arc, +} + +impl Partition for SelectionPartition { + /// Execute the Selection + fn execute(&self) -> Result>> { + Ok(Arc::new(Mutex::new(SelectionIterator { + schema: self.schema.clone(), + expr: self.expr.clone(), + input: self.input.execute()?, + }))) + } +} + +/// Selection iterator +struct SelectionIterator { + schema: Arc, + expr: Arc, + input: Arc>, +} + +impl BatchIterator for SelectionIterator { + /// Get the schema + fn schema(&self) -> Arc { + self.schema.clone() + } + + /// Get the next batch + fn next(&mut self) -> Result> { + let mut input = self.input.lock().unwrap(); + match input.next()? { + Some(batch) => { + // evaluate the selection predicate to get a boolean array + let predicate_result = self.expr.evaluate(&batch)?; + + if let Some(f) = predicate_result.as_any().downcast_ref::() + { + // filter each array + let mut filtered_arrays = vec![]; + for i in 0..batch.num_columns() { + let array = batch.column(i); + let filtered_array = filter(array.as_ref(), f)?; + filtered_arrays.push(filtered_array); + } + Ok(Some(RecordBatch::try_new( + batch.schema().clone(), + filtered_arrays, + )?)) + } else { + Err(ExecutionError::InternalError( + "Predicate evaluated to non-boolean value".to_string(), + )) + } + } + None => Ok(None), + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::execution::physical_plan::csv::CsvExec; + use crate::execution::physical_plan::expressions::*; + use crate::execution::physical_plan::ExecutionPlan; + use crate::logicalplan::{Operator, ScalarValue}; + use crate::test; + use std::iter::Iterator; + + #[test] + fn simple_predicate() -> Result<()> { + let schema = test::aggr_test_schema(); + + let partitions = 4; + let path = test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; + + let csv = CsvExec::try_new(&path, schema, true, None, 1024)?; + + let predicate: Arc = binary( + binary(col(1), Operator::Gt, lit(ScalarValue::UInt32(1))), + Operator::And, + binary(col(1), Operator::Lt, lit(ScalarValue::UInt32(4))), + ); + + let selection: Arc = + Arc::new(SelectionExec::try_new(predicate, Arc::new(csv))?); + + let results = test::execute(selection.as_ref())?; + + results + .iter() + .for_each(|batch| assert_eq!(13, batch.num_columns())); + let row_count: usize = results.iter().map(|batch| batch.num_rows()).sum(); + assert_eq!(41, row_count); + + Ok(()) + } +} diff --git a/rust/datafusion/src/execution/projection.rs b/rust/datafusion/src/execution/projection.rs index 48752ea514c..cd4ea93a2cf 100644 --- a/rust/datafusion/src/execution/projection.rs +++ b/rust/datafusion/src/execution/projection.rs @@ -141,5 +141,4 @@ mod tests { assert_eq!("c1", batch.schema().field(0).name()); } - } diff --git a/rust/datafusion/src/execution/table_impl.rs b/rust/datafusion/src/execution/table_impl.rs index cb2cb1aa0a5..d0879bdb663 100644 --- a/rust/datafusion/src/execution/table_impl.rs +++ b/rust/datafusion/src/execution/table_impl.rs @@ -315,5 +315,4 @@ mod tests { true, ); } - } diff --git a/rust/datafusion/src/logicalplan.rs b/rust/datafusion/src/logicalplan.rs index 87c1694c4bc..c251abb4d6e 100644 --- a/rust/datafusion/src/logicalplan.rs +++ b/rust/datafusion/src/logicalplan.rs @@ -641,5 +641,4 @@ mod tests { println!("plan: {:?}", plan1); }); } - } diff --git a/rust/datafusion/src/sql/planner.rs b/rust/datafusion/src/sql/planner.rs index 7af24d7ba53..a73afdd62ea 100644 --- a/rust/datafusion/src/sql/planner.rs +++ b/rust/datafusion/src/sql/planner.rs @@ -569,5 +569,4 @@ mod tests { } } } - } diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs new file mode 100644 index 00000000000..3a4a7864cbf --- /dev/null +++ b/rust/parquet/src/arrow/array_reader.rs @@ -0,0 +1,1044 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::cmp::min; +use std::collections::{HashMap, HashSet}; +use std::marker::PhantomData; +use std::mem::size_of; +use std::mem::transmute; +use std::rc::Rc; +use std::result::Result::Ok; +use std::slice::from_raw_parts_mut; +use std::sync::Arc; +use std::vec::Vec; + +use arrow::array::{ + ArrayDataBuilder, ArrayDataRef, ArrayRef, BooleanBufferBuilder, BufferBuilderTrait, + Int16BufferBuilder, StructArray, +}; +use arrow::buffer::{Buffer, MutableBuffer}; +use arrow::datatypes::{DataType as ArrowType, Field}; + +use crate::arrow::converter::{ + BooleanConverter, Converter, Float32Converter, Float64Converter, Int16Converter, + Int32Converter, Int64Converter, Int8Converter, UInt16Converter, UInt32Converter, + UInt64Converter, UInt8Converter, +}; +use crate::arrow::record_reader::RecordReader; +use crate::arrow::schema::parquet_to_arrow_field; +use crate::basic::{Repetition, Type as PhysicalType}; +use crate::column::page::PageIterator; +use crate::data_type::{ + BoolType, ByteArrayType, DataType, DoubleType, FloatType, Int32Type, Int64Type, + Int96Type, +}; +use crate::errors::{ParquetError, ParquetError::ArrowError, Result}; +use crate::file::reader::{FilePageIterator, FileReader}; +use crate::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, Type, TypePtr, +}; +use crate::schema::visitor::TypeVisitor; + +/// Array reader reads parquet data into arrow array. +pub trait ArrayReader { + /// Returns the arrow type of this array reader. + fn get_data_type(&self) -> &ArrowType; + + /// Reads at most `batch_size` records into an arrow array and return it. + fn next_batch(&mut self, batch_size: usize) -> Result; + + /// Returns the definition levels of data from last call of `next_batch`. + /// The result is used by parent array reader to calculate its own definition + /// levels and repetition levels, so that its parent can calculate null bitmap. + fn get_def_levels(&self) -> Option<&[i16]>; + + /// Return the repetition levels of data from last call of `next_batch`. + /// The result is used by parent array reader to calculate its own definition + /// levels and repetition levels, so that its parent can calculate null bitmap. + fn get_rep_levels(&self) -> Option<&[i16]>; +} + +/// Primitive array readers are leaves of array reader tree. They accept page iterator +/// and read them into primitive arrays. +pub struct PrimitiveArrayReader { + data_type: ArrowType, + pages: Box, + def_levels_buffer: Option, + rep_levels_buffer: Option, + column_desc: ColumnDescPtr, + record_reader: RecordReader, + _type_marker: PhantomData, +} + +impl PrimitiveArrayReader { + /// Construct primitive array reader. + pub fn new( + mut pages: Box, + column_desc: ColumnDescPtr, + ) -> Result { + let data_type = parquet_to_arrow_field(column_desc.clone())? + .data_type() + .clone(); + + let mut record_reader = RecordReader::::new(column_desc.clone()); + record_reader.set_page_reader( + pages + .next() + .ok_or_else(|| general_err!("Can't build array without pages!"))??, + )?; + + Ok(Self { + data_type, + pages, + def_levels_buffer: None, + rep_levels_buffer: None, + column_desc, + record_reader, + _type_marker: PhantomData, + }) + } +} + +/// Implementation of primitive array reader. +impl ArrayReader for PrimitiveArrayReader { + /// Returns data type of primitive array. + fn get_data_type(&self) -> &ArrowType { + &self.data_type + } + + /// Reads at most `batch_size` records into array. + fn next_batch(&mut self, batch_size: usize) -> Result { + let mut records_read = 0usize; + while records_read < batch_size { + let records_to_read = batch_size - records_read; + + let records_read_once = self.record_reader.read_records(records_to_read)?; + records_read = records_read + records_read_once; + + // Record reader exhausted + if records_read_once < records_to_read { + if let Some(page_reader) = self.pages.next() { + // Read from new page reader + self.record_reader.set_page_reader(page_reader?)?; + } else { + // Page reader also exhausted + break; + } + } + } + + // convert to arrays + let array = match (&self.data_type, T::get_physical_type()) { + (ArrowType::Boolean, PhysicalType::BOOLEAN) => unsafe { + BooleanConverter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::Int8, PhysicalType::INT32) => unsafe { + Int8Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::Int16, PhysicalType::INT32) => unsafe { + Int16Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::Int32, PhysicalType::INT32) => unsafe { + Int32Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::UInt8, PhysicalType::INT32) => unsafe { + UInt8Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::UInt16, PhysicalType::INT32) => unsafe { + UInt16Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::UInt32, PhysicalType::INT32) => unsafe { + UInt32Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::Int64, PhysicalType::INT64) => unsafe { + Int64Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::UInt64, PhysicalType::INT64) => unsafe { + UInt64Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::Float32, PhysicalType::FLOAT) => unsafe { + Float32Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (ArrowType::Float64, PhysicalType::DOUBLE) => unsafe { + Float64Converter::convert(transmute::< + &mut RecordReader, + &mut RecordReader, + >(&mut self.record_reader)) + }, + (arrow_type, _) => Err(general_err!( + "Reading {:?} type from parquet is not supported yet.", + arrow_type + )), + }?; + + // save definition and repetition buffers + self.def_levels_buffer = self.record_reader.consume_def_levels()?; + self.rep_levels_buffer = self.record_reader.consume_rep_levels()?; + self.record_reader.reset(); + Ok(array) + } + + fn get_def_levels(&self) -> Option<&[i16]> { + self.def_levels_buffer.as_ref().map(|buf| buf.typed_data()) + } + + fn get_rep_levels(&self) -> Option<&[i16]> { + self.rep_levels_buffer.as_ref().map(|buf| buf.typed_data()) + } +} + +/// Implementation of struct array reader. +struct StructArrayReader { + children: Vec>, + data_type: ArrowType, + struct_def_level: i16, + struct_rep_level: i16, + def_level_buffer: Option, + rep_level_buffer: Option, +} + +impl StructArrayReader { + /// Construct struct array reader. + pub fn new( + data_type: ArrowType, + children: Vec>, + def_level: i16, + rep_level: i16, + ) -> Self { + Self { + data_type, + children, + struct_def_level: def_level, + struct_rep_level: rep_level, + def_level_buffer: None, + rep_level_buffer: None, + } + } +} + +impl ArrayReader for StructArrayReader { + /// Returns data type. + /// This must be a struct. + fn get_data_type(&self) -> &ArrowType { + &self.data_type + } + + /// Read `batch_size` struct records. + /// + /// Definition levels of struct array is calculated as following: + /// ```ignore + /// def_levels[i] = min(child1_def_levels[i], child2_def_levels[i], ..., + /// childn_def_levels[i]); + /// ``` + /// + /// Repetition levels of struct array is calculated as following: + /// ```ignore + /// rep_levels[i] = child1_rep_levels[i]; + /// ``` + /// + /// The null bitmap of struct array is calculated from def_levels: + /// ```ignore + /// null_bitmap[i] = (def_levels[i] >= self.def_level); + /// ``` + fn next_batch(&mut self, batch_size: usize) -> Result { + if self.children.len() == 0 { + self.def_level_buffer = None; + self.rep_level_buffer = None; + return Ok(Arc::new(StructArray::from(Vec::new()))); + } + + let children_array = self + .children + .iter_mut() + .map(|reader| reader.next_batch(batch_size)) + .try_fold( + Vec::new(), + |mut result, child_array| -> Result> { + result.push(child_array?); + Ok(result) + }, + )?; + + // check that array child data has same size + let children_array_len = + children_array.first().map(|arr| arr.len()).ok_or_else(|| { + general_err!("Struct array reader should have at least one child!") + })?; + + let all_children_len_eq = children_array + .iter() + .all(|arr| arr.len() == children_array_len); + if !all_children_len_eq { + return Err(general_err!("Not all children array length are the same!")); + } + + // calculate struct def level data + let buffer_size = children_array_len * size_of::(); + let mut def_level_data_buffer = MutableBuffer::new(buffer_size); + def_level_data_buffer.resize(buffer_size)?; + + let def_level_data = unsafe { + let ptr = transmute::<*const u8, *mut i16>(def_level_data_buffer.raw_data()); + from_raw_parts_mut(ptr, children_array_len) + }; + + def_level_data + .iter_mut() + .for_each(|v| *v = self.struct_def_level); + + for child in &self.children { + if let Some(current_child_def_levels) = child.get_def_levels() { + if current_child_def_levels.len() != children_array_len { + return Err(general_err!("Child array length are not equal!")); + } else { + for i in 0..children_array_len { + def_level_data[i] = + min(def_level_data[i], current_child_def_levels[i]); + } + } + } + } + + // calculate bitmap for current array + let mut bitmap_builder = BooleanBufferBuilder::new(children_array_len); + let mut null_count = 0; + for def_level in def_level_data { + let not_null = *def_level >= self.struct_def_level; + if !not_null { + null_count += 1; + } + bitmap_builder.append(not_null)?; + } + + // Now we can build array data + let array_data = ArrayDataBuilder::new(self.data_type.clone()) + .len(children_array_len) + .null_count(null_count) + .null_bit_buffer(bitmap_builder.finish()) + .child_data( + children_array + .iter() + .map(|x| x.data()) + .collect::>(), + ) + .build(); + + // calculate struct rep level data, since struct doesn't add to repetition + // levels, here we just need to keep repetition levels of first array + // TODO: Verify that all children array reader has same repetition levels + let rep_level_data = self + .children + .first() + .ok_or_else(|| { + general_err!("Struct array reader should have at least one child!") + })? + .get_rep_levels() + .map(|data| -> Result { + let mut buffer = Int16BufferBuilder::new(children_array_len); + buffer.append_slice(data)?; + Ok(buffer.finish()) + }) + .transpose()?; + + self.def_level_buffer = Some(def_level_data_buffer.freeze()); + self.rep_level_buffer = rep_level_data; + Ok(Arc::new(StructArray::from(array_data))) + } + + fn get_def_levels(&self) -> Option<&[i16]> { + self.def_level_buffer.as_ref().map(|buf| buf.typed_data()) + } + + fn get_rep_levels(&self) -> Option<&[i16]> { + self.rep_level_buffer.as_ref().map(|buf| buf.typed_data()) + } +} + +/// Create array reader from parquet schema, column indices, and parquet file reader. +pub fn build_array_reader( + parquet_schema: SchemaDescPtr, + column_indices: T, + file_reader: Rc, +) -> Result> +where + T: IntoIterator, +{ + let mut base_nodes = Vec::new(); + let mut base_nodes_set = HashSet::new(); + let mut leaves = HashMap::<*const Type, usize>::new(); + + for c in column_indices { + let column = parquet_schema.column(c).self_type() as *const Type; + let root = parquet_schema.get_column_root_ptr(c); + let root_raw_ptr = root.clone().as_ref() as *const Type; + + leaves.insert(column, c); + if !base_nodes_set.contains(&root_raw_ptr) { + base_nodes.push(root); + base_nodes_set.insert(root_raw_ptr); + } + } + + if leaves.is_empty() { + return Err(general_err!("Can't build array reader without columns!")); + } + + ArrayReaderBuilder::new( + Rc::new(parquet_schema.root_schema().clone()), + Rc::new(leaves), + file_reader, + ) + .build_array_reader() +} + +/// Used to build array reader. +struct ArrayReaderBuilder { + root_schema: TypePtr, + // Key: columns that need to be included in final array builder + // Value: column index in schema + columns_included: Rc>, + file_reader: Rc, +} + +/// Used in type visitor. +#[derive(Clone)] +struct ArrayReaderBuilderContext { + def_level: i16, + rep_level: i16, + path: ColumnPath, +} + +impl Default for ArrayReaderBuilderContext { + fn default() -> Self { + Self { + def_level: 0i16, + rep_level: 0i16, + path: ColumnPath::new(Vec::new()), + } + } +} + +/// Create array reader by visiting schema. +impl<'a> TypeVisitor>, &'a ArrayReaderBuilderContext> + for ArrayReaderBuilder +{ + /// Build array reader for primitive type. + /// Currently we don't have a list reader implementation, so repeated type is not + /// supported yet. + fn visit_primitive( + &mut self, + cur_type: TypePtr, + context: &'a ArrayReaderBuilderContext, + ) -> Result>> { + if self.is_included(cur_type.as_ref()) { + let mut new_context = context.clone(); + new_context.path.append(vec![cur_type.name().to_string()]); + + match cur_type.get_basic_info().repetition() { + Repetition::REPEATED => { + new_context.def_level += 1; + new_context.rep_level += 1; + } + Repetition::OPTIONAL => { + new_context.def_level += 1; + } + _ => (), + } + + let reader = + self.build_for_primitive_type_inner(cur_type.clone(), &new_context)?; + + if cur_type.get_basic_info().repetition() == Repetition::REPEATED { + Err(ArrowError( + "Reading repeated field is not supported yet!".to_string(), + )) + } else { + Ok(Some(reader)) + } + } else { + Ok(None) + } + } + + /// Build array reader for struct type. + fn visit_struct( + &mut self, + cur_type: Rc, + context: &'a ArrayReaderBuilderContext, + ) -> Result>> { + let mut new_context = context.clone(); + new_context.path.append(vec![cur_type.name().to_string()]); + + if cur_type.get_basic_info().has_repetition() { + match cur_type.get_basic_info().repetition() { + Repetition::REPEATED => { + new_context.def_level += 1; + new_context.rep_level += 1; + } + Repetition::OPTIONAL => { + new_context.def_level += 1; + } + _ => (), + } + } + + if let Some(reader) = + self.build_for_struct_type_inner(cur_type.clone(), &new_context)? + { + if cur_type.get_basic_info().has_repetition() + && cur_type.get_basic_info().repetition() == Repetition::REPEATED + { + Err(ArrowError( + "Reading repeated field is not supported yet!".to_string(), + )) + } else { + Ok(Some(reader)) + } + } else { + Ok(None) + } + } + + /// Build array reader for map type. + /// Currently this is not supported. + fn visit_map( + &mut self, + _cur_type: Rc, + _context: &'a ArrayReaderBuilderContext, + ) -> Result>> { + Err(ArrowError( + "Reading parquet map array into arrow is not supported yet!".to_string(), + )) + } + + /// Build array reader for list type. + /// Currently this is not supported. + fn visit_list_with_item( + &mut self, + _list_type: Rc, + _item_type: &Type, + _context: &'a ArrayReaderBuilderContext, + ) -> Result>> { + Err(ArrowError( + "Reading parquet list array into arrow is not supported yet!".to_string(), + )) + } +} + +impl<'a> ArrayReaderBuilder { + /// Construct array reader builder. + fn new( + root_schema: TypePtr, + columns_included: Rc>, + file_reader: Rc, + ) -> Self { + Self { + root_schema, + columns_included, + file_reader, + } + } + + /// Main entry point. + fn build_array_reader(&mut self) -> Result> { + let context = ArrayReaderBuilderContext::default(); + + self.visit_struct(self.root_schema.clone(), &context) + .and_then(|reader_opt| { + reader_opt.ok_or_else(|| general_err!("Failed to build array reader!")) + }) + } + + // Utility functions + + /// Check whether one column in included in this array reader builder. + fn is_included(&self, t: &Type) -> bool { + self.columns_included.contains_key(&(t as *const Type)) + } + + /// Creates primitive array reader for each primitive type. + fn build_for_primitive_type_inner( + &self, + cur_type: TypePtr, + context: &'a ArrayReaderBuilderContext, + ) -> Result> { + let column_desc = Rc::new(ColumnDescriptor::new( + cur_type.clone(), + Some(self.root_schema.clone()), + context.def_level, + context.rep_level, + context.path.clone(), + )); + let page_iterator = Box::new(FilePageIterator::new( + self.columns_included[&(cur_type.as_ref() as *const Type)], + self.file_reader.clone(), + )?); + + match cur_type.get_physical_type() { + PhysicalType::BOOLEAN => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + )?)), + PhysicalType::INT32 => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + )?)), + PhysicalType::INT64 => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + )?)), + PhysicalType::INT96 => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + )?)), + PhysicalType::FLOAT => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + )?)), + PhysicalType::DOUBLE => Ok(Box::new( + PrimitiveArrayReader::::new(page_iterator, column_desc)?, + )), + PhysicalType::BYTE_ARRAY => Ok(Box::new(PrimitiveArrayReader::< + ByteArrayType, + >::new( + page_iterator, column_desc + )?)), + other => Err(ArrowError(format!( + "Unable to create primite array reader for parquet physical type {}", + other + ))), + } + } + + /// Constructs struct array reader without considering repetition. + fn build_for_struct_type_inner( + &mut self, + cur_type: TypePtr, + context: &'a ArrayReaderBuilderContext, + ) -> Result>> { + let mut fields = Vec::with_capacity(cur_type.get_fields().len()); + let mut children_reader = Vec::with_capacity(cur_type.get_fields().len()); + + for child in cur_type.get_fields() { + if let Some(child_reader) = self.dispatch(child.clone(), context)? { + fields.push(Field::new( + child.name(), + child_reader.get_data_type().clone(), + child.is_optional(), + )); + children_reader.push(child_reader); + } + } + + if !fields.is_empty() { + let arrow_type = ArrowType::Struct(fields); + Ok(Some(Box::new(StructArrayReader::new( + arrow_type, + children_reader, + context.def_level, + context.rep_level, + )))) + } else { + Ok(None) + } + } +} + +#[cfg(test)] +mod tests { + use crate::arrow::array_reader::{ + build_array_reader, ArrayReader, PrimitiveArrayReader, StructArrayReader, + }; + use crate::basic::Encoding; + use crate::column::page::Page; + use crate::data_type::{DataType, Int32Type}; + use crate::errors::Result; + use crate::file::reader::{FileReader, SerializedFileReader}; + use crate::schema::parser::parse_message_type; + use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; + use crate::util::test_common::page_util::InMemoryPageIterator; + use crate::util::test_common::{get_test_file, make_pages}; + use arrow::array::{Array, ArrayRef, PrimitiveArray, StructArray}; + use arrow::datatypes::{DataType as ArrowType, Field, Int32Type as ArrowInt32}; + use rand::distributions::range::SampleRange; + use std::collections::VecDeque; + use std::rc::Rc; + use std::sync::Arc; + + fn make_column_chuncks( + column_desc: ColumnDescPtr, + encoding: Encoding, + num_levels: usize, + min_value: T::T, + max_value: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + page_lists: &mut Vec>, + use_v2: bool, + num_chuncks: usize, + ) where + T::T: PartialOrd + SampleRange + Copy, + { + for _i in 0..num_chuncks { + let mut pages = VecDeque::new(); + let mut data = Vec::new(); + let mut page_def_levels = Vec::new(); + let mut page_rep_levels = Vec::new(); + + make_pages::( + column_desc.clone(), + encoding, + 1, + num_levels, + min_value, + max_value, + &mut page_def_levels, + &mut page_rep_levels, + &mut data, + &mut pages, + use_v2, + ); + + def_levels.append(&mut page_def_levels); + rep_levels.append(&mut page_rep_levels); + values.append(&mut data); + page_lists.push(Vec::from(pages)); + } + } + + #[test] + fn test_primitive_array_reader_data() { + // Construct column schema + let message_type = " + message test_schema { + REQUIRED INT32 leaf; + } + "; + + let schema = parse_message_type(message_type) + .map(|t| Rc::new(SchemaDescriptor::new(Rc::new(t)))) + .unwrap(); + + let column_desc = schema.column(0); + + // Construct page iterator + { + let mut data = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chuncks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + 1, + 200, + &mut Vec::new(), + &mut Vec::new(), + &mut data, + &mut page_lists, + true, + 2, + ); + let page_iterator = InMemoryPageIterator::new( + schema.clone(), + column_desc.clone(), + page_lists, + ); + + let mut array_reader = PrimitiveArrayReader::::new( + Box::new(page_iterator), + column_desc.clone(), + ) + .unwrap(); + + // Read first 50 values, which are all from the first column chunck + let array = array_reader.next_batch(50).unwrap(); + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!( + &PrimitiveArray::::from( + data[0..50].iter().cloned().collect::>() + ), + array + ); + + // Read next 100 values, the first 50 ones are from the first column chunk, + // and the last 50 ones are from the second column chunk + let array = array_reader.next_batch(100).unwrap(); + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!( + &PrimitiveArray::::from( + data[50..150].iter().cloned().collect::>() + ), + array + ); + + // Try to read 100 values, however there are only 50 values + let array = array_reader.next_batch(100).unwrap(); + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!( + &PrimitiveArray::::from( + data[150..200].iter().cloned().collect::>() + ), + array + ); + } + } + + #[test] + fn test_primitive_array_reader_def_and_rep_levels() { + // Construct column schema + let message_type = " + message test_schema { + REPEATED Group test_mid { + OPTIONAL INT32 leaf; + } + } + "; + + let schema = parse_message_type(message_type) + .map(|t| Rc::new(SchemaDescriptor::new(Rc::new(t)))) + .unwrap(); + + let column_desc = schema.column(0); + + // Construct page iterator + { + let mut def_levels = Vec::new(); + let mut rep_levels = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chuncks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + 1, + 200, + &mut def_levels, + &mut rep_levels, + &mut Vec::new(), + &mut page_lists, + true, + 2, + ); + + let page_iterator = InMemoryPageIterator::new( + schema.clone(), + column_desc.clone(), + page_lists, + ); + + let mut array_reader = PrimitiveArrayReader::::new( + Box::new(page_iterator), + column_desc.clone(), + ) + .unwrap(); + + let mut accu_len: usize = 0; + + // Read first 50 values, which are all from the first column chunck + let array = array_reader.next_batch(50).unwrap(); + assert_eq!( + Some(&def_levels[accu_len..(accu_len + array.len())]), + array_reader.get_def_levels() + ); + assert_eq!( + Some(&rep_levels[accu_len..(accu_len + array.len())]), + array_reader.get_rep_levels() + ); + accu_len += array.len(); + + // Read next 100 values, the first 50 ones are from the first column chunk, + // and the last 50 ones are from the second column chunk + let array = array_reader.next_batch(100).unwrap(); + assert_eq!( + Some(&def_levels[accu_len..(accu_len + array.len())]), + array_reader.get_def_levels() + ); + assert_eq!( + Some(&rep_levels[accu_len..(accu_len + array.len())]), + array_reader.get_rep_levels() + ); + accu_len += array.len(); + + // Try to read 100 values, however there are only 50 values + let array = array_reader.next_batch(100).unwrap(); + assert_eq!( + Some(&def_levels[accu_len..(accu_len + array.len())]), + array_reader.get_def_levels() + ); + assert_eq!( + Some(&rep_levels[accu_len..(accu_len + array.len())]), + array_reader.get_rep_levels() + ); + } + } + + /// Array reader for test. + struct InMemoryArrayReader { + data_type: ArrowType, + array: ArrayRef, + def_levels: Option>, + rep_levels: Option>, + } + + impl InMemoryArrayReader { + pub fn new( + data_type: ArrowType, + array: ArrayRef, + def_levels: Option>, + rep_levels: Option>, + ) -> Self { + Self { + data_type, + array, + def_levels, + rep_levels, + } + } + } + + impl ArrayReader for InMemoryArrayReader { + fn get_data_type(&self) -> &ArrowType { + &self.data_type + } + + fn next_batch(&mut self, _batch_size: usize) -> Result { + Ok(self.array.clone()) + } + + fn get_def_levels(&self) -> Option<&[i16]> { + self.def_levels.as_ref().map(|v| v.as_slice()) + } + + fn get_rep_levels(&self) -> Option<&[i16]> { + self.rep_levels.as_ref().map(|v| v.as_slice()) + } + } + + #[test] + fn test_struct_array_reader() { + let array_1 = Arc::new(PrimitiveArray::::from(vec![1, 2, 3, 4, 5])); + let array_reader_1 = InMemoryArrayReader::new( + ArrowType::Int32, + array_1.clone(), + Some(vec![0, 1, 2, 3, 1]), + Some(vec![1, 1, 1, 1, 1]), + ); + + let array_2 = Arc::new(PrimitiveArray::::from(vec![5, 4, 3, 2, 1])); + let array_reader_2 = InMemoryArrayReader::new( + ArrowType::Int32, + array_2.clone(), + Some(vec![0, 1, 3, 1, 2]), + Some(vec![1, 1, 1, 1, 1]), + ); + + let struct_type = ArrowType::Struct(vec![ + Field::new("f1", array_1.data_type().clone(), true), + Field::new("f2", array_2.data_type().clone(), true), + ]); + + let mut struct_array_reader = StructArrayReader::new( + struct_type, + vec![Box::new(array_reader_1), Box::new(array_reader_2)], + 1, + 1, + ); + + let struct_array = struct_array_reader.next_batch(5).unwrap(); + let struct_array = struct_array.as_any().downcast_ref::().unwrap(); + + assert_eq!(5, struct_array.len()); + assert_eq!( + vec![true, false, false, false, false], + (0..5) + .map(|idx| struct_array.data_ref().is_null(idx)) + .collect::>() + ); + assert_eq!( + Some(vec![0, 1, 1, 1, 1].as_slice()), + struct_array_reader.get_def_levels() + ); + assert_eq!( + Some(vec![1, 1, 1, 1, 1].as_slice()), + struct_array_reader.get_rep_levels() + ); + } + + #[test] + fn test_create_array_reader() { + let file = get_test_file("nulls.snappy.parquet"); + let file_reader = Rc::new(SerializedFileReader::new(file).unwrap()); + + let array_reader = build_array_reader( + file_reader.metadata().file_metadata().schema_descr_ptr(), + vec![0usize].into_iter(), + file_reader, + ) + .unwrap(); + + // Create arrow types + let arrow_type = ArrowType::Struct(vec![Field::new( + "b_struct", + ArrowType::Struct(vec![Field::new("b_c_int", ArrowType::Int32, true)]), + true, + )]); + + assert_eq!(array_reader.get_data_type(), &arrow_type); + } +} diff --git a/rust/parquet/src/arrow/converter.rs b/rust/parquet/src/arrow/converter.rs index 263e78a7fe6..6056271a759 100644 --- a/rust/parquet/src/arrow/converter.rs +++ b/rust/parquet/src/arrow/converter.rs @@ -67,9 +67,9 @@ where let mut array_data = ArrayDataBuilder::new(ArrowSourceType::get_data_type()) .len(record_reader.num_values()) - .add_buffer(record_data); + .add_buffer(record_data?); - if let Some(b) = record_reader.consume_bitmap_buffer() { + if let Some(b) = record_reader.consume_bitmap_buffer()? { array_data = array_data.null_bit_buffer(b); } diff --git a/rust/parquet/src/arrow/mod.rs b/rust/parquet/src/arrow/mod.rs index af1d00c91b2..a2c6031cfb8 100644 --- a/rust/parquet/src/arrow/mod.rs +++ b/rust/parquet/src/arrow/mod.rs @@ -20,6 +20,7 @@ //! //! This mod provides API for converting between arrow and parquet. +pub(in crate::arrow) mod array_reader; pub(in crate::arrow) mod converter; pub(in crate::arrow) mod record_reader; pub mod schema; diff --git a/rust/parquet/src/arrow/record_reader.rs b/rust/parquet/src/arrow/record_reader.rs index 803f4a0d2f5..de42ae7f953 100644 --- a/rust/parquet/src/arrow/record_reader.rs +++ b/rust/parquet/src/arrow/record_reader.rs @@ -16,9 +16,9 @@ // under the License. use std::cmp::{max, min}; -use std::mem::replace; use std::mem::size_of; use std::mem::transmute; +use std::mem::{replace, swap}; use std::slice; use crate::column::{page::PageReader, reader::ColumnReaderImpl}; @@ -187,46 +187,138 @@ impl RecordReader { } /// Returns definition level data. - pub fn consume_def_levels(&mut self) -> Option { - let empty_def_buffer = if self.column_desc.max_def_level() > 0 { - Some(MutableBuffer::new(MIN_BATCH_SIZE)) + /// The implementation has side effects. It will create a new buffer to hold those + /// definition level values that have already been read into memory but not counted + /// as record values, e.g. those from `self.num_values` to `self.values_written`. + pub fn consume_def_levels(&mut self) -> Result> { + let new_buffer = if let Some(ref mut def_levels_buf) = &mut self.def_levels { + let num_left_values = self.values_written - self.num_values; + let mut new_buffer = MutableBuffer::new( + size_of::() * max(MIN_BATCH_SIZE, num_left_values), + ); + new_buffer.resize(num_left_values * size_of::())?; + + let new_def_levels = FatPtr::::with_offset(&new_buffer, 0); + let new_def_levels = new_def_levels.to_slice_mut(); + let left_def_levels = + FatPtr::::with_offset(&def_levels_buf, self.num_values); + let left_def_levels = left_def_levels.to_slice(); + + new_def_levels[0..num_left_values] + .copy_from_slice(&left_def_levels[0..num_left_values]); + + def_levels_buf.resize(self.num_values * size_of::())?; + Some(new_buffer) } else { None }; - replace(&mut self.def_levels, empty_def_buffer).map(|x| x.freeze()) + Ok(replace(&mut self.def_levels, new_buffer).map(|x| x.freeze())) } - /// Return repetition level data - pub fn consume_rep_levels(&mut self) -> Option { - let empty_def_buffer = if self.column_desc.max_rep_level() > 0 { - Some(MutableBuffer::new(MIN_BATCH_SIZE)) + /// Return repetition level data. + /// The side effect is similar to `consume_def_levels`. + pub fn consume_rep_levels(&mut self) -> Result> { + // TODO: Optimize to reduce the copy + let new_buffer = if let Some(ref mut rep_levels_buf) = &mut self.rep_levels { + let num_left_values = self.values_written - self.num_values; + let mut new_buffer = MutableBuffer::new( + size_of::() * max(MIN_BATCH_SIZE, num_left_values), + ); + new_buffer.resize(num_left_values * size_of::())?; + + let new_rep_levels = FatPtr::::with_offset(&new_buffer, 0); + let new_rep_levels = new_rep_levels.to_slice_mut(); + let left_rep_levels = + FatPtr::::with_offset(&rep_levels_buf, self.num_values); + let left_rep_levels = left_rep_levels.to_slice(); + + new_rep_levels[0..num_left_values] + .copy_from_slice(&left_rep_levels[0..num_left_values]); + + rep_levels_buf.resize(self.num_values * size_of::())?; + + Some(new_buffer) } else { None }; - replace(&mut self.rep_levels, empty_def_buffer).map(|x| x.freeze()) + Ok(replace(&mut self.rep_levels, new_buffer).map(|x| x.freeze())) } /// Returns currently stored buffer data. - pub fn consume_record_data(&mut self) -> Buffer { - replace(&mut self.records, MutableBuffer::new(MIN_BATCH_SIZE)).freeze() + /// The side effect is similar to `consume_def_levels`. + pub fn consume_record_data(&mut self) -> Result { + // TODO: Optimize to reduce the copy + let num_left_values = self.values_written - self.num_values; + let mut new_buffer = MutableBuffer::new(max(MIN_BATCH_SIZE, num_left_values)); + new_buffer.resize(num_left_values * T::get_type_size())?; + + let new_records = + FatPtr::::with_offset_and_size(&new_buffer, 0, T::get_type_size()); + let new_records = new_records.to_slice_mut(); + let left_records = FatPtr::::with_offset_and_size( + &self.records, + self.num_values, + T::get_type_size(), + ); + let left_records = left_records.to_slice_mut(); + + for idx in 0..num_left_values { + swap(&mut new_records[idx], &mut left_records[idx]); + } + + self.records.resize(self.num_values * T::get_type_size())?; + + Ok(replace(&mut self.records, new_buffer).freeze()) } - pub fn consume_bitmap_buffer(&mut self) -> Option { - let bitmap_builder = if self.column_desc.max_def_level() > 0 { - Some(BooleanBufferBuilder::new(MIN_BATCH_SIZE)) + /// Returns currently stored null bitmap data. + /// The side effect is similar to `consume_def_levels`. + pub fn consume_bitmap_buffer(&mut self) -> Result> { + // TODO: Optimize to reduce the copy + if self.column_desc.max_def_level() > 0 { + assert!(self.null_bitmap.is_some()); + let num_left_values = self.values_written - self.num_values; + let new_bitmap_builder = Some(BooleanBufferBuilder::new(max( + MIN_BATCH_SIZE, + num_left_values, + ))); + + let old_bitmap = replace(&mut self.null_bitmap, new_bitmap_builder) + .map(|mut builder| builder.finish()) + .unwrap(); + + let old_bitmap = Bitmap::from(old_bitmap); + + for i in self.num_values..self.values_written { + self.null_bitmap + .as_mut() + .unwrap() + .append(old_bitmap.is_set(i))?; + } + + Ok(Some(old_bitmap.to_buffer())) } else { - None - }; + Ok(None) + } + } - replace(&mut self.null_bitmap, bitmap_builder).map(|mut builder| builder.finish()) + /// Reset state of record reader. + /// Should be called after consuming data, e.g. `consume_rep_levels`, + /// `consume_rep_levels`, `consume_record_data` and `consume_bitmap_buffer`. + pub fn reset(&mut self) { + self.values_written = self.values_written - self.num_values; + self.num_records = 0; + self.num_values = 0; + self.values_seen = 0; + self.in_middle_of_record = false; } /// Returns bitmap data. - pub fn consume_bitmap(&mut self) -> Option { + pub fn consume_bitmap(&mut self) -> Result> { self.consume_bitmap_buffer() - .map(|buffer| Bitmap::from(buffer)) + .map(|buffer| buffer.map(|b| Bitmap::from(b))) } /// Try to read one batch of data. @@ -488,9 +580,12 @@ mod tests { let mut bb = Int32BufferBuilder::new(7); bb.append_slice(&[4, 7, 6, 3, 2, 8, 9]).unwrap(); let expected_buffer = bb.finish(); - assert_eq!(expected_buffer, record_reader.consume_record_data()); - assert_eq!(None, record_reader.consume_def_levels()); - assert_eq!(None, record_reader.consume_bitmap()); + assert_eq!( + expected_buffer, + record_reader.consume_record_data().unwrap() + ); + assert_eq!(None, record_reader.consume_def_levels().unwrap()); + assert_eq!(None, record_reader.consume_bitmap().unwrap()); } #[test] @@ -573,7 +668,10 @@ mod tests { let mut bb = Int32BufferBuilder::new(7); bb.append_slice(&[0, 7, 0, 6, 3, 0, 8]).unwrap(); let expected_buffer = bb.finish(); - assert_eq!(expected_buffer, record_reader.consume_record_data()); + assert_eq!( + expected_buffer, + record_reader.consume_record_data().unwrap() + ); // Verify result def levels let mut bb = Int16BufferBuilder::new(7); @@ -582,7 +680,7 @@ mod tests { let expected_def_levels = bb.finish(); assert_eq!( Some(expected_def_levels), - record_reader.consume_def_levels() + record_reader.consume_def_levels().unwrap() ); // Verify bitmap @@ -590,7 +688,10 @@ mod tests { bb.append_slice(&[false, true, false, true, true, false, true]) .unwrap(); let expected_bitmap = Bitmap::from(bb.finish()); - assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + assert_eq!( + Some(expected_bitmap), + record_reader.consume_bitmap().unwrap() + ); } #[test] @@ -677,7 +778,10 @@ mod tests { let mut bb = Int32BufferBuilder::new(9); bb.append_slice(&[4, 0, 0, 7, 6, 3, 2, 8, 9]).unwrap(); let expected_buffer = bb.finish(); - assert_eq!(expected_buffer, record_reader.consume_record_data()); + assert_eq!( + expected_buffer, + record_reader.consume_record_data().unwrap() + ); // Verify result def levels let mut bb = Int16BufferBuilder::new(9); @@ -686,7 +790,7 @@ mod tests { let expected_def_levels = bb.finish(); assert_eq!( Some(expected_def_levels), - record_reader.consume_def_levels() + record_reader.consume_def_levels().unwrap() ); // Verify bitmap @@ -694,7 +798,10 @@ mod tests { bb.append_slice(&[true, false, false, true, true, true, true, true, true]) .unwrap(); let expected_bitmap = Bitmap::from(bb.finish()); - assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + assert_eq!( + Some(expected_bitmap), + record_reader.consume_bitmap().unwrap() + ); } #[test] diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index 8f7c7a3061a..cc3c26f72cd 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -517,13 +517,8 @@ mod tests { use crate::basic::Type as PhysicalType; use crate::column::page::Page; - use crate::encodings::encoding::{DictEncoder, Encoder}; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::{ - memory::MemTracker, - test_common::page_util::{DataPageBuilder, DataPageBuilderImpl}, - test_common::random_numbers_range, - }; + use crate::util::test_common::make_pages; const NUM_LEVELS: usize = 128; const NUM_PAGES: usize = 2; @@ -1383,94 +1378,4 @@ mod tests { Ok(self.pages.next()) } } - - fn make_pages( - desc: ColumnDescPtr, - encoding: Encoding, - num_pages: usize, - levels_per_page: usize, - min: T::T, - max: T::T, - def_levels: &mut Vec, - rep_levels: &mut Vec, - values: &mut Vec, - pages: &mut VecDeque, - use_v2: bool, - ) where - T::T: PartialOrd + SampleRange + Copy, - { - let mut num_values = 0; - let max_def_level = desc.max_def_level(); - let max_rep_level = desc.max_rep_level(); - - let mem_tracker = Rc::new(MemTracker::new()); - let mut dict_encoder = DictEncoder::::new(desc.clone(), mem_tracker); - - for i in 0..num_pages { - let mut num_values_cur_page = 0; - let level_range = i * levels_per_page..(i + 1) * levels_per_page; - - if max_def_level > 0 { - random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); - for dl in &def_levels[level_range.clone()] { - if *dl == max_def_level { - num_values_cur_page += 1; - } - } - } else { - num_values_cur_page = levels_per_page; - } - if max_rep_level > 0 { - random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); - } - random_numbers_range(num_values_cur_page, min, max, values); - - // Generate the current page - - let mut pb = DataPageBuilderImpl::new( - desc.clone(), - num_values_cur_page as u32, - use_v2, - ); - if max_rep_level > 0 { - pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); - } - if max_def_level > 0 { - pb.add_def_levels(max_def_level, &def_levels[level_range]); - } - - let value_range = num_values..num_values + num_values_cur_page; - match encoding { - Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { - let _ = dict_encoder.put(&values[value_range.clone()]); - let indices = dict_encoder - .write_indices() - .expect("write_indices() should be OK"); - pb.add_indices(indices); - } - Encoding::PLAIN => { - pb.add_values::(encoding, &values[value_range]); - } - enc @ _ => panic!("Unexpected encoding {}", enc), - } - - let data_page = pb.consume(); - pages.push_back(data_page); - num_values += num_values_cur_page; - } - - if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY - { - let dict = dict_encoder - .write_dict() - .expect("write_dict() should be OK"); - let dict_page = Page::DictionaryPage { - buf: dict, - num_values: dict_encoder.num_entries() as u32, - encoding: Encoding::RLE_DICTIONARY, - is_sorted: false, - }; - pages.push_front(dict_page); - } - } } diff --git a/rust/parquet/src/compression.rs b/rust/parquet/src/compression.rs index bdc9729b155..d29024ed5c8 100644 --- a/rust/parquet/src/compression.rs +++ b/rust/parquet/src/compression.rs @@ -338,5 +338,4 @@ mod tests { fn test_codec_zstd() { test_codec(CodecType::ZSTD); } - } diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs index 08dd2e14e6d..4d4d54969d7 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -164,6 +164,13 @@ impl Type { _ => false, } } + + /// Returns `true` if this type is repeated or optional. + /// If this type doesn't have repetition defined, we still treat it as optional. + pub fn is_optional(&self) -> bool { + self.get_basic_info().has_repetition() + && self.get_basic_info().repetition() != Repetition::REQUIRED + } } /// A builder for primitive types. All attributes are optional @@ -527,6 +534,21 @@ impl ColumnPath { pub fn string(&self) -> String { self.parts.join(".") } + + /// Appends more components to end of column path. + /// ```rust + /// use parquet::schema::types::ColumnPath; + /// + /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c" + /// .to_string()]); + /// assert_eq!(&path.string(), "a.b.c"); + /// + /// path.append(vec!["d".to_string(), "e".to_string()]); + /// assert_eq!(&path.string(), "a.b.c.d.e"); + /// ``` + pub fn append(&mut self, mut tail: Vec) -> () { + self.parts.append(&mut tail); + } } impl fmt::Display for ColumnPath { diff --git a/rust/parquet/src/util/test_common/mod.rs b/rust/parquet/src/util/test_common/mod.rs index c24afdf40ab..79a970e0a82 100644 --- a/rust/parquet/src/util/test_common/mod.rs +++ b/rust/parquet/src/util/test_common/mod.rs @@ -28,3 +28,5 @@ pub use self::rand_gen::RandGen; pub use self::file_util::get_temp_file; pub use self::file_util::get_test_file; pub use self::file_util::get_test_path; + +pub use self::page_util::make_pages; diff --git a/rust/parquet/src/util/test_common/page_util.rs b/rust/parquet/src/util/test_common/page_util.rs index d12b734f2d5..f8316d6f2c4 100644 --- a/rust/parquet/src/util/test_common/page_util.rs +++ b/rust/parquet/src/util/test_common/page_util.rs @@ -16,19 +16,23 @@ // under the License. use crate::basic::Encoding; -use crate::column::page::Page; use crate::column::page::PageReader; +use crate::column::page::{Page, PageIterator}; use crate::data_type::DataType; -use crate::encodings::encoding::{get_encoder, Encoder}; +use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::encodings::levels::max_buffer_size; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; -use crate::schema::types::ColumnDescPtr; +use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; use crate::util::memory::MemTracker; use crate::util::memory::MemTrackerPtr; +use crate::util::test_common::random_numbers_range; +use rand::distributions::range::SampleRange; +use std::collections::VecDeque; use std::mem; use std::rc::Rc; +use std::vec::IntoIter; pub trait DataPageBuilder { fn add_rep_levels(&mut self, max_level: i16, rep_levels: &[i16]); @@ -176,3 +180,134 @@ impl PageReader for InMemoryPageReader { Ok(self.pages.next()) } } + +/// A utility page iterator which stores page readers in memory, used for tests. +pub struct InMemoryPageIterator { + schema: SchemaDescPtr, + column_desc: ColumnDescPtr, + page_readers: IntoIter>, +} + +impl InMemoryPageIterator { + pub fn new( + schema: SchemaDescPtr, + column_desc: ColumnDescPtr, + pages: Vec>, + ) -> Self { + let page_readers = pages + .into_iter() + .map(|pages| Box::new(InMemoryPageReader::new(pages)) as Box) + .collect::>>() + .into_iter(); + + Self { + schema, + column_desc, + page_readers, + } + } +} + +impl Iterator for InMemoryPageIterator { + type Item = Result>; + + fn next(&mut self) -> Option { + self.page_readers.next().map(|page_reader| Ok(page_reader)) + } +} + +impl PageIterator for InMemoryPageIterator { + fn schema(&mut self) -> Result { + Ok(self.schema.clone()) + } + + fn column_schema(&mut self) -> Result { + Ok(self.column_desc.clone()) + } +} + +pub fn make_pages( + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + levels_per_page: usize, + min: T::T, + max: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + pages: &mut VecDeque, + use_v2: bool, +) where + T::T: PartialOrd + SampleRange + Copy, +{ + let mut num_values = 0; + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + + let mem_tracker = Rc::new(MemTracker::new()); + let mut dict_encoder = DictEncoder::::new(desc.clone(), mem_tracker); + + for i in 0..num_pages { + let mut num_values_cur_page = 0; + let level_range = i * levels_per_page..(i + 1) * levels_per_page; + + if max_def_level > 0 { + random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); + for dl in &def_levels[level_range.clone()] { + if *dl == max_def_level { + num_values_cur_page += 1; + } + } + } else { + num_values_cur_page = levels_per_page; + } + if max_rep_level > 0 { + random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); + } + random_numbers_range(num_values_cur_page, min, max, values); + + // Generate the current page + + let mut pb = + DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + if max_rep_level > 0 { + pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); + } + if max_def_level > 0 { + pb.add_def_levels(max_def_level, &def_levels[level_range]); + } + + let value_range = num_values..num_values + num_values_cur_page; + match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + let _ = dict_encoder.put(&values[value_range.clone()]); + let indices = dict_encoder + .write_indices() + .expect("write_indices() should be OK"); + pb.add_indices(indices); + } + Encoding::PLAIN => { + pb.add_values::(encoding, &values[value_range]); + } + enc @ _ => panic!("Unexpected encoding {}", enc), + } + + let data_page = pb.consume(); + pages.push_back(data_page); + num_values += num_values_cur_page; + } + + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + let dict = dict_encoder + .write_dict() + .expect("write_dict() should be OK"); + let dict_page = Page::DictionaryPage { + buf: dict, + num_values: dict_encoder.num_entries() as u32, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }; + pages.push_front(dict_page); + } +} diff --git a/rust/rust-toolchain b/rust/rust-toolchain index 781a7d6cb5f..9e495a9f73e 100644 --- a/rust/rust-toolchain +++ b/rust/rust-toolchain @@ -1 +1 @@ -nightly-2019-07-30 \ No newline at end of file +nightly-2019-09-25