diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 202c24f0a58..6b095b878e5 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -226,7 +226,7 @@ if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ]; then doxygen popd cd ../docs - sphinx-build -q -b html -d _build/doctrees -W source _build/html + sphinx-build -q -b html -d _build/doctrees -W --keep-going source _build/html fi popd # $ARROW_PYTHON_DIR diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 6e24db03dfe..3d42dbc3631 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -58,14 +58,16 @@ using TimePoint = /// \brief EXPERIMENTAL: FileSystem entry type enum class ARROW_EXPORT FileType : int8_t { - // Target does not exist + /// Entry does not exist NonExistent, - // Target exists but its type is unknown (could be a special file such - // as a Unix socket or character device, or Windows NUL / CON / ...) + /// Entry exists but its type is unknown + /// + /// This can designate a special file such as a Unix socket or character + /// device, or Windows NUL / CON / ... Unknown, - // Target is a regular file + /// Entry is a regular file File, - // Target is a directory + /// Entry is a directory Directory }; @@ -84,26 +86,27 @@ struct ARROW_EXPORT FileStats { FileStats(const FileStats&) = default; FileStats& operator=(const FileStats&) = default; - // The file type. + /// The file type FileType type() const { return type_; } void set_type(FileType type) { type_ = type; } - // The full file path in the filesystem. + /// The full file path in the filesystem std::string path() const { return path_; } void set_path(const std::string& path) { path_ = path; } - // The file base name (component after the last directory separator). + /// The file base name (component after the last directory separator) std::string base_name() const; - // The size in bytes, if available. Only regular files are guaranteed - // to have a size. + /// The size in bytes, if available + /// + /// Only regular files are guaranteed to have a size. int64_t size() const { return size_; } void set_size(int64_t size) { size_ = size; } - // The file extension + /// The file extension (excluding the dot) std::string extension() const; - // The time of last modification, if available. + /// The time of last modification, if available TimePoint mtime() const { return mtime_; } void set_mtime(TimePoint mtime) { mtime_ = mtime; } @@ -228,7 +231,9 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem { std::shared_ptr base_fs); ~SubTreeFileSystem() override; + /// \cond FALSE using FileSystem::GetTargetStats; + /// \endcond Status GetTargetStats(const std::string& path, FileStats* out) override; Status GetTargetStats(const Selector& select, std::vector* out) override; diff --git a/cpp/src/arrow/filesystem/localfs.h b/cpp/src/arrow/filesystem/localfs.h index 73e35206d71..57da283b036 100644 --- a/cpp/src/arrow/filesystem/localfs.h +++ b/cpp/src/arrow/filesystem/localfs.h @@ -29,14 +29,18 @@ namespace fs { /// \brief EXPERIMENTAL: a FileSystem implementation accessing files /// on the local machine. /// -/// Details such as symlinks are abstracted away (symlinks are always followed, -/// except when deleting an entry). +/// This class handles only `/`-separated paths. If desired, conversion +/// from Windows backslash-separated paths should be done by the caller. +/// Details such as symlinks are abstracted away (symlinks are always +/// followed, except when deleting an entry). class ARROW_EXPORT LocalFileSystem : public FileSystem { public: LocalFileSystem(); ~LocalFileSystem() override; + /// \cond FALSE using FileSystem::GetTargetStats; + /// \endcond Status GetTargetStats(const std::string& path, FileStats* out) override; Status GetTargetStats(const Selector& select, std::vector* out) override; diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index c02c4c15402..8dd3d87f8ff 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -37,7 +37,7 @@ namespace fs { extern ARROW_EXPORT const char* kS3DefaultRegion; -/// Options for the S3 FileSystem implementation. +/// Options for the S3FileSystem implementation. struct ARROW_EXPORT S3Options { /// AWS region to connect to (default "us-east-1") std::string region = kS3DefaultRegion; @@ -79,7 +79,9 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { public: ~S3FileSystem() override; + /// \cond FALSE using FileSystem::GetTargetStats; + /// \endcond Status GetTargetStats(const std::string& path, FileStats* out) override; Status GetTargetStats(const Selector& select, std::vector* out) override; @@ -120,6 +122,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { Status OpenAppendStream(const std::string& path, std::shared_ptr* out) override; + /// Create a S3FileSystem instance from the given options. static Status Make(const S3Options& options, std::shared_ptr* out); protected: diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index 10b6e8f6c73..59653947742 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -36,6 +36,7 @@ class Status; namespace io { +/// \brief An operating system file open in write-only mode. class ARROW_EXPORT FileOutputStream : public OutputStream { public: ~FileOutputStream() override; @@ -95,8 +96,9 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { // Write bytes to the stream. Thread-safe Status Write(const void* data, int64_t nbytes) override; - + /// \cond FALSE using Writable::Write; + /// \endcond int file_descriptor() const; @@ -107,7 +109,11 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { std::unique_ptr impl_; }; -// Operating system file +/// \brief An operating system file open in read-only mode. +/// +/// Reads through this implementation are unbuffered. If many small reads +/// need to be issued, it is recommended to use a buffering layer for good +/// performance. class ARROW_EXPORT ReadableFile : public internal::RandomAccessFileConcurrencyWrapper { public: @@ -173,12 +179,13 @@ class ARROW_EXPORT ReadableFile std::unique_ptr impl_; }; -// A file interface that uses memory-mapped files for memory interactions, -// supporting zero copy reads. The same class is used for both reading and -// writing. -// -// If opening a file in a writable mode, it is not truncated first as with -// FileOutputStream +/// \brief A file interface that uses memory-mapped files for memory interactions +/// +/// This implementation supports zero-copy reads. The same class is used +/// for both reading and writing. +/// +/// If opening a file in a writable mode, it is not truncated first as with +/// FileOutputStream. class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { public: ~MemoryMappedFile() override; diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 51af229f000..2346e1514f0 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -37,7 +37,7 @@ class Status; namespace io { -// \brief An output stream that writes to a resizable buffer +/// \brief An output stream that writes to a resizable buffer class ARROW_EXPORT BufferOutputStream : public OutputStream { public: explicit BufferOutputStream(const std::shared_ptr& buffer); @@ -61,7 +61,9 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { Status Tell(int64_t* position) const override; Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE using OutputStream::Write; + /// \endcond /// Close the stream and return the buffer Status Finish(std::shared_ptr* result); @@ -88,7 +90,11 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { uint8_t* mutable_data_; }; -// \brief A helper class to tracks the size of allocations +/// \brief A helper class to tracks the size of allocations +/// +/// Writes to this stream do not copy or retain any data, they just bump +/// a size counter that can be later used to know exactly which data size +/// needs to be allocated for actual writing. class ARROW_EXPORT MockOutputStream : public OutputStream { public: MockOutputStream() : extent_bytes_written_(0), is_open_(true) {} @@ -106,7 +112,7 @@ class ARROW_EXPORT MockOutputStream : public OutputStream { bool is_open_; }; -/// \brief Enables random writes into a fixed-size mutable buffer +/// \brief An output stream that writes into a fixed-size mutable buffer class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile { public: /// Input buffer must be mutable, will abort if not diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 2b31b476334..02650a6d35e 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -31,7 +31,12 @@ namespace arrow { struct SparseTensorFormat { /// EXPERIMENTAL: The index format type of SparseTensor - enum type { COO, CSR }; + enum type { + /// Coordinate list (COO) format. + COO, + /// Compressed sparse row (CSR) format. + CSR + }; }; /// \brief EXPERIMENTAL: The base class for the index of a sparse tensor @@ -83,10 +88,15 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase& coords); /// \brief Return a tensor that has the coordinates of the non-zero values + /// + /// The returned tensor is a Nx3 tensor where N is the number of non-zero + /// values. Each 3-element column has the form `{row, column, index}`, + /// indicating that the value for the logical element at `{row, column}` + /// should be found at the given physical index. const std::shared_ptr& indices() const { return coords_; } /// \brief Return a string representation of the sparse index @@ -120,7 +130,7 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase& indptr, const std::shared_ptr& indices); @@ -231,20 +241,23 @@ class SparseTensorImpl : public SparseTensor { public: virtual ~SparseTensorImpl() = default; - // Constructor with all attributes + /// \brief Construct a sparse tensor from physical data buffer and logical index SparseTensorImpl(const std::shared_ptr& sparse_index, const std::shared_ptr& type, const std::shared_ptr& data, const std::vector& shape, const std::vector& dim_names) : SparseTensor(type, data, shape, sparse_index, dim_names) {} - // Constructor for empty sparse tensor + /// \brief Construct an empty sparse tensor SparseTensorImpl(const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names = {}) : SparseTensorImpl(NULLPTR, type, NULLPTR, shape, dim_names) {} - // Constructor with a dense tensor + /// \brief Construct a sparse tensor from a dense tensor + /// + /// The dense tensor is re-encoded as a sparse index and a physical + /// data buffer for the non-zero value. SparseTensorImpl(const Tensor& tensor, const std::shared_ptr& index_value_type) : SparseTensorImpl(NULLPTR, tensor.type(), NULLPTR, tensor.shape(), diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h index a05d43e738d..9ffd79b27f4 100644 --- a/cpp/src/arrow/util/compression.h +++ b/cpp/src/arrow/util/compression.h @@ -29,6 +29,7 @@ namespace arrow { class Status; struct Compression { + /// \brief Compression algorithm enum type { UNCOMPRESSED, SNAPPY, GZIP, BROTLI, ZSTD, LZ4, LZO, BZ2 }; }; @@ -96,6 +97,7 @@ class ARROW_EXPORT Decompressor { // XXX add methods for buffer size heuristics? }; +/// \brief Compression codec class ARROW_EXPORT Codec { public: virtual ~Codec(); @@ -107,7 +109,10 @@ class ARROW_EXPORT Codec { /// \brief Return a string name for compression type static std::string GetCodecAsString(Compression::type t); + /// \brief Create a codec for the given compression algorithm static Status Create(Compression::type codec, std::unique_ptr* out); + + /// \brief Create a codec for the given compression algorithm and level static Status Create(Compression::type codec, int compression_level, std::unique_ptr* out); diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 1c113b7de68..f62fe52b619 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -28,6 +28,9 @@ API Reference api/array api/builder api/table + api/tensor api/utilities + api/io api/cuda api/flight + api/filesystem diff --git a/docs/source/cpp/api/filesystem.rst b/docs/source/cpp/api/filesystem.rst new file mode 100644 index 00000000000..293dd080963 --- /dev/null +++ b/docs/source/cpp/api/filesystem.rst @@ -0,0 +1,46 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========== +Filesystems +=========== + +Interface +========= + +.. doxygenenum:: arrow::fs::FileType + +.. doxygenstruct:: arrow::fs::FileStats + :members: + +.. doxygenclass:: arrow::fs::FileSystem + :members: + +Concrete implementations +======================== + +.. doxygenclass:: arrow::fs::SubTreeFileSystem + :members: + +.. doxygenclass:: arrow::fs::LocalFileSystem + :members: + +.. doxygenstruct:: arrow::fs::S3Options + :members: + +.. doxygenclass:: arrow::fs::S3FileSystem + :members: diff --git a/docs/source/cpp/api/flight.rst b/docs/source/cpp/api/flight.rst index 4e56a7690ac..d226a1e5673 100644 --- a/docs/source/cpp/api/flight.rst +++ b/docs/source/cpp/api/flight.rst @@ -19,11 +19,11 @@ Arrow Flight RPC ================ -.. warning:: Flight is currently unstable. APIs are subject to change, - though we don't expect drastic changes. +.. note:: Flight is currently unstable. APIs are subject to change, + though we don't expect drastic changes. -.. warning:: Flight is currently only available when built from source - appropriately. +.. note:: Flight is currently only available when built from source + appropriately. Common Types ============ diff --git a/docs/source/cpp/api/io.rst b/docs/source/cpp/api/io.rst new file mode 100644 index 00000000000..735136a0d47 --- /dev/null +++ b/docs/source/cpp/api/io.rst @@ -0,0 +1,95 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Input / output +============== + +Interfaces +========== + +.. doxygenclass:: arrow::io::FileInterface + :members: + +.. doxygenclass:: arrow::io::Readable + :members: + +.. doxygenclass:: arrow::io::Seekable + :members: + +.. doxygenclass:: arrow::io::Writable + :members: + +.. doxygenclass:: arrow::io::InputStream + :members: + +.. doxygenclass:: arrow::io::RandomAccessFile + :members: + +.. doxygenclass:: arrow::io::OutputStream + :members: + +.. doxygenclass:: arrow::io::ReadWriteFileInterface + :members: + +Concrete implementations +======================== + +In-memory streams +----------------- + +.. doxygenclass:: arrow::io::BufferReader + :members: + +.. doxygenclass:: arrow::io::MockOutputStream + :members: + +.. doxygenclass:: arrow::io::BufferOutputStream + :members: + +.. doxygenclass:: arrow::io::FixedSizeBufferWriter + :members: + +Local files +----------- + +.. doxygenclass:: arrow::io::ReadableFile + :members: + +.. doxygenclass:: arrow::io::FileOutputStream + :members: + +.. doxygenclass:: arrow::io::MemoryMappedFile + :members: + +Buffering input / output wrappers +--------------------------------- + +.. doxygenclass:: arrow::io::BufferedInputStream + :members: + +.. doxygenclass:: arrow::io::BufferedOutputStream + :members: + +Compressed input / output wrappers +---------------------------------- + +.. doxygenclass:: arrow::io::CompressedInputStream + :members: + +.. doxygenclass:: arrow::io::CompressedOutputStream + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index c921229e6cb..93d8444b916 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -39,6 +39,15 @@ Memory Pools .. doxygenfunction:: arrow::default_memory_pool :project: arrow_cpp +.. doxygenfunction:: arrow::jemalloc_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::mimalloc_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::system_memory_pool + :project: arrow_cpp + .. doxygenclass:: arrow::MemoryPool :project: arrow_cpp :members: diff --git a/docs/source/cpp/api/tensor.rst b/docs/source/cpp/api/tensor.rst new file mode 100644 index 00000000000..eb783f296b9 --- /dev/null +++ b/docs/source/cpp/api/tensor.rst @@ -0,0 +1,53 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======= +Tensors +======= + +Dense Tensors +============= + +.. doxygenclass:: arrow::Tensor + :members: + +.. doxygenclass:: arrow::NumericTensor + :members: + +Sparse Tensors +============== + +.. doxygenenum:: arrow::SparseTensorFormat::type + +.. doxygenclass:: arrow::SparseIndex + :members: + +.. doxygenclass:: arrow::SparseCOOIndex + :members: + +.. doxygenclass:: arrow::SparseCSRIndex + :members: + +.. doxygenclass:: arrow::SparseTensor + :members: + +.. doxygenclass:: arrow::SparseTensorImpl + :members: + +.. doxygentypedef:: arrow::SparseTensorCOO + +.. doxygentypedef:: arrow::SparseTensorCSR diff --git a/docs/source/cpp/api/utilities.rst b/docs/source/cpp/api/utilities.rst index 0aaffb54a47..87c5a3bbe04 100644 --- a/docs/source/cpp/api/utilities.rst +++ b/docs/source/cpp/api/utilities.rst @@ -37,3 +37,16 @@ Abstract Sequences :project: arrow_cpp :members: +Compression +=========== + +.. doxygenenum:: arrow::Compression::type + +.. doxygenclass:: arrow::util::Codec + :members: + +.. doxygenclass:: arrow::util::Compressor + :members: + +.. doxygenclass:: arrow::util::Decompressor + :members: diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 7c55b76912d..0f43a61c5c8 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -29,3 +29,4 @@ Getting Started arrays datatypes tables + io diff --git a/docs/source/cpp/io.rst b/docs/source/cpp/io.rst new file mode 100644 index 00000000000..898bc9a5592 --- /dev/null +++ b/docs/source/cpp/io.rst @@ -0,0 +1,86 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +============================== +Input / output and filesystems +============================== + +Arrow provides a range of C++ interfaces abstracting the concrete details +of input / output operations. They operate on streams of untyped binary data. +Those abstractions are used for various purposes such as reading CSV or +Parquet data, transmitting IPC streams, and more. + +.. cpp:namespace:: arrow::io + +Reading binary data +=================== + +Interfaces for reading binary data come in two flavours: + +* Sequential reading: the :class:`InputStream` interface provides + ``Read`` methods; it is recommended to ``Read`` to a ``Buffer`` as it + may in some cases avoid a memory copy. + +* Random access reading: the :class:`RandomAccessFile` interface + provides additional facilities for positioning and, most importantly, + the ``ReadAt`` methods which allow parallel reading from multiple threads. + +Concrete implementations are available for :class:`in-memory reads `, +:class:`unbuffered file reads `, +:class:`memory-mapped file reads `, +:class:`buffered reads `, +:class:`compressed reads `. + +Writing binary data +=================== + +Writing binary data is mostly done through the :class:`OutputStream` +interface. + +Concrete implementations are available for :class:`in-memory writes `, +:class:`unbuffered file writes `, +:class:`memory-mapped file writes `, +:class:`buffered writes `, +:class:`compressed writes `. + +.. cpp:namespace:: arrow::fs + +Filesystems +=========== + +The :class:`filesystem interface ` allows abstracted access over +various data storage backends such as the local filesystem or a S3 bucket. +It provides input and output streams as well as directory operations. + +The filesystem interface exposes a simplified view of the underlying data +storage. Data paths are represented as *abstract paths*, which are +``/``-separated, even on Windows, and shouldn't include special path +components such as ``.`` and ``..``. Symbolic links, if supported by the +underlying storage, are automatically dereferenced. Only basic +:class:`metadata ` about file entries, such as the file size +and modification time, is made available. + +Concrete implementations are available for +:class:`local filesystem access ` and +:class:`Amazon S3-compatible storage `. + +.. note:: + The filesystem layer is currently experimental. API details may vary + in the future. diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst index 53fc998eae6..ccebdba45dd 100644 --- a/docs/source/cpp/overview.rst +++ b/docs/source/cpp/overview.rst @@ -89,3 +89,9 @@ The devices layer Basic **CUDA** integration is provided, allowing to describe Arrow data backed by GPU-allocated memory. + +The filesystem layer +-------------------- + +A filesystem abstraction allows reading and writing data from different storage +backends, such as the local filesystem or a S3 bucket. diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index e929c6eecd8..d5dbb3f73b7 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -18,9 +18,9 @@ .. default-domain:: cpp .. highlight:: cpp -======================== -Two-dimensional Datasets -======================== +============ +Tabular Data +============ While arrays and chunked arrays represent a one-dimensional sequence of homogenous values, data often comes in the form of two-dimensional sets of