From 4f2da1fe3d377a37ef96b70fb46d91de0a0a4e41 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 00:53:20 +0000 Subject: [PATCH 01/28] Add hybrid scan reader stubs --- cpp/CMakeLists.txt | 1 + .../cudf/io/experimental/hybrid_scan.hpp | 241 ++++++++ .../io/parquet/experimental/hybrid_scan.cpp | 193 +++++++ .../experimental/hybrid_scan_helpers.hpp | 367 ++++++++++++ .../parquet/experimental/hybrid_scan_impl.hpp | 528 ++++++++++++++++++ cpp/src/io/parquet/reader_impl_helpers.hpp | 6 +- 6 files changed, 1333 insertions(+), 3 deletions(-) create mode 100644 cpp/include/cudf/io/experimental/hybrid_scan.hpp create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan.cpp create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 726c4de222e..7f0d2afe5c7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -521,6 +521,7 @@ add_library( src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu + src/io/parquet/experimental/hybrid_scan.cpp src/io/parquet/page_data.cu src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp new file mode 100644 index 00000000000..7e33888ab63 --- /dev/null +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hybrid_scan.hpp + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace experimental::io::parquet { + +namespace detail { + +/** + * @brief Internal experimental Parquet reader optimized for highly selective filters (Hybrid Scan + * operation). + */ +class impl; + +} // namespace detail + +/** + * @brief The experimental parquet reader class to optimally read parquet files subject to + * highly selective filters (Hybrid Scan operation) + * + * This class is designed to best exploit reductive optimization techniques to speed up reading + * Parquet files subject to highly selective filters (Hybrid scan operation). This class reads file + * contents in two passes, where the first pass optimally reads the `filter` columns (i.e. columns + * that appear in the filter expression) and the second pass optimally reads the `payload` columns + * (i.e. columns that do not appear in the filter expression) + */ +class hybrid_scan_reader { + public: + /** + * @brief Constructor for the experimental parquet reader class to optimally read Parquet files + * subject to highly selective filters + * + * @param footer_bytes Host span of parquet file footer bytes + * @param options Parquet reader options + */ + explicit hybrid_scan_reader(cudf::host_span footer_bytes, + cudf::io::parquet_reader_options const& options); + + /** + * @brief Destructor for the experimental parquet reader class + */ + ~hybrid_scan_reader(); + + /** + * @brief Get the Parquet file footer metadata + * + * @return Parquet file footer metadata + */ + [[nodiscard]] cudf::io::parquet::FileMetaData const& get_parquet_metadata() const; + + /** + * @brief Get the byte range of the `PageIndex` in the Parquet file + * + * @return Byte range of the `PageIndex` + */ + [[nodiscard]] cudf::io::text::byte_range_info get_page_index_bytes() const; + + /** + * @brief Setup the PageIndex + * + * @param page_index_bytes Host span of Parquet `PageIndex` buffer bytes + */ + void setup_page_index(cudf::host_span page_index_bytes) const; + + /** + * @brief Get all available row groups from the parquet file + * + * @param options Parquet reader options + * @return Vector of row group indices + */ + [[nodiscard]] std::vector get_all_row_groups( + cudf::io::parquet_reader_options const& options) const; + + /** + * @brief Filter the row groups with statistics + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Filtered row group indices + */ + [[nodiscard]] std::vector filter_row_groups_with_stats( + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const; + + /** + * @brief Fetches byte ranges of bloom filters and dictionary pages (secondary filters) for + * further row group pruning + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @return Pair of vectors of byte ranges to per-column-chunk bloom filters and dictionary pages + */ + [[nodiscard]] std::pair, + std::vector> + get_secondary_filters(cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options) const; + + /** + * @brief Filter the row groups with dictionary pages + * + * @param dictionary_page_data Device buffers containing per-column-chunk dictionary page data + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Filtered row group indices + */ + [[nodiscard]] std::vector filter_row_groups_with_dictionary_pages( + std::vector& dictionary_page_data, + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const; + + /** + * @brief Filter the row groups with bloom filters + * + * @param bloom_filter_data Device buffers containing per-column-chunk bloom filter data + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Filtered row group indices + */ + [[nodiscard]] std::vector filter_row_groups_with_bloom_filters( + std::vector& bloom_filter_data, + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const; + + /** + * @brief Filter data pages of filter columns using statistics containing in `PageIndex` metadata + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return A pair of boolean column indicating rows corresponding to data pages after + * page-pruning, and a list of boolean vectors indicating which data pages are not pruned, + * one per filter column. + */ + [[nodiscard]] std::pair, std::vector>> + filter_data_pages_with_stats(cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @brief Fetches byte ranges of column chunks of filter columns + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @return Vector of byte ranges to column chunks of filter columns + */ + [[nodiscard]] std::vector get_filter_column_chunk_byte_ranges( + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options) const; + + /** + * @brief Materializes filter columns, and updates the input row validity mask to only the rows + * that survive the row selection predicate at row level + * + * @param page_mask Boolean vectors indicating data pages are not pruned, one per filter column + * @param row_group_indices Input row groups indices + * @param column_chunk_buffers Device buffers containing column chunk data of filter columns + * @param[in,out] row_mask Mutable boolean column indicating rows that survive page-pruning + * @param options Parquet reader options + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Table of materialized filter columns and metadata + */ + [[nodiscard]] cudf::io::table_with_metadata materialize_filter_columns( + cudf::host_span const> page_mask, + cudf::host_span row_group_indices, + std::vector column_chunk_buffers, + cudf::mutable_column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const; + + /** + * @brief Fetches byte ranges of column chunks of payload columns + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @return Vector of byte ranges to column chunks of payload columns + */ + [[nodiscard]] std::vector get_payload_column_chunk_byte_ranges( + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options) const; + + /** + * @brief Materializes payload columns + * + * @param row_group_indices Input row groups indices + * @param column_chunk_buffers Device buffers containing column chunk data of payload columns + * @param row_mask Boolean column indicating which rows need to be read + * @param options Parquet reader options + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Table of materialized payload columns and metadata + */ + [[nodiscard]] cudf::io::table_with_metadata materialize_payload_columns( + cudf::host_span row_group_indices, + std::vector column_chunk_buffers, + cudf::column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const; + + private: + std::unique_ptr _impl; +}; + +} // namespace experimental::io::parquet +} // namespace CUDF_EXPORT cudf \ No newline at end of file diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp new file mode 100644 index 00000000000..fce6b29ba97 --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hybrid_scan_impl.hpp" + +#include +#include + +namespace cudf::experimental::io::parquet { + +hybrid_scan_reader::hybrid_scan_reader(cudf::host_span footer_bytes, + cudf::io::parquet_reader_options const& options) + : _impl{std::make_unique(footer_bytes, options)} +{ +} + +hybrid_scan_reader::~hybrid_scan_reader() = default; + +[[nodiscard]] cudf::io::text::byte_range_info hybrid_scan_reader::get_page_index_bytes() const +{ + return _impl->get_page_index_bytes(); +} + +[[nodiscard]] cudf::io::parquet::FileMetaData const& hybrid_scan_reader::get_parquet_metadata() + const +{ + return _impl->get_parquet_metadata(); +} + +void hybrid_scan_reader::setup_page_index(cudf::host_span page_index_bytes) const +{ + return _impl->setup_page_index(page_index_bytes); +} + +std::vector hybrid_scan_reader::get_all_row_groups( + cudf::io::parquet_reader_options const& options) const +{ + CUDF_EXPECTS(options.get_row_groups().size() == 0 or options.get_row_groups().size() == 1, + "Encountered invalid size of row group indices in parquet reader options"); + + // If row groups are specified in parquet reader options, return them as is + if (options.get_row_groups().size()) { return options.get_row_groups().front(); } + + return _impl->get_all_row_groups(options); +} + +std::vector hybrid_scan_reader::filter_row_groups_with_stats( + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const +{ + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->filter_row_groups_with_stats(input_row_group_indices, options, stream).front(); +} + +std::pair, + std::vector> +hybrid_scan_reader::get_secondary_filters(cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options) const +{ + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->get_secondary_filters(input_row_group_indices, options); +} + +std::vector hybrid_scan_reader::filter_row_groups_with_dictionary_pages( + std::vector& dictionary_page_data, + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const +{ + CUDF_EXPECTS(row_group_indices.size() == dictionary_page_data.size(), + "Mismatch in size of input row group indices and dictionary page device buffers"); + + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl + ->filter_row_groups_with_dictionary_pages( + dictionary_page_data, input_row_group_indices, options, stream) + .front(); +} + +std::vector hybrid_scan_reader::filter_row_groups_with_bloom_filters( + std::vector& bloom_filter_data, + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const +{ + CUDF_EXPECTS(row_group_indices.size() == bloom_filter_data.size(), + "Mismatch in size of input row group indices and bloom filter device buffers"); + + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl + ->filter_row_groups_with_bloom_filters( + bloom_filter_data, input_row_group_indices, options, stream) + .front(); +} + +std::pair, std::vector>> +hybrid_scan_reader::filter_data_pages_with_stats(cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->filter_data_pages_with_stats(input_row_group_indices, options, stream, mr); +} + +[[nodiscard]] std::vector +hybrid_scan_reader::get_filter_column_chunk_byte_ranges( + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options) const +{ + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->get_filter_column_chunk_byte_ranges(input_row_group_indices, options).first; +} + +cudf::io::table_with_metadata hybrid_scan_reader::materialize_filter_columns( + cudf::host_span const> data_page_mask, + cudf::host_span row_group_indices, + std::vector column_chunk_buffers, + cudf::mutable_column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const +{ + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->materialize_filter_columns(data_page_mask, + input_row_group_indices, + std::move(column_chunk_buffers), + row_mask, + options, + stream); +} + +[[nodiscard]] std::vector +hybrid_scan_reader::get_payload_column_chunk_byte_ranges( + cudf::host_span row_group_indices, + cudf::io::parquet_reader_options const& options) const +{ + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->get_payload_column_chunk_byte_ranges(input_row_group_indices, options).first; +} + +cudf::io::table_with_metadata hybrid_scan_reader::materialize_payload_columns( + cudf::host_span row_group_indices, + std::vector column_chunk_buffers, + cudf::column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) const +{ + // Temporary vector with row group indices from the first source + auto const input_row_group_indices = + std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; + + return _impl->materialize_payload_columns( + input_row_group_indices, std::move(column_chunk_buffers), row_mask, options, stream); +} + +} // namespace cudf::experimental::io::parquet diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp new file mode 100644 index 00000000000..8f0c5c3863d --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "io/parquet/parquet_gpu.hpp" +#include "io/parquet/reader_impl_helpers.hpp" + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf::experimental::io::parquet::detail { + +using aggregate_reader_metadata_base = cudf::io::parquet::detail::aggregate_reader_metadata; +using equality_literals_collector = cudf::io::parquet::detail::equality_literals_collector; +using FileMetadata = cudf::io::parquet::FileMetaData; +using inline_column_buffer = cudf::io::detail::inline_column_buffer; +using input_column_info = cudf::io::parquet::detail::input_column_info; +using metadata_base = cudf::io::parquet::detail::metadata; +using row_group_info = cudf::io::parquet::detail::row_group_info; + +/** + * @brief Class for parsing dataset metadata + */ +struct metadata : private metadata_base { + explicit metadata(cudf::host_span footer_bytes); + metadata_base get_file_metadata() && { return std::move(*this); } +}; + +class aggregate_reader_metadata : public aggregate_reader_metadata_base { + private: + /** + * @brief Materializes column chunk dictionary pages into `cuco::static_set`s + * + * @param dictionary_page_data Dictionary page data device buffers for each input row group + * @param input_row_group_indices Lists of input row groups, one per source + * @param total_row_groups Total number of row groups in `input_row_group_indices` + * @param output_dtypes Datatypes of output columns + * @param dictionary_col_schemas schema indices of dictionary columns only + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A flattened list of `cuco::static_set_ref` device buffers for each filter column + * across row groups + */ + [[nodiscard]] std::vector materialize_dictionaries( + cudf::host_span dictionary_page_data, + host_span const> input_row_group_indices, + host_span output_dtypes, + host_span dictionary_col_schemas, + rmm::cuda_stream_view stream) const; + + /** + * @brief Filters the row groups using dictionary pages + * + * @param dictionaries `cuco::static_set_ref` device buffers for column chunk dictionary + * @param input_row_group_indices Lists of input row groups, one per source + * @param literals Lists of literals, one per input column + * @param operators Lists of operators, one per input column + * @param total_row_groups Total number of row groups in `input_row_group_indices` + * @param output_dtypes Datatypes of output columns + * @param dictionary_col_schemas schema indices of dictionary columns only + * @param filter AST expression to filter row groups based on bloom filter membership + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A pair of filtered row group indices if any is filtered. + */ + [[nodiscard]] std::optional>> apply_dictionary_filter( + cudf::host_span dictionaries, + host_span const> input_row_group_indices, + host_span const> literals, + host_span const> operators, + size_type total_row_groups, + host_span output_dtypes, + host_span dictionary_col_schemas, + std::reference_wrapper filter, + rmm::cuda_stream_view stream) const; + + public: + /** + * @brief Constructor for aggregate_reader_metadata + * + * @param footer_bytes Host span of Parquet file footer buffer bytes + * @param use_arrow_schema Whether to use Arrow schema + * @param has_cols_from_mismatched_srcs Whether to have columns from mismatched sources + */ + aggregate_reader_metadata(cudf::host_span footer_bytes, + bool use_arrow_schema, + bool has_cols_from_mismatched_srcs); + + /** + * @brief Fetch the byte range of the `PageIndex` in the Parquet file + */ + [[nodiscard]] cudf::io::text::byte_range_info get_page_index_bytes() const; + + /** + * @brief Get the Parquet file metadata + */ + [[nodiscard]] FileMetadata const& get_parquet_metadata() const; + + /** + * @brief Setup the PageIndex + * + * @param page_index_bytes Host span of Parquet `PageIndex` buffer bytes + */ + void setup_page_index(cudf::host_span page_index_bytes); + + /** + * @brief Filters and reduces down to the selection of filter columns + * + * @param filter_columns_names List of paths of column names that are present only in filter + * @param include_index Whether to always include the PANDAS index column(s) + * @param strings_to_categorical Type conversion parameter + * @param timestamp_type_id Type conversion parameter + * + * @return input column information, output column information, list of output column schema + * indices + */ + [[nodiscard]] std:: + tuple, std::vector, std::vector> + select_filter_columns(std::optional> const& filter_columns_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id); + + /** + * @brief Filters and reduces down to the selection of payload columns + * + * @param column_names List of paths of column names that are present only in payload and filter + * @param filter_columns_names List of paths of column names that are present only in filter + * @param include_index Whether to always include the PANDAS index column(s) + * @param strings_to_categorical Type conversion parameter + * @param timestamp_type_id Type conversion parameter + * + * @return input column information, output column information, list of output column schema + * indices + */ + [[nodiscard]] std:: + tuple, std::vector, std::vector> + select_payload_columns(std::optional> const& column_names, + std::optional> const& filter_columns_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id); + + /** + * @brief Filters and reduces down to a selection of row groups + * + * The input `row_start` and `row_count` parameters will be recomputed and output as the valid + * values based on the input row group list. + * + * @param row_group_indices Lists of row groups to read, one per source + * @param row_start Starting row of the selection + * @param row_count Total number of rows selected + * + * @return A tuple of corrected row_start, row_count, list of row group indexes and its + * starting row, list of number of rows per source, number of input row groups, and a + * struct containing the number of row groups surviving each predicate pushdown filter + */ + [[nodiscard]] std::tuple> select_row_groups( + host_span const> row_group_indices, + int64_t row_start, + std::optional const& row_count); + + /** + * @brief Filter the row groups with statistics based on predicate filter + * + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter row groups based on Column chunk statistics + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Filtered row group indices, if any are filtered + */ + [[nodiscard]] std::vector> filter_row_groups_with_stats( + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const; + + /** + * @brief Get the bloom filter byte ranges, one per input column chunk + * + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter row groups based on bloom filters + * + * @return Byte ranges of bloom filters, one per input column chunk + */ + [[nodiscard]] std::vector get_bloom_filter_bytes( + cudf::host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter); + + /** + * @brief Get the dictionary page byte ranges, one per input column chunk + * + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter row groups based on dictionary pages + * + * @return Byte ranges of dictionary pages, one per input column chunk + */ + [[nodiscard]] std::vector get_dictionary_page_bytes( + cudf::host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter); + + /** + * @brief Filter the row groups using dictionaries based on predicate filter + * + * @param dictionary_page_data Device buffers of dictionary pages, one per input column chunk + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter row groups based on dictionary pages + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Filtered row group indices, if any are filtered + */ + [[nodiscard]] std::vector> filter_row_groups_with_dictionary_pages( + std::vector& dictionary_page_data, + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const; + + /** + * @brief Filter the row groups using bloom filters based on predicate filter + * + * @param bloom_filter_data Device buffers of bloom filters, one per input column chunk + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter row groups based on bloom filters + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Filtered row group indices, if any are filtered + */ + [[nodiscard]] std::vector> filter_row_groups_with_bloom_filters( + std::vector& bloom_filter_data, + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const; + + /** + * @brief Filter data pages using statistics page-level statistics based on predicate filter + * + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter data pages based on `PageIndex` statistics + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @return A boolean column representing a mask of rows surviving the predicate filter at + * page-level + */ + [[nodiscard]] std::unique_ptr filter_data_pages_with_stats( + cudf::host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @brief Computes which data pages need decoding to construct input columns based on the row mask + * + * Compute a vector of boolean vectors indicating which data pages need to be decoded to + * construct each input column based on the row mask, one vector per column + * + * @param row_mask Boolean column indicating which rows need to be read after page-pruning + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A vector of boolean vectors indicating which data pages need to be decoded to produce + * the output table based on the input row mask, one per input column + */ + [[nodiscard]] std::vector> compute_data_page_mask( + cudf::column_view row_mask, + cudf::host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + rmm::cuda_stream_view stream) const; +}; + +/** + * @brief Collects lists of equal and not-equal predicate literals in the AST expression, one list + * per input table column. This is used in row group filtering based on dictionary pages. + */ +class dictionary_literals_and_operators_collector : public equality_literals_collector { + public: + dictionary_literals_and_operators_collector(); + + dictionary_literals_and_operators_collector(ast::expression const& expr, + cudf::size_type num_input_columns); + + using equality_literals_collector::visit; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) + */ + std::reference_wrapper visit(ast::column_reference const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) + */ + std::reference_wrapper visit( + ast::column_name_reference const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) + */ + std::reference_wrapper visit(ast::operation const& expr) override; + + /** + * @brief Returns the vectors of dictionary page filter literals in the AST expression, one per + * input table column + */ + [[nodiscard]] std::vector> get_literals() = delete; + + /** + * @brief Returns a pair of two vectors containing dictionary filter literals and operators + * in the AST expression respectively, one per input table column + */ + [[nodiscard]] std::pair>, + std::vector>> + get_literals_and_operators() &&; + + private: + std::vector> _operators; +}; + +} // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp new file mode 100644 index 00000000000..073c281cee7 --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file reader_impl.hpp + * @brief cuDF-IO Parquet reader class implementation header + */ + +#pragma once + +#include "hybrid_scan_helpers.hpp" +#include "io/parquet/parquet_gpu.hpp" +#include "io/parquet/reader_impl_chunking.hpp" + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf::experimental::io::parquet::detail { + +/** + * @brief Implementation for Parquet reader + */ +class impl { + public: + /** + * @brief Constructor for the experimental parquet reader implementation to optimally read + * Parquet files subject to highly selective filters + * + * @param footer_bytes Host span of parquet file footer bytes + * @param options Parquet reader options + */ + explicit impl(cudf::host_span footer_bytes, + cudf::io::parquet_reader_options const& options); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::get_parquet_metadata + */ + [[nodiscard]] cudf::io::parquet::FileMetaData const& get_parquet_metadata() const; + + /** + * @copydoc cudf::io::experimental::hybrid_scan::get_page_index_bytes + */ + [[nodiscard]] cudf::io::text::byte_range_info get_page_index_bytes() const; + + /** + * @copydoc cudf::io::experimental::hybrid_scan::setup_page_index + */ + void setup_page_index(cudf::host_span page_index_bytes) const; + + /** + * @copydoc cudf::io::experimental::hybrid_scan::get_all_row_groups + */ + [[nodiscard]] std::vector get_all_row_groups( + cudf::io::parquet_reader_options const& options) const; + + /** + * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_stats + */ + [[nodiscard]] std::vector> filter_row_groups_with_stats( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::get_secondary_filters + */ + [[nodiscard]] std::pair, + std::vector> + get_secondary_filters(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_dictionary_pages + */ + [[nodiscard]] std::vector> filter_row_groups_with_dictionary_pages( + std::vector& dictionary_page_data, + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_bloom_filters + */ + [[nodiscard]] std::vector> filter_row_groups_with_bloom_filters( + std::vector& bloom_filter_data, + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::filter_data_pages_with_stats + */ + [[nodiscard]] std::pair, std::vector>> + filter_data_pages_with_stats(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + /** + * @brief Fetches byte ranges of column chunks of filter columns + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @return Pair of a vector of byte ranges to column chunks of filter columns and a vector of + * their corresponding input source file indices + */ + [[nodiscard]] std::pair, + std::vector> + get_filter_column_chunk_byte_ranges( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::materialize_filter_columns + */ + [[nodiscard]] cudf::io::table_with_metadata materialize_filter_columns( + cudf::host_span const> data_page_pask, + cudf::host_span const> row_group_indices, + std::vector column_chunk_buffers, + cudf::mutable_column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream); + + /** + * @brief Fetches byte ranges of column chunks of payload columns + * + * @param row_group_indices Input row groups indices + * @param options Parquet reader options + * @return Pair of a vector of byte ranges to column chunks of payload columns and a vector of + * their corresponding input source file indices + */ + [[nodiscard]] std::pair, + std::vector> + get_payload_column_chunk_byte_ranges( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options); + + /** + * @copydoc cudf::io::experimental::hybrid_scan::materialize_payload_columns + */ + [[nodiscard]] cudf::io::table_with_metadata materialize_payload_columns( + cudf::host_span const> row_group_indices, + std::vector column_chunk_buffers, + cudf::column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream); + + /** + * @brief Updates the output row mask such that such that out_row_mask[i] = true iff + * in_row_mask[i] is valid and true + * + * Updates the output row mask to reflect the final valid and surviving rows from the input row + * mask. This is inline with the masking behavior of cudf::detail::apply_boolean_mask + * + * @param in_row_mask Input row mask column + * @param out_row_mask Output row mask column + * @param stream CUDA stream + */ + static void update_row_mask(cudf::column_view in_row_mask, + cudf::mutable_column_view out_row_mask, + rmm::cuda_stream_view stream); + + private: + using table_metadata = cudf::io::table_metadata; + + /** + * @brief The enum indicating whether we are reading the filter columns or the payload columns + */ + enum class read_mode { FILTER_COLUMNS, PAYLOAD_COLUMNS }; + + /** + * @brief Initialize the necessary options related internal variables for use later on + * + * @param row_group_indices Row group indices to read + * @param options Reader options + * @param stream CUDA stream used for device memory operations and kernel launches + */ + void initialize_options(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream); + + /** + * @brief Set the mask for pages + * + * @param data_page_mask Input data page mask from page-pruning step + */ + void set_page_mask(cudf::host_span const> data_page_mask); + + /** + * @brief Select the columns to be read based on the read mode + * + * @param read_mode Read mode indicating if we are reading filter or payload columns + * @param options Reader options + */ + void select_columns(read_mode read_mode, cudf::io::parquet_reader_options const& options); + + /** + * @brief Get the byte ranges for the input column chunks + * + * @param row_group_indices The row groups indices to read + * @return A pair of vectors containing the byte ranges and the source indices + */ + [[nodiscard]] std::pair, + std::vector> + get_input_column_chunk_byte_ranges( + cudf::host_span const> row_group_indices) const; + + /** + * @brief Invalidate output buffer nullmask for rows spanned by the pruned pages + * + * @param page_mask Boolean vector indicating if a page needs to be decoded or is pruned + */ + void update_output_nullmasks_for_pruned_pages(cudf::host_span page_mask); + + /** + * @brief Perform the necessary data preprocessing for parsing file later on + * + * Only ever called once for filter and payload columns. This function prepares the input row + * groups and computes the schedule of top level passes (see `pass_intermediate_data`) and the + * schedule of subpasses (see `subpass_intermediate_data`). + * + * @param row_group_indices Row group indices to read + * @param column_chunk_buffers Device buffers containing column chunk data + * @param options Parquet reader options + */ + void prepare_data(cudf::host_span const> row_group_indices, + std::vector column_chunk_buffers, + cudf::io::parquet_reader_options const& options); + + /** + * @brief Prepares the select input row groups and associated chunk information + * + * @param row_group_indices Row group indices to read + * @param options Parquet reader options + */ + void prepare_row_groups(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options); + + /** + * @brief Ratchet the pass/subpass/chunk process forward. + * + * @param column_chunk_buffers Device buffers containing column chunk data + * @param options Parquet reader options + */ + void handle_chunking(std::vector column_chunk_buffers, + cudf::io::parquet_reader_options const& options); + + /** + * @brief Setup step for the next input read pass. + * + * A 'pass' is defined as a subset of row groups read out of the globally + * requested set of all row groups. + * + * @param column_chunk_buffers Device buffers containing column chunk data + * @param options Parquet reader options + */ + void setup_next_pass(std::vector column_chunk_buffers, + cudf::io::parquet_reader_options const& options); + + /** + * @brief Setup step for the next decompression subpass. + * + * A 'subpass' is defined as a subset of pages within a pass that are + * decompressed and decoded as a batch. Subpasses may be further subdivided + * into output chunks. + * + * @param options Parquet reader options + * + */ + void setup_next_subpass(cudf::io::parquet_reader_options const& options); + + /** + * @brief Populate the output table metadata from the parquet file metadata. + * + * @param out_metadata The output table metadata to add to + */ + void populate_metadata(table_metadata& out_metadata) const; + + /** + * @brief Setup pointers to columns chunks to be processed for this pass. + * + * Does not decompress the chunk data. + * + * @return boolean indicating if compressed chunks were found + */ + bool setup_column_chunks(); + + /** + * @brief Setup compressed column chunks data and decode page headers for the current pass. + * + * @param column_chunk_buffers Device buffers containing column chunk data + */ + void setup_compressed_data(std::vector column_chunk_buffers); + + /** + * @brief Build string dictionary indices for a pass. + */ + void build_string_dict_indices(); + + /** + * @brief For list columns, generate estimated row counts for pages in the current pass. + * + * The row counts in the pages that come out of the file only reflect the number of values in + * all of the rows in the page, not the number of rows themselves. In order to do subpass reading + * more accurately, we would like to have a more accurate guess of the real number of rows per + * page. + */ + void generate_list_column_row_count_estimates(); + + /** + * @brief Perform some preprocessing for subpass page data and also compute the split locations + * {skip_rows, num_rows} for chunked reading. + * + * There are several pieces of information we can't compute directly from row counts in + * the parquet headers when dealing with nested schemas: + * - The total sizes of all output columns at all nesting levels + * - The starting output buffer offset for each page, for each nesting level + * + * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders). + * + * @param chunk_read_limit Limit on total number of bytes to be returned per read, + * or `0` if there is no limit + */ + void preprocess_subpass_pages(size_t chunk_read_limit); + + /** + * @brief Allocate nesting information storage for all pages and set pointers to it. + * + * One large contiguous buffer of PageNestingInfo structs is allocated and + * distributed among the PageInfo structs. + * + * Note that this gets called even in the flat schema case so that we have a + * consistent place to store common information such as value counts, etc. + */ + void allocate_nesting_info(); + + /** + * @brief Allocate space for use when decoding definition/repetition levels. + * + * One large contiguous buffer of data allocated and + * distributed among the PageInfo structs. + */ + void allocate_level_decode_space(); + + /** + * @brief Reset the internal state of the reader. + */ + void reset_internal_state(); + + /** + * @brief Finalize the output table by adding empty columns for the non-selected columns in + * schema. + * + * @tparam RowMaskView View type of the row mask column + * + * @param[in] read_mode Read mode indicating if we are reading filter or payload columns + * @param[in,out] out_metadata The output table metadata + * @param[in,out] out_columns The columns for building the output table + * @param[in,out] row_mask Boolean column indicating which rows need to be read after page-pruning + * for filter columns, or after materialize step for payload columns + * @return The output table along with columns' metadata + */ + template + cudf::io::table_with_metadata finalize_output(read_mode read_mode, + table_metadata& out_metadata, + std::vector>& out_columns, + RowMaskView row_mask); + + /** + * @brief Allocate data buffers for the output columns. + * + * @param skip_rows Crop all rows below skip_rows + * @param num_rows Maximum number of rows to read + */ + void allocate_columns(size_t skip_rows, size_t num_rows); + + /** + * @brief Calculate per-page offsets for string data + * + * @return Vector of total string data sizes for each column + */ + cudf::detail::host_vector calculate_page_string_offsets(); + + /** + * @brief Converts the page data and outputs to columns. + * + * @param skip_rows Minimum number of rows from start + * @param num_rows Number of rows to output + */ + void decode_page_data(size_t skip_rows, size_t num_rows); + + /** + * @brief Creates file-wide parquet chunk information. + * + * Creates information about all chunks in the file, storing it in + * the file-wide _file_itm_data structure. + */ + void create_global_chunk_info(cudf::io::parquet_reader_options const& options); + + /** + * @brief Computes all of the passes we will perform over the file. + */ + void compute_input_passes(); + + /** + * @brief Given a set of pages that have had their sizes computed by nesting level and + * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing + * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. + */ + void compute_output_chunks_for_subpass(); + + /** + * @brief Check if there is more work to be done. + */ + [[nodiscard]] bool has_more_work() const + { + return _file_itm_data.num_passes() > 0 && + _file_itm_data._current_input_pass < _file_itm_data.num_passes(); + } + + /** + * @brief Read a chunk of data and return an output table. + * + * This function is called internally and expects all preprocessing steps have already been done. + * + * @tparam RowMaskView View type of the row mask column + * @param[in] read_mode Read mode indicating if we are reading filter or payload columns + * @param[in,out] row_mask Boolean column indicating which rows need to be read after page-pruning + * for filter columns, or after materialize step for payload columns + * @return The output table along with columns' metadata + */ + template + cudf::io::table_with_metadata read_chunk_internal(read_mode read_mode, RowMaskView row_mask); + + /** + * @brief Check if the user has specified custom row bounds + * + * @return True if the user has specified custom row bounds + */ + [[nodiscard]] constexpr bool uses_custom_row_bounds() const { return false; } + + /** + * @brief Check if this is the first output chunk + * + * @return True if this is the first output chunk + */ + [[nodiscard]] bool is_first_output_chunk() const + { + return _file_itm_data._output_chunk_count == 0; + } + + private: + using named_to_reference_converter = cudf::io::parquet::detail::named_to_reference_converter; + using input_column_info = cudf::io::parquet::detail::input_column_info; + using inline_column_buffer = cudf::io::detail::inline_column_buffer; + using reader_column_schema = cudf::io::reader_column_schema; + using file_intermediate_data = cudf::io::parquet::detail::file_intermediate_data; + using pass_intermediate_data = cudf::io::parquet::detail::pass_intermediate_data; + + rmm::cuda_stream_view _stream; + rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()}; + + std::unique_ptr _metadata; + + // name to reference converter to extract AST output filter + named_to_reference_converter _expr_conv{std::nullopt, cudf::io::table_metadata{}}; + + // input columns to be processed + std::vector _input_columns; + // Buffers for generating output columns + std::vector _output_buffers; + // Buffers copied from `_output_buffers` after construction for reuse + std::vector _output_buffers_template; + // _output_buffers associated schema indices + std::vector _output_column_schemas; + + // _output_buffers associated metadata + std::unique_ptr _output_metadata; + + std::optional> _filter_columns_names; + + bool _strings_to_categorical = false; + + // are there usable page indexes available + bool _has_page_index = false; + + size_type _num_sources{1}; + + // timestamp_type + cudf::data_type _timestamp_type{type_id::EMPTY}; + + std::optional> _reader_column_schema; + + std::vector _page_mask; + + file_intermediate_data _file_itm_data; + bool _file_preprocessed{false}; + bool _uses_custom_row_bounds{false}; + + bool _is_filter_columns_selected{false}; + bool _is_payload_columns_selected{false}; + + std::unique_ptr _pass_itm_data; +}; + +} // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index f08ba5f8b85..b5972949b71 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -113,6 +113,7 @@ struct row_group_info { * @brief Class for parsing dataset metadata */ struct metadata : public FileMetaData { + metadata() = default; explicit metadata(datasource* source); void sanitize_schema(); }; @@ -134,6 +135,7 @@ struct surviving_row_group_metrics { }; class aggregate_reader_metadata { + protected: std::vector per_file_metadata; std::vector> keyval_maps; std::vector> schema_idx_maps; @@ -219,7 +221,7 @@ class aggregate_reader_metadata { * @param stream CUDA stream used for device memory operations and kernel launches * @param aligned_mr Aligned device memory resource to allocate bloom filter buffers * - * @return A flattened list of bloom filter bitset device buffers for each predicate column across + * @return A flattened list of bloom filter bitset device buffers for each filter column across * row group */ [[nodiscard]] std::vector read_bloom_filters( @@ -582,8 +584,6 @@ class equality_literals_collector : public ast::detail::expression_transformer { cudf::host_span const> operands); size_type _num_input_columns; - - private: std::vector> _literals; }; From 343176fcc8a1e53ecb185e365d6087a554723506 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 01:06:56 +0000 Subject: [PATCH 02/28] Minor style and docs fix --- cpp/include/cudf/io/experimental/hybrid_scan.hpp | 2 +- cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp index 7e33888ab63..4fb7f45d8e6 100644 --- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -238,4 +238,4 @@ class hybrid_scan_reader { }; } // namespace experimental::io::parquet -} // namespace CUDF_EXPORT cudf \ No newline at end of file +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index 073c281cee7..2a2a612aa03 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -15,8 +15,8 @@ */ /** - * @file reader_impl.hpp - * @brief cuDF-IO Parquet reader class implementation header + * @file hybrid_scan_impl.hpp + * @brief cuDF-IO experimental Parquet reader class implementation header */ #pragma once From 46d7d11f07665d98e78693663626bd09f2ce469f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 01:40:38 +0000 Subject: [PATCH 03/28] docstring updates --- .../cudf/io/experimental/hybrid_scan.hpp | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp index 4fb7f45d8e6..2039cba45e0 100644 --- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -72,7 +72,10 @@ class hybrid_scan_reader { ~hybrid_scan_reader(); /** - * @brief Get the Parquet file footer metadata + * @brief Get the Parquet file footer metadata. + * + * Returns the materialized Parquet file footer metadata struct. The footer will contain the + * materialized `PageIndex` if called after `setup_page_index()`. * * @return Parquet file footer metadata */ @@ -86,7 +89,11 @@ class hybrid_scan_reader { [[nodiscard]] cudf::io::text::byte_range_info get_page_index_bytes() const; /** - * @brief Setup the PageIndex + * @brief Setup the `PageIndex` within the Parquet file metadata struct for later use + * + * Materialize the `ColumnIndex` and `OffsetIndex` structs (collectively called `PageIndex`) + * within the Parquet file metadata struct. The statistics contained in `PageIndex` can be used to + * prune data pages before decoding * * @param page_index_bytes Host span of Parquet `PageIndex` buffer bytes */ @@ -102,7 +109,7 @@ class hybrid_scan_reader { cudf::io::parquet_reader_options const& options) const; /** - * @brief Filter the row groups with statistics + * @brief Filter the input row groups with statistics * * @param row_group_indices Input row groups indices * @param options Parquet reader options @@ -115,7 +122,7 @@ class hybrid_scan_reader { rmm::cuda_stream_view stream) const; /** - * @brief Fetches byte ranges of bloom filters and dictionary pages (secondary filters) for + * @brief Get byte ranges of bloom filters and dictionary pages (secondary filters) for * further row group pruning * * @param row_group_indices Input row groups indices @@ -175,7 +182,7 @@ class hybrid_scan_reader { rmm::device_async_resource_ref mr) const; /** - * @brief Fetches byte ranges of column chunks of filter columns + * @brief Get byte ranges of column chunks of filter columns * * @param row_group_indices Input row groups indices * @param options Parquet reader options @@ -186,7 +193,7 @@ class hybrid_scan_reader { cudf::io::parquet_reader_options const& options) const; /** - * @brief Materializes filter columns, and updates the input row validity mask to only the rows + * @brief Materialize filter columns, and updates the input row validity mask to only the rows * that survive the row selection predicate at row level * * @param page_mask Boolean vectors indicating data pages are not pruned, one per filter column @@ -206,7 +213,7 @@ class hybrid_scan_reader { rmm::cuda_stream_view stream) const; /** - * @brief Fetches byte ranges of column chunks of payload columns + * @brief Get byte ranges of column chunks of payload columns * * @param row_group_indices Input row groups indices * @param options Parquet reader options @@ -217,7 +224,7 @@ class hybrid_scan_reader { cudf::io::parquet_reader_options const& options) const; /** - * @brief Materializes payload columns + * @brief Materialize payload columns * * @param row_group_indices Input row groups indices * @param column_chunk_buffers Device buffers containing column chunk data of payload columns From 67ac400dad9cd517c213529cb7225e64b2bcf967 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:41:06 +0000 Subject: [PATCH 04/28] Add more stubs --- cpp/CMakeLists.txt | 4 + .../experimental/hybrid_scan_chunking.cu | 79 +++++++ .../experimental/hybrid_scan_helpers.cpp | 155 ++++++++++++++ .../parquet/experimental/hybrid_scan_impl.cpp | 196 ++++++++++++++++++ .../experimental/hybrid_scan_preprocess.cu | 101 +++++++++ 5 files changed, 535 insertions(+) create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp create mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7f0d2afe5c7..b11b6faf7b1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -522,6 +522,10 @@ add_library( src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu src/io/parquet/experimental/hybrid_scan.cpp + src/io/parquet/experimental/hybrid_scan_chunking.cu + src/io/parquet/experimental/hybrid_scan_helpers.cpp + src/io/parquet/experimental/hybrid_scan_impl.cpp + src/io/parquet/experimental/hybrid_scan_preprocess.cu src/io/parquet/page_data.cu src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu new file mode 100644 index 00000000000..42d69f44f99 --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hybrid_scan_helpers.hpp" +#include "hybrid_scan_impl.hpp" +#include "io/comp/gpuinflate.hpp" +#include "io/comp/io_uncomp.hpp" +#include "io/comp/nvcomp_adapter.hpp" +#include "io/parquet/compact_protocol_reader.hpp" +#include "io/parquet/reader_impl_chunking.hpp" +#include "io/utilities/time_utils.cuh" + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::experimental::io::parquet::detail { + +namespace nvcomp = cudf::io::detail::nvcomp; + +using compression_result = cudf::io::detail::compression_result; +using compression_status = cudf::io::detail::compression_status; +using compression_type = cudf::io::compression_type; +using ColumnChunkDesc = cudf::io::parquet::detail::ColumnChunkDesc; +using CompactProtocolReader = cudf::io::parquet::detail::CompactProtocolReader; +using Compression = cudf::io::parquet::Compression; +using level_type = cudf::io::parquet::detail::level_type; +using LogicalType = cudf::io::parquet::LogicalType; +using PageInfo = cudf::io::parquet::detail::PageInfo; +using Type = cudf::io::parquet::Type; + +void impl::create_global_chunk_info(cudf::io::parquet_reader_options const& options) {} + +void impl::compute_input_passes() {} + +void impl::compute_output_chunks_for_subpass() {} + +void impl::handle_chunking(std::vector column_chunk_buffers, + cudf::io::parquet_reader_options const& options) +{ +} + +void impl::setup_next_pass(std::vector column_chunk_buffers, + cudf::io::parquet_reader_options const& options) +{ +} + +void impl::setup_next_subpass(cudf::io::parquet_reader_options const& options) {} + +} // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp new file mode 100644 index 00000000000..93efc641f1f --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hybrid_scan_helpers.hpp" + +#include "io/parquet/compact_protocol_reader.hpp" +#include "io/parquet/reader_impl_helpers.hpp" +#include "io/utilities/row_selection.hpp" + +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf::experimental::io::parquet::detail { + +using aggregate_reader_metadata_base = cudf::io::parquet::detail::aggregate_reader_metadata; +using ColumnIndex = cudf::io::parquet::ColumnIndex; +using column_name_info = cudf::io::column_name_info; +using CompactProtocolReader = cudf::io::parquet::detail::CompactProtocolReader; +using equality_literals_collector = cudf::io::parquet::detail::equality_literals_collector; +using FieldRepetitionType = cudf::io::parquet::FieldRepetitionType; +using inline_column_buffer = cudf::io::detail::inline_column_buffer; +using input_column_info = cudf::io::parquet::detail::input_column_info; +using metadata_base = cudf::io::parquet::detail::metadata; +using OffsetIndex = cudf::io::parquet::OffsetIndex; +using row_group_info = cudf::io::parquet::detail::row_group_info; +using SchemaElement = cudf::io::parquet::SchemaElement; + +metadata::metadata(cudf::host_span footer_bytes) {} + +aggregate_reader_metadata::aggregate_reader_metadata(cudf::host_span footer_bytes, + bool use_arrow_schema, + bool has_cols_from_mismatched_srcs) + : aggregate_reader_metadata_base({}, false, false) +{ +} + +cudf::io::text::byte_range_info aggregate_reader_metadata::get_page_index_bytes() const +{ + return {}; +} + +FileMetadata const& aggregate_reader_metadata::get_parquet_metadata() const +{ + return per_file_metadata.front(); +} + +void aggregate_reader_metadata::setup_page_index(cudf::host_span page_index_bytes) {} + +std::tuple, + std::vector, + std::vector> +aggregate_reader_metadata::select_payload_columns( + std::optional> const& use_names, + std::optional> const& filter_columns_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) +{ + return {}; +} + +std::tuple, + std::vector, + std::vector> +aggregate_reader_metadata::select_filter_columns( + std::optional> const& filter_columns_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) +{ + return {}; +} + +std::tuple> +aggregate_reader_metadata::select_row_groups( + host_span const> row_group_indices, + int64_t row_start, + std::optional const& row_count) +{ + return {}; +} + +std::vector> aggregate_reader_metadata::filter_row_groups_with_stats( + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const +{ + return {}; +} + +std::vector aggregate_reader_metadata::get_bloom_filter_bytes( + cudf::host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter) +{ + return {}; +} + +std::vector aggregate_reader_metadata::get_dictionary_page_bytes( + cudf::host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter) +{ + return {}; +} + +std::vector> +aggregate_reader_metadata::filter_row_groups_with_dictionary_pages( + std::vector& dictionary_page_data, + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const +{ + return {}; +} + +std::vector> +aggregate_reader_metadata::filter_row_groups_with_bloom_filters( + std::vector& bloom_filter_data, + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const +{ + return {}; +} + +} // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp new file mode 100644 index 00000000000..559a9b0daa4 --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hybrid_scan_impl.hpp" + +#include "cudf/io/text/byte_range_info.hpp" +#include "hybrid_scan_helpers.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cudf::experimental::io::parquet::detail { + +using byte_range_info = cudf::io::text::byte_range_info; +using ColumnChunkDesc = cudf::io::parquet::detail::ColumnChunkDesc; +using decode_kernel_mask = cudf::io::parquet::detail::decode_kernel_mask; +using FileMetaData = cudf::io::parquet::FileMetaData; +using LogicalType = cudf::io::parquet::LogicalType; +using PageInfo = cudf::io::parquet::detail::PageInfo; +using PageNestingDecodeInfo = cudf::io::parquet::detail::PageNestingDecodeInfo; +using Type = cudf::io::parquet::Type; + +void impl::decode_page_data(size_t skip_rows, size_t num_rows) {} + +impl::impl(cudf::host_span footer_bytes, + cudf::io::parquet_reader_options const& options) +{ + // Open and parse the source dataset metadata + _metadata = std::make_unique( + footer_bytes, + options.is_enabled_use_arrow_schema(), + options.get_columns().has_value() and options.is_enabled_allow_mismatched_pq_schemas()); +} + +FileMetaData const& impl::get_parquet_metadata() const { return _metadata->get_parquet_metadata(); } + +cudf::io::text::byte_range_info impl::get_page_index_bytes() const { return {}; } + +void impl::setup_page_index(cudf::host_span page_index_bytes) const {} + +void impl::select_columns(read_mode read_mode, cudf::io::parquet_reader_options const& options) {} + +void impl::reset_internal_state() {} + +std::vector impl::get_all_row_groups( + cudf::io::parquet_reader_options const& options) const +{ + return {}; +} + +std::vector> impl::filter_row_groups_with_stats( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) +{ + return {}; +} + +std::pair, std::vector> impl::get_secondary_filters( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options) +{ + return {}; +} + +std::vector> impl::filter_row_groups_with_dictionary_pages( + std::vector& dictionary_page_data, + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) +{ + return {}; +} + +std::vector> impl::filter_row_groups_with_bloom_filters( + std::vector& bloom_filter_data, + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) +{ + return {}; +} + +std::pair, std::vector>> +impl::filter_data_pages_with_stats(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return {}; +} + +std::pair, std::vector> +impl::get_input_column_chunk_byte_ranges( + cudf::host_span const> row_group_indices) const +{ + return {}; +} + +std::pair, std::vector> +impl::get_filter_column_chunk_byte_ranges( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options) +{ + return {}; +} + +std::pair, std::vector> +impl::get_payload_column_chunk_byte_ranges( + cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options) +{ + return {}; +} + +cudf::io::table_with_metadata impl::materialize_filter_columns( + cudf::host_span const> data_page_mask, + cudf::host_span const> row_group_indices, + std::vector column_chunk_buffers, + cudf::mutable_column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) +{ + return {}; +} + +cudf::io::table_with_metadata impl::materialize_payload_columns( + cudf::host_span const> row_group_indices, + std::vector column_chunk_buffers, + cudf::column_view row_mask, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) +{ + return {}; +} + +void impl::initialize_options(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options, + rmm::cuda_stream_view stream) +{ +} + +template +cudf::io::table_with_metadata impl::read_chunk_internal(read_mode read_mode, RowMaskView row_mask) +{ + return {}; +} + +template +cudf::io::table_with_metadata impl::finalize_output( + read_mode read_mode, + table_metadata& out_metadata, + std::vector>& out_columns, + RowMaskView row_mask) +{ + return {}; +} + +void impl::populate_metadata(table_metadata& out_metadata) const {} + +void impl::prepare_data(cudf::host_span const> row_group_indices, + std::vector column_chunk_buffers, + cudf::io::parquet_reader_options const& options) +{ +} + +void impl::update_output_nullmasks_for_pruned_pages(cudf::host_span page_mask) {} + +void impl::set_page_mask(cudf::host_span const> data_page_mask) {} + +} // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu new file mode 100644 index 00000000000..3f902847094 --- /dev/null +++ b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hybrid_scan_helpers.hpp" +#include "hybrid_scan_impl.hpp" +#include "io/parquet/parquet_gpu.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::experimental::io::parquet::detail { + +using chunk_page_info = cudf::io::parquet::detail::chunk_page_info; +using ColumnChunkDesc = cudf::io::parquet::detail::ColumnChunkDesc; +using Compression = cudf::io::parquet::Compression; +using decode_error = cudf::io::parquet::detail::decode_error; +using Encoding = cudf::io::parquet::Encoding; +using kernel_error = cudf::io::parquet::kernel_error; +using level_type = cudf::io::parquet::detail::level_type; +using LogicalType = cudf::io::parquet::LogicalType; +using PageInfo = cudf::io::parquet::detail::PageInfo; +using PageNestingDecodeInfo = cudf::io::parquet::detail::PageNestingDecodeInfo; +using PageNestingInfo = cudf::io::parquet::detail::PageNestingInfo; +using pass_intermediate_data = cudf::io::parquet::detail::pass_intermediate_data; +using SchemaElement = cudf::io::parquet::SchemaElement; +using string_index_pair = cudf::io::parquet::detail::string_index_pair; +using Type = cudf::io::parquet::Type; + +void impl::prepare_row_groups(cudf::host_span const> row_group_indices, + cudf::io::parquet_reader_options const& options) +{ +} + +void impl::allocate_level_decode_space() {} + +void impl::build_string_dict_indices() {} + +bool impl::setup_column_chunks() { return {}; } + +void impl::preprocess_subpass_pages(size_t chunk_read_limit) {} + +cudf::detail::host_vector impl::calculate_page_string_offsets() +{ + return cudf::detail::make_host_vector(0, _stream); +} + +void impl::update_row_mask(cudf::column_view in_row_mask, + cudf::mutable_column_view out_row_mask, + rmm::cuda_stream_view stream) +{ +} + +} // namespace cudf::experimental::io::parquet::detail From 173ed0228394931c50fb468757609850b47bdbd1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:51:28 +0000 Subject: [PATCH 05/28] Remove unneeded definitions --- cpp/CMakeLists.txt | 2 - .../experimental/hybrid_scan_chunking.cu | 79 -------------- .../experimental/hybrid_scan_helpers.cpp | 88 --------------- .../parquet/experimental/hybrid_scan_impl.cpp | 40 ------- .../experimental/hybrid_scan_preprocess.cu | 101 ------------------ 5 files changed, 310 deletions(-) delete mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu delete mode 100644 cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b11b6faf7b1..b9c4d96fac5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -522,10 +522,8 @@ add_library( src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu src/io/parquet/experimental/hybrid_scan.cpp - src/io/parquet/experimental/hybrid_scan_chunking.cu src/io/parquet/experimental/hybrid_scan_helpers.cpp src/io/parquet/experimental/hybrid_scan_impl.cpp - src/io/parquet/experimental/hybrid_scan_preprocess.cu src/io/parquet/page_data.cu src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu deleted file mode 100644 index 42d69f44f99..00000000000 --- a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "hybrid_scan_helpers.hpp" -#include "hybrid_scan_impl.hpp" -#include "io/comp/gpuinflate.hpp" -#include "io/comp/io_uncomp.hpp" -#include "io/comp/nvcomp_adapter.hpp" -#include "io/parquet/compact_protocol_reader.hpp" -#include "io/parquet/reader_impl_chunking.hpp" -#include "io/utilities/time_utils.cuh" - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace cudf::experimental::io::parquet::detail { - -namespace nvcomp = cudf::io::detail::nvcomp; - -using compression_result = cudf::io::detail::compression_result; -using compression_status = cudf::io::detail::compression_status; -using compression_type = cudf::io::compression_type; -using ColumnChunkDesc = cudf::io::parquet::detail::ColumnChunkDesc; -using CompactProtocolReader = cudf::io::parquet::detail::CompactProtocolReader; -using Compression = cudf::io::parquet::Compression; -using level_type = cudf::io::parquet::detail::level_type; -using LogicalType = cudf::io::parquet::LogicalType; -using PageInfo = cudf::io::parquet::detail::PageInfo; -using Type = cudf::io::parquet::Type; - -void impl::create_global_chunk_info(cudf::io::parquet_reader_options const& options) {} - -void impl::compute_input_passes() {} - -void impl::compute_output_chunks_for_subpass() {} - -void impl::handle_chunking(std::vector column_chunk_buffers, - cudf::io::parquet_reader_options const& options) -{ -} - -void impl::setup_next_pass(std::vector column_chunk_buffers, - cudf::io::parquet_reader_options const& options) -{ -} - -void impl::setup_next_subpass(cudf::io::parquet_reader_options const& options) {} - -} // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 93efc641f1f..0f6e5edce7e 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -64,92 +64,4 @@ FileMetadata const& aggregate_reader_metadata::get_parquet_metadata() const return per_file_metadata.front(); } -void aggregate_reader_metadata::setup_page_index(cudf::host_span page_index_bytes) {} - -std::tuple, - std::vector, - std::vector> -aggregate_reader_metadata::select_payload_columns( - std::optional> const& use_names, - std::optional> const& filter_columns_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) -{ - return {}; -} - -std::tuple, - std::vector, - std::vector> -aggregate_reader_metadata::select_filter_columns( - std::optional> const& filter_columns_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) -{ - return {}; -} - -std::tuple> -aggregate_reader_metadata::select_row_groups( - host_span const> row_group_indices, - int64_t row_start, - std::optional const& row_count) -{ - return {}; -} - -std::vector> aggregate_reader_metadata::filter_row_groups_with_stats( - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream) const -{ - return {}; -} - -std::vector aggregate_reader_metadata::get_bloom_filter_bytes( - cudf::host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter) -{ - return {}; -} - -std::vector aggregate_reader_metadata::get_dictionary_page_bytes( - cudf::host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter) -{ - return {}; -} - -std::vector> -aggregate_reader_metadata::filter_row_groups_with_dictionary_pages( - std::vector& dictionary_page_data, - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream) const -{ - return {}; -} - -std::vector> -aggregate_reader_metadata::filter_row_groups_with_bloom_filters( - std::vector& bloom_filter_data, - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream) const -{ - return {}; -} - } // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 559a9b0daa4..e30890a2d79 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -45,8 +45,6 @@ using PageInfo = cudf::io::parquet::detail::PageInfo; using PageNestingDecodeInfo = cudf::io::parquet::detail::PageNestingDecodeInfo; using Type = cudf::io::parquet::Type; -void impl::decode_page_data(size_t skip_rows, size_t num_rows) {} - impl::impl(cudf::host_span footer_bytes, cudf::io::parquet_reader_options const& options) { @@ -63,10 +61,6 @@ cudf::io::text::byte_range_info impl::get_page_index_bytes() const { return {}; void impl::setup_page_index(cudf::host_span page_index_bytes) const {} -void impl::select_columns(read_mode read_mode, cudf::io::parquet_reader_options const& options) {} - -void impl::reset_internal_state() {} - std::vector impl::get_all_row_groups( cudf::io::parquet_reader_options const& options) const { @@ -159,38 +153,4 @@ cudf::io::table_with_metadata impl::materialize_payload_columns( return {}; } -void impl::initialize_options(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, - rmm::cuda_stream_view stream) -{ -} - -template -cudf::io::table_with_metadata impl::read_chunk_internal(read_mode read_mode, RowMaskView row_mask) -{ - return {}; -} - -template -cudf::io::table_with_metadata impl::finalize_output( - read_mode read_mode, - table_metadata& out_metadata, - std::vector>& out_columns, - RowMaskView row_mask) -{ - return {}; -} - -void impl::populate_metadata(table_metadata& out_metadata) const {} - -void impl::prepare_data(cudf::host_span const> row_group_indices, - std::vector column_chunk_buffers, - cudf::io::parquet_reader_options const& options) -{ -} - -void impl::update_output_nullmasks_for_pruned_pages(cudf::host_span page_mask) {} - -void impl::set_page_mask(cudf::host_span const> data_page_mask) {} - } // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu deleted file mode 100644 index 3f902847094..00000000000 --- a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "hybrid_scan_helpers.hpp" -#include "hybrid_scan_impl.hpp" -#include "io/parquet/parquet_gpu.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace cudf::experimental::io::parquet::detail { - -using chunk_page_info = cudf::io::parquet::detail::chunk_page_info; -using ColumnChunkDesc = cudf::io::parquet::detail::ColumnChunkDesc; -using Compression = cudf::io::parquet::Compression; -using decode_error = cudf::io::parquet::detail::decode_error; -using Encoding = cudf::io::parquet::Encoding; -using kernel_error = cudf::io::parquet::kernel_error; -using level_type = cudf::io::parquet::detail::level_type; -using LogicalType = cudf::io::parquet::LogicalType; -using PageInfo = cudf::io::parquet::detail::PageInfo; -using PageNestingDecodeInfo = cudf::io::parquet::detail::PageNestingDecodeInfo; -using PageNestingInfo = cudf::io::parquet::detail::PageNestingInfo; -using pass_intermediate_data = cudf::io::parquet::detail::pass_intermediate_data; -using SchemaElement = cudf::io::parquet::SchemaElement; -using string_index_pair = cudf::io::parquet::detail::string_index_pair; -using Type = cudf::io::parquet::Type; - -void impl::prepare_row_groups(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options) -{ -} - -void impl::allocate_level_decode_space() {} - -void impl::build_string_dict_indices() {} - -bool impl::setup_column_chunks() { return {}; } - -void impl::preprocess_subpass_pages(size_t chunk_read_limit) {} - -cudf::detail::host_vector impl::calculate_page_string_offsets() -{ - return cudf::detail::make_host_vector(0, _stream); -} - -void impl::update_row_mask(cudf::column_view in_row_mask, - cudf::mutable_column_view out_row_mask, - rmm::cuda_stream_view stream) -{ -} - -} // namespace cudf::experimental::io::parquet::detail From 52cca2fad84f87cc0553d94d5226700bbc14cde9 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:54:17 +0000 Subject: [PATCH 06/28] Remove more declarations --- .../experimental/hybrid_scan_helpers.hpp | 244 ------------------ 1 file changed, 244 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 8f0c5c3863d..21344d6a958 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -118,250 +118,6 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * @brief Get the Parquet file metadata */ [[nodiscard]] FileMetadata const& get_parquet_metadata() const; - - /** - * @brief Setup the PageIndex - * - * @param page_index_bytes Host span of Parquet `PageIndex` buffer bytes - */ - void setup_page_index(cudf::host_span page_index_bytes); - - /** - * @brief Filters and reduces down to the selection of filter columns - * - * @param filter_columns_names List of paths of column names that are present only in filter - * @param include_index Whether to always include the PANDAS index column(s) - * @param strings_to_categorical Type conversion parameter - * @param timestamp_type_id Type conversion parameter - * - * @return input column information, output column information, list of output column schema - * indices - */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_filter_columns(std::optional> const& filter_columns_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id); - - /** - * @brief Filters and reduces down to the selection of payload columns - * - * @param column_names List of paths of column names that are present only in payload and filter - * @param filter_columns_names List of paths of column names that are present only in filter - * @param include_index Whether to always include the PANDAS index column(s) - * @param strings_to_categorical Type conversion parameter - * @param timestamp_type_id Type conversion parameter - * - * @return input column information, output column information, list of output column schema - * indices - */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_payload_columns(std::optional> const& column_names, - std::optional> const& filter_columns_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id); - - /** - * @brief Filters and reduces down to a selection of row groups - * - * The input `row_start` and `row_count` parameters will be recomputed and output as the valid - * values based on the input row group list. - * - * @param row_group_indices Lists of row groups to read, one per source - * @param row_start Starting row of the selection - * @param row_count Total number of rows selected - * - * @return A tuple of corrected row_start, row_count, list of row group indexes and its - * starting row, list of number of rows per source, number of input row groups, and a - * struct containing the number of row groups surviving each predicate pushdown filter - */ - [[nodiscard]] std::tuple> select_row_groups( - host_span const> row_group_indices, - int64_t row_start, - std::optional const& row_count); - - /** - * @brief Filter the row groups with statistics based on predicate filter - * - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param filter Optional AST expression to filter row groups based on Column chunk statistics - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Filtered row group indices, if any are filtered - */ - [[nodiscard]] std::vector> filter_row_groups_with_stats( - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream) const; - - /** - * @brief Get the bloom filter byte ranges, one per input column chunk - * - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param filter Optional AST expression to filter row groups based on bloom filters - * - * @return Byte ranges of bloom filters, one per input column chunk - */ - [[nodiscard]] std::vector get_bloom_filter_bytes( - cudf::host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter); - - /** - * @brief Get the dictionary page byte ranges, one per input column chunk - * - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param filter Optional AST expression to filter row groups based on dictionary pages - * - * @return Byte ranges of dictionary pages, one per input column chunk - */ - [[nodiscard]] std::vector get_dictionary_page_bytes( - cudf::host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter); - - /** - * @brief Filter the row groups using dictionaries based on predicate filter - * - * @param dictionary_page_data Device buffers of dictionary pages, one per input column chunk - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param filter Optional AST expression to filter row groups based on dictionary pages - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Filtered row group indices, if any are filtered - */ - [[nodiscard]] std::vector> filter_row_groups_with_dictionary_pages( - std::vector& dictionary_page_data, - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream) const; - - /** - * @brief Filter the row groups using bloom filters based on predicate filter - * - * @param bloom_filter_data Device buffers of bloom filters, one per input column chunk - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param filter Optional AST expression to filter row groups based on bloom filters - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Filtered row group indices, if any are filtered - */ - [[nodiscard]] std::vector> filter_row_groups_with_bloom_filters( - std::vector& bloom_filter_data, - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream) const; - - /** - * @brief Filter data pages using statistics page-level statistics based on predicate filter - * - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param filter Optional AST expression to filter data pages based on `PageIndex` statistics - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @return A boolean column representing a mask of rows surviving the predicate filter at - * page-level - */ - [[nodiscard]] std::unique_ptr filter_data_pages_with_stats( - cudf::host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::optional> filter, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const; - - /** - * @brief Computes which data pages need decoding to construct input columns based on the row mask - * - * Compute a vector of boolean vectors indicating which data pages need to be decoded to - * construct each input column based on the row mask, one vector per column - * - * @param row_mask Boolean column indicating which rows need to be read after page-pruning - * @param row_group_indices Input row groups indices - * @param output_dtypes Datatypes of output columns - * @param output_column_schemas schema indices of output columns - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return A vector of boolean vectors indicating which data pages need to be decoded to produce - * the output table based on the input row mask, one per input column - */ - [[nodiscard]] std::vector> compute_data_page_mask( - cudf::column_view row_mask, - cudf::host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - rmm::cuda_stream_view stream) const; -}; - -/** - * @brief Collects lists of equal and not-equal predicate literals in the AST expression, one list - * per input table column. This is used in row group filtering based on dictionary pages. - */ -class dictionary_literals_and_operators_collector : public equality_literals_collector { - public: - dictionary_literals_and_operators_collector(); - - dictionary_literals_and_operators_collector(ast::expression const& expr, - cudf::size_type num_input_columns); - - using equality_literals_collector::visit; - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) - */ - std::reference_wrapper visit(ast::column_reference const& expr) override; - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) - */ - std::reference_wrapper visit( - ast::column_name_reference const& expr) override; - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) - */ - std::reference_wrapper visit(ast::operation const& expr) override; - - /** - * @brief Returns the vectors of dictionary page filter literals in the AST expression, one per - * input table column - */ - [[nodiscard]] std::vector> get_literals() = delete; - - /** - * @brief Returns a pair of two vectors containing dictionary filter literals and operators - * in the AST expression respectively, one per input table column - */ - [[nodiscard]] std::pair>, - std::vector>> - get_literals_and_operators() &&; - - private: - std::vector> _operators; }; } // namespace cudf::experimental::io::parquet::detail From 6be4a4bca0d7f36a5d0054ea101c479c52bef0b4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Apr 2025 23:33:07 +0000 Subject: [PATCH 07/28] Add `setup_page_index` to `hybrid_scan_helpers` --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 2 ++ cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 0f6e5edce7e..cd4433e2fba 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -64,4 +64,6 @@ FileMetadata const& aggregate_reader_metadata::get_parquet_metadata() const return per_file_metadata.front(); } +void aggregate_reader_metadata::setup_page_index(cudf::host_span page_index_bytes) {} + } // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 21344d6a958..2cf0b2dea5a 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -118,6 +118,13 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * @brief Get the Parquet file metadata */ [[nodiscard]] FileMetadata const& get_parquet_metadata() const; + + /** + * @brief Setup the PageIndex + * + * @param page_index_bytes Host span of Parquet `PageIndex` buffer bytes + */ + void setup_page_index(cudf::host_span page_index_bytes); }; } // namespace cudf::experimental::io::parquet::detail From 952baec5621551cff67de933d205c15ee5830641 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 11 Apr 2025 00:13:57 +0000 Subject: [PATCH 08/28] Add hybrid scan reader's metadata API implementations --- .../experimental/hybrid_scan_helpers.cpp | 81 ++++++- .../parquet/experimental/hybrid_scan_impl.cpp | 15 +- cpp/tests/CMakeLists.txt | 1 + .../io/parquet_experimental_reader_test.cpp | 215 ++++++++++++++++++ 4 files changed, 307 insertions(+), 5 deletions(-) create mode 100644 cpp/tests/io/parquet_experimental_reader_test.cpp diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index cd4433e2fba..a91f3c4c04a 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -45,17 +45,55 @@ using OffsetIndex = cudf::io::parquet::OffsetIndex; using row_group_info = cudf::io::parquet::detail::row_group_info; using SchemaElement = cudf::io::parquet::SchemaElement; -metadata::metadata(cudf::host_span footer_bytes) {} +metadata::metadata(cudf::host_span footer_bytes) +{ + CompactProtocolReader cp(footer_bytes.data(), footer_bytes.size()); + cp.read(this); + CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); + sanitize_schema(); +} aggregate_reader_metadata::aggregate_reader_metadata(cudf::host_span footer_bytes, bool use_arrow_schema, bool has_cols_from_mismatched_srcs) : aggregate_reader_metadata_base({}, false, false) { + // Re-initialize internal variables here as base class was initialized without a source + per_file_metadata = std::vector{metadata{footer_bytes}.get_file_metadata()}; + keyval_maps = collect_keyval_metadata(); + schema_idx_maps = init_schema_idx_maps(has_cols_from_mismatched_srcs); + num_rows = calc_num_rows(); + num_row_groups = calc_num_row_groups(); + + // Force all columns to be nullable + auto& schema = per_file_metadata.front().schema; + std::for_each(schema.begin(), schema.end(), [](auto& col) { + col.repetition_type = FieldRepetitionType::OPTIONAL; + }); + + // Collect and apply arrow:schema from Parquet's key value metadata section + if (use_arrow_schema) { + apply_arrow_schema(); + + // Erase ARROW_SCHEMA_KEY from the output pfm if exists + std::for_each(keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { + pfm.erase(cudf::io::parquet::detail::ARROW_SCHEMA_KEY); + }); + } } cudf::io::text::byte_range_info aggregate_reader_metadata::get_page_index_bytes() const { + auto& schema = per_file_metadata.front(); + auto& row_groups = schema.row_groups; + + if (row_groups.size() and row_groups.front().columns.size()) { + int64_t const min_offset = schema.row_groups.front().columns.front().column_index_offset; + auto const& last_col = schema.row_groups.back().columns.back(); + int64_t const max_offset = last_col.offset_index_offset + last_col.offset_index_length; + return {min_offset, (max_offset - min_offset)}; + } + return {}; } @@ -64,6 +102,45 @@ FileMetadata const& aggregate_reader_metadata::get_parquet_metadata() const return per_file_metadata.front(); } -void aggregate_reader_metadata::setup_page_index(cudf::host_span page_index_bytes) {} +void aggregate_reader_metadata::setup_page_index(cudf::host_span page_index_bytes) +{ + // Return early if empty page index buffer span + if (not page_index_bytes.size()) { + CUDF_LOG_WARN("Hybrid scan reader encountered empty `PageIndex` buffer"); + return; + } + + auto& schema = per_file_metadata.front(); + auto& row_groups = schema.row_groups; + + CUDF_EXPECTS(row_groups.size() and row_groups.front().columns.size(), + "No column chunks in Parquet schema to read PageIndex for"); + + CompactProtocolReader cp(page_index_bytes.data(), page_index_bytes.size()); + + // Set the first ColumnChunk's offset of ColumnIndex as the adjusted zero offset + int64_t const min_offset = row_groups.front().columns.front().column_index_offset; + // now loop over row groups + for (auto& rg : row_groups) { + for (auto& col : rg.columns) { + // Read the ColumnIndex for this ColumnChunk + if (col.column_index_length > 0 && col.column_index_offset > 0) { + int64_t const offset = col.column_index_offset - min_offset; + cp.init(page_index_bytes.data() + offset, col.column_index_length); + ColumnIndex ci; + cp.read(&ci); + col.column_index = std::move(ci); + } + // Read the OffsetIndex for this ColumnChunk + if (col.offset_index_length > 0 && col.offset_index_offset > 0) { + int64_t const offset = col.offset_index_offset - min_offset; + cp.init(page_index_bytes.data() + offset, col.offset_index_length); + OffsetIndex oi; + cp.read(&oi); + col.offset_index = std::move(oi); + } + } + } +} } // namespace cudf::experimental::io::parquet::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index e30890a2d79..0f37bcd22bd 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -57,14 +57,23 @@ impl::impl(cudf::host_span footer_bytes, FileMetaData const& impl::get_parquet_metadata() const { return _metadata->get_parquet_metadata(); } -cudf::io::text::byte_range_info impl::get_page_index_bytes() const { return {}; } +cudf::io::text::byte_range_info impl::get_page_index_bytes() const +{ + return _metadata->get_page_index_bytes(); +} -void impl::setup_page_index(cudf::host_span page_index_bytes) const {} +void impl::setup_page_index(cudf::host_span page_index_bytes) const +{ + _metadata->setup_page_index(page_index_bytes); +} std::vector impl::get_all_row_groups( cudf::io::parquet_reader_options const& options) const { - return {}; + auto const num_row_groups = _metadata->get_num_row_groups(); + auto row_groups_indices = std::vector(num_row_groups); + std::iota(row_groups_indices.begin(), row_groups_indices.end(), size_type{0}); + return row_groups_indices; } std::vector> impl::filter_row_groups_with_stats( diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cea62e5360e..ea47ae748de 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -313,6 +313,7 @@ ConfigureTest( io/parquet_bloom_filter_test.cu io/parquet_chunked_reader_test.cu io/parquet_chunked_writer_test.cpp + io/parquet_experimental_reader_test.cpp io/parquet_common.cpp io/parquet_misc_test.cpp io/parquet_reader_test.cpp diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp new file mode 100644 index 00000000000..9c04356acb1 --- /dev/null +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tests/io/parquet_common.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// Base test fixture for tests +struct ParquetExperimentalReaderTest : public cudf::test::BaseFixture {}; + +namespace { + +/** + * @brief Fetches a host span of Parquet footer bytes from the input buffer span + * + * @param buffer Input buffer span + * @return A host span of the footer bytes + */ +cudf::host_span fetch_footer_bytes(cudf::host_span buffer) +{ + using namespace cudf::io::parquet; + + constexpr auto header_len = sizeof(file_header_s); + constexpr auto ender_len = sizeof(file_ender_s); + size_t const len = buffer.size(); + + auto const header_buffer = cudf::host_span(buffer.data(), header_len); + auto const header = reinterpret_cast(header_buffer.data()); + auto const ender_buffer = + cudf::host_span(buffer.data() + len - ender_len, ender_len); + auto const ender = reinterpret_cast(ender_buffer.data()); + CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source"); + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); + CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic, + "Corrupted header or footer"); + CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len), + "Incorrect footer length"); + + return cudf::host_span(buffer.data() + len - ender->footer_len - ender_len, + ender->footer_len); +} + +/** + * @brief Fetches a host span of Parquet PageIndexbytes from the input buffer span + * + * @param buffer Input buffer span + * @param page_index_bytes Byte range of `PageIndex` to fetch + * @return A host span of the PageIndex bytes + */ +cudf::host_span fetch_page_index_bytes( + cudf::host_span buffer, cudf::io::text::byte_range_info const page_index_bytes) +{ + return cudf::host_span( + reinterpret_cast(buffer.data()) + page_index_bytes.offset(), + page_index_bytes.size()); +} + +/** + * @brief Creates a table and writes it to Parquet host buffer with column level statistics + * + * This function creates a table with three columns: + * - col_uint32: ascending uint32_t values + * - col_int64: descending int64_t values + * - col_str: ascending string values + * + * The function creates a table by concatenating the same set of columns NumTableConcats times. + * It then writes this table to a Parquet host buffer with column level statistics. + * + * @tparam NumTableConcats Number of times to concatenate the base table (must be >= 1) + * @return Tuple of table and Parquet host buffer + */ +template +auto create_parquet_with_stats() +{ + static_assert(NumTableConcats >= 1, "Concatenated table must contain at least one table"); + + auto col0 = testdata::ascending(); + auto col1 = testdata::descending(); + auto col2 = testdata::ascending(); + + auto expected = table_view{{col0, col1, col2}}; + auto table = cudf::concatenate(std::vector(NumTableConcats, expected)); + expected = table->view(); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_uint32"); + expected_metadata.column_metadata[1].set_name("col_int64"); + expected_metadata.column_metadata[2].set_name("col_str"); + + std::vector buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected) + .metadata(std::move(expected_metadata)) + .row_group_size_rows(5000) + .max_page_size_rows(1000) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN); + + if constexpr (NumTableConcats > 1) { + out_opts.set_row_group_size_rows(20000); + out_opts.set_max_page_size_rows(5000); + } + + cudf::io::write_parquet(out_opts); + + auto columns = std::vector>{}; + if constexpr (NumTableConcats == 1) { + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + } else { + columns = table->release(); + } + return std::pair{cudf::table{std::move(columns)}, buffer}; +} + +} // namespace + +TEST_F(ParquetExperimentalReaderTest, TestMetadata) +{ + srand(31337); + + // Create a table with several row groups each with a single page. + auto constexpr num_concat = 1; + auto [written_table, buffer] = create_parquet_with_stats(); + + // Filtering AST - table[0] < 100 + auto literal_value = cudf::numeric_scalar(100); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_0 = cudf::ast::column_name_reference("col_uint32"); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + // Create reader options with empty source info + cudf::io::parquet_reader_options options = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(nullptr, 0)) + .filter(filter_expression); + + // Input file buffer span + auto const file_buffer_span = + cudf::host_span(reinterpret_cast(buffer.data()), buffer.size()); + + // Fetch footer and page index bytes from the buffer. + auto const footer_buffer = fetch_footer_bytes(file_buffer_span); + + // Create hybrid scan reader with footer bytes + auto const reader = + std::make_unique(footer_buffer, options); + + // Get Parquet file metadata from the reader - API # 1 + auto parquet_metadata = reader->get_parquet_metadata(); + + // Check that the offset and column indices are not present + ASSERT_TRUE(not parquet_metadata.row_groups[0].columns[0].offset_index.has_value()); + ASSERT_TRUE(not parquet_metadata.row_groups[0].columns[0].column_index.has_value()); + + // Get page index byte range from the reader - API # 2 + auto const page_index_byte_range = reader->get_page_index_bytes(); + + // Fetch page index bytes from the input buffer + auto const page_index_buffer = fetch_page_index_bytes(file_buffer_span, page_index_byte_range); + + // Setup page index - API # 3 + reader->setup_page_index(page_index_buffer); + + // Get Parquet file metadata from the reader again + parquet_metadata = reader->get_parquet_metadata(); + + // Check that the offset and column indices are now present + ASSERT_TRUE(parquet_metadata.row_groups[0].columns[0].offset_index.has_value()); + ASSERT_TRUE(parquet_metadata.row_groups[0].columns[0].column_index.has_value()); + + // Get all row groups from the reader - API # 4 + auto input_row_group_indices = reader->get_all_row_groups(options); + // Expect 4 = 20000 rows / 5000 rows per row group + ASSERT_EQ(input_row_group_indices.size(), 4); + + // Explicitly set the row groups to read + options.set_row_groups({{0, 1}}); + + // Get all row groups from the reader again + input_row_group_indices = reader->get_all_row_groups(options); + // Expect only 2 row groups now + ASSERT_EQ(reader->get_all_row_groups(options).size(), 2); +} From 133ddd04ed85893a6134a8947905da0fc6c2d0bb Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Apr 2025 17:59:33 +0000 Subject: [PATCH 09/28] Suggestions from code reviews --- .../cudf/io/experimental/hybrid_scan.hpp | 4 +-- .../io/parquet/experimental/hybrid_scan.cpp | 9 +++-- .../experimental/hybrid_scan_helpers.cpp | 26 ++++++-------- .../experimental/hybrid_scan_helpers.hpp | 18 +++++----- .../parquet/experimental/hybrid_scan_impl.cpp | 30 ++++++++-------- .../parquet/experimental/hybrid_scan_impl.hpp | 34 ++++++------------- .../io/parquet_experimental_reader_test.cpp | 4 +-- 7 files changed, 51 insertions(+), 74 deletions(-) diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp index 2039cba45e0..8198667dfa7 100644 --- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -32,7 +32,7 @@ #include namespace CUDF_EXPORT cudf { -namespace experimental::io::parquet { +namespace io::parquet::experimental { namespace detail { @@ -244,5 +244,5 @@ class hybrid_scan_reader { std::unique_ptr _impl; }; -} // namespace experimental::io::parquet +} // namespace io::parquet::experimental } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp index fce6b29ba97..e995fc22ffd 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp @@ -19,7 +19,7 @@ #include #include -namespace cudf::experimental::io::parquet { +namespace cudf::io::parquet::experimental { hybrid_scan_reader::hybrid_scan_reader(cudf::host_span footer_bytes, cudf::io::parquet_reader_options const& options) @@ -29,13 +29,12 @@ hybrid_scan_reader::hybrid_scan_reader(cudf::host_span footer_byt hybrid_scan_reader::~hybrid_scan_reader() = default; -[[nodiscard]] cudf::io::text::byte_range_info hybrid_scan_reader::get_page_index_bytes() const +[[nodiscard]] text::byte_range_info hybrid_scan_reader::get_page_index_bytes() const { return _impl->get_page_index_bytes(); } -[[nodiscard]] cudf::io::parquet::FileMetaData const& hybrid_scan_reader::get_parquet_metadata() - const +[[nodiscard]] FileMetaData const& hybrid_scan_reader::get_parquet_metadata() const { return _impl->get_parquet_metadata(); } @@ -190,4 +189,4 @@ cudf::io::table_with_metadata hybrid_scan_reader::materialize_payload_columns( input_row_group_indices, std::move(column_chunk_buffers), row_mask, options, stream); } -} // namespace cudf::experimental::io::parquet +} // namespace cudf::io::parquet::experimental diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index a91f3c4c04a..6b8db2c9809 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -30,20 +30,14 @@ #include #include -namespace cudf::experimental::io::parquet::detail { - -using aggregate_reader_metadata_base = cudf::io::parquet::detail::aggregate_reader_metadata; -using ColumnIndex = cudf::io::parquet::ColumnIndex; -using column_name_info = cudf::io::column_name_info; -using CompactProtocolReader = cudf::io::parquet::detail::CompactProtocolReader; -using equality_literals_collector = cudf::io::parquet::detail::equality_literals_collector; -using FieldRepetitionType = cudf::io::parquet::FieldRepetitionType; -using inline_column_buffer = cudf::io::detail::inline_column_buffer; -using input_column_info = cudf::io::parquet::detail::input_column_info; -using metadata_base = cudf::io::parquet::detail::metadata; -using OffsetIndex = cudf::io::parquet::OffsetIndex; -using row_group_info = cudf::io::parquet::detail::row_group_info; -using SchemaElement = cudf::io::parquet::SchemaElement; +namespace cudf::io::parquet::experimental::detail { + +using aggregate_reader_metadata_base = parquet::detail::aggregate_reader_metadata; +using CompactProtocolReader = parquet::detail::CompactProtocolReader; +using equality_literals_collector = parquet::detail::equality_literals_collector; +using input_column_info = parquet::detail::input_column_info; +using metadata_base = parquet::detail::metadata; +using row_group_info = parquet::detail::row_group_info; metadata::metadata(cudf::host_span footer_bytes) { @@ -97,7 +91,7 @@ cudf::io::text::byte_range_info aggregate_reader_metadata::get_page_index_bytes( return {}; } -FileMetadata const& aggregate_reader_metadata::get_parquet_metadata() const +FileMetaData const& aggregate_reader_metadata::get_parquet_metadata() const { return per_file_metadata.front(); } @@ -143,4 +137,4 @@ void aggregate_reader_metadata::setup_page_index(cudf::host_span } } -} // namespace cudf::experimental::io::parquet::detail +} // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 2cf0b2dea5a..fc3f6e9978f 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -31,15 +31,13 @@ #include #include -namespace cudf::experimental::io::parquet::detail { +namespace cudf::io::parquet::experimental::detail { -using aggregate_reader_metadata_base = cudf::io::parquet::detail::aggregate_reader_metadata; -using equality_literals_collector = cudf::io::parquet::detail::equality_literals_collector; -using FileMetadata = cudf::io::parquet::FileMetaData; -using inline_column_buffer = cudf::io::detail::inline_column_buffer; -using input_column_info = cudf::io::parquet::detail::input_column_info; -using metadata_base = cudf::io::parquet::detail::metadata; -using row_group_info = cudf::io::parquet::detail::row_group_info; +using aggregate_reader_metadata_base = parquet::detail::aggregate_reader_metadata; +using equality_literals_collector = parquet::detail::equality_literals_collector; +using input_column_info = parquet::detail::input_column_info; +using metadata_base = parquet::detail::metadata; +using row_group_info = parquet::detail::row_group_info; /** * @brief Class for parsing dataset metadata @@ -117,7 +115,7 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { /** * @brief Get the Parquet file metadata */ - [[nodiscard]] FileMetadata const& get_parquet_metadata() const; + [[nodiscard]] FileMetaData const& get_parquet_metadata() const; /** * @brief Setup the PageIndex @@ -127,4 +125,4 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { void setup_page_index(cudf::host_span page_index_bytes); }; -} // namespace cudf::experimental::io::parquet::detail +} // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 0f37bcd22bd..5b27985baad 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -34,16 +34,13 @@ #include #include -namespace cudf::experimental::io::parquet::detail { +namespace cudf::io::parquet::experimental::detail { -using byte_range_info = cudf::io::text::byte_range_info; -using ColumnChunkDesc = cudf::io::parquet::detail::ColumnChunkDesc; -using decode_kernel_mask = cudf::io::parquet::detail::decode_kernel_mask; -using FileMetaData = cudf::io::parquet::FileMetaData; -using LogicalType = cudf::io::parquet::LogicalType; -using PageInfo = cudf::io::parquet::detail::PageInfo; -using PageNestingDecodeInfo = cudf::io::parquet::detail::PageNestingDecodeInfo; -using Type = cudf::io::parquet::Type; +using ColumnChunkDesc = parquet::detail::ColumnChunkDesc; +using decode_kernel_mask = parquet::detail::decode_kernel_mask; +using PageInfo = parquet::detail::PageInfo; +using PageNestingDecodeInfo = parquet::detail::PageNestingDecodeInfo; +using byte_range_info = text::byte_range_info; impl::impl(cudf::host_span footer_bytes, cudf::io::parquet_reader_options const& options) @@ -57,10 +54,7 @@ impl::impl(cudf::host_span footer_bytes, FileMetaData const& impl::get_parquet_metadata() const { return _metadata->get_parquet_metadata(); } -cudf::io::text::byte_range_info impl::get_page_index_bytes() const -{ - return _metadata->get_page_index_bytes(); -} +byte_range_info impl::get_page_index_bytes() const { return _metadata->get_page_index_bytes(); } void impl::setup_page_index(cudf::host_span page_index_bytes) const { @@ -162,4 +156,12 @@ cudf::io::table_with_metadata impl::materialize_payload_columns( return {}; } -} // namespace cudf::experimental::io::parquet::detail +bool impl::has_more_work() const +{ + return _file_itm_data.num_passes() > 0 && + _file_itm_data._current_input_pass < _file_itm_data.num_passes(); +} + +bool impl::is_first_output_chunk() const { return _file_itm_data._output_chunk_count == 0; } + +} // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index 2a2a612aa03..4d935c688dc 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -37,7 +37,7 @@ #include #include -namespace cudf::experimental::io::parquet::detail { +namespace cudf::io::parquet::experimental::detail { /** * @brief Implementation for Parquet reader @@ -434,11 +434,7 @@ class impl { /** * @brief Check if there is more work to be done. */ - [[nodiscard]] bool has_more_work() const - { - return _file_itm_data.num_passes() > 0 && - _file_itm_data._current_input_pass < _file_itm_data.num_passes(); - } + [[nodiscard]] bool has_more_work() const; /** * @brief Read a chunk of data and return an output table. @@ -454,30 +450,20 @@ class impl { template cudf::io::table_with_metadata read_chunk_internal(read_mode read_mode, RowMaskView row_mask); - /** - * @brief Check if the user has specified custom row bounds - * - * @return True if the user has specified custom row bounds - */ - [[nodiscard]] constexpr bool uses_custom_row_bounds() const { return false; } - /** * @brief Check if this is the first output chunk * * @return True if this is the first output chunk */ - [[nodiscard]] bool is_first_output_chunk() const - { - return _file_itm_data._output_chunk_count == 0; - } + [[nodiscard]] bool is_first_output_chunk() const; private: - using named_to_reference_converter = cudf::io::parquet::detail::named_to_reference_converter; - using input_column_info = cudf::io::parquet::detail::input_column_info; - using inline_column_buffer = cudf::io::detail::inline_column_buffer; - using reader_column_schema = cudf::io::reader_column_schema; - using file_intermediate_data = cudf::io::parquet::detail::file_intermediate_data; - using pass_intermediate_data = cudf::io::parquet::detail::pass_intermediate_data; + using named_to_reference_converter = parquet::detail::named_to_reference_converter; + using input_column_info = parquet::detail::input_column_info; + using inline_column_buffer = io::detail::inline_column_buffer; + using reader_column_schema = io::reader_column_schema; + using file_intermediate_data = parquet::detail::file_intermediate_data; + using pass_intermediate_data = parquet::detail::pass_intermediate_data; rmm::cuda_stream_view _stream; rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()}; @@ -525,4 +511,4 @@ class impl { std::unique_ptr _pass_itm_data; }; -} // namespace cudf::experimental::io::parquet::detail +} // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp index 9c04356acb1..5c2815cdb63 100644 --- a/cpp/tests/io/parquet_experimental_reader_test.cpp +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -149,8 +149,6 @@ auto create_parquet_with_stats() TEST_F(ParquetExperimentalReaderTest, TestMetadata) { - srand(31337); - // Create a table with several row groups each with a single page. auto constexpr num_concat = 1; auto [written_table, buffer] = create_parquet_with_stats(); @@ -175,7 +173,7 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) // Create hybrid scan reader with footer bytes auto const reader = - std::make_unique(footer_buffer, options); + std::make_unique(footer_buffer, options); // Get Parquet file metadata from the reader - API # 1 auto parquet_metadata = reader->get_parquet_metadata(); From 32535d63c1fc22ef5691cf08de4c0d6b7221b650 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Apr 2025 18:29:07 +0000 Subject: [PATCH 10/28] style fix --- .../cudf/io/experimental/hybrid_scan.hpp | 26 ++++----- .../io/parquet/experimental/hybrid_scan.cpp | 28 +++++----- .../parquet/experimental/hybrid_scan_impl.cpp | 28 +++++----- .../parquet/experimental/hybrid_scan_impl.hpp | 53 +++++++++---------- cpp/src/io/parquet/reader_impl_helpers.hpp | 2 +- 5 files changed, 66 insertions(+), 71 deletions(-) diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp index 8198667dfa7..cee4bec36ad 100644 --- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -64,7 +64,7 @@ class hybrid_scan_reader { * @param options Parquet reader options */ explicit hybrid_scan_reader(cudf::host_span footer_bytes, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @brief Destructor for the experimental parquet reader class @@ -106,7 +106,7 @@ class hybrid_scan_reader { * @return Vector of row group indices */ [[nodiscard]] std::vector get_all_row_groups( - cudf::io::parquet_reader_options const& options) const; + parquet_reader_options const& options) const; /** * @brief Filter the input row groups with statistics @@ -118,7 +118,7 @@ class hybrid_scan_reader { */ [[nodiscard]] std::vector filter_row_groups_with_stats( cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const; /** @@ -132,7 +132,7 @@ class hybrid_scan_reader { [[nodiscard]] std::pair, std::vector> get_secondary_filters(cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options) const; + parquet_reader_options const& options) const; /** * @brief Filter the row groups with dictionary pages @@ -146,7 +146,7 @@ class hybrid_scan_reader { [[nodiscard]] std::vector filter_row_groups_with_dictionary_pages( std::vector& dictionary_page_data, cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const; /** @@ -161,7 +161,7 @@ class hybrid_scan_reader { [[nodiscard]] std::vector filter_row_groups_with_bloom_filters( std::vector& bloom_filter_data, cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const; /** @@ -177,7 +177,7 @@ class hybrid_scan_reader { */ [[nodiscard]] std::pair, std::vector>> filter_data_pages_with_stats(cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; @@ -190,7 +190,7 @@ class hybrid_scan_reader { */ [[nodiscard]] std::vector get_filter_column_chunk_byte_ranges( cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options) const; + parquet_reader_options const& options) const; /** * @brief Materialize filter columns, and updates the input row validity mask to only the rows @@ -204,12 +204,12 @@ class hybrid_scan_reader { * @param stream CUDA stream used for device memory operations and kernel launches * @return Table of materialized filter columns and metadata */ - [[nodiscard]] cudf::io::table_with_metadata materialize_filter_columns( + [[nodiscard]] table_with_metadata materialize_filter_columns( cudf::host_span const> page_mask, cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const; /** @@ -221,7 +221,7 @@ class hybrid_scan_reader { */ [[nodiscard]] std::vector get_payload_column_chunk_byte_ranges( cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options) const; + parquet_reader_options const& options) const; /** * @brief Materialize payload columns @@ -233,11 +233,11 @@ class hybrid_scan_reader { * @param stream CUDA stream used for device memory operations and kernel launches * @return Table of materialized payload columns and metadata */ - [[nodiscard]] cudf::io::table_with_metadata materialize_payload_columns( + [[nodiscard]] table_with_metadata materialize_payload_columns( cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const; private: diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp index e995fc22ffd..c741e657c8b 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp @@ -22,7 +22,7 @@ namespace cudf::io::parquet::experimental { hybrid_scan_reader::hybrid_scan_reader(cudf::host_span footer_bytes, - cudf::io::parquet_reader_options const& options) + parquet_reader_options const& options) : _impl{std::make_unique(footer_bytes, options)} { } @@ -45,7 +45,7 @@ void hybrid_scan_reader::setup_page_index(cudf::host_span page_in } std::vector hybrid_scan_reader::get_all_row_groups( - cudf::io::parquet_reader_options const& options) const + parquet_reader_options const& options) const { CUDF_EXPECTS(options.get_row_groups().size() == 0 or options.get_row_groups().size() == 1, "Encountered invalid size of row group indices in parquet reader options"); @@ -58,7 +58,7 @@ std::vector hybrid_scan_reader::get_all_row_groups( std::vector hybrid_scan_reader::filter_row_groups_with_stats( cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const { // Temporary vector with row group indices from the first source @@ -71,7 +71,7 @@ std::vector hybrid_scan_reader::filter_row_groups_with_stats( std::pair, std::vector> hybrid_scan_reader::get_secondary_filters(cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options) const + parquet_reader_options const& options) const { // Temporary vector with row group indices from the first source auto const input_row_group_indices = @@ -83,7 +83,7 @@ hybrid_scan_reader::get_secondary_filters(cudf::host_span row_g std::vector hybrid_scan_reader::filter_row_groups_with_dictionary_pages( std::vector& dictionary_page_data, cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const { CUDF_EXPECTS(row_group_indices.size() == dictionary_page_data.size(), @@ -102,7 +102,7 @@ std::vector hybrid_scan_reader::filter_row_groups_with_dictiona std::vector hybrid_scan_reader::filter_row_groups_with_bloom_filters( std::vector& bloom_filter_data, cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const { CUDF_EXPECTS(row_group_indices.size() == bloom_filter_data.size(), @@ -120,7 +120,7 @@ std::vector hybrid_scan_reader::filter_row_groups_with_bloom_fi std::pair, std::vector>> hybrid_scan_reader::filter_data_pages_with_stats(cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { @@ -133,8 +133,7 @@ hybrid_scan_reader::filter_data_pages_with_stats(cudf::host_span hybrid_scan_reader::get_filter_column_chunk_byte_ranges( - cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options) const + cudf::host_span row_group_indices, parquet_reader_options const& options) const { // Temporary vector with row group indices from the first source auto const input_row_group_indices = @@ -143,12 +142,12 @@ hybrid_scan_reader::get_filter_column_chunk_byte_ranges( return _impl->get_filter_column_chunk_byte_ranges(input_row_group_indices, options).first; } -cudf::io::table_with_metadata hybrid_scan_reader::materialize_filter_columns( +table_with_metadata hybrid_scan_reader::materialize_filter_columns( cudf::host_span const> data_page_mask, cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const { // Temporary vector with row group indices from the first source @@ -165,8 +164,7 @@ cudf::io::table_with_metadata hybrid_scan_reader::materialize_filter_columns( [[nodiscard]] std::vector hybrid_scan_reader::get_payload_column_chunk_byte_ranges( - cudf::host_span row_group_indices, - cudf::io::parquet_reader_options const& options) const + cudf::host_span row_group_indices, parquet_reader_options const& options) const { auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -174,11 +172,11 @@ hybrid_scan_reader::get_payload_column_chunk_byte_ranges( return _impl->get_payload_column_chunk_byte_ranges(input_row_group_indices, options).first; } -cudf::io::table_with_metadata hybrid_scan_reader::materialize_payload_columns( +table_with_metadata hybrid_scan_reader::materialize_payload_columns( cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) const { // Temporary vector with row group indices from the first source diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 5b27985baad..41ca4d00f90 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -42,8 +42,7 @@ using PageInfo = parquet::detail::PageInfo; using PageNestingDecodeInfo = parquet::detail::PageNestingDecodeInfo; using byte_range_info = text::byte_range_info; -impl::impl(cudf::host_span footer_bytes, - cudf::io::parquet_reader_options const& options) +impl::impl(cudf::host_span footer_bytes, parquet_reader_options const& options) { // Open and parse the source dataset metadata _metadata = std::make_unique( @@ -61,8 +60,7 @@ void impl::setup_page_index(cudf::host_span page_index_bytes) con _metadata->setup_page_index(page_index_bytes); } -std::vector impl::get_all_row_groups( - cudf::io::parquet_reader_options const& options) const +std::vector impl::get_all_row_groups(parquet_reader_options const& options) const { auto const num_row_groups = _metadata->get_num_row_groups(); auto row_groups_indices = std::vector(num_row_groups); @@ -72,7 +70,7 @@ std::vector impl::get_all_row_groups( std::vector> impl::filter_row_groups_with_stats( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) { return {}; @@ -80,7 +78,7 @@ std::vector> impl::filter_row_groups_with_stats( std::pair, std::vector> impl::get_secondary_filters( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options) + parquet_reader_options const& options) { return {}; } @@ -88,7 +86,7 @@ std::pair, std::vector> impl::get_ std::vector> impl::filter_row_groups_with_dictionary_pages( std::vector& dictionary_page_data, cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) { return {}; @@ -97,7 +95,7 @@ std::vector> impl::filter_row_groups_with_dictionary_page std::vector> impl::filter_row_groups_with_bloom_filters( std::vector& bloom_filter_data, cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) { return {}; @@ -105,7 +103,7 @@ std::vector> impl::filter_row_groups_with_bloom_filters( std::pair, std::vector>> impl::filter_data_pages_with_stats(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -122,7 +120,7 @@ impl::get_input_column_chunk_byte_ranges( std::pair, std::vector> impl::get_filter_column_chunk_byte_ranges( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options) + parquet_reader_options const& options) { return {}; } @@ -130,27 +128,27 @@ impl::get_filter_column_chunk_byte_ranges( std::pair, std::vector> impl::get_payload_column_chunk_byte_ranges( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options) + parquet_reader_options const& options) { return {}; } -cudf::io::table_with_metadata impl::materialize_filter_columns( +table_with_metadata impl::materialize_filter_columns( cudf::host_span const> data_page_mask, cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) { return {}; } -cudf::io::table_with_metadata impl::materialize_payload_columns( +table_with_metadata impl::materialize_payload_columns( cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream) { return {}; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index 4d935c688dc..5f50619a467 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -51,8 +51,7 @@ class impl { * @param footer_bytes Host span of parquet file footer bytes * @param options Parquet reader options */ - explicit impl(cudf::host_span footer_bytes, - cudf::io::parquet_reader_options const& options); + explicit impl(cudf::host_span footer_bytes, parquet_reader_options const& options); /** * @copydoc cudf::io::experimental::hybrid_scan::get_parquet_metadata @@ -73,14 +72,14 @@ class impl { * @copydoc cudf::io::experimental::hybrid_scan::get_all_row_groups */ [[nodiscard]] std::vector get_all_row_groups( - cudf::io::parquet_reader_options const& options) const; + parquet_reader_options const& options) const; /** * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_stats */ [[nodiscard]] std::vector> filter_row_groups_with_stats( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream); /** @@ -89,7 +88,7 @@ class impl { [[nodiscard]] std::pair, std::vector> get_secondary_filters(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_dictionary_pages @@ -97,7 +96,7 @@ class impl { [[nodiscard]] std::vector> filter_row_groups_with_dictionary_pages( std::vector& dictionary_page_data, cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream); /** @@ -106,7 +105,7 @@ class impl { [[nodiscard]] std::vector> filter_row_groups_with_bloom_filters( std::vector& bloom_filter_data, cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream); /** @@ -114,7 +113,7 @@ class impl { */ [[nodiscard]] std::pair, std::vector>> filter_data_pages_with_stats(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -130,17 +129,17 @@ class impl { std::vector> get_filter_column_chunk_byte_ranges( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @copydoc cudf::io::experimental::hybrid_scan::materialize_filter_columns */ - [[nodiscard]] cudf::io::table_with_metadata materialize_filter_columns( + [[nodiscard]] table_with_metadata materialize_filter_columns( cudf::host_span const> data_page_pask, cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream); /** @@ -155,16 +154,16 @@ class impl { std::vector> get_payload_column_chunk_byte_ranges( cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @copydoc cudf::io::experimental::hybrid_scan::materialize_payload_columns */ - [[nodiscard]] cudf::io::table_with_metadata materialize_payload_columns( + [[nodiscard]] table_with_metadata materialize_payload_columns( cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream); /** @@ -198,7 +197,7 @@ class impl { * @param stream CUDA stream used for device memory operations and kernel launches */ void initialize_options(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options, + parquet_reader_options const& options, rmm::cuda_stream_view stream); /** @@ -214,7 +213,7 @@ class impl { * @param read_mode Read mode indicating if we are reading filter or payload columns * @param options Reader options */ - void select_columns(read_mode read_mode, cudf::io::parquet_reader_options const& options); + void select_columns(read_mode read_mode, parquet_reader_options const& options); /** * @brief Get the byte ranges for the input column chunks @@ -247,7 +246,7 @@ class impl { */ void prepare_data(cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @brief Prepares the select input row groups and associated chunk information @@ -256,7 +255,7 @@ class impl { * @param options Parquet reader options */ void prepare_row_groups(cudf::host_span const> row_group_indices, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @brief Ratchet the pass/subpass/chunk process forward. @@ -265,7 +264,7 @@ class impl { * @param options Parquet reader options */ void handle_chunking(std::vector column_chunk_buffers, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @brief Setup step for the next input read pass. @@ -277,7 +276,7 @@ class impl { * @param options Parquet reader options */ void setup_next_pass(std::vector column_chunk_buffers, - cudf::io::parquet_reader_options const& options); + parquet_reader_options const& options); /** * @brief Setup step for the next decompression subpass. @@ -289,7 +288,7 @@ class impl { * @param options Parquet reader options * */ - void setup_next_subpass(cudf::io::parquet_reader_options const& options); + void setup_next_subpass(parquet_reader_options const& options); /** * @brief Populate the output table metadata from the parquet file metadata. @@ -383,10 +382,10 @@ class impl { * @return The output table along with columns' metadata */ template - cudf::io::table_with_metadata finalize_output(read_mode read_mode, - table_metadata& out_metadata, - std::vector>& out_columns, - RowMaskView row_mask); + table_with_metadata finalize_output(read_mode read_mode, + table_metadata& out_metadata, + std::vector>& out_columns, + RowMaskView row_mask); /** * @brief Allocate data buffers for the output columns. @@ -417,7 +416,7 @@ class impl { * Creates information about all chunks in the file, storing it in * the file-wide _file_itm_data structure. */ - void create_global_chunk_info(cudf::io::parquet_reader_options const& options); + void create_global_chunk_info(parquet_reader_options const& options); /** * @brief Computes all of the passes we will perform over the file. @@ -448,7 +447,7 @@ class impl { * @return The output table along with columns' metadata */ template - cudf::io::table_with_metadata read_chunk_internal(read_mode read_mode, RowMaskView row_mask); + table_with_metadata read_chunk_internal(read_mode read_mode, RowMaskView row_mask); /** * @brief Check if this is the first output chunk diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index b5972949b71..b031c543efc 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -221,7 +221,7 @@ class aggregate_reader_metadata { * @param stream CUDA stream used for device memory operations and kernel launches * @param aligned_mr Aligned device memory resource to allocate bloom filter buffers * - * @return A flattened list of bloom filter bitset device buffers for each filter column across + * @return A flattened list of bloom filter bitset device buffers for each predicate column across * row group */ [[nodiscard]] std::vector read_bloom_filters( From 7e4474d42e9064ca9557e97028fc23acf2085175 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Apr 2025 20:23:33 +0000 Subject: [PATCH 11/28] Suggestions from code review --- cpp/include/cudf/io/experimental/hybrid_scan.hpp | 9 +++++---- cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp | 2 +- cpp/src/io/parquet/reader_impl.hpp | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp index cee4bec36ad..651483c8d48 100644 --- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -14,10 +14,6 @@ * limitations under the License. */ -/** - * @file hybrid_scan.hpp - */ - #pragma once #include @@ -33,6 +29,11 @@ namespace CUDF_EXPORT cudf { namespace io::parquet::experimental { +/** + * @addtogroup io_readers + * @{ + * @file + */ namespace detail { diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index 5f50619a467..aef89492bab 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -494,7 +494,7 @@ class impl { size_type _num_sources{1}; // timestamp_type - cudf::data_type _timestamp_type{type_id::EMPTY}; + cudf::data_type _timestamp_type; std::optional> _reader_column_schema; diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 3aa9b94ed6b..6ba2bdb5e7b 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -373,7 +373,7 @@ class reader::impl { // Reader configs. struct { // timestamp_type - data_type timestamp_type{type_id::EMPTY}; + data_type timestamp_type; // User specified reading rows/stripes selection. int64_t const skip_rows; std::optional num_rows; From 2111da5604ae98dd9a7092c6e8f32ea8890f146b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Apr 2025 22:20:14 +0000 Subject: [PATCH 12/28] style fix --- cpp/src/io/parquet/reader_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 6ba2bdb5e7b..70ac443ec2b 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 80c99528c990aceaf302137bcf1e6c54bf00bac5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 22 Apr 2025 00:55:25 +0000 Subject: [PATCH 13/28] Impl rg pruning with stats in expt PQ reader --- .../experimental/hybrid_scan_helpers.cpp | 129 ++++++++++++++++++ .../experimental/hybrid_scan_helpers.hpp | 39 ++++++ .../parquet/experimental/hybrid_scan_impl.cpp | 99 +++++++++++++- cpp/src/io/parquet/reader_impl_helpers.cpp | 6 +- .../io/parquet_experimental_reader_test.cpp | 48 ++++++- 5 files changed, 317 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 6b8db2c9809..64906a0de88 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -35,6 +35,7 @@ namespace cudf::io::parquet::experimental::detail { using aggregate_reader_metadata_base = parquet::detail::aggregate_reader_metadata; using CompactProtocolReader = parquet::detail::CompactProtocolReader; using equality_literals_collector = parquet::detail::equality_literals_collector; +using inline_column_buffer = detail::inline_column_buffer; using input_column_info = parquet::detail::input_column_info; using metadata_base = parquet::detail::metadata; using row_group_info = parquet::detail::row_group_info; @@ -137,4 +138,132 @@ void aggregate_reader_metadata::setup_page_index(cudf::host_span } } +std::tuple, + std::vector, + std::vector> +aggregate_reader_metadata::select_payload_columns( + std::optional> const& payload_column_names, + std::optional> const& filter_column_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) +{ + // Select all columns if no payload or filter columns are specified + if (not payload_column_names.has_value() and not filter_column_names.has_value()) { + return select_columns({}, {}, include_index, strings_to_categorical, timestamp_type_id); + } + + // If payload columns are specified, only select payload columns that do not appear in the filter + // expression + std::vector valid_payload_columns; + if (payload_column_names.has_value()) { + valid_payload_columns = *payload_column_names; + // Remove filter columns from the provided payload column names + if (filter_column_names.has_value()) { + valid_payload_columns.erase(std::remove_if(valid_payload_columns.begin(), + valid_payload_columns.end(), + [&](std::string const& col) { + return std::find(filter_column_names->begin(), + filter_column_names->end(), + col) != + filter_column_names->end(); + }), + valid_payload_columns.end()); + } + // Select valid payload columns using the base `select_columns` method + return select_columns( + valid_payload_columns, {}, include_index, strings_to_categorical, timestamp_type_id); + } + + // Otherwise, select all columns that do not appear in the filter expression + std::function add_column_path = [&](std::string path_till_now, + int schema_idx) { + auto const& schema_elem = get_schema(schema_idx); + std::string const curr_path = path_till_now + schema_elem.name; + // If the current path is not a filter column, then add it and its children to the list of valid + // payload columns + if (std::find(filter_column_names.value().cbegin(), + filter_column_names.value().cend(), + curr_path) == filter_column_names.value().cend()) { + valid_payload_columns.push_back(curr_path); + // Add all children as well + for (auto const& child_idx : schema_elem.children_idx) { + add_column_path(curr_path + ".", child_idx); + } + } + }; + // Add all base level columns to valid payload columns + for (auto const& child_idx : get_schema(0).children_idx) { + add_column_path("", child_idx); + } + + // Select valid payload columns using the base `select_columns` method + return select_columns( + valid_payload_columns, {}, include_index, strings_to_categorical, timestamp_type_id); +} + +std::vector> aggregate_reader_metadata::filter_row_groups_with_stats( + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const +{ + std::vector> all_row_group_indices; + std::transform(per_file_metadata.cbegin(), + per_file_metadata.cend(), + std::back_inserter(all_row_group_indices), + [](auto const& file_meta) { + std::vector rg_idx(file_meta.row_groups.size()); + std::iota(rg_idx.begin(), rg_idx.end(), 0); + return rg_idx; + }); + + if (not filter.has_value()) { return all_row_group_indices; } + + // Compute total number of input row groups + cudf::size_type total_row_groups = [&]() { + if (not row_group_indices.empty()) { + size_t const total_row_groups = + std::accumulate(row_group_indices.begin(), + row_group_indices.end(), + size_t{0}, + [](auto sum, auto const& pfm) { return sum + pfm.size(); }); + + // Check if we have less than 2B total row groups. + CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), + "Total number of row groups exceed the cudf::size_type's limit"); + return static_cast(total_row_groups); + } else { + return num_row_groups; + } + }(); + + // Span of input row group indices for predicate pushdown + host_span const> input_row_group_indices; + if (row_group_indices.empty()) { + std::transform(per_file_metadata.cbegin(), + per_file_metadata.cend(), + std::back_inserter(all_row_group_indices), + [](auto const& file_meta) { + std::vector rg_idx(file_meta.row_groups.size()); + std::iota(rg_idx.begin(), rg_idx.end(), 0); + return rg_idx; + }); + input_row_group_indices = host_span const>(all_row_group_indices); + } else { + input_row_group_indices = row_group_indices; + } + + // Filter stats table with StatsAST expression and collect filtered row group indices + auto const stats_filtered_row_group_indices = apply_stats_filters(input_row_group_indices, + total_row_groups, + output_dtypes, + output_column_schemas, + filter.value(), + stream); + + return stats_filtered_row_group_indices.value_or(all_row_group_indices); +} + } // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index fc3f6e9978f..74f4e75d94d 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -35,6 +35,7 @@ namespace cudf::io::parquet::experimental::detail { using aggregate_reader_metadata_base = parquet::detail::aggregate_reader_metadata; using equality_literals_collector = parquet::detail::equality_literals_collector; +using inline_column_buffer = io::detail::inline_column_buffer; using input_column_info = parquet::detail::input_column_info; using metadata_base = parquet::detail::metadata; using row_group_info = parquet::detail::row_group_info; @@ -123,6 +124,44 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * @param page_index_bytes Host span of Parquet `PageIndex` buffer bytes */ void setup_page_index(cudf::host_span page_index_bytes); + + /** + * @brief Filters and reduces down to the selection of payload columns + * + * @param payload_column_names List of paths of select payload column names, if any + * @param filter_columns_names List of paths of column names present only in filter, if any + * @param include_index Whether to always include the PANDAS index column(s) + * @param strings_to_categorical Type conversion parameter + * @param timestamp_type_id Type conversion parameter + * + * @return input column information, output column information, list of output column schema + * indices + */ + [[nodiscard]] std:: + tuple, std::vector, std::vector> + select_payload_columns(std::optional> const& payload_column_names, + std::optional> const& filter_column_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id); + + /** + * @brief Filter the row groups with statistics based on predicate filter + * + * @param row_group_indices Input row groups indices + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter Optional AST expression to filter row groups based on Column chunk statistics + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Filtered row group indices, if any are filtered + */ + [[nodiscard]] std::vector> filter_row_groups_with_stats( + host_span const> row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::optional> filter, + rmm::cuda_stream_view stream) const; }; } // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 41ca4d00f90..13847946a05 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -42,6 +42,27 @@ using PageInfo = parquet::detail::PageInfo; using PageNestingDecodeInfo = parquet::detail::PageNestingDecodeInfo; using byte_range_info = text::byte_range_info; +namespace { + +/** + * @brief Populate and return a vector of data types of output columns + * + * @param output_buffer_template `inline_column_buffers` to extract output column data types from + * @return A vector of output column data types + */ +[[nodiscard]] std::vector get_output_types( + cudf::host_span output_buffer_template) +{ + std::vector output_dtypes; + std::transform(output_buffer_template.begin(), + output_buffer_template.end(), + std::back_inserter(output_dtypes), + [](auto const& col) { return col.type; }); + return output_dtypes; +} + +} // namespace + impl::impl(cudf::host_span footer_bytes, parquet_reader_options const& options) { // Open and parse the source dataset metadata @@ -60,6 +81,50 @@ void impl::setup_page_index(cudf::host_span page_index_bytes) con _metadata->setup_page_index(page_index_bytes); } +void impl::select_columns(read_mode read_mode, parquet_reader_options const& options) +{ + // Strings may be returned as either string or categorical columns + auto const strings_to_categorical = options.is_enabled_convert_strings_to_categories(); + auto const use_pandas_metadata = options.is_enabled_use_pandas_metadata(); + auto const timestamp_type_id = options.get_timestamp_type().id(); + + // Select only columns required by the filter + if (read_mode == read_mode::FILTER_COLUMNS) { + if (_is_filter_columns_selected) { return; } + // list, struct, dictionary are not supported by AST filter yet. + _filter_columns_names = + cudf::io::parquet::detail::get_column_names_in_expression(options.get_filter(), {}); + // Select only filter columns using the base `select_columns` method + std::tie(_input_columns, _output_buffers, _output_column_schemas) = _metadata->select_columns( + _filter_columns_names, {}, use_pandas_metadata, strings_to_categorical, timestamp_type_id); + + _is_filter_columns_selected = true; + _is_payload_columns_selected = false; + } else { + if (_is_payload_columns_selected) { return; } + + std::tie(_input_columns, _output_buffers, _output_column_schemas) = + _metadata->select_payload_columns(options.get_columns(), + _filter_columns_names, + use_pandas_metadata, + strings_to_categorical, + timestamp_type_id); + + _is_payload_columns_selected = true; + _is_filter_columns_selected = false; + } + + CUDF_EXPECTS(_input_columns.size() > 0 and _output_buffers.size() > 0, "No columns selected"); + + // Clear the output buffers templates + _output_buffers_template.clear(); + + // Save the states of the output buffers for reuse. + for (auto const& buff : _output_buffers) { + _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); + } +} + std::vector impl::get_all_row_groups(parquet_reader_options const& options) const { auto const num_row_groups = _metadata->get_num_row_groups(); @@ -73,7 +138,22 @@ std::vector> impl::filter_row_groups_with_stats( parquet_reader_options const& options, rmm::cuda_stream_view stream) { - return {}; + CUDF_EXPECTS(options.get_filter().has_value(), "Filter expression must not be empty"); + + select_columns(read_mode::FILTER_COLUMNS, options); + + table_metadata metadata; + populate_metadata(metadata); + auto expr_conv = named_to_reference_converter(options.get_filter(), metadata); + CUDF_EXPECTS(expr_conv.get_converted_expr().has_value(), + "Columns names in filter expression must be convertible to index references"); + auto output_dtypes = get_output_types(_output_buffers_template); + + return _metadata->filter_row_groups_with_stats(row_group_indices, + output_dtypes, + _output_column_schemas, + expr_conv.get_converted_expr(), + stream); } std::pair, std::vector> impl::get_secondary_filters( @@ -154,6 +234,23 @@ table_with_metadata impl::materialize_payload_columns( return {}; } +void impl::populate_metadata(table_metadata& out_metadata) const +{ + // Return column names + out_metadata.schema_info.resize(_output_buffers.size()); + for (size_t i = 0; i < _output_column_schemas.size(); i++) { + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); + out_metadata.schema_info[i].name = schema.name; + out_metadata.schema_info[i].is_nullable = + schema.repetition_type != cudf::io::parquet::FieldRepetitionType::REQUIRED; + } + + // Return user metadata + out_metadata.per_file_user_data = _metadata->get_key_value_metadata(); + out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), + out_metadata.per_file_user_data[0].end()}; +} + bool impl::has_more_work() const { return _file_itm_data.num_passes() > 0 && diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 0522fe1b780..2fa8638791e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -1473,9 +1473,11 @@ aggregate_reader_metadata::select_columns( // Find which of the selected paths are valid and get their schema index std::vector valid_selected_paths; - // vector reference pushback (*use_names). If filter names passed. + auto const empty_names = std::vector{}; + // vector reference pushback for select column and/or filter names passed. Use empty names + // reference if the no filter column names provided. std::vector const>> const column_names{ - *use_names, *filter_columns_names}; + *use_names, filter_columns_names.has_value() ? *filter_columns_names : empty_names}; for (auto const& used_column_names : column_names) { for (auto const& selected_path : used_column_names.get()) { auto found_path = diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp index 5c2815cdb63..22eb6b81829 100644 --- a/cpp/tests/io/parquet_experimental_reader_test.cpp +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -149,7 +149,7 @@ auto create_parquet_with_stats() TEST_F(ParquetExperimentalReaderTest, TestMetadata) { - // Create a table with several row groups each with a single page. + // Create a table with 4 row groups each with a single page. auto constexpr num_concat = 1; auto [written_table, buffer] = create_parquet_with_stats(); @@ -211,3 +211,49 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) // Expect only 2 row groups now ASSERT_EQ(reader->get_all_row_groups(options).size(), 2); } + +TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) +{ + // Create a table with 4 row groups each with a single page. + auto constexpr num_concat = 1; + auto [written_table, buffer] = create_parquet_with_stats(); + + // Filtering AST - table[0] < 50 + auto literal_value = cudf::numeric_scalar(50); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_0 = cudf::ast::column_name_reference("col_uint32"); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + // Create reader options with empty source info + cudf::io::parquet_reader_options options = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(nullptr, 0)) + .filter(filter_expression); + + // Fetch footer and page index bytes from the buffer. + auto const footer_buffer = fetch_footer_bytes( + cudf::host_span(reinterpret_cast(buffer.data()), buffer.size())); + + // Create hybrid scan reader with footer bytes + auto const reader = + std::make_unique(footer_buffer, options); + + // Get all row groups from the reader - API # 4 + auto input_row_group_indices = reader->get_all_row_groups(options); + // Expect 4 = 20000 rows / 5000 rows per row group + ASSERT_EQ(input_row_group_indices.size(), 4); + // Expect 3 row groups to be filtered out with stats + ASSERT_EQ( + reader + ->filter_row_groups_with_stats(input_row_group_indices, options, cudf::get_default_stream()) + .size(), + 1); + + // Use custom input row group indices + input_row_group_indices = {1, 2}; + // Expect all row groups to be filtered out with stats + ASSERT_EQ( + reader + ->filter_row_groups_with_stats(input_row_group_indices, options, cudf::get_default_stream()) + .size(), + 0); +} From 10d9cfad8b57a377f9e728d26daa2b9057140ca8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 30 Apr 2025 19:52:40 +0000 Subject: [PATCH 14/28] Merge changes --- .../experimental/hybrid_scan_helpers.cpp | 1 + .../experimental/hybrid_scan_helpers.hpp | 1 + .../parquet/experimental/hybrid_scan_impl.cpp | 30 ++++++++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index a4a885b346a..a1ae10505a5 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -35,6 +35,7 @@ namespace cudf::io::parquet::experimental::detail { using aggregate_reader_metadata_base = parquet::detail::aggregate_reader_metadata; using metadata_base = parquet::detail::metadata; +using io::detail::inline_column_buffer; using parquet::detail::CompactProtocolReader; using parquet::detail::equality_literals_collector; using parquet::detail::input_column_info; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 83f6f6d43d4..ff39c466a61 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -36,6 +36,7 @@ namespace cudf::io::parquet::experimental::detail { using aggregate_reader_metadata_base = parquet::detail::aggregate_reader_metadata; using metadata_base = parquet::detail::metadata; +using io::detail::inline_column_buffer; using parquet::detail::equality_literals_collector; using parquet::detail::input_column_info; using parquet::detail::row_group_info; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 7f136b87098..5d0df1afee6 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -37,12 +37,39 @@ namespace cudf::io::parquet::experimental::detail { +using io::detail::inline_column_buffer; using parquet::detail::ColumnChunkDesc; using parquet::detail::decode_kernel_mask; using parquet::detail::PageInfo; using parquet::detail::PageNestingDecodeInfo; using text::byte_range_info; +namespace { +// Tests the passed in logical type for a FIXED_LENGTH_BYTE_ARRAY column to see if it should +// be treated as a string. Currently the only logical type that has special handling is DECIMAL. +// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which +// for now would also be treated as a string). +[[maybe_unused]] inline bool is_treat_fixed_length_as_string( + std::optional const& logical_type) +{ + if (!logical_type.has_value()) { return true; } + return logical_type->type != LogicalType::DECIMAL; +} + +[[nodiscard]] std::vector get_output_types( + cudf::host_span output_buffer_template) +{ + std::vector output_dtypes; + output_dtypes.reserve(output_buffer_template.size()); + std::transform(output_buffer_template.begin(), + output_buffer_template.end(), + std::back_inserter(output_dtypes), + [](auto const& col) { return col.type; }); + return output_dtypes; +} + +} // namespace + hybrid_scan_reader_impl::hybrid_scan_reader_impl(cudf::host_span footer_bytes, parquet_reader_options const& options) { @@ -146,7 +173,8 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w stream); } -std::pair, std::vector> impl::get_secondary_filters( +std::pair, std::vector> +hybrid_scan_reader_impl::secondary_filters_byte_ranges( cudf::host_span const> row_group_indices, parquet_reader_options const& options) { From 736c76cde14b4fa96a7d5fb4ade31299dd5f6e27 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 30 Apr 2025 20:01:22 +0000 Subject: [PATCH 15/28] Improvements --- .../experimental/hybrid_scan_helpers.cpp | 44 +++++++------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index a1ae10505a5..2a7969f0946 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -209,17 +209,19 @@ std::vector> aggregate_reader_metadata::filter_row_ std::optional> filter, rmm::cuda_stream_view stream) const { - std::vector> all_row_group_indices; - std::transform(per_file_metadata.cbegin(), - per_file_metadata.cend(), - std::back_inserter(all_row_group_indices), - [](auto const& file_meta) { - std::vector rg_idx(file_meta.row_groups.size()); - std::iota(rg_idx.begin(), rg_idx.end(), 0); - return rg_idx; - }); - - if (not filter.has_value()) { return all_row_group_indices; } + CUDF_EXPECTS(not row_group_indices.empty(), "Input row group indices must not be empty"); + + auto all_row_group_indices = [&]() { + std::vector> all_row_group_indices; + std::transform(row_group_indices.begin(), + row_group_indices.end(), + std::back_inserter(all_row_group_indices), + [](auto rg_indices) { return rg_indices; }); + return all_row_group_indices; + }; + + // No filter expression, return all row groups + if (not filter.has_value()) { return all_row_group_indices(); } // Compute total number of input row groups cudf::size_type total_row_groups = [&]() { @@ -239,31 +241,15 @@ std::vector> aggregate_reader_metadata::filter_row_ } }(); - // Span of input row group indices for predicate pushdown - host_span const> input_row_group_indices; - if (row_group_indices.empty()) { - std::transform(per_file_metadata.cbegin(), - per_file_metadata.cend(), - std::back_inserter(all_row_group_indices), - [](auto const& file_meta) { - std::vector rg_idx(file_meta.row_groups.size()); - std::iota(rg_idx.begin(), rg_idx.end(), 0); - return rg_idx; - }); - input_row_group_indices = host_span const>(all_row_group_indices); - } else { - input_row_group_indices = row_group_indices; - } - // Filter stats table with StatsAST expression and collect filtered row group indices - auto const stats_filtered_row_group_indices = apply_stats_filters(input_row_group_indices, + auto const stats_filtered_row_group_indices = apply_stats_filters(row_group_indices, total_row_groups, output_dtypes, output_column_schemas, filter.value(), stream); - return stats_filtered_row_group_indices.value_or(all_row_group_indices); + return stats_filtered_row_group_indices.value_or(all_row_group_indices()); } } // namespace cudf::io::parquet::experimental::detail From 9b0cdfc1444affac1bad3f61c95855c9711faa81 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 30 Apr 2025 20:07:59 +0000 Subject: [PATCH 16/28] Minor improvements --- .../experimental/hybrid_scan_helpers.cpp | 13 ++++++++----- .../parquet/experimental/hybrid_scan_impl.cpp | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 2a7969f0946..13a33ace9b1 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -148,14 +148,16 @@ aggregate_reader_metadata::select_payload_columns( bool strings_to_categorical, type_id timestamp_type_id) { - // Select all columns if no payload or filter columns are specified + // If neither payload nor filter columns are specified, select all columns if (not payload_column_names.has_value() and not filter_column_names.has_value()) { + // Call the base `select_columns()` method without specifying any columns return select_columns({}, {}, include_index, strings_to_categorical, timestamp_type_id); } + std::vector valid_payload_columns; + // If payload columns are specified, only select payload columns that do not appear in the filter // expression - std::vector valid_payload_columns; if (payload_column_names.has_value()) { valid_payload_columns = *payload_column_names; // Remove filter columns from the provided payload column names @@ -170,12 +172,13 @@ aggregate_reader_metadata::select_payload_columns( }), valid_payload_columns.end()); } - // Select valid payload columns using the base `select_columns` method + // Call the base `select_columns()` method with valid payload columns return select_columns( valid_payload_columns, {}, include_index, strings_to_categorical, timestamp_type_id); } - // Otherwise, select all columns that do not appear in the filter expression + // Else if only filter columns are specified, select all columns that do not appear in the + // filter expression std::function add_column_path = [&](std::string path_till_now, int schema_idx) { auto const& schema_elem = get_schema(schema_idx); @@ -197,7 +200,7 @@ aggregate_reader_metadata::select_payload_columns( add_column_path("", child_idx); } - // Select valid payload columns using the base `select_columns` method + // Call the base `select_columns()` method with all but filter columns return select_columns( valid_payload_columns, {}, include_index, strings_to_categorical, timestamp_type_id); } diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 5d0df1afee6..af66e1e24a6 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -254,6 +254,23 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( return {}; } +void hybrid_scan_reader_impl::populate_metadata(table_metadata& out_metadata) const +{ + // Return column names + out_metadata.schema_info.resize(_output_buffers.size()); + for (size_t i = 0; i < _output_column_schemas.size(); i++) { + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); + out_metadata.schema_info[i].name = schema.name; + out_metadata.schema_info[i].is_nullable = + schema.repetition_type != cudf::io::parquet::FieldRepetitionType::REQUIRED; + } + + // Return user metadata + out_metadata.per_file_user_data = _metadata->get_key_value_metadata(); + out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), + out_metadata.per_file_user_data[0].end()}; +} + bool hybrid_scan_reader_impl::has_more_work() const { return _file_itm_data.num_passes() > 0 && From 36973dbbf38bb1d818108ba087c9efd6792ba9bc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 30 Apr 2025 20:17:25 +0000 Subject: [PATCH 17/28] Minor improvement --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 4 +--- cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 13a33ace9b1..9a9586d01d7 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -212,8 +212,6 @@ std::vector> aggregate_reader_metadata::filter_row_ std::optional> filter, rmm::cuda_stream_view stream) const { - CUDF_EXPECTS(not row_group_indices.empty(), "Input row group indices must not be empty"); - auto all_row_group_indices = [&]() { std::vector> all_row_group_indices; std::transform(row_group_indices.begin(), @@ -223,7 +221,7 @@ std::vector> aggregate_reader_metadata::filter_row_ return all_row_group_indices; }; - // No filter expression, return all row groups + // No converted filter expression, return all row groups if (not filter.has_value()) { return all_row_group_indices(); } // Compute total number of input row groups diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index af66e1e24a6..c7861e67f55 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -155,6 +155,7 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w parquet_reader_options const& options, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(not row_group_indices.empty(), "Input row group indices must not be empty"); CUDF_EXPECTS(options.get_filter().has_value(), "Filter expression must not be empty"); select_columns(read_mode::FILTER_COLUMNS, options); From c88e8ad12a00df90902ae81f4891fe6b9929add8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 30 Apr 2025 23:38:36 +0000 Subject: [PATCH 18/28] Minor improvements --- .../experimental/hybrid_scan_helpers.cpp | 29 +++++++++++-------- .../parquet/experimental/hybrid_scan_impl.cpp | 4 +-- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 9a9586d01d7..d50be15b356 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -41,6 +41,20 @@ using parquet::detail::equality_literals_collector; using parquet::detail::input_column_info; using parquet::detail::row_group_info; +namespace { + +auto all_row_group_indices(host_span const> row_group_indices) +{ + std::vector> all_row_group_indices; + std::transform(row_group_indices.begin(), + row_group_indices.end(), + std::back_inserter(all_row_group_indices), + [](auto rg_indices) { return rg_indices; }); + return all_row_group_indices; +} + +} // namespace + metadata::metadata(cudf::host_span footer_bytes) { CompactProtocolReader cp(footer_bytes.data(), footer_bytes.size()); @@ -212,17 +226,8 @@ std::vector> aggregate_reader_metadata::filter_row_ std::optional> filter, rmm::cuda_stream_view stream) const { - auto all_row_group_indices = [&]() { - std::vector> all_row_group_indices; - std::transform(row_group_indices.begin(), - row_group_indices.end(), - std::back_inserter(all_row_group_indices), - [](auto rg_indices) { return rg_indices; }); - return all_row_group_indices; - }; - - // No converted filter expression, return all row groups - if (not filter.has_value()) { return all_row_group_indices(); } + // Return all row groups if no filter expression + if (not filter.has_value()) { return all_row_group_indices(row_group_indices); } // Compute total number of input row groups cudf::size_type total_row_groups = [&]() { @@ -250,7 +255,7 @@ std::vector> aggregate_reader_metadata::filter_row_ filter.value(), stream); - return stats_filtered_row_group_indices.value_or(all_row_group_indices()); + return stats_filtered_row_group_indices.value_or(all_row_group_indices(row_group_indices); } } // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index c7861e67f55..79e7aed8adb 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -155,8 +155,8 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w parquet_reader_options const& options, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(not row_group_indices.empty(), "Input row group indices must not be empty"); - CUDF_EXPECTS(options.get_filter().has_value(), "Filter expression must not be empty"); + CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); + CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); select_columns(read_mode::FILTER_COLUMNS, options); From 9cf6598e7a5d809a5ab62bc08eea8d8ec9e77fa7 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 30 Apr 2025 23:39:51 +0000 Subject: [PATCH 19/28] Minor improvements --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 3 ++- cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index d50be15b356..4b02c4b0d15 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -43,7 +43,8 @@ using parquet::detail::row_group_info; namespace { -auto all_row_group_indices(host_span const> row_group_indices) +[[nodiscard]] auto all_row_group_indices( + host_span const> row_group_indices) { std::vector> all_row_group_indices; std::transform(row_group_indices.begin(), diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 79e7aed8adb..d435db7f3d3 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -45,6 +45,7 @@ using parquet::detail::PageNestingDecodeInfo; using text::byte_range_info; namespace { + // Tests the passed in logical type for a FIXED_LENGTH_BYTE_ARRAY column to see if it should // be treated as a string. Currently the only logical type that has special handling is DECIMAL. // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which From 8b4ba9aa48c51a61525020566838c060a5e2d956 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 30 Apr 2025 23:42:02 +0000 Subject: [PATCH 20/28] fix build error --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 4b02c4b0d15..dc7f72be48c 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -256,7 +256,7 @@ std::vector> aggregate_reader_metadata::filter_row_ filter.value(), stream); - return stats_filtered_row_group_indices.value_or(all_row_group_indices(row_group_indices); + return stats_filtered_row_group_indices.value_or(all_row_group_indices(row_group_indices)); } } // namespace cudf::io::parquet::experimental::detail From e314dc61ac4a1f1b25b8397f21050c8aa6812b15 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 1 May 2025 19:06:31 +0000 Subject: [PATCH 21/28] Apply suggestions from reviews --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index dc7f72be48c..d485606b6f4 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -46,12 +46,8 @@ namespace { [[nodiscard]] auto all_row_group_indices( host_span const> row_group_indices) { - std::vector> all_row_group_indices; - std::transform(row_group_indices.begin(), - row_group_indices.end(), - std::back_inserter(all_row_group_indices), - [](auto rg_indices) { return rg_indices; }); - return all_row_group_indices; + return std::vector>(row_group_indices.begin(), + row_group_indices.end()); } } // namespace From eac62359e878fc19e2a6e79c21f9da375e5aca44 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 1 May 2025 20:51:23 +0000 Subject: [PATCH 22/28] Use `EXPECT_EQ` instead of `ASSERT_EQ` in tests --- cpp/tests/io/parquet_experimental_reader_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp index 48280ee6e81..cb604bb8f95 100644 --- a/cpp/tests/io/parquet_experimental_reader_test.cpp +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -193,7 +193,7 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) // Get all row groups from the reader - API # 4 auto input_row_group_indices = reader->all_row_groups(options); // Expect 4 = 20000 rows / 5000 rows per row group - ASSERT_EQ(input_row_group_indices.size(), 4); + EXPECT_EQ(input_row_group_indices.size(), 4); // Explicitly set the row groups to read options.set_row_groups({{0, 1}}); @@ -201,7 +201,7 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) // Get all row groups from the reader again input_row_group_indices = reader->all_row_groups(options); // Expect only 2 row groups now - ASSERT_EQ(reader->all_row_groups(options).size(), 2); + EXPECT_EQ(reader->all_row_groups(options).size(), 2); } TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) @@ -232,9 +232,9 @@ TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) // Get all row groups from the reader - API # 4 auto input_row_group_indices = reader->all_row_groups(options); // Expect 4 = 20000 rows / 5000 rows per row group - ASSERT_EQ(input_row_group_indices.size(), 4); + EXPECT_EQ(input_row_group_indices.size(), 4); // Expect 3 row groups to be filtered out with stats - ASSERT_EQ( + EXPECT_EQ( reader ->filter_row_groups_with_stats(input_row_group_indices, options, cudf::get_default_stream()) .size(), @@ -243,7 +243,7 @@ TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) // Use custom input row group indices input_row_group_indices = {1, 2}; // Expect all row groups to be filtered out with stats - ASSERT_EQ( + EXPECT_EQ( reader ->filter_row_groups_with_stats(input_row_group_indices, options, cudf::get_default_stream()) .size(), From ca43c1e4cc50bacbd330a96afb45ad959974f54c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 5 May 2025 22:15:01 +0000 Subject: [PATCH 23/28] Suggestions from code review --- cpp/include/cudf/io/parquet.hpp | 16 ++++--- .../experimental/hybrid_scan_helpers.cpp | 28 ++++++----- .../experimental/hybrid_scan_helpers.hpp | 2 +- .../parquet/experimental/hybrid_scan_impl.cpp | 7 +-- cpp/src/io/parquet/reader_impl.cpp | 8 ++-- cpp/src/io/parquet/reader_impl_helpers.hpp | 2 +- .../io/parquet_experimental_reader_test.cpp | 48 ++++++++----------- 7 files changed, 55 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 9c76f9969f3..05cc1ec967c 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -118,16 +118,18 @@ class parquet_reader_options { * @brief Default constructor. * * This has been added since Cython requires a default constructor to create objects on stack. + * The `hybrid_scan_reader` also uses this to create `parquet_reader_options` without a source. */ explicit parquet_reader_options() = default; /** - * @brief Creates a parquet_reader_options_builder which will build parquet_reader_options. + * @brief Creates a `parquet_reader_options_builder` to build `parquet_reader_options`. + * By default, build with empty data source info. * * @param src Source information to read parquet file * @return Builder to build reader options */ - static parquet_reader_options_builder builder(source_info src); + static parquet_reader_options_builder builder(source_info src = source_info{}); /** * @brief Returns source info. @@ -137,8 +139,7 @@ class parquet_reader_options { [[nodiscard]] source_info const& get_source() const { return _source; } /** - * @brief Returns true/false depending on whether strings should be converted to categories or - * not. + * @brief Returns boolean depending on whether strings should be converted to categories or not. * * @return `true` if strings should be converted to categories */ @@ -148,21 +149,21 @@ class parquet_reader_options { } /** - * @brief Returns true/false depending whether to use pandas metadata or not while reading. + * @brief Returns boolean depending on whether to use pandas metadata or not while reading. * * @return `true` if pandas metadata is used while reading */ [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; } /** - * @brief Returns true/false depending whether to use arrow schema while reading. + * @brief Returns boolean depending on whether to use arrow schema while reading. * * @return `true` if arrow schema is used while reading */ [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; } /** - * @brief Returns true/false depending on whether to read matching projected and filter columns + * @brief Returns boolean depending on whether to read matching projected and filter columns * from mismatched Parquet sources. * * @return `true` if mismatched projected and filter columns will be read from mismatched Parquet @@ -372,6 +373,7 @@ class parquet_reader_options_builder { * @brief Default constructor. * * This has been added since Cython requires a default constructor to create objects on stack. + * The `hybrid_scan_reader` also uses this to construct `parquet_reader_options` without a source. */ parquet_reader_options_builder() = default; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index d485606b6f4..ec4682645a2 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -29,6 +29,7 @@ #include #include #include +#include namespace cudf::io::parquet::experimental::detail { @@ -172,14 +173,15 @@ aggregate_reader_metadata::select_payload_columns( if (payload_column_names.has_value()) { valid_payload_columns = *payload_column_names; // Remove filter columns from the provided payload column names - if (filter_column_names.has_value()) { + if (filter_column_names.has_value() and not filter_column_names->empty()) { + // Add filter column names to a hash set for faster lookup + std::unordered_set filter_columns(filter_column_names->begin(), + filter_column_names->end()); + // Remove a payload column name if it is also present in the hash set valid_payload_columns.erase(std::remove_if(valid_payload_columns.begin(), valid_payload_columns.end(), - [&](std::string const& col) { - return std::find(filter_column_names->begin(), - filter_column_names->end(), - col) != - filter_column_names->end(); + [&filter_columns](std::string const& col) { + return filter_columns.count(col) > 0; }), valid_payload_columns.end()); } @@ -196,9 +198,8 @@ aggregate_reader_metadata::select_payload_columns( std::string const curr_path = path_till_now + schema_elem.name; // If the current path is not a filter column, then add it and its children to the list of valid // payload columns - if (std::find(filter_column_names.value().cbegin(), - filter_column_names.value().cend(), - curr_path) == filter_column_names.value().cend()) { + if (std::find(filter_column_names->begin(), filter_column_names->end(), curr_path) == + filter_column_names->end()) { valid_payload_columns.push_back(curr_path); // Add all children as well for (auto const& child_idx : schema_elem.children_idx) { @@ -206,9 +207,12 @@ aggregate_reader_metadata::select_payload_columns( } } }; - // Add all base level columns to valid payload columns - for (auto const& child_idx : get_schema(0).children_idx) { - add_column_path("", child_idx); + + // Add all but filter columns to valid payload columns + if (not filter_column_names->empty()) { + for (auto const& child_idx : get_schema(0).children_idx) { + add_column_path("", child_idx); + } } // Call the base `select_columns()` method with all but filter columns diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index ff39c466a61..e5e51766b04 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -135,7 +135,7 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * @param strings_to_categorical Type conversion parameter * @param timestamp_type_id Type conversion parameter * - * @return input column information, output column information, list of output column schema + * @return input column information, output column buffers, list of output column schema * indices */ [[nodiscard]] std:: diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index d435db7f3d3..0562aa98ef5 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -137,9 +137,10 @@ void hybrid_scan_reader_impl::select_columns(read_mode read_mode, _output_buffers_template.clear(); // Save the states of the output buffers for reuse. - for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); - } + std::transform(_output_buffers.begin(), + _output_buffers.end(), + std::back_inserter(_output_buffers_template), + [](auto const& buff) { return inline_column_buffer::empty_like(buff); }); } std::vector hybrid_scan_reader_impl::all_row_groups( diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 25d9cddb7a5..45eefa1c624 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -570,9 +570,11 @@ reader::impl::impl(std::size_t chunk_read_limit, _options.timestamp_type.id()); // Save the states of the output buffers for reuse in `chunk_read()`. - for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); - } + std::transform( + _output_buffers.begin(), + _output_buffers.end(), + std::back_inserter(_output_buffers_template), + [](auto const& buff) { return cudf::io::detail::inline_column_buffer::empty_like(buff); }); // Save the name to reference converter to extract output filter AST in // `preprocess_file()` and `finalize_output()` diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index e279285495d..bb6a485ac9b 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -479,7 +479,7 @@ class aggregate_reader_metadata { * @param strings_to_categorical Type conversion parameter * @param timestamp_type_id Type conversion parameter * - * @return input column information, output column information, list of output column schema + * @return input column information, output column buffers, list of output column schema * indices */ [[nodiscard]] std::tuple, diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp index cb604bb8f95..290f81312f1 100644 --- a/cpp/tests/io/parquet_experimental_reader_test.cpp +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -134,7 +134,7 @@ auto create_parquet_with_stats() cudf::io::write_parquet(out_opts); - return std::pair{std::move(table), buffer}; + return std::pair{std::move(table), std::vector(buffer.begin(), buffer.end())}; } } // namespace @@ -143,7 +143,7 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) { // Create a table with several row groups each with a single page. auto constexpr num_concat = 1; - auto [_, buffer] = create_parquet_with_stats(); + auto [_, file_buffer] = create_parquet_with_stats(); // Filtering AST - table[0] < 100 auto literal_value = cudf::numeric_scalar(100); @@ -153,32 +153,27 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) // Create reader options with empty source info cudf::io::parquet_reader_options options = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(nullptr, 0)) - .filter(filter_expression); - - // Input file buffer span - auto const file_buffer_span = - cudf::host_span(reinterpret_cast(buffer.data()), buffer.size()); + cudf::io::parquet_reader_options::builder().filter(filter_expression); // Fetch footer and page index bytes from the buffer. - auto const footer_buffer = fetch_footer_bytes(file_buffer_span); + auto const footer_buffer = fetch_footer_bytes(file_buffer); // Create hybrid scan reader with footer bytes auto const reader = std::make_unique(footer_buffer, options); - // Get Parquet file metadata from the reader - API # 1 + // Get Parquet file metadata from the reader auto parquet_metadata = reader->parquet_metadata(); // Check that the offset and column indices are not present ASSERT_FALSE(parquet_metadata.row_groups[0].columns[0].offset_index.has_value()); ASSERT_FALSE(parquet_metadata.row_groups[0].columns[0].column_index.has_value()); - // Get page index byte range from the reader - API # 2 + // Get page index byte range from the reader auto const page_index_byte_range = reader->page_index_byte_range(); // Fetch page index bytes from the input buffer - auto const page_index_buffer = fetch_page_index_bytes(file_buffer_span, page_index_byte_range); + auto const page_index_buffer = fetch_page_index_bytes(file_buffer, page_index_byte_range); // Setup page index - API # 3 reader->setup_page_index(page_index_buffer); @@ -190,7 +185,7 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) ASSERT_TRUE(parquet_metadata.row_groups[0].columns[0].offset_index.has_value()); ASSERT_TRUE(parquet_metadata.row_groups[0].columns[0].column_index.has_value()); - // Get all row groups from the reader - API # 4 + // Get all row groups from the reader auto input_row_group_indices = reader->all_row_groups(options); // Expect 4 = 20000 rows / 5000 rows per row group EXPECT_EQ(input_row_group_indices.size(), 4); @@ -207,8 +202,8 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) { // Create a table with 4 row groups each with a single page. - auto constexpr num_concat = 1; - auto [written_table, buffer] = create_parquet_with_stats(); + auto constexpr num_concat = 1; + auto [written_table, file_buffer] = create_parquet_with_stats(); // Filtering AST - table[0] < 50 auto literal_value = cudf::numeric_scalar(50); @@ -222,30 +217,25 @@ TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) .filter(filter_expression); // Fetch footer and page index bytes from the buffer. - auto const footer_buffer = fetch_footer_bytes( - cudf::host_span(reinterpret_cast(buffer.data()), buffer.size())); + auto const footer_buffer = fetch_footer_bytes(file_buffer); // Create hybrid scan reader with footer bytes auto const reader = std::make_unique(footer_buffer, options); - // Get all row groups from the reader - API # 4 + // Get all row groups from the reader auto input_row_group_indices = reader->all_row_groups(options); // Expect 4 = 20000 rows / 5000 rows per row group EXPECT_EQ(input_row_group_indices.size(), 4); + auto stats_filtered_row_groups = reader->filter_row_groups_with_stats( + input_row_group_indices, options, cudf::get_default_stream()); // Expect 3 row groups to be filtered out with stats - EXPECT_EQ( - reader - ->filter_row_groups_with_stats(input_row_group_indices, options, cudf::get_default_stream()) - .size(), - 1); + EXPECT_EQ(stats_filtered_row_groups.size(), 1); // Use custom input row group indices - input_row_group_indices = {1, 2}; + input_row_group_indices = {1, 2}; + stats_filtered_row_groups = reader->filter_row_groups_with_stats( + input_row_group_indices, options, cudf::get_default_stream()); // Expect all row groups to be filtered out with stats - EXPECT_EQ( - reader - ->filter_row_groups_with_stats(input_row_group_indices, options, cudf::get_default_stream()) - .size(), - 0); + EXPECT_EQ(stats_filtered_row_groups.size(), 0); } From e837f34679ea144536d00c5e9b00a36fe19d105b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 5 May 2025 22:23:39 +0000 Subject: [PATCH 24/28] Remove the missed source_info from test --- cpp/tests/io/parquet_experimental_reader_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp index 290f81312f1..2296a25a33f 100644 --- a/cpp/tests/io/parquet_experimental_reader_test.cpp +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -213,8 +213,7 @@ TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) // Create reader options with empty source info cudf::io::parquet_reader_options options = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(nullptr, 0)) - .filter(filter_expression); + cudf::io::parquet_reader_options::builder().filter(filter_expression); // Fetch footer and page index bytes from the buffer. auto const footer_buffer = fetch_footer_bytes(file_buffer); From 8109a61d46df1376210c0021142a2b627ca9d2f2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 5 May 2025 22:28:22 +0000 Subject: [PATCH 25/28] Minor improvement --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index ec4682645a2..deb0f7a9d1c 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -175,13 +175,13 @@ aggregate_reader_metadata::select_payload_columns( // Remove filter columns from the provided payload column names if (filter_column_names.has_value() and not filter_column_names->empty()) { // Add filter column names to a hash set for faster lookup - std::unordered_set filter_columns(filter_column_names->begin(), - filter_column_names->end()); + std::unordered_set filter_columns_set(filter_column_names->begin(), + filter_column_names->end()); // Remove a payload column name if it is also present in the hash set valid_payload_columns.erase(std::remove_if(valid_payload_columns.begin(), valid_payload_columns.end(), - [&filter_columns](std::string const& col) { - return filter_columns.count(col) > 0; + [&filter_columns_set](auto const& col) { + return filter_columns_set.count(col) > 0; }), valid_payload_columns.end()); } From ff7744d4c257200f1a656c90515858a5d460936b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 5 May 2025 22:42:05 +0000 Subject: [PATCH 26/28] Minor docs update --- cpp/include/cudf/io/parquet.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 05cc1ec967c..24bf1fbf916 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -139,7 +139,7 @@ class parquet_reader_options { [[nodiscard]] source_info const& get_source() const { return _source; } /** - * @brief Returns boolean depending on whether strings should be converted to categories or not. + * @brief Returns boolean depending on whether strings should be converted to categories. * * @return `true` if strings should be converted to categories */ @@ -149,7 +149,7 @@ class parquet_reader_options { } /** - * @brief Returns boolean depending on whether to use pandas metadata or not while reading. + * @brief Returns boolean depending on whether to use pandas metadata while reading. * * @return `true` if pandas metadata is used while reading */ @@ -309,14 +309,14 @@ class parquet_reader_options { /** * @brief Sets to enable/disable use of pandas metadata to read. * - * @param val Boolean value whether to use pandas metadata + * @param val Boolean indicating whether to use pandas metadata */ void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; } /** * @brief Sets to enable/disable use of arrow schema to read. * - * @param val Boolean value whether to use arrow schema + * @param val Boolean indicating whether to use arrow schema */ void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; } @@ -324,8 +324,8 @@ class parquet_reader_options { * @brief Sets to enable/disable reading of matching projected and filter columns from mismatched * Parquet sources. * - * @param val Boolean value whether to read matching projected and filter columns from mismatched - * Parquet sources. + * @param val Boolean indicating whether to read matching projected and filter columns from + * mismatched Parquet sources. */ void enable_allow_mismatched_pq_schemas(bool val) { _allow_mismatched_pq_schemas = val; } From 0cb0264238ce3f0a8f241a4175c135a7ebfaa259 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 6 May 2025 00:40:22 +0000 Subject: [PATCH 27/28] Use hash set to remove filter columns --- cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index deb0f7a9d1c..7e2ea3a5722 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -192,14 +192,18 @@ aggregate_reader_metadata::select_payload_columns( // Else if only filter columns are specified, select all columns that do not appear in the // filter expression + + // Add filter column names to a hash set for faster lookup + std::unordered_set filter_columns_set(filter_column_names->begin(), + filter_column_names->end()); + std::function add_column_path = [&](std::string path_till_now, int schema_idx) { auto const& schema_elem = get_schema(schema_idx); std::string const curr_path = path_till_now + schema_elem.name; // If the current path is not a filter column, then add it and its children to the list of valid // payload columns - if (std::find(filter_column_names->begin(), filter_column_names->end(), curr_path) == - filter_column_names->end()) { + if (filter_columns_set.count(curr_path) == 0) { valid_payload_columns.push_back(curr_path); // Add all children as well for (auto const& child_idx : schema_elem.children_idx) { From 7538030f72a849c850e5ece15eea498f75857dac Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 6 May 2025 00:46:51 +0000 Subject: [PATCH 28/28] Revert the file buffer span in test --- .../io/parquet_experimental_reader_test.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/parquet_experimental_reader_test.cpp b/cpp/tests/io/parquet_experimental_reader_test.cpp index 2296a25a33f..4b9313ee94c 100644 --- a/cpp/tests/io/parquet_experimental_reader_test.cpp +++ b/cpp/tests/io/parquet_experimental_reader_test.cpp @@ -134,7 +134,7 @@ auto create_parquet_with_stats() cudf::io::write_parquet(out_opts); - return std::pair{std::move(table), std::vector(buffer.begin(), buffer.end())}; + return std::pair{std::move(table), std::move(buffer)}; } } // namespace @@ -155,8 +155,12 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) cudf::io::parquet_reader_options options = cudf::io::parquet_reader_options::builder().filter(filter_expression); + // Input file buffer span + auto const file_buffer_span = cudf::host_span( + reinterpret_cast(file_buffer.data()), file_buffer.size()); + // Fetch footer and page index bytes from the buffer. - auto const footer_buffer = fetch_footer_bytes(file_buffer); + auto const footer_buffer = fetch_footer_bytes(file_buffer_span); // Create hybrid scan reader with footer bytes auto const reader = @@ -173,9 +177,9 @@ TEST_F(ParquetExperimentalReaderTest, TestMetadata) auto const page_index_byte_range = reader->page_index_byte_range(); // Fetch page index bytes from the input buffer - auto const page_index_buffer = fetch_page_index_bytes(file_buffer, page_index_byte_range); + auto const page_index_buffer = fetch_page_index_bytes(file_buffer_span, page_index_byte_range); - // Setup page index - API # 3 + // Setup page index reader->setup_page_index(page_index_buffer); // Get Parquet file metadata from the reader again @@ -215,8 +219,12 @@ TEST_F(ParquetExperimentalReaderTest, TestFilterRowGroupWithStats) cudf::io::parquet_reader_options options = cudf::io::parquet_reader_options::builder().filter(filter_expression); + // Input file buffer span + auto const file_buffer_span = cudf::host_span( + reinterpret_cast(file_buffer.data()), file_buffer.size()); + // Fetch footer and page index bytes from the buffer. - auto const footer_buffer = fetch_footer_bytes(file_buffer); + auto const footer_buffer = fetch_footer_bytes(file_buffer_span); // Create hybrid scan reader with footer bytes auto const reader =