diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index c328263c7f3..56d0158a62f 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -689,6 +689,7 @@ CUDF_KERNEL void __launch_bounds__(128) util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align); if (not comp_page_sizes.empty()) { comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page]; + page_g.comp_data_size = comp_page_sizes[ck_g.first_page + num_pages]; } page_headers_size += page_g.max_hdr_size; max_page_data_size = max(max_page_data_size, page_g.max_data_size); diff --git a/cpp/tests/io/compression_common.hpp b/cpp/tests/io/compression_common.hpp new file mode 100644 index 00000000000..b14c00a1243 --- /dev/null +++ b/cpp/tests/io/compression_common.hpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include + +class tmp_env_var { + public: + explicit tmp_env_var(std::string name, std::string const& value) : name_(std::move(name)) + { + auto const previous_value = std::getenv(name_.c_str()); + if (previous_value != nullptr) { previous_value_ = std::string(previous_value); } + + setenv(name_.c_str(), value.c_str(), 1); + } + + tmp_env_var(tmp_env_var const&) = delete; + tmp_env_var& operator=(tmp_env_var const&) = delete; + tmp_env_var(tmp_env_var&&) = delete; + tmp_env_var& operator=(tmp_env_var&&) = delete; + + ~tmp_env_var() + { + if (previous_value_.has_value()) { + setenv(name_.c_str(), previous_value_->c_str(), 1); + } else { + unsetenv(name_.c_str()); + } + } + + private: + std::string name_; + std::optional previous_value_; +}; + +static constexpr char const* host_comp_env_var = "LIBCUDF_HOST_COMPRESSION"; +static constexpr char const* host_decomp_env_var = "LIBCUDF_HOST_DECOMPRESSION"; +static constexpr char const* nvcomp_policy_env_var = "LIBCUDF_NVCOMP_POLICY"; + +template +struct CompressionTest + : public Base, + public ::testing::WithParamInterface> { + CompressionTest() + { + auto const comp_impl = std::get<0>(GetParam()); + + if (comp_impl == "NVCOMP") { + env_vars.emplace_back(host_comp_env_var, "OFF"); + env_vars.emplace_back(nvcomp_policy_env_var, "ALWAYS"); + } else if (comp_impl == "DEVICE_INTERNAL") { + env_vars.emplace_back(host_comp_env_var, "OFF"); + env_vars.emplace_back(nvcomp_policy_env_var, "OFF"); + } else if (comp_impl == "HOST") { + env_vars.emplace_back(host_comp_env_var, "ON"); + } else { + CUDF_FAIL("Invalid test parameter"); + } + } + + private: + std::list env_vars; +}; + +template +struct DecompressionTest + : public Base, + public ::testing::WithParamInterface> { + DecompressionTest() + { + auto const comp_impl = std::get<0>(GetParam()); + + if (comp_impl == "NVCOMP") { + env_vars.emplace_back(host_decomp_env_var, "OFF"); + env_vars.emplace_back(nvcomp_policy_env_var, "ALWAYS"); + } else if (comp_impl == "DEVICE_INTERNAL") { + env_vars.emplace_back(host_decomp_env_var, "OFF"); + env_vars.emplace_back(nvcomp_policy_env_var, "OFF"); + } else if (comp_impl == "HOST") { + env_vars.emplace_back(host_decomp_env_var, "ON"); + } else { + CUDF_FAIL("Invalid test parameter"); + } + } + + private: + std::list env_vars; +}; diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu index 5f1aea71f73..72ee7d21b05 100644 --- a/cpp/tests/io/orc_chunked_reader_test.cu +++ b/cpp/tests/io/orc_chunked_reader_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "compression_common.hpp" + #include #include #include @@ -161,6 +163,8 @@ auto chunked_read(std::string const& filepath, struct OrcChunkedReaderTest : public cudf::test::BaseFixture {}; +using OrcChunkedDecompressionTest = DecompressionTest; + TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData) { std::vector> input_columns; @@ -1477,3 +1481,62 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow) #endif // LOCAL_TEST } + +TEST_P(OrcChunkedDecompressionTest, RoundTripBasic) +{ + auto const compression_type = std::get<1>(GetParam()); + + auto const num_rows = 12'345; + + std::vector> input_columns; + auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 4; }); + input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release()); + input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release()); + auto expected = std::make_unique(std::move(input_columns)); + + auto const filepath = temp_env->get_temp_filepath("chunked_read_compressions.orc"); + auto const write_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected) + .compression(compression_type) + .stripe_size_rows(2'000) + .row_index_stride(1'000) + .build(); + cudf::io::write_orc(write_opts); + + { + auto const [result, num_chunks] = + chunked_read(filepath, output_limit{0}, input_limit{2'400'000}); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{0}, input_limit{240'000}); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath, output_limit{0}, input_limit{24'000}); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } +} + +INSTANTIATE_TEST_CASE_P(Nvcomp, + OrcChunkedDecompressionTest, + ::testing::Combine(::testing::Values("NVCOMP"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZSTD))); + +INSTANTIATE_TEST_CASE_P(DeviceInternal, + OrcChunkedDecompressionTest, + ::testing::Combine(::testing::Values("DEVICE_INTERNAL"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY))); + +INSTANTIATE_TEST_CASE_P(Host, + OrcChunkedDecompressionTest, + ::testing::Combine(::testing::Values("HOST"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::ZSTD))); diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index bac259636a6..69044e44282 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "compression_common.hpp" + #include #include #include @@ -139,8 +141,9 @@ struct OrcStatisticsTest : public cudf::test::BaseFixture {}; // Test fixture for metadata tests struct OrcMetadataReaderTest : public cudf::test::BaseFixture {}; -struct OrcCompressionTest : public cudf::test::BaseFixture, - public ::testing::WithParamInterface {}; +using OrcCompressionTest = CompressionTest; + +using OrcDecompressionTest = DecompressionTest; namespace { // Generates a vector of uniform random values of type T @@ -1731,15 +1734,18 @@ TEST_F(OrcWriterTest, CompStats) EXPECT_FALSE(std::isnan(stats->compression_ratio())); } -TEST_F(OrcChunkedWriterTest, CompStats) +TEST_P(OrcCompressionTest, CompStats) { - auto table = create_random_fixed_table(1, 100000, true); + auto const compression_type = std::get<1>(GetParam()); + + auto table = create_random_fixed_table(1, 55000, true); auto const stats = std::make_shared(); std::vector unused_buffer; cudf::io::chunked_orc_writer_options opts = cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer}) + .compression(compression_type) .compression_statistics(stats); cudf::io::orc_chunked_writer(opts).write(*table); @@ -1758,7 +1764,7 @@ TEST_F(OrcChunkedWriterTest, CompStats) EXPECT_EQ(stats->num_skipped_bytes(), 0); } -void expect_compression_stats_empty(std::shared_ptr stats) +void expect_compression_stats_empty(cudf::io::writer_compression_statistics const* stats) { EXPECT_EQ(stats->num_compressed_bytes(), 0); EXPECT_EQ(stats->num_failed_bytes(), 0); @@ -1766,8 +1772,10 @@ void expect_compression_stats_empty(std::shared_ptrcompression_ratio())); } -TEST_F(OrcWriterTest, CompStatsEmptyTable) +TEST_P(OrcCompressionTest, CompStatsEmptyTable) { + auto const compression_type = std::get<1>(GetParam()); + auto table_no_rows = create_random_fixed_table(20, 0, false); auto const stats = std::make_shared(); @@ -1775,14 +1783,17 @@ TEST_F(OrcWriterTest, CompStatsEmptyTable) std::vector unused_buffer; cudf::io::orc_writer_options opts = cudf::io::orc_writer_options::builder( cudf::io::sink_info{&unused_buffer}, table_no_rows->view()) + .compression(compression_type) .compression_statistics(stats); cudf::io::write_orc(opts); - expect_compression_stats_empty(stats); + expect_compression_stats_empty(stats.get()); } -TEST_F(OrcChunkedWriterTest, CompStatsEmptyTable) +TEST_P(OrcCompressionTest, ChunkedCompStatsEmptyTable) { + auto const compression_type = std::get<1>(GetParam()); + auto table_no_rows = create_random_fixed_table(20, 0, false); auto const stats = std::make_shared(); @@ -1790,10 +1801,11 @@ TEST_F(OrcChunkedWriterTest, CompStatsEmptyTable) std::vector unused_buffer; cudf::io::chunked_orc_writer_options opts = cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer}) + .compression(compression_type) .compression_statistics(stats); cudf::io::orc_chunked_writer(opts).write(*table_no_rows); - expect_compression_stats_empty(stats); + expect_compression_stats_empty(stats.get()); } TEST_F(OrcWriterTest, EmptyRowGroup) @@ -2024,15 +2036,9 @@ TEST_F(OrcStatisticsTest, Empty) EXPECT_EQ(ts6.count[0], 0); } -TEST_P(OrcCompressionTest, Basic) +void round_trip_basic(cudf::io::compression_type compression_type) { - constexpr auto num_rows = 12000; - auto const compression_type = GetParam(); - - if (not cudf::io::is_supported_read_orc(compression_type) or - not cudf::io::is_supported_write_orc(compression_type)) { - GTEST_SKIP() << "Compression not supported with the current configuration"; - } + constexpr auto num_rows = 12345; // Generate compressible data auto int_sequence = @@ -2045,10 +2051,13 @@ TEST_P(OrcCompressionTest, Basic) table_view expected({int_col, float_col}); + auto const stats = std::make_shared(); + std::vector out_buffer; cudf::io::orc_writer_options out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected) - .compression(compression_type); + .compression(compression_type) + .compression_statistics(stats); cudf::io::write_orc(out_opts); cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder( @@ -2056,15 +2065,16 @@ TEST_P(OrcCompressionTest, Basic) auto result = cudf::io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + EXPECT_NE(stats->num_compressed_bytes(), 0); + EXPECT_EQ(stats->num_failed_bytes(), 0); + EXPECT_EQ(stats->num_skipped_bytes(), 0); + EXPECT_FALSE(std::isnan(stats->compression_ratio())); } -INSTANTIATE_TEST_CASE_P(OrcCompressionTest, - OrcCompressionTest, - ::testing::Values(cudf::io::compression_type::NONE, - cudf::io::compression_type::AUTO, - cudf::io::compression_type::SNAPPY, - cudf::io::compression_type::LZ4, - cudf::io::compression_type::ZSTD)); +TEST_P(OrcCompressionTest, RoundTripBasic) { round_trip_basic(std::get<1>(GetParam())); } + +TEST_P(OrcDecompressionTest, RoundTripBasic) { round_trip_basic(std::get<1>(GetParam())); } TEST_F(OrcWriterTest, BounceBufferBug) { @@ -2278,4 +2288,46 @@ TEST_F(OrcWriterTest, MultipleBlocksInStripeFooter) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +INSTANTIATE_TEST_CASE_P(Nvcomp, + OrcCompressionTest, + ::testing::Combine(::testing::Values("NVCOMP"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZSTD))); + +INSTANTIATE_TEST_CASE_P(DeviceInternal, + OrcCompressionTest, + ::testing::Combine(::testing::Values("DEVICE_INTERNAL"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY))); + +INSTANTIATE_TEST_CASE_P(Host, + OrcCompressionTest, + ::testing::Combine(::testing::Values("HOST"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::ZSTD))); + +INSTANTIATE_TEST_CASE_P(Nvcomp, + OrcDecompressionTest, + ::testing::Combine(::testing::Values("NVCOMP"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZSTD))); + +INSTANTIATE_TEST_CASE_P(DeviceInternal, + OrcDecompressionTest, + ::testing::Combine(::testing::Values("DEVICE_INTERNAL"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY))); + +INSTANTIATE_TEST_CASE_P(Host, + OrcDecompressionTest, + ::testing::Combine(::testing::Values("HOST"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::ZSTD))); + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 05c1d30624b..10f9d531e98 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "compression_common.hpp" #include "parquet_common.hpp" #include @@ -181,6 +182,8 @@ auto const read_table_and_nrows_per_source(cudf::io::chunked_parquet_reader cons struct ParquetChunkedReaderTest : public cudf::test::BaseFixture {}; +using ParquetChunkedDecompressionTest = DecompressionTest; + TEST_F(ParquetChunkedReaderTest, TestChunkedReadNoData) { std::vector> input_columns; @@ -1882,3 +1885,55 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceEmptyTable) EXPECT_TRUE( std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin())); } + +TEST_P(ParquetChunkedDecompressionTest, RoundTripBasic) +{ + auto const compression_type = std::get<1>(GetParam()); + + srand(31337); + auto expected = create_compressible_fixed_table(4, 23456, 3, true); + + auto const filepath = temp_env->get_temp_filepath("chunked_decompression"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected) + .compression(compression_type) + .row_group_size_rows(2000); + cudf::io::write_parquet(args); + + { + auto const [result, num_chunks] = chunked_read(filepath, 0, 2'400'000); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath, 0, 240'000); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [result, num_chunks] = chunked_read(filepath, 0, 24'000); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } +} + +INSTANTIATE_TEST_CASE_P(Nvcomp, + ParquetChunkedDecompressionTest, + ::testing::Combine(::testing::Values("NVCOMP"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZSTD))); + +INSTANTIATE_TEST_CASE_P(DeviceInternal, + ParquetChunkedDecompressionTest, + ::testing::Combine(::testing::Values("DEVICE_INTERNAL"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY))); + +INSTANTIATE_TEST_CASE_P(Host, + ParquetChunkedDecompressionTest, + ::testing::Combine(::testing::Values("HOST"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::ZSTD))); diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 20e1469a993..f0bce8ed064 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -30,9 +30,6 @@ template struct ParquetWriterDeltaTest : public ParquetWriterTest {}; -struct ParquetCompressionTest : public cudf::test::BaseFixture, - public ::testing::WithParamInterface {}; - TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes); TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes) @@ -234,46 +231,3 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) EXPECT_EQ(ci.boundary_order, expected_orders[i]); } } - -TEST_P(ParquetCompressionTest, Basic) -{ - constexpr auto num_rows = 12000; - auto const compression_type = GetParam(); - - if (not cudf::io::is_supported_read_parquet(compression_type) or - not cudf::io::is_supported_write_parquet(compression_type)) { - GTEST_SKIP() << "Compression not supported with the current configuration"; - } - - // Generate compressible data - auto int_sequence = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; }); - auto float_sequence = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 32; }); - - cudf::test::fixed_width_column_wrapper int_col(int_sequence, int_sequence + num_rows); - cudf::test::fixed_width_column_wrapper float_col(float_sequence, - float_sequence + num_rows); - - table_view expected({int_col, float_col}); - - std::vector out_buffer; - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected) - .compression(compression_type); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{out_buffer.data(), out_buffer.size()}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -INSTANTIATE_TEST_CASE_P(ParquetCompressionTest, - ParquetCompressionTest, - ::testing::Values(cudf::io::compression_type::NONE, - cudf::io::compression_type::AUTO, - cudf::io::compression_type::SNAPPY, - cudf::io::compression_type::LZ4, - cudf::io::compression_type::ZSTD)); diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index fe0727b0eda..0af7e9f8c7e 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "compression_common.hpp" #include "parquet_common.hpp" #include @@ -33,6 +34,8 @@ #include +using ParquetDecompressionTest = DecompressionTest; + TEST_F(ParquetReaderTest, UserBounds) { // trying to read more rows than there are should result in @@ -2799,6 +2802,48 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) } } +TEST_P(ParquetDecompressionTest, RoundTripBasic) +{ + auto const compression_type = std::get<1>(GetParam()); + + srand(31337); + // Exercises multiple rowgroups + auto expected = create_compressible_fixed_table(4, 12345, 3, true); + + // Use a host buffer for faster I/O + std::vector buffer; + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, *expected) + .compression(compression_type); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{buffer.data(), buffer.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +INSTANTIATE_TEST_CASE_P(Nvcomp, + ParquetDecompressionTest, + ::testing::Combine(::testing::Values("NVCOMP"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZSTD))); + +INSTANTIATE_TEST_CASE_P(DeviceInternal, + ParquetDecompressionTest, + ::testing::Combine(::testing::Values("DEVICE_INTERNAL"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY))); + +INSTANTIATE_TEST_CASE_P(Host, + ParquetDecompressionTest, + ::testing::Combine(::testing::Values("HOST"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::ZSTD))); + ////////////////////// // wide tables tests diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 98423593867..746942763b4 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "compression_common.hpp" #include "parquet_common.hpp" #include @@ -37,32 +38,7 @@ using cudf::test::iterators::no_nulls; -struct CompressionTest - : public ParquetWriterTest, - public ::testing::WithParamInterface> { - CompressionTest() - { - auto const comp_impl = std::get<0>(GetParam()); - - if (comp_impl == "NVCOMP") { - setenv("LIBCUDF_HOST_COMPRESSION", "OFF", 1); - setenv("LIBCUDF_NVCOMP_POLICY", "ALWAYS", 1); - } else if (comp_impl == "DEVICE_INTERNAL") { - setenv("LIBCUDF_HOST_COMPRESSION", "OFF", 1); - setenv("LIBCUDF_NVCOMP_POLICY", "OFF", 1); - } else if (comp_impl == "HOST") { - setenv("LIBCUDF_HOST_COMPRESSION", "ON", 1); - setenv("LIBCUDF_NVCOMP_POLICY", "OFF", 1); - } else { - CUDF_FAIL("Invalid test parameter"); - } - } - ~CompressionTest() override - { - unsetenv("LIBCUDF_HOST_COMPRESSION"); - unsetenv("LIBCUDF_NVCOMP_POLICY"); - } -}; +using ParquetCompressionTest = CompressionTest; template void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema) @@ -1361,7 +1337,7 @@ TEST_F(ParquetWriterTest, UserNullabilityInvalid) EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error); } -TEST_P(CompressionTest, CompStats) +TEST_P(ParquetCompressionTest, CompStats) { auto const compression_type = std::get<1>(GetParam()); @@ -1388,7 +1364,7 @@ TEST_P(CompressionTest, CompStats) CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table->view()); } -TEST_P(CompressionTest, CompStatsEmptyTable) +TEST_P(ParquetCompressionTest, CompStatsEmptyTable) { auto const compression_type = std::get<1>(GetParam()); @@ -1407,17 +1383,61 @@ TEST_P(CompressionTest, CompStatsEmptyTable) expect_compression_stats_empty(stats); } -INSTANTIATE_TEST_CASE_P(NvcompCompressionTest, - CompressionTest, +TEST_P(ParquetCompressionTest, RoundTripBasic) +{ + constexpr auto num_rows = 12000; + auto const compression_type = std::get<1>(GetParam()); + + // Generate compressible data + auto int_sequence = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; }); + auto float_sequence = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 32; }); + + cudf::test::fixed_width_column_wrapper int_col(int_sequence, int_sequence + num_rows); + cudf::test::fixed_width_column_wrapper float_col(float_sequence, + float_sequence + num_rows); + + table_view expected({int_col, float_col}); + + auto const stats = std::make_shared(); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected) + .compression(compression_type) + .compression_statistics(stats); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + EXPECT_NE(stats->num_compressed_bytes(), 0); + EXPECT_EQ(stats->num_failed_bytes(), 0); + EXPECT_EQ(stats->num_skipped_bytes(), 0); + EXPECT_FALSE(std::isnan(stats->compression_ratio())); +} + +INSTANTIATE_TEST_CASE_P(Nvcomp, + ParquetCompressionTest, ::testing::Combine(::testing::Values("NVCOMP"), ::testing::Values(cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, cudf::io::compression_type::ZSTD))); -INSTANTIATE_TEST_CASE_P(OtherCompressionTest, - CompressionTest, - ::testing::Combine(::testing::Values("DEVICE_INTERNAL", "HOST"), +INSTANTIATE_TEST_CASE_P(DeviceInternal, + ParquetCompressionTest, + ::testing::Combine(::testing::Values("DEVICE_INTERNAL"), + ::testing::Values(cudf::io::compression_type::AUTO, + cudf::io::compression_type::SNAPPY))); + +INSTANTIATE_TEST_CASE_P(Host, + ParquetCompressionTest, + ::testing::Combine(::testing::Values("HOST"), ::testing::Values(cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::ZSTD)));