-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-35746: [Parquet][C++][Python] Switch default Parquet version to 2.6 #36137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
14b5c6f
ed53e1c
53c0e47
4095601
6107218
213275b
115030c
932de52
b86b420
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1249,11 +1249,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) { | |
| ASSERT_OK(NullableArray<::arrow::UInt32Type>(LARGE_SIZE, 100, kDefaultSeed, &values)); | ||
| std::shared_ptr<Table> table = MakeSimpleTable(values, true); | ||
|
|
||
| // Parquet 2.4 roundtrip should yield an uint32_t column again | ||
| // Parquet 2.6 roundtrip should yield an uint32_t column again | ||
| this->ResetSink(); | ||
| std::shared_ptr<::parquet::WriterProperties> properties = | ||
| ::parquet::WriterProperties::Builder() | ||
| .version(ParquetVersion::PARQUET_2_4) | ||
| .version(ParquetVersion::PARQUET_2_6) | ||
| ->build(); | ||
| ASSERT_OK_NO_THROW( | ||
| WriteTable(*table, default_memory_pool(), this->sink_, 512, properties)); | ||
|
|
@@ -1613,7 +1613,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { | |
| ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4)); | ||
| } | ||
|
|
||
| void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false) { | ||
| void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected_micro = false) { | ||
| using ::arrow::ArrayFromVector; | ||
|
|
||
| std::vector<bool> is_valid = {true, true, true, false, true, true}; | ||
|
|
@@ -1629,7 +1629,7 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false) | |
| auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO)); | ||
|
|
||
| std::shared_ptr<::arrow::Schema> schema( | ||
| new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6})); | ||
| new ::arrow::Schema({f0, f1, f2, (expected_micro ? f3_x : f3), f4, f5, f6})); | ||
|
|
||
| std::vector<int32_t> t32_values = {1489269000, 1489270000, 1489271000, | ||
| 1489272000, 1489272000, 1489273000}; | ||
|
|
@@ -1654,20 +1654,30 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false) | |
| ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); | ||
| ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6); | ||
|
|
||
| *out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6}); | ||
| *out = Table::Make(schema, {a0, a1, a2, expected_micro ? a3_x : a3, a4, a5, a6}); | ||
| } | ||
|
|
||
| TEST(TestArrowReadWrite, DateTimeTypes) { | ||
| std::shared_ptr<Table> table, result; | ||
| std::shared_ptr<Table> table, result, expected; | ||
|
|
||
| // Parquet 2.6 nanoseconds are preserved | ||
| MakeDateTimeTypesTable(&table); | ||
| ASSERT_NO_FATAL_FAILURE( | ||
| DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); | ||
|
|
||
| MakeDateTimeTypesTable(&table, true); // build expected result | ||
| ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), | ||
| MakeDateTimeTypesTable(&expected, false); | ||
| ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*expected->schema(), | ||
| *result->schema(), | ||
| /*check_metadata=*/false)); | ||
| ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); | ||
| ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected, *result)); | ||
|
|
||
| // Parquet 2.4 nanoseconds are converted to microseconds | ||
| auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder() | ||
| .version(ParquetVersion::PARQUET_2_4) | ||
| ->build(); | ||
| MakeDateTimeTypesTable(&expected, true); | ||
| ASSERT_NO_FATAL_FAILURE( | ||
| CheckConfiguredRoundtrip(table, expected, parquet_version_2_4_properties)); | ||
| } | ||
|
|
||
| TEST(TestArrowReadWrite, UseDeprecatedInt96) { | ||
|
|
@@ -1973,7 +1983,9 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { | |
| field("ts:us", t_us), field("ts:ns", t_ns)}); | ||
| auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns}); | ||
|
|
||
| auto parquet_version_1_properties = ::parquet::default_writer_properties(); | ||
| auto parquet_version_1_properties = ::parquet::WriterProperties::Builder() | ||
| .version(ParquetVersion::PARQUET_1_0) | ||
| ->build(); | ||
| ARROW_SUPPRESS_DEPRECATION_WARNING | ||
| auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder() | ||
| .version(ParquetVersion::PARQUET_2_0) | ||
|
|
@@ -3334,7 +3346,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { | |
| auto arrow_writer_props = ArrowWriterProperties::Builder(); | ||
| arrow_writer_props.store_schema(); | ||
| if (inner_type->id() == ::arrow::Type::UINT32) { | ||
| writer_props.version(ParquetVersion::PARQUET_2_4); | ||
| writer_props.version(ParquetVersion::PARQUET_2_6); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default writer_props created above with |
||
| } else if (inner_type->id() == ::arrow::Type::TIMESTAMP) { | ||
| // By default ns is coerced to us, override that | ||
| ::arrow::TimeUnit::type unit = | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -869,9 +869,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { | |
| /*is_from_converted_type=*/false, | ||
| /*force_set_converted_type=*/true), | ||
| ParquetType::INT64, -1}, | ||
| // Parquet v1, values converted to microseconds | ||
| {"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO), | ||
| LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS, | ||
| LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS, | ||
| /*is_from_converted_type=*/false, | ||
| /*force_set_converted_type=*/true), | ||
| ParquetType::INT64, -1}, | ||
|
|
@@ -882,7 +881,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { | |
| LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, | ||
| -1}, | ||
| {"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"), | ||
| LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, | ||
| LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64, | ||
| -1}, | ||
| {"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), | ||
| LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT64, | ||
|
|
@@ -891,7 +890,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { | |
| LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, | ||
| -1}, | ||
| {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"), | ||
| LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, | ||
| LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we extract NANOS, and test with both 2_4 and 2_6? Since 2.4 might still be used for some time |
||
| -1}}; | ||
|
|
||
| std::vector<std::shared_ptr<Field>> arrow_fields; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -336,23 +336,24 @@ def get_table(pq_reader_method, filename, **kwargs): | |
| tm.assert_frame_equal(df, df_correct) | ||
|
|
||
|
|
||
| def test_timestamp_restore_timezone(): | ||
| @pytest.mark.parametrize('unit', ['ms', 'us', 'ns']) | ||
| def test_timestamp_restore_timezone(unit): | ||
| # ARROW-5888, restore timezone from serialized metadata | ||
| ty = pa.timestamp('ms', tz='America/New_York') | ||
| ty = pa.timestamp(unit, tz='America/New_York') | ||
| arr = pa.array([1, 2, 3], type=ty) | ||
| t = pa.table([arr], names=['f0']) | ||
| _check_roundtrip(t) | ||
|
|
||
|
|
||
| def test_timestamp_restore_timezone_nanosecond(): | ||
| # ARROW-9634, also restore timezone for nanosecond data that get stored | ||
| # as microseconds in the parquet file | ||
| # as microseconds in the parquet file for Parquet ver 2.4 and less | ||
| ty = pa.timestamp('ns', tz='America/New_York') | ||
| arr = pa.array([1000, 2000, 3000], type=ty) | ||
| table = pa.table([arr], names=['f0']) | ||
| ty_us = pa.timestamp('us', tz='America/New_York') | ||
| expected = pa.table([arr.cast(ty_us)], names=['f0']) | ||
| _check_roundtrip(table, expected=expected) | ||
| _check_roundtrip(table, expected=expected, version='2.4') | ||
|
|
||
|
|
||
| @pytest.mark.pandas | ||
|
|
@@ -378,28 +379,31 @@ def test_parquet_version_timestamp_differences(): | |
| a_us = pa.array(d_us, type=pa.timestamp('us')) | ||
| a_ns = pa.array(d_ns, type=pa.timestamp('ns')) | ||
|
|
||
| all_versions = ['1.0', '2.4', '2.6'] | ||
|
|
||
| names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns'] | ||
| table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names) | ||
|
|
||
| # Using Parquet version 1.0, seconds should be coerced to milliseconds | ||
| # Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds | ||
| # and nanoseconds should be coerced to microseconds by default | ||
| expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names) | ||
| _check_roundtrip(table, expected) | ||
| _check_roundtrip(table, expected, version='1.0') | ||
| _check_roundtrip(table, expected, version='2.4') | ||
|
|
||
| # Using Parquet version 2.0, seconds should be coerced to milliseconds | ||
| # Using Parquet version 2.6, seconds should be coerced to milliseconds | ||
| # and nanoseconds should be retained by default | ||
| expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names) | ||
| _check_roundtrip(table, expected, version='2.6') | ||
|
|
||
| # Using Parquet version 1.0, coercing to milliseconds or microseconds | ||
| # For either Parquet version coercing to milliseconds or microseconds | ||
| # is allowed | ||
| expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names) | ||
| _check_roundtrip(table, expected, coerce_timestamps='ms') | ||
| for ver in all_versions: | ||
| _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver) | ||
|
|
||
| # Using Parquet version 2.0, coercing to milliseconds or microseconds | ||
| # is allowed | ||
| expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names) | ||
| _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us') | ||
| for ver in all_versions: | ||
| _check_roundtrip(table, expected, version=ver, coerce_timestamps='us') | ||
|
|
||
| # TODO: after pyarrow allows coerce_timestamps='ns', tests like the | ||
| # following should pass ... | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could also be left for a follow-up, but we could now allow this (in |
||
|
|
@@ -416,10 +420,9 @@ def test_parquet_version_timestamp_differences(): | |
| # For either Parquet version, coercing to nanoseconds is allowed | ||
| # if Int96 storage is used | ||
| expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) | ||
| _check_roundtrip(table, expected, | ||
| use_deprecated_int96_timestamps=True) | ||
| _check_roundtrip(table, expected, version='2.6', | ||
| use_deprecated_int96_timestamps=True) | ||
| for ver in all_versions: | ||
| _check_roundtrip(table, expected, version=ver, | ||
| use_deprecated_int96_timestamps=True) | ||
|
|
||
|
|
||
| @pytest.mark.pandas | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice catch!