diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 47570405c91..f82177b86f1 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1249,11 +1249,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) { ASSERT_OK(NullableArray<::arrow::UInt32Type>(LARGE_SIZE, 100, kDefaultSeed, &values)); std::shared_ptr table = MakeSimpleTable(values, true); - // Parquet 2.4 roundtrip should yield an uint32_t column again + // Parquet 2.6 roundtrip should yield an uint32_t column again this->ResetSink(); std::shared_ptr<::parquet::WriterProperties> properties = ::parquet::WriterProperties::Builder() - .version(ParquetVersion::PARQUET_2_4) + .version(ParquetVersion::PARQUET_2_6) ->build(); ASSERT_OK_NO_THROW( WriteTable(*table, default_memory_pool(), this->sink_, 512, properties)); @@ -1613,7 +1613,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4)); } -void MakeDateTimeTypesTable(std::shared_ptr
* out, bool expected = false) { +void MakeDateTimeTypesTable(std::shared_ptr
* out, bool expected_micro = false) { using ::arrow::ArrayFromVector; std::vector is_valid = {true, true, true, false, true, true}; @@ -1629,7 +1629,7 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool expected = false) auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO)); std::shared_ptr<::arrow::Schema> schema( - new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6})); + new ::arrow::Schema({f0, f1, f2, (expected_micro ? f3_x : f3), f4, f5, f6})); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; @@ -1654,20 +1654,30 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool expected = false) ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6); - *out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6}); + *out = Table::Make(schema, {a0, a1, a2, expected_micro ? a3_x : a3, a4, a5, a6}); } TEST(TestArrowReadWrite, DateTimeTypes) { - std::shared_ptr
table, result; + std::shared_ptr
table, result, expected; + // Parquet 2.6 nanoseconds are preserved MakeDateTimeTypesTable(&table); ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); - MakeDateTimeTypesTable(&table, true); // build expected result - ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), + MakeDateTimeTypesTable(&expected, false); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*expected->schema(), + *result->schema(), /*check_metadata=*/false)); - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected, *result)); + + // Parquet 2.4 nanoseconds are converted to microseconds + auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_2_4) + ->build(); + MakeDateTimeTypesTable(&expected, true); + ASSERT_NO_FATAL_FAILURE( + CheckConfiguredRoundtrip(table, expected, parquet_version_2_4_properties)); } TEST(TestArrowReadWrite, UseDeprecatedInt96) { @@ -1973,7 +1983,9 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { field("ts:us", t_us), field("ts:ns", t_ns)}); auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns}); - auto parquet_version_1_properties = ::parquet::default_writer_properties(); + auto parquet_version_1_properties = ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_1_0) + ->build(); ARROW_SUPPRESS_DEPRECATION_WARNING auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder() .version(ParquetVersion::PARQUET_2_0) @@ -3334,7 +3346,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { auto arrow_writer_props = ArrowWriterProperties::Builder(); arrow_writer_props.store_schema(); if (inner_type->id() == ::arrow::Type::UINT32) { - writer_props.version(ParquetVersion::PARQUET_2_4); + writer_props.version(ParquetVersion::PARQUET_2_6); } else if (inner_type->id() == ::arrow::Type::TIMESTAMP) { // By default ns is coerced to us, override that ::arrow::TimeUnit::type unit = diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 621f2a0e76f..7c608e44247 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -869,9 +869,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { /*is_from_converted_type=*/false, /*force_set_converted_type=*/true), ParquetType::INT64, -1}, - // Parquet v1, values converted to microseconds {"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO), - LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS, + LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS, /*is_from_converted_type=*/false, /*force_set_converted_type=*/true), ParquetType::INT64, -1}, @@ -882,7 +881,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"), - LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, + LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64, -1}, {"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT64, @@ -891,7 +890,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"), - LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64, + LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64, -1}}; std::vector> arrow_fields; diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index ed9bb4b4078..af9876370ee 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -650,6 +650,7 @@ TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) { TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) { this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4); + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6); } TEST(TestWriter, NullValuesBuffer) { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index f16f6c49d3e..0a69659508e 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -214,7 +214,7 @@ class PARQUET_EXPORT WriterProperties { write_batch_size_(DEFAULT_WRITE_BATCH_SIZE), max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH), pagesize_(kDefaultDataPageSize), - version_(ParquetVersion::PARQUET_2_4), + version_(ParquetVersion::PARQUET_2_6), data_page_version_(ParquetDataPageVersion::V1), created_by_(DEFAULT_CREATED_BY), store_decimal_as_integer_(false), @@ -296,7 +296,7 @@ class PARQUET_EXPORT WriterProperties { } /// Specify the Parquet file version. - /// Default PARQUET_2_4. + /// Default PARQUET_2_6. Builder* version(ParquetVersion::type version) { version_ = version; return this; diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc index 5fd182f679c..2ba1b8a6046 100644 --- a/cpp/src/parquet/properties_test.cc +++ b/cpp/src/parquet/properties_test.cc @@ -44,7 +44,7 @@ TEST(TestWriterProperties, Basics) { ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize()); ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, props->dictionary_pagesize_limit()); - ASSERT_EQ(ParquetVersion::PARQUET_2_4, props->version()); + ASSERT_EQ(ParquetVersion::PARQUET_2_6, props->version()); ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version()); ASSERT_FALSE(props->page_checksum_enabled()); } diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 781c7e23be6..2f53d5fbbaa 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -710,7 +710,7 @@ cdef class FileMetaData(_Weakrefable): """ Parquet format version used in file (str, such as '1.0', '2.4'). - If version is missing or unparsable, will default to assuming '2.4'. + If version is missing or unparsable, will default to assuming '2.6'. """ cdef ParquetVersion version = self._metadata.version() if version == ParquetVersion_V1: @@ -722,9 +722,9 @@ cdef class FileMetaData(_Weakrefable): elif version == ParquetVersion_V2_6: return '2.6' else: - warnings.warn('Unrecognized file version, assuming 2.4: {}' + warnings.warn('Unrecognized file version, assuming 2.6: {}' .format(version)) - return '2.4' + return '2.6' @property def created_by(self): diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 3fa0d6dadd2..cfcaa475dc0 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -748,7 +748,7 @@ def _sanitize_table(table, new_schema, flavor): return table -_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4" +_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.6" Determine which Parquet logical types are available for use, whether the reduced set from the Parquet 1.x.x format or the expanded logical types added in later format versions. @@ -944,7 +944,7 @@ class ParquetWriter: def __init__(self, where, schema, filesystem=None, flavor=None, - version='2.4', + version='2.6', use_dictionary=True, compression='snappy', write_statistics=True, @@ -3060,7 +3060,7 @@ def read_pandas(source, columns=None, **kwargs): _DNF_filter_doc, "") -def write_table(table, where, row_group_size=None, version='2.4', +def write_table(table, where, row_group_size=None, version='2.6', use_dictionary=True, compression='snappy', write_statistics=True, use_deprecated_int96_timestamps=None, diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 5c0e1a077d0..9bc59cbcf96 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -630,7 +630,9 @@ def test_write_error_deletes_incomplete_file(tempdir): filename = tempdir / 'tmp_file' try: - _write_table(pdf, filename) + # Test relies on writing nanoseconds to raise an error + # true for Parquet 2.4 + _write_table(pdf, filename, version="2.4") except pa.ArrowException: pass diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index bb1107972b2..1cad82b8398 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -336,9 +336,10 @@ def get_table(pq_reader_method, filename, **kwargs): tm.assert_frame_equal(df, df_correct) -def test_timestamp_restore_timezone(): +@pytest.mark.parametrize('unit', ['ms', 'us', 'ns']) +def test_timestamp_restore_timezone(unit): # ARROW-5888, restore timezone from serialized metadata - ty = pa.timestamp('ms', tz='America/New_York') + ty = pa.timestamp(unit, tz='America/New_York') arr = pa.array([1, 2, 3], type=ty) t = pa.table([arr], names=['f0']) _check_roundtrip(t) @@ -346,13 +347,13 @@ def test_timestamp_restore_timezone(): def test_timestamp_restore_timezone_nanosecond(): # ARROW-9634, also restore timezone for nanosecond data that get stored - # as microseconds in the parquet file + # as microseconds in the parquet file for Parquet ver 2.4 and less ty = pa.timestamp('ns', tz='America/New_York') arr = pa.array([1000, 2000, 3000], type=ty) table = pa.table([arr], names=['f0']) ty_us = pa.timestamp('us', tz='America/New_York') expected = pa.table([arr.cast(ty_us)], names=['f0']) - _check_roundtrip(table, expected=expected) + _check_roundtrip(table, expected=expected, version='2.4') @pytest.mark.pandas @@ -378,28 +379,31 @@ def test_parquet_version_timestamp_differences(): a_us = pa.array(d_us, type=pa.timestamp('us')) a_ns = pa.array(d_ns, type=pa.timestamp('ns')) + all_versions = ['1.0', '2.4', '2.6'] + names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns'] table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names) - # Using Parquet version 1.0, seconds should be coerced to milliseconds + # Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds # and nanoseconds should be coerced to microseconds by default expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names) - _check_roundtrip(table, expected) + _check_roundtrip(table, expected, version='1.0') + _check_roundtrip(table, expected, version='2.4') - # Using Parquet version 2.0, seconds should be coerced to milliseconds + # Using Parquet version 2.6, seconds should be coerced to milliseconds # and nanoseconds should be retained by default expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names) _check_roundtrip(table, expected, version='2.6') - # Using Parquet version 1.0, coercing to milliseconds or microseconds + # For either Parquet version coercing to milliseconds or microseconds # is allowed expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names) - _check_roundtrip(table, expected, coerce_timestamps='ms') + for ver in all_versions: + _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver) - # Using Parquet version 2.0, coercing to milliseconds or microseconds - # is allowed expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names) - _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us') + for ver in all_versions: + _check_roundtrip(table, expected, version=ver, coerce_timestamps='us') # TODO: after pyarrow allows coerce_timestamps='ns', tests like the # following should pass ... @@ -416,10 +420,9 @@ def test_parquet_version_timestamp_differences(): # For either Parquet version, coercing to nanoseconds is allowed # if Int96 storage is used expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) - _check_roundtrip(table, expected, - use_deprecated_int96_timestamps=True) - _check_roundtrip(table, expected, version='2.6', - use_deprecated_int96_timestamps=True) + for ver in all_versions: + _check_roundtrip(table, expected, version=ver, + use_deprecated_int96_timestamps=True) @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index ea999064277..6bd68e08fc5 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -256,7 +256,7 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): arrow_table = pa.Table.from_pandas(df) with filename.open('wb') as f: - _write_table(arrow_table, f, version="2.4") + _write_table(arrow_table, f, version="2.6") data = io.BytesIO(filename.read_bytes())