apache · jorisvandenbossche · Jul 6, 2023 · Jun 16, 2023 · Jun 23, 2023 · Jun 27, 2023
@@ -1249,11 +1249,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) {
   ASSERT_OK(NullableArray<::arrow::UInt32Type>(LARGE_SIZE, 100, kDefaultSeed, &values));
   std::shared_ptr<Table> table = MakeSimpleTable(values, true);
 
-  // Parquet 2.4 roundtrip should yield an uint32_t column again
+  // Parquet 2.6 roundtrip should yield an uint32_t column again
   this->ResetSink();
   std::shared_ptr<::parquet::WriterProperties> properties =
       ::parquet::WriterProperties::Builder()
-          .version(ParquetVersion::PARQUET_2_4)
+          .version(ParquetVersion::PARQUET_2_6)
           ->build();
   ASSERT_OK_NO_THROW(
       WriteTable(*table, default_memory_pool(), this->sink_, 512, properties));
@@ -1613,7 +1613,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) {
   ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4));
 }
 
-void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false) {
+void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected_micro = false) {
   using ::arrow::ArrayFromVector;
 
   std::vector<bool> is_valid = {true, true, true, false, true, true};
@@ -1629,7 +1629,7 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false)
   auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO));
 
   std::shared_ptr<::arrow::Schema> schema(
-      new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6}));
+      new ::arrow::Schema({f0, f1, f2, (expected_micro ? f3_x : f3), f4, f5, f6}));
 
   std::vector<int32_t> t32_values = {1489269000, 1489270000, 1489271000,
                                      1489272000, 1489272000, 1489273000};
@@ -1654,20 +1654,30 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false)
   ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5);
   ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6);
 
-  *out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6});
+  *out = Table::Make(schema, {a0, a1, a2, expected_micro ? a3_x : a3, a4, a5, a6});
 }
 
 TEST(TestArrowReadWrite, DateTimeTypes) {
-  std::shared_ptr<Table> table, result;
+  std::shared_ptr<Table> table, result, expected;
 
+  // Parquet 2.6 nanoseconds are preserved
   MakeDateTimeTypesTable(&table);
   ASSERT_NO_FATAL_FAILURE(
       DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result));
 
-  MakeDateTimeTypesTable(&table, true);  // build expected result
-  ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
+  MakeDateTimeTypesTable(&expected, false);
+  ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*expected->schema(),
+                                                     *result->schema(),
                                                      /*check_metadata=*/false));
-  ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result));
+  ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected, *result));
+
+  // Parquet 2.4 nanoseconds are converted to microseconds
+  auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder()
+                                            .version(ParquetVersion::PARQUET_2_4)
+                                            ->build();
+  MakeDateTimeTypesTable(&expected, true);
+  ASSERT_NO_FATAL_FAILURE(
+      CheckConfiguredRoundtrip(table, expected, parquet_version_2_4_properties));
 }
 
 TEST(TestArrowReadWrite, UseDeprecatedInt96) {
@@ -1973,7 +1983,9 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) {
                               field("ts:us", t_us), field("ts:ns", t_ns)});
   auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns});
 
-  auto parquet_version_1_properties = ::parquet::default_writer_properties();
+  auto parquet_version_1_properties = ::parquet::WriterProperties::Builder()
+                                          .version(ParquetVersion::PARQUET_1_0)
+                                          ->build();
   ARROW_SUPPRESS_DEPRECATION_WARNING
   auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder()
                                             .version(ParquetVersion::PARQUET_2_0)
@@ -3334,7 +3346,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) {
     auto arrow_writer_props = ArrowWriterProperties::Builder();
     arrow_writer_props.store_schema();
     if (inner_type->id() == ::arrow::Type::UINT32) {
-      writer_props.version(ParquetVersion::PARQUET_2_4);
+      writer_props.version(ParquetVersion::PARQUET_2_6);
     } else if (inner_type->id() == ::arrow::Type::TIMESTAMP) {
       // By default ns is coerced to us, override that
       ::arrow::TimeUnit::type unit =

@@ -869,9 +869,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
                               /*is_from_converted_type=*/false,
                               /*force_set_converted_type=*/true),
        ParquetType::INT64, -1},
-      // Parquet v1, values converted to microseconds
       {"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO),
-       LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS,
+       LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS,
                               /*is_from_converted_type=*/false,
                               /*force_set_converted_type=*/true),
        ParquetType::INT64, -1},
@@ -882,7 +881,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
        -1},
       {"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"),
-       LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
+       LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64,
        -1},
       {"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"),
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT64,
@@ -891,7 +890,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
        -1},
       {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"),
-       LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
+       LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64,
        -1}};
 
   std::vector<std::shared_ptr<Field>> arrow_fields;

@@ -650,6 +650,7 @@ TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) {
 
 TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) {
   this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4);
+  this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6);
 }
 
 TEST(TestWriter, NullValuesBuffer) {

@@ -214,7 +214,7 @@ class PARQUET_EXPORT WriterProperties {
           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
           pagesize_(kDefaultDataPageSize),
-          version_(ParquetVersion::PARQUET_2_4),
+          version_(ParquetVersion::PARQUET_2_6),
           data_page_version_(ParquetDataPageVersion::V1),
           created_by_(DEFAULT_CREATED_BY),
           store_decimal_as_integer_(false),
@@ -296,7 +296,7 @@ class PARQUET_EXPORT WriterProperties {
     }
 
     /// Specify the Parquet file version.
-    /// Default PARQUET_2_4.
+    /// Default PARQUET_2_6.
     Builder* version(ParquetVersion::type version) {
       version_ = version;
       return this;

@@ -44,7 +44,7 @@ TEST(TestWriterProperties, Basics) {
 
   ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize());
   ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, props->dictionary_pagesize_limit());
-  ASSERT_EQ(ParquetVersion::PARQUET_2_4, props->version());
+  ASSERT_EQ(ParquetVersion::PARQUET_2_6, props->version());
   ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version());
   ASSERT_FALSE(props->page_checksum_enabled());
 }

@@ -710,7 +710,7 @@ cdef class FileMetaData(_Weakrefable):
         """
         Parquet format version used in file (str, such as '1.0', '2.4').
 
-        If version is missing or unparsable, will default to assuming '2.4'.
+        If version is missing or unparsable, will default to assuming '2.6'.
         """
         cdef ParquetVersion version = self._metadata.version()
         if version == ParquetVersion_V1:
@@ -722,9 +722,9 @@ cdef class FileMetaData(_Weakrefable):
         elif version == ParquetVersion_V2_6:
             return '2.6'
         else:
-            warnings.warn('Unrecognized file version, assuming 2.4: {}'
+            warnings.warn('Unrecognized file version, assuming 2.6: {}'
                           .format(version))
-            return '2.4'
+            return '2.6'
 
     @property
     def created_by(self):

@@ -748,7 +748,7 @@ def _sanitize_table(table, new_schema, flavor):
         return table
 
 
-_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4"
+_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.6"
     Determine which Parquet logical types are available for use, whether the
     reduced set from the Parquet 1.x.x format or the expanded logical types
     added in later format versions.
@@ -944,7 +944,7 @@ class ParquetWriter:
 
     def __init__(self, where, schema, filesystem=None,
                  flavor=None,
-                 version='2.4',
+                 version='2.6',
                  use_dictionary=True,
                  compression='snappy',
                  write_statistics=True,
@@ -3060,7 +3060,7 @@ def read_pandas(source, columns=None, **kwargs):
     _DNF_filter_doc, "")
 
 
-def write_table(table, where, row_group_size=None, version='2.4',
+def write_table(table, where, row_group_size=None, version='2.6',
                 use_dictionary=True, compression='snappy',
                 write_statistics=True,
                 use_deprecated_int96_timestamps=None,

@@ -630,7 +630,9 @@ def test_write_error_deletes_incomplete_file(tempdir):
 
     filename = tempdir / 'tmp_file'
     try:
-        _write_table(pdf, filename)
+        # Test relies on writing nanoseconds to raise an error
+        # true for Parquet 2.4
+        _write_table(pdf, filename, version="2.4")
     except pa.ArrowException:
         pass
 

@@ -336,23 +336,24 @@ def get_table(pq_reader_method, filename, **kwargs):
     tm.assert_frame_equal(df, df_correct)
 
 
-def test_timestamp_restore_timezone():
+@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
+def test_timestamp_restore_timezone(unit):
     # ARROW-5888, restore timezone from serialized metadata
-    ty = pa.timestamp('ms', tz='America/New_York')
+    ty = pa.timestamp(unit, tz='America/New_York')
     arr = pa.array([1, 2, 3], type=ty)
     t = pa.table([arr], names=['f0'])
     _check_roundtrip(t)
 
 
 def test_timestamp_restore_timezone_nanosecond():
     # ARROW-9634, also restore timezone for nanosecond data that get stored
-    # as microseconds in the parquet file
+    # as microseconds in the parquet file for Parquet ver 2.4 and less
     ty = pa.timestamp('ns', tz='America/New_York')
     arr = pa.array([1000, 2000, 3000], type=ty)
     table = pa.table([arr], names=['f0'])
     ty_us = pa.timestamp('us', tz='America/New_York')
     expected = pa.table([arr.cast(ty_us)], names=['f0'])
-    _check_roundtrip(table, expected=expected)
+    _check_roundtrip(table, expected=expected, version='2.4')
 
 
 @pytest.mark.pandas
@@ -378,28 +379,31 @@ def test_parquet_version_timestamp_differences():
     a_us = pa.array(d_us, type=pa.timestamp('us'))
     a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
 
+    all_versions = ['1.0', '2.4', '2.6']
+
     names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
     table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
 
-    # Using Parquet version 1.0, seconds should be coerced to milliseconds
+    # Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
     # and nanoseconds should be coerced to microseconds by default
     expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
-    _check_roundtrip(table, expected)
+    _check_roundtrip(table, expected, version='1.0')
+    _check_roundtrip(table, expected, version='2.4')
 
-    # Using Parquet version 2.0, seconds should be coerced to milliseconds
+    # Using Parquet version 2.6, seconds should be coerced to milliseconds
     # and nanoseconds should be retained by default
     expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
     _check_roundtrip(table, expected, version='2.6')
 
-    # Using Parquet version 1.0, coercing to milliseconds or microseconds
+    # For either Parquet version coercing to milliseconds or microseconds
     # is allowed
     expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
-    _check_roundtrip(table, expected, coerce_timestamps='ms')
+    for ver in all_versions:
+        _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
 
-    # Using Parquet version 2.0, coercing to milliseconds or microseconds
-    # is allowed
     expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
-    _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
 
     # TODO: after pyarrow allows coerce_timestamps='ns', tests like the
     # following should pass ...
@@ -416,10 +420,9 @@ def test_parquet_version_timestamp_differences():
     # For either Parquet version, coercing to nanoseconds is allowed
     # if Int96 storage is used
     expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
-    _check_roundtrip(table, expected,
-                     use_deprecated_int96_timestamps=True)
-    _check_roundtrip(table, expected, version='2.6',
-                     use_deprecated_int96_timestamps=True)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver,
+                         use_deprecated_int96_timestamps=True)
 
 
 @pytest.mark.pandas

@@ -256,7 +256,7 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
     arrow_table = pa.Table.from_pandas(df)
 
     with filename.open('wb') as f:
-        _write_table(arrow_table, f, version="2.4")
+        _write_table(arrow_table, f, version="2.6")
 
     data = io.BytesIO(filename.read_bytes())