diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 47570405c91..f82177b86f1 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1249,11 +1249,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) {
ASSERT_OK(NullableArray<::arrow::UInt32Type>(LARGE_SIZE, 100, kDefaultSeed, &values));
std::shared_ptr
table = MakeSimpleTable(values, true);
- // Parquet 2.4 roundtrip should yield an uint32_t column again
+ // Parquet 2.6 roundtrip should yield an uint32_t column again
this->ResetSink();
std::shared_ptr<::parquet::WriterProperties> properties =
::parquet::WriterProperties::Builder()
- .version(ParquetVersion::PARQUET_2_4)
+ .version(ParquetVersion::PARQUET_2_6)
->build();
ASSERT_OK_NO_THROW(
WriteTable(*table, default_memory_pool(), this->sink_, 512, properties));
@@ -1613,7 +1613,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) {
ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4));
}
-void MakeDateTimeTypesTable(std::shared_ptr* out, bool expected = false) {
+void MakeDateTimeTypesTable(std::shared_ptr* out, bool expected_micro = false) {
using ::arrow::ArrayFromVector;
std::vector is_valid = {true, true, true, false, true, true};
@@ -1629,7 +1629,7 @@ void MakeDateTimeTypesTable(std::shared_ptr* out, bool expected = false)
auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO));
std::shared_ptr<::arrow::Schema> schema(
- new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6}));
+ new ::arrow::Schema({f0, f1, f2, (expected_micro ? f3_x : f3), f4, f5, f6}));
std::vector t32_values = {1489269000, 1489270000, 1489271000,
1489272000, 1489272000, 1489273000};
@@ -1654,20 +1654,30 @@ void MakeDateTimeTypesTable(std::shared_ptr* out, bool expected = false)
ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5);
ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6);
- *out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6});
+ *out = Table::Make(schema, {a0, a1, a2, expected_micro ? a3_x : a3, a4, a5, a6});
}
TEST(TestArrowReadWrite, DateTimeTypes) {
- std::shared_ptr table, result;
+ std::shared_ptr table, result, expected;
+ // Parquet 2.6 nanoseconds are preserved
MakeDateTimeTypesTable(&table);
ASSERT_NO_FATAL_FAILURE(
DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result));
- MakeDateTimeTypesTable(&table, true); // build expected result
- ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
+ MakeDateTimeTypesTable(&expected, false);
+ ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*expected->schema(),
+ *result->schema(),
/*check_metadata=*/false));
- ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result));
+ ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected, *result));
+
+ // Parquet 2.4 nanoseconds are converted to microseconds
+ auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder()
+ .version(ParquetVersion::PARQUET_2_4)
+ ->build();
+ MakeDateTimeTypesTable(&expected, true);
+ ASSERT_NO_FATAL_FAILURE(
+ CheckConfiguredRoundtrip(table, expected, parquet_version_2_4_properties));
}
TEST(TestArrowReadWrite, UseDeprecatedInt96) {
@@ -1973,7 +1983,9 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) {
field("ts:us", t_us), field("ts:ns", t_ns)});
auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns});
- auto parquet_version_1_properties = ::parquet::default_writer_properties();
+ auto parquet_version_1_properties = ::parquet::WriterProperties::Builder()
+ .version(ParquetVersion::PARQUET_1_0)
+ ->build();
ARROW_SUPPRESS_DEPRECATION_WARNING
auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder()
.version(ParquetVersion::PARQUET_2_0)
@@ -3334,7 +3346,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) {
auto arrow_writer_props = ArrowWriterProperties::Builder();
arrow_writer_props.store_schema();
if (inner_type->id() == ::arrow::Type::UINT32) {
- writer_props.version(ParquetVersion::PARQUET_2_4);
+ writer_props.version(ParquetVersion::PARQUET_2_6);
} else if (inner_type->id() == ::arrow::Type::TIMESTAMP) {
// By default ns is coerced to us, override that
::arrow::TimeUnit::type unit =
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 621f2a0e76f..7c608e44247 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -869,9 +869,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
/*is_from_converted_type=*/false,
/*force_set_converted_type=*/true),
ParquetType::INT64, -1},
- // Parquet v1, values converted to microseconds
{"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO),
- LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS,
+ LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS,
/*is_from_converted_type=*/false,
/*force_set_converted_type=*/true),
ParquetType::INT64, -1},
@@ -882,7 +881,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
-1},
{"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"),
- LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
+ LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64,
-1},
{"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"),
LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT64,
@@ -891,7 +890,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
-1},
{"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"),
- LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
+ LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64,
-1}};
std::vector> arrow_fields;
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index ed9bb4b4078..af9876370ee 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -650,6 +650,7 @@ TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) {
TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) {
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4);
+ this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6);
}
TEST(TestWriter, NullValuesBuffer) {
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index f16f6c49d3e..0a69659508e 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -214,7 +214,7 @@ class PARQUET_EXPORT WriterProperties {
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
pagesize_(kDefaultDataPageSize),
- version_(ParquetVersion::PARQUET_2_4),
+ version_(ParquetVersion::PARQUET_2_6),
data_page_version_(ParquetDataPageVersion::V1),
created_by_(DEFAULT_CREATED_BY),
store_decimal_as_integer_(false),
@@ -296,7 +296,7 @@ class PARQUET_EXPORT WriterProperties {
}
/// Specify the Parquet file version.
- /// Default PARQUET_2_4.
+ /// Default PARQUET_2_6.
Builder* version(ParquetVersion::type version) {
version_ = version;
return this;
diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc
index 5fd182f679c..2ba1b8a6046 100644
--- a/cpp/src/parquet/properties_test.cc
+++ b/cpp/src/parquet/properties_test.cc
@@ -44,7 +44,7 @@ TEST(TestWriterProperties, Basics) {
ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize());
ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, props->dictionary_pagesize_limit());
- ASSERT_EQ(ParquetVersion::PARQUET_2_4, props->version());
+ ASSERT_EQ(ParquetVersion::PARQUET_2_6, props->version());
ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version());
ASSERT_FALSE(props->page_checksum_enabled());
}
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 781c7e23be6..2f53d5fbbaa 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -710,7 +710,7 @@ cdef class FileMetaData(_Weakrefable):
"""
Parquet format version used in file (str, such as '1.0', '2.4').
- If version is missing or unparsable, will default to assuming '2.4'.
+ If version is missing or unparsable, will default to assuming '2.6'.
"""
cdef ParquetVersion version = self._metadata.version()
if version == ParquetVersion_V1:
@@ -722,9 +722,9 @@ cdef class FileMetaData(_Weakrefable):
elif version == ParquetVersion_V2_6:
return '2.6'
else:
- warnings.warn('Unrecognized file version, assuming 2.4: {}'
+ warnings.warn('Unrecognized file version, assuming 2.6: {}'
.format(version))
- return '2.4'
+ return '2.6'
@property
def created_by(self):
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 3fa0d6dadd2..cfcaa475dc0 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -748,7 +748,7 @@ def _sanitize_table(table, new_schema, flavor):
return table
-_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4"
+_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.6"
Determine which Parquet logical types are available for use, whether the
reduced set from the Parquet 1.x.x format or the expanded logical types
added in later format versions.
@@ -944,7 +944,7 @@ class ParquetWriter:
def __init__(self, where, schema, filesystem=None,
flavor=None,
- version='2.4',
+ version='2.6',
use_dictionary=True,
compression='snappy',
write_statistics=True,
@@ -3060,7 +3060,7 @@ def read_pandas(source, columns=None, **kwargs):
_DNF_filter_doc, "")
-def write_table(table, where, row_group_size=None, version='2.4',
+def write_table(table, where, row_group_size=None, version='2.6',
use_dictionary=True, compression='snappy',
write_statistics=True,
use_deprecated_int96_timestamps=None,
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index 5c0e1a077d0..9bc59cbcf96 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -630,7 +630,9 @@ def test_write_error_deletes_incomplete_file(tempdir):
filename = tempdir / 'tmp_file'
try:
- _write_table(pdf, filename)
+ # Test relies on writing nanoseconds to raise an error
+ # true for Parquet 2.4
+ _write_table(pdf, filename, version="2.4")
except pa.ArrowException:
pass
diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
index bb1107972b2..1cad82b8398 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -336,9 +336,10 @@ def get_table(pq_reader_method, filename, **kwargs):
tm.assert_frame_equal(df, df_correct)
-def test_timestamp_restore_timezone():
+@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
+def test_timestamp_restore_timezone(unit):
# ARROW-5888, restore timezone from serialized metadata
- ty = pa.timestamp('ms', tz='America/New_York')
+ ty = pa.timestamp(unit, tz='America/New_York')
arr = pa.array([1, 2, 3], type=ty)
t = pa.table([arr], names=['f0'])
_check_roundtrip(t)
@@ -346,13 +347,13 @@ def test_timestamp_restore_timezone():
def test_timestamp_restore_timezone_nanosecond():
# ARROW-9634, also restore timezone for nanosecond data that get stored
- # as microseconds in the parquet file
+ # as microseconds in the parquet file for Parquet ver 2.4 and less
ty = pa.timestamp('ns', tz='America/New_York')
arr = pa.array([1000, 2000, 3000], type=ty)
table = pa.table([arr], names=['f0'])
ty_us = pa.timestamp('us', tz='America/New_York')
expected = pa.table([arr.cast(ty_us)], names=['f0'])
- _check_roundtrip(table, expected=expected)
+ _check_roundtrip(table, expected=expected, version='2.4')
@pytest.mark.pandas
@@ -378,28 +379,31 @@ def test_parquet_version_timestamp_differences():
a_us = pa.array(d_us, type=pa.timestamp('us'))
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+ all_versions = ['1.0', '2.4', '2.6']
+
names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
- # Using Parquet version 1.0, seconds should be coerced to milliseconds
+ # Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
# and nanoseconds should be coerced to microseconds by default
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
- _check_roundtrip(table, expected)
+ _check_roundtrip(table, expected, version='1.0')
+ _check_roundtrip(table, expected, version='2.4')
- # Using Parquet version 2.0, seconds should be coerced to milliseconds
+ # Using Parquet version 2.6, seconds should be coerced to milliseconds
# and nanoseconds should be retained by default
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
_check_roundtrip(table, expected, version='2.6')
- # Using Parquet version 1.0, coercing to milliseconds or microseconds
+ # For either Parquet version coercing to milliseconds or microseconds
# is allowed
expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
- _check_roundtrip(table, expected, coerce_timestamps='ms')
+ for ver in all_versions:
+ _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
- # Using Parquet version 2.0, coercing to milliseconds or microseconds
- # is allowed
expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
- _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')
+ for ver in all_versions:
+ _check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
# TODO: after pyarrow allows coerce_timestamps='ns', tests like the
# following should pass ...
@@ -416,10 +420,9 @@ def test_parquet_version_timestamp_differences():
# For either Parquet version, coercing to nanoseconds is allowed
# if Int96 storage is used
expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
- _check_roundtrip(table, expected,
- use_deprecated_int96_timestamps=True)
- _check_roundtrip(table, expected, version='2.6',
- use_deprecated_int96_timestamps=True)
+ for ver in all_versions:
+ _check_roundtrip(table, expected, version=ver,
+ use_deprecated_int96_timestamps=True)
@pytest.mark.pandas
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
index ea999064277..6bd68e08fc5 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -256,7 +256,7 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
arrow_table = pa.Table.from_pandas(df)
with filename.open('wb') as f:
- _write_table(arrow_table, f, version="2.4")
+ _write_table(arrow_table, f, version="2.6")
data = io.BytesIO(filename.read_bytes())