Skip to content
34 changes: 23 additions & 11 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1249,11 +1249,11 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) {
ASSERT_OK(NullableArray<::arrow::UInt32Type>(LARGE_SIZE, 100, kDefaultSeed, &values));
std::shared_ptr<Table> table = MakeSimpleTable(values, true);

// Parquet 2.4 roundtrip should yield an uint32_t column again
// Parquet 2.6 roundtrip should yield an uint32_t column again
this->ResetSink();
std::shared_ptr<::parquet::WriterProperties> properties =
::parquet::WriterProperties::Builder()
.version(ParquetVersion::PARQUET_2_4)
.version(ParquetVersion::PARQUET_2_6)
->build();
ASSERT_OK_NO_THROW(
WriteTable(*table, default_memory_pool(), this->sink_, 512, properties));
Expand Down Expand Up @@ -1613,7 +1613,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) {
ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4));
}

void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false) {
void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected_micro = false) {
using ::arrow::ArrayFromVector;

std::vector<bool> is_valid = {true, true, true, false, true, true};
Expand All @@ -1629,7 +1629,7 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false)
auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO));

std::shared_ptr<::arrow::Schema> schema(
new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6}));
new ::arrow::Schema({f0, f1, f2, (expected_micro ? f3_x : f3), f4, f5, f6}));

std::vector<int32_t> t32_values = {1489269000, 1489270000, 1489271000,
1489272000, 1489272000, 1489273000};
Expand All @@ -1654,20 +1654,30 @@ void MakeDateTimeTypesTable(std::shared_ptr<Table>* out, bool expected = false)
ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5);
ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6);

*out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6});
*out = Table::Make(schema, {a0, a1, a2, expected_micro ? a3_x : a3, a4, a5, a6});
}

TEST(TestArrowReadWrite, DateTimeTypes) {
std::shared_ptr<Table> table, result;
std::shared_ptr<Table> table, result, expected;

// Parquet 2.6 nanoseconds are preserved
MakeDateTimeTypesTable(&table);
ASSERT_NO_FATAL_FAILURE(
DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result));

MakeDateTimeTypesTable(&table, true); // build expected result
ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
MakeDateTimeTypesTable(&expected, false);
ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*expected->schema(),
*result->schema(),
/*check_metadata=*/false));
ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result));
ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected, *result));

// Parquet 2.4 nanoseconds are converted to microseconds
auto parquet_version_2_4_properties = ::parquet::WriterProperties::Builder()
.version(ParquetVersion::PARQUET_2_4)
->build();
MakeDateTimeTypesTable(&expected, true);
ASSERT_NO_FATAL_FAILURE(
CheckConfiguredRoundtrip(table, expected, parquet_version_2_4_properties));
}

TEST(TestArrowReadWrite, UseDeprecatedInt96) {
Expand Down Expand Up @@ -1973,7 +1983,9 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) {
field("ts:us", t_us), field("ts:ns", t_ns)});
auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns});

auto parquet_version_1_properties = ::parquet::default_writer_properties();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice catch!

auto parquet_version_1_properties = ::parquet::WriterProperties::Builder()
.version(ParquetVersion::PARQUET_1_0)
->build();
ARROW_SUPPRESS_DEPRECATION_WARNING
auto parquet_version_2_0_properties = ::parquet::WriterProperties::Builder()
.version(ParquetVersion::PARQUET_2_0)
Expand Down Expand Up @@ -3334,7 +3346,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) {
auto arrow_writer_props = ArrowWriterProperties::Builder();
arrow_writer_props.store_schema();
if (inner_type->id() == ::arrow::Type::UINT32) {
writer_props.version(ParquetVersion::PARQUET_2_4);
writer_props.version(ParquetVersion::PARQUET_2_6);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default writer_props created above with WriterProperties::Builder(); will now use 2_6 by default, so potentially this could be removed alltogether

} else if (inner_type->id() == ::arrow::Type::TIMESTAMP) {
// By default ns is coerced to us, override that
::arrow::TimeUnit::type unit =
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -869,9 +869,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
/*is_from_converted_type=*/false,
/*force_set_converted_type=*/true),
ParquetType::INT64, -1},
// Parquet v1, values converted to microseconds
{"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO),
LogicalType::Timestamp(false, LogicalType::TimeUnit::MICROS,
LogicalType::Timestamp(false, LogicalType::TimeUnit::NANOS,
/*is_from_converted_type=*/false,
/*force_set_converted_type=*/true),
ParquetType::INT64, -1},
Expand All @@ -882,7 +881,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
-1},
{"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"),
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64,
-1},
{"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"),
LogicalType::Timestamp(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT64,
Expand All @@ -891,7 +890,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
-1},
{"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"),
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
LogicalType::Timestamp(true, LogicalType::TimeUnit::NANOS), ParquetType::INT64,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we extract NANOS, and test with both 2_4 and 2_6? Since 2.4 might still be used for some time

-1}};

std::vector<std::shared_ptr<Field>> arrow_fields;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,7 @@ TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) {

TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) {
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6);
}

TEST(TestWriter, NullValuesBuffer) {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class PARQUET_EXPORT WriterProperties {
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
pagesize_(kDefaultDataPageSize),
version_(ParquetVersion::PARQUET_2_4),
version_(ParquetVersion::PARQUET_2_6),
data_page_version_(ParquetDataPageVersion::V1),
created_by_(DEFAULT_CREATED_BY),
store_decimal_as_integer_(false),
Expand Down Expand Up @@ -296,7 +296,7 @@ class PARQUET_EXPORT WriterProperties {
}

/// Specify the Parquet file version.
/// Default PARQUET_2_4.
/// Default PARQUET_2_6.
Builder* version(ParquetVersion::type version) {
version_ = version;
return this;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/properties_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ TEST(TestWriterProperties, Basics) {

ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize());
ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, props->dictionary_pagesize_limit());
ASSERT_EQ(ParquetVersion::PARQUET_2_4, props->version());
ASSERT_EQ(ParquetVersion::PARQUET_2_6, props->version());
ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version());
ASSERT_FALSE(props->page_checksum_enabled());
}
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ cdef class FileMetaData(_Weakrefable):
"""
Parquet format version used in file (str, such as '1.0', '2.4').

If version is missing or unparsable, will default to assuming '2.4'.
If version is missing or unparsable, will default to assuming '2.6'.
"""
cdef ParquetVersion version = self._metadata.version()
if version == ParquetVersion_V1:
Expand All @@ -722,9 +722,9 @@ cdef class FileMetaData(_Weakrefable):
elif version == ParquetVersion_V2_6:
return '2.6'
else:
warnings.warn('Unrecognized file version, assuming 2.4: {}'
warnings.warn('Unrecognized file version, assuming 2.6: {}'
.format(version))
return '2.4'
return '2.6'

@property
def created_by(self):
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ def _sanitize_table(table, new_schema, flavor):
return table


_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4"
_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.6"
Determine which Parquet logical types are available for use, whether the
reduced set from the Parquet 1.x.x format or the expanded logical types
added in later format versions.
Expand Down Expand Up @@ -944,7 +944,7 @@ class ParquetWriter:

def __init__(self, where, schema, filesystem=None,
flavor=None,
version='2.4',
version='2.6',
use_dictionary=True,
compression='snappy',
write_statistics=True,
Expand Down Expand Up @@ -3060,7 +3060,7 @@ def read_pandas(source, columns=None, **kwargs):
_DNF_filter_doc, "")


def write_table(table, where, row_group_size=None, version='2.4',
def write_table(table, where, row_group_size=None, version='2.6',
use_dictionary=True, compression='snappy',
write_statistics=True,
use_deprecated_int96_timestamps=None,
Expand Down
4 changes: 3 additions & 1 deletion python/pyarrow/tests/parquet/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,9 @@ def test_write_error_deletes_incomplete_file(tempdir):

filename = tempdir / 'tmp_file'
try:
_write_table(pdf, filename)
# Test relies on writing nanoseconds to raise an error
# true for Parquet 2.4
_write_table(pdf, filename, version="2.4")
except pa.ArrowException:
pass

Expand Down
35 changes: 19 additions & 16 deletions python/pyarrow/tests/parquet/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,23 +336,24 @@ def get_table(pq_reader_method, filename, **kwargs):
tm.assert_frame_equal(df, df_correct)


def test_timestamp_restore_timezone():
@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
def test_timestamp_restore_timezone(unit):
# ARROW-5888, restore timezone from serialized metadata
ty = pa.timestamp('ms', tz='America/New_York')
ty = pa.timestamp(unit, tz='America/New_York')
arr = pa.array([1, 2, 3], type=ty)
t = pa.table([arr], names=['f0'])
_check_roundtrip(t)


def test_timestamp_restore_timezone_nanosecond():
# ARROW-9634, also restore timezone for nanosecond data that get stored
# as microseconds in the parquet file
# as microseconds in the parquet file for Parquet ver 2.4 and less
ty = pa.timestamp('ns', tz='America/New_York')
arr = pa.array([1000, 2000, 3000], type=ty)
table = pa.table([arr], names=['f0'])
ty_us = pa.timestamp('us', tz='America/New_York')
expected = pa.table([arr.cast(ty_us)], names=['f0'])
_check_roundtrip(table, expected=expected)
_check_roundtrip(table, expected=expected, version='2.4')


@pytest.mark.pandas
Expand All @@ -378,28 +379,31 @@ def test_parquet_version_timestamp_differences():
a_us = pa.array(d_us, type=pa.timestamp('us'))
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))

all_versions = ['1.0', '2.4', '2.6']

names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)

# Using Parquet version 1.0, seconds should be coerced to milliseconds
# Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
# and nanoseconds should be coerced to microseconds by default
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
_check_roundtrip(table, expected)
_check_roundtrip(table, expected, version='1.0')
_check_roundtrip(table, expected, version='2.4')

# Using Parquet version 2.0, seconds should be coerced to milliseconds
# Using Parquet version 2.6, seconds should be coerced to milliseconds
# and nanoseconds should be retained by default
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
_check_roundtrip(table, expected, version='2.6')

# Using Parquet version 1.0, coercing to milliseconds or microseconds
# For either Parquet version coercing to milliseconds or microseconds
# is allowed
expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
_check_roundtrip(table, expected, coerce_timestamps='ms')
for ver in all_versions:
_check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)

# Using Parquet version 2.0, coercing to milliseconds or microseconds
# is allowed
expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
_check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')
for ver in all_versions:
_check_roundtrip(table, expected, version=ver, coerce_timestamps='us')

# TODO: after pyarrow allows coerce_timestamps='ns', tests like the
# following should pass ...
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could also be left for a follow-up, but we could now allow this (in _create_arrow_writer_properties in _parquet.pyx we would need to update the cython bindings to pass it through, I think on C++ side it's already implemented)

Expand All @@ -416,10 +420,9 @@ def test_parquet_version_timestamp_differences():
# For either Parquet version, coercing to nanoseconds is allowed
# if Int96 storage is used
expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
_check_roundtrip(table, expected,
use_deprecated_int96_timestamps=True)
_check_roundtrip(table, expected, version='2.6',
use_deprecated_int96_timestamps=True)
for ver in all_versions:
_check_roundtrip(table, expected, version=ver,
use_deprecated_int96_timestamps=True)


@pytest.mark.pandas
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
arrow_table = pa.Table.from_pandas(df)

with filename.open('wb') as f:
_write_table(arrow_table, f, version="2.4")
_write_table(arrow_table, f, version="2.6")

data = io.BytesIO(filename.read_bytes())

Expand Down