Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
"use_deprecated_int96_timestamps",
"coerce_timestamps",
"allow_truncated_timestamps",
"use_compliant_nested_type",
}

setters = set()
Expand Down Expand Up @@ -586,7 +587,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._properties = dict(
use_dictionary=True,
compression="snappy",
version="1.0",
version="2.6",
write_statistics=None,
data_page_size=None,
compression_level=None,
Expand All @@ -601,6 +602,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._set_properties()
self._set_arrow_properties()

def __repr__(self):
return "<pyarrow.dataset.ParquetFileWriteOptions {0}>".format(
" ".join([f"{key}={value}" for key, value in self._properties.items()])
)


cdef set _PARQUET_READ_OPTIONS = {
'dictionary_columns', 'coerce_int96_timestamp_unit'
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3078,6 +3078,8 @@ def write_table(table, where, row_group_size=None, version='2.6',
dictionary_pagesize_limit=None,
store_schema=True,
**kwargs):
# Implementor's note: when adding keywords here / updating defaults, also
# update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions
row_group_size = kwargs.pop('chunk_size', row_group_size)
use_int96 = use_deprecated_int96_timestamps
try:
Expand Down
18 changes: 16 additions & 2 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4539,7 +4539,9 @@ def test_write_table_partitioned_dict(tempdir):
@pytest.mark.parquet
def test_write_dataset_parquet(tempdir):
table = pa.table([
pa.array(range(20)), pa.array(np.random.randn(20)),
pa.array(range(20), type="uint32"),
pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype(
"datetime64[ns]")),
pa.array(np.repeat(['a', 'b'], 10))
], names=["f1", "f2", "part"])

Expand All @@ -4551,20 +4553,32 @@ def test_write_dataset_parquet(tempdir):
file_paths = list(base_dir.rglob("*"))
expected_paths = [base_dir / "part-0.parquet"]
assert set(file_paths) == set(expected_paths)
# check Table roundtrip
# check Table roundtrip with default version
result = ds.dataset(base_dir, format="parquet").to_table()
assert result.equals(table)

# using custom options
for version in ["1.0", "2.4", "2.6"]:
format = ds.ParquetFileFormat()
opts = format.make_write_options(version=version)
assert "<pyarrow.dataset.ParquetFileWriteOptions" in repr(opts)
base_dir = tempdir / 'parquet_dataset_version{0}'.format(version)
ds.write_dataset(table, base_dir, format=format, file_options=opts)
meta = pq.read_metadata(base_dir / "part-0.parquet")
expected_version = "1.0" if version == "1.0" else "2.6"
assert meta.format_version == expected_version

# ensure version is actually honored based on supported datatypes
result = ds.dataset(base_dir, format="parquet").to_table()
schema = table.schema
if version == "1.0":
# uint32 is written as int64
schema = schema.set(0, schema.field(0).with_type(pa.int64()))
if version in ("1.0", "2.4"):
schema = schema.set(1, schema.field(1).with_type(pa.timestamp("us")))
expected = table.cast(schema)
assert result.equals(expected)


def test_write_dataset_csv(tempdir):
table = pa.table([
Expand Down