Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 41 additions & 26 deletions python/pyarrow/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
)


def partitioning(field_names=None, flavor=None):
def partitioning(schema=None, field_names=None, flavor=None):
"""
Specify a partitioning scheme.

Expand All @@ -81,9 +81,13 @@ def partitioning(field_names=None, flavor=None):

Parameters
----------
field_names : pyarrow.Schema or list of str
The schema that describes the partitions present in the file path. If
a list of strings (field names) is passed, the schema's types are
schema : pyarrow.Schema, default None
The schema that describes the partitions present in the file path.
If not specified, and `field_names` and/or `flavor` are specified,
the schema will be inferred from the file path (and a
PartitioningFactory is returned).
field_names : list of str, default None
A list of strings (field names). If specified, the schema's types are
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bkietz @kszucs following up on #6022 (comment), I went with splitting the keyword into two separate ones.
But I can also rename it to schema_or_field_names and keep it as a single one (it's long, but you can use it positionally), if that has stronger preference.

I think having separate ones is a bit cleaner, and easier to explain.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And it is easier to merge than separate later.

inferred from the file paths (only valid for DirectoryPartitioning).
flavor : str, default None
The default is DirectoryPartitioning. Specify ``flavor="hive"`` for
Expand All @@ -102,7 +106,7 @@ def partitioning(field_names=None, flavor=None):

or let the types be inferred by only specifying the field names:

>>> partitioning(["year", "month"])
>>> partitioning(field_names=["year", "month"])

For paths like "/2009/June", the year will be inferred as int32 while month
will be inferred as string.
Expand All @@ -121,27 +125,34 @@ def partitioning(field_names=None, flavor=None):
"""
if flavor is None:
# default flavor
if isinstance(field_names, pa.Schema):
return DirectoryPartitioning(field_names)
elif isinstance(field_names, list):
return DirectoryPartitioning.discover(field_names)
elif field_names is None:
raise ValueError(
"For the default directory flavor, need to specify "
"'field_names' as Schema or list of field names")
if schema is not None:
if field_names is not None:
raise ValueError(
"Cannot specify both 'schema' and 'field_names'")
return DirectoryPartitioning(schema)
elif field_names is not None:
if isinstance(field_names, list):
return DirectoryPartitioning.discover(field_names)
else:
raise ValueError(
"Expected list of field names, got {0}".format(
type(field_names)))
else:
raise ValueError(
"Expected Schema or list of field names, got {0}".format(
type(field_names)))
"For the default directory flavor, need to specify "
"a Schema or a list of field names")
elif flavor == 'hive':
if isinstance(field_names, pa.Schema):
return HivePartitioning(field_names)
elif field_names is None:
return HivePartitioning.discover()
if field_names is not None:
raise ValueError("Cannot specify 'field_names' for flavor 'hive'")
elif schema is not None:
if isinstance(schema, pa.Schema):
return HivePartitioning(schema)
else:
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is why I found useful the ability to restrict the HivePartitioning for certain field names. Perhaps I'm only interested in a subset of the partitioning fields, and it'd be easier to define them here.

@bkietz created an issue where we can discuss https://issues.apache.org/jira/browse/ARROW-7646

"Expected Schema for 'schema', got {0}".format(
type(schema)))
else:
raise ValueError(
"Expected Schema or None for 'field_names', got {0}".format(
type(field_names)))
return HivePartitioning.discover()
else:
raise ValueError("Unsupported flavor")

Expand Down Expand Up @@ -181,6 +192,8 @@ def _ensure_partitioning(scheme):
pass
elif isinstance(scheme, str):
scheme = partitioning(flavor=scheme)
elif isinstance(scheme, list):
scheme = partitioning(field_names=scheme)
elif isinstance(scheme, (Partitioning, PartitioningFactory)):
pass
else:
Expand All @@ -202,9 +215,10 @@ def source(path_or_paths, filesystem=None, partitioning=None,
a list of paths.
filesystem : FileSystem, default None
By default will be inferred from the path.
partitioning : Partitioning(Factory) or str
partitioning : Partitioning(Factory), str or list of str
The partitioning scheme specified with the ``partitioning()``
function. A flavor string can be used as shortcut.
function. A flavor string can be used as shortcut, and with a list of
field names a DirectionaryPartitioning will be inferred.
format : str
Currently only "parquet" is supported.

Expand Down Expand Up @@ -273,9 +287,10 @@ def dataset(sources, filesystem=None, partitioning=None, format=None):
case, the additional keywords will be ignored).
filesystem : FileSystem, default None
By default will be inferred from the path.
partitioning : Partitioning(Factory) or str
partitioning : Partitioning(Factory), str, list of str
The partitioning scheme specified with the ``partitioning()``
function. A flavor string can be used as shortcut.
function. A flavor string can be used as shortcut, and with a list of
field names a DirectionaryPartitioning will be inferred.
format : str
Currently only "parquet" is supported.

Expand Down
10 changes: 8 additions & 2 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,11 +394,15 @@ def test_partitioning_function():

part = ds.partitioning(schema)
assert isinstance(part, ds.DirectoryPartitioning)
part = ds.partitioning(names)
part = ds.partitioning(field_names=names)
assert isinstance(part, ds.PartitioningFactory)
# needs schema or names
# needs schema or list of names
with pytest.raises(ValueError):
ds.partitioning()
with pytest.raises(ValueError, match="Expected list"):
ds.partitioning(field_names=schema)
with pytest.raises(ValueError, match="Cannot specify both"):
ds.partitioning(schema, field_names=schema)

# Hive partitioning

Expand All @@ -409,6 +413,8 @@ def test_partitioning_function():
# cannot pass list of names
with pytest.raises(ValueError):
ds.partitioning(names, flavor="hive")
with pytest.raises(ValueError, match="Cannot specify 'field_names'"):
ds.partitioning(field_names=names, flavor="hive")

# unsupported flavor
with pytest.raises(ValueError):
Expand Down