apache · jorisvandenbossche · Jan 21, 2020 · Jan 21, 2020 · jorisvandenbossche · Jan 21, 2020
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
@@ -59,7 +59,7 @@
 )
 
 
-def partitioning(field_names=None, flavor=None):
+def partitioning(schema=None, field_names=None, flavor=None):
     """
     Specify a partitioning scheme.
 
@@ -81,9 +81,13 @@ def partitioning(field_names=None, flavor=None):
 
     Parameters
     ----------
-    field_names : pyarrow.Schema or list of str
-        The schema that describes the partitions present in the file path. If
-        a list of strings (field names) is passed, the schema's types are
+    schema : pyarrow.Schema, default None
+        The schema that describes the partitions present in the file path.
+        If not specified, and `field_names` and/or `flavor` are specified,
+        the schema will be inferred from the file path (and a
+        PartitioningFactory is returned).
+    field_names :  list of str, default None
+        A list of strings (field names). If specified, the schema's types are
         inferred from the file paths (only valid for DirectoryPartitioning).
     flavor : str, default None
         The default is DirectoryPartitioning. Specify ``flavor="hive"`` for
@@ -102,7 +106,7 @@ def partitioning(field_names=None, flavor=None):
 
     or let the types be inferred by only specifying the field names:
 
-    >>> partitioning(["year", "month"])
+    >>> partitioning(field_names=["year", "month"])
 
     For paths like "/2009/June", the year will be inferred as int32 while month
     will be inferred as string.
@@ -121,27 +125,34 @@ def partitioning(field_names=None, flavor=None):
     """
     if flavor is None:
         # default flavor
-        if isinstance(field_names, pa.Schema):
-            return DirectoryPartitioning(field_names)
-        elif isinstance(field_names, list):
-            return DirectoryPartitioning.discover(field_names)
-        elif field_names is None:
-            raise ValueError(
-                "For the default directory flavor, need to specify "
-                "'field_names' as Schema or list of field names")
+        if schema is not None:
+            if field_names is not None:
+                raise ValueError(
+                    "Cannot specify both 'schema' and 'field_names'")
+            return DirectoryPartitioning(schema)
+        elif field_names is not None:
+            if isinstance(field_names, list):
+                return DirectoryPartitioning.discover(field_names)
+            else:
+                raise ValueError(
+                    "Expected list of field names, got {0}".format(
+                        type(field_names)))
         else:
             raise ValueError(
-                "Expected Schema or list of field names, got {0}".format(
-                    type(field_names)))
+                "For the default directory flavor, need to specify "
+                "a Schema or a list of field names")
     elif flavor == 'hive':
-        if isinstance(field_names, pa.Schema):
-            return HivePartitioning(field_names)
-        elif field_names is None:
-            return HivePartitioning.discover()
+        if field_names is not None:
+            raise ValueError("Cannot specify 'field_names' for flavor 'hive'")
+        elif schema is not None:
+            if isinstance(schema, pa.Schema):
+                return HivePartitioning(schema)
+            else:
+                raise ValueError(
+                    "Expected Schema for 'schema', got {0}".format(
+                        type(schema)))
         else:
-            raise ValueError(
-                "Expected Schema or None for 'field_names', got {0}".format(
-                    type(field_names)))
+            return HivePartitioning.discover()
     else:
         raise ValueError("Unsupported flavor")
 
@@ -181,6 +192,8 @@ def _ensure_partitioning(scheme):
         pass
     elif isinstance(scheme, str):
         scheme = partitioning(flavor=scheme)
+    elif isinstance(scheme, list):
+        scheme = partitioning(field_names=scheme)
     elif isinstance(scheme, (Partitioning, PartitioningFactory)):
         pass
     else:
@@ -202,9 +215,10 @@ def source(path_or_paths, filesystem=None, partitioning=None,
         a list of paths.
     filesystem : FileSystem, default None
         By default will be inferred from the path.
-    partitioning : Partitioning(Factory) or str
+    partitioning : Partitioning(Factory), str or list of str
         The partitioning scheme specified with the ``partitioning()``
-        function. A flavor string can be used as shortcut.
+        function. A flavor string can be used as shortcut, and with a list of
+        field names a DirectionaryPartitioning will be inferred.
     format : str
         Currently only "parquet" is supported.
 
@@ -273,9 +287,10 @@ def dataset(sources, filesystem=None, partitioning=None, format=None):
         case, the additional keywords will be ignored).
     filesystem : FileSystem, default None
         By default will be inferred from the path.
-    partitioning : Partitioning(Factory) or str
+    partitioning : Partitioning(Factory), str, list of str
         The partitioning scheme specified with the ``partitioning()``
-        function. A flavor string can be used as shortcut.
+        function. A flavor string can be used as shortcut, and with a list of
+        field names a DirectionaryPartitioning will be inferred.
     format : str
         Currently only "parquet" is supported.
 

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -394,11 +394,15 @@ def test_partitioning_function():
 
     part = ds.partitioning(schema)
     assert isinstance(part, ds.DirectoryPartitioning)
-    part = ds.partitioning(names)
+    part = ds.partitioning(field_names=names)
     assert isinstance(part, ds.PartitioningFactory)
-    # needs schema or names
+    # needs schema or list of names
     with pytest.raises(ValueError):
         ds.partitioning()
+    with pytest.raises(ValueError, match="Expected list"):
+        ds.partitioning(field_names=schema)
+    with pytest.raises(ValueError, match="Cannot specify both"):
+        ds.partitioning(schema, field_names=schema)
 
     # Hive partitioning
 
@@ -409,6 +413,8 @@ def test_partitioning_function():
     # cannot pass list of names
     with pytest.raises(ValueError):
         ds.partitioning(names, flavor="hive")
+    with pytest.raises(ValueError, match="Cannot specify 'field_names'"):
+        ds.partitioning(field_names=names, flavor="hive")
 
     # unsupported flavor
     with pytest.raises(ValueError):