diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index 04e4f9e88d0b..dd1c2d3c5105 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -235,6 +235,10 @@ def createDataFrame( # If no schema supplied by user then get the names of columns only if schema is None: _cols = [str(x) if not isinstance(x, str) else x for x in data.columns] + elif isinstance(schema, (list, tuple)) and cast(int, _num_cols) < len(data.columns): + assert isinstance(_cols, list) + _cols.extend([f"_{i + 1}" for i in range(cast(int, _num_cols), len(data.columns))]) + _num_cols = len(_cols) # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): @@ -309,6 +313,9 @@ def createDataFrame( _inferred_schema = self._inferSchemaFromList(_data, _cols) + if _cols is not None and cast(int, _num_cols) < len(_cols): + _num_cols = len(_cols) + if _has_nulltype(_inferred_schema): # For cases like createDataFrame([("Alice", None, 80.1)], schema) # we can not infer the schema from the data itself. diff --git a/python/pyspark/sql/tests/connect/test_parity_types.py b/python/pyspark/sql/tests/connect/test_parity_types.py index 3d54d488a5d6..67d5a17660e2 100644 --- a/python/pyspark/sql/tests/connect/test_parity_types.py +++ b/python/pyspark/sql/tests/connect/test_parity_types.py @@ -90,11 +90,6 @@ def test_infer_nested_schema(self): def test_infer_schema(self): super().test_infer_schema() - # TODO(SPARK-42022): createDataFrame should autogenerate missing column names - @unittest.skip("Fails in Spark Connect, should enable.") - def test_infer_schema_not_enough_names(self): - super().test_infer_schema_not_enough_names() - # TODO(SPARK-42020): createDataFrame with UDT @unittest.skip("Fails in Spark Connect, should enable.") def test_infer_schema_specification(self):