Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions python/pyspark/sql/connect/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def createDataFrame(
# If no schema supplied by user then get the names of columns only
if schema is None:
_cols = [str(x) if not isinstance(x, str) else x for x in data.columns]
elif isinstance(schema, (list, tuple)) and _num_cols < len(data.columns):
_cols = _cols + [f"_{i + 1}" for i in range(_num_cols, len(data.columns))]

@amaliujia amaliujia Mar 7, 2023

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, I guess probably we can do a bit more: need to make sure the user provided column name are not the same as the auto-generated one.

Though the probability of the collision is small so maybe this is not a big concern.

_num_cols = len(_cols)

# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):
Expand Down Expand Up @@ -309,6 +312,9 @@ def createDataFrame(

_inferred_schema = self._inferSchemaFromList(_data, _cols)

if _cols is not None and _num_cols < len(_cols):
_num_cols = len(_cols)

if _has_nulltype(_inferred_schema):
# For cases like createDataFrame([("Alice", None, 80.1)], schema)
# we can not infer the schema from the data itself.
Expand Down
5 changes: 0 additions & 5 deletions python/pyspark/sql/tests/connect/test_parity_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@ def test_infer_nested_schema(self):
def test_infer_schema(self):
super().test_infer_schema()

# TODO(SPARK-42022): createDataFrame should autogenerate missing column names
@unittest.skip("Fails in Spark Connect, should enable.")
def test_infer_schema_not_enough_names(self):
super().test_infer_schema_not_enough_names()

# TODO(SPARK-42020): createDataFrame with UDT
@unittest.skip("Fails in Spark Connect, should enable.")
def test_infer_schema_specification(self):
Expand Down