apache · ueshin · Mar 7, 2023 · Mar 7, 2023
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -235,6 +235,10 @@ def createDataFrame(
             # If no schema supplied by user then get the names of columns only
             if schema is None:
                 _cols = [str(x) if not isinstance(x, str) else x for x in data.columns]
+            elif isinstance(schema, (list, tuple)) and cast(int, _num_cols) < len(data.columns):
+                assert isinstance(_cols, list)
+                _cols.extend([f"_{i + 1}" for i in range(cast(int, _num_cols), len(data.columns))])
+                _num_cols = len(_cols)
 
             # Determine arrow types to coerce data when creating batches
             if isinstance(schema, StructType):
@@ -309,6 +313,9 @@ def createDataFrame(
 
             _inferred_schema = self._inferSchemaFromList(_data, _cols)
 
+            if _cols is not None and cast(int, _num_cols) < len(_cols):
+                _num_cols = len(_cols)
+
             if _has_nulltype(_inferred_schema):
                 # For cases like createDataFrame([("Alice", None, 80.1)], schema)
                 # we can not infer the schema from the data itself.

diff --git a/python/pyspark/sql/tests/connect/test_parity_types.py b/python/pyspark/sql/tests/connect/test_parity_types.py
@@ -90,11 +90,6 @@ def test_infer_nested_schema(self):
     def test_infer_schema(self):
         super().test_infer_schema()
 
-    # TODO(SPARK-42022): createDataFrame should autogenerate missing column names
-    @unittest.skip("Fails in Spark Connect, should enable.")
-    def test_infer_schema_not_enough_names(self):
-        super().test_infer_schema_not_enough_names()
-
     # TODO(SPARK-42020): createDataFrame with UDT
     @unittest.skip("Fails in Spark Connect, should enable.")
     def test_infer_schema_specification(self):