Skip to content
238 changes: 202 additions & 36 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,9 +363,18 @@ def schema(self) -> StructType:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.schema
StructType([StructField('age', IntegerType(), True),
StructField('name', StringType(), True)])
StructType([StructField('age', LongType(), True), StructField('name', StringType(), True)])
"""
if self._schema is None:
try:
Expand Down Expand Up @@ -571,29 +580,42 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =

Examples
--------
>>> df
DataFrame[age: int, name: string]
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto for indentation

>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 2|Alice|
| 5| Bob|
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.show(2)
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
+---+-----+
only showing top 2 rows
>>> df.show(truncate=3)
+---+----+
|age|name|
+---+----+
| 2| Ali|
| 5| Bob|
| 14| Tom|
| 23| Ali|
| 16| Bob|
+---+----+
>>> df.show(vertical=True)
-RECORD 0-----
age | 2
name | Alice
age | 14
name | Tom
-RECORD 1-----
age | 5
name | Bob
age | 23
name | Alice
-RECORD 2-----
age | 16
name | Bob
"""

if not isinstance(n, int) or isinstance(n, bool):
Expand Down Expand Up @@ -798,8 +820,18 @@ def count(self) -> int:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.count()
2
3
"""
return int(self._jdf.count())

Expand Down Expand Up @@ -862,8 +894,18 @@ def take(self, num: int) -> List[Row]:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.take(2)
[Row(age=2, name='Alice'), Row(age=5, name='Bob')]
[Row(age=14, name='Tom'), Row(age=23, name='Alice')]
"""
return self.limit(num).collect()

Expand All @@ -878,8 +920,18 @@ def tail(self, num: int) -> List[Row]:

Examples
--------
>>> df.tail(1)
[Row(age=5, name='Bob')]
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.tail(2)
[Row(age=23, name='Alice'), Row(age=16, name='Bob')]
"""
with SCCallSiteSync(self._sc):
sock_info = self._jdf.tailToPython(num)
Expand Down Expand Up @@ -1179,6 +1231,16 @@ def distinct(self) -> "DataFrame":

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (23, "Alice")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 23|Alice|
+---+-----+
>>> df.distinct().count()
2
"""
Expand Down Expand Up @@ -1375,8 +1437,17 @@ def dtypes(self) -> List[Tuple[str, str]]:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"),
... (23, "Alice")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
+---+-----+
>>> df.dtypes
[('age', 'int'), ('name', 'string')]
[('age', 'bigint'), ('name', 'string')]
"""
return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]

Expand Down Expand Up @@ -2743,7 +2814,20 @@ def fillna(

Examples
--------
>>> df4.na.fill(50).show()
Fill all null values with 50 when the data type of the column is an integer

>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

intentation

>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.fill(50).show()
+---+------+-----+
|age|height| name|
+---+------+-----+
Expand All @@ -2753,7 +2837,19 @@ def fillna(
| 50| 50| null|
+---+------+-----+

>>> df5.na.fill(False).show()
Fill all null values with False when the data type of the column is a boolean

>>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None),
... (None, "Mallory", True)], ["age", "name", "spy"])
>>> df.show()
+----+-------+----+
| age| name| spy|
+----+-------+----+
| 10| Alice|null|
| 5| Bob|null|
|null|Mallory|true|
+----+-------+----+
>>> df.na.fill(False).show()
+----+-------+-----+
| age| name| spy|
+----+-------+-----+
Expand All @@ -2762,7 +2858,20 @@ def fillna(
|null|Mallory| true|
+----+-------+-----+

>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column

>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.fill({'age': 50, 'name': 'unknown'}).show()
+---+------+-------+
|age|height| name|
+---+------+-------+
Expand Down Expand Up @@ -2869,7 +2978,18 @@ def replace( # type: ignore[misc]

Examples
--------
>>> df4.na.replace(10, 20).show()
>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.replace(10, 20).show()
+----+------+-----+
| age|height| name|
+----+------+-----+
Expand All @@ -2879,17 +2999,20 @@ def replace( # type: ignore[misc]
|null| null| null|
+----+------+-----+

>>> df4.na.replace('Alice', None).show()
+----+------+----+
| age|height|name|
+----+------+----+
| 10| 80|null|
| 5| null| Bob|
|null| null| Tom|
|null| null|null|
+----+------+----+
Replace all instances of Alice to null

>>> df4.na.replace({'Alice': None}).show()
>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.replace('Alice', None).show()
+----+------+----+
| age|height|name|
+----+------+----+
Expand All @@ -2899,7 +3022,20 @@ def replace( # type: ignore[misc]
|null| null|null|
+----+------+----+

>>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
Replace all instances of Alice to 'A' and Bob to 'B' under the name column

>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to create duplicate dataframe here? If we don't need it, better to delete

>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),(None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+                                                             
| age|height| name|
+----+------+-----+
|  10|    80|Alice|
|   5|  null|  Bob|
|null|  null|  Tom|
|null|  null| null|
+----+------+-----+

>>> df.na.replace('Alice', None).show()
+----+------+----+
| age|height|name|
+----+------+----+
|  10|    80|null|
|   5|  null| Bob|
|null|  null| Tom|
|null|  null|null|
+----+------+----+

>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
|  10|    80|Alice|
|   5|  null|  Bob|
|null|  null|  Tom|
|null|  null| null|
+----+------+-----+

>>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+----+------+----+
| age|height|name|
+----+------+----+
|  10|    80|   A|
|   5|  null|   B|
|null|  null| Tom|
|null|  null|null|
+----+------+----+

... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+----+------+----+
| age|height|name|
+----+------+----+
Expand Down Expand Up @@ -3356,11 +3492,31 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.drop('age').collect()
[Row(name='Alice'), Row(name='Bob')]
[Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]

>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't need to create a new DataFrame here, since drop() doesn't remove the column in-place.

e.g.

>>> df.drop('age').show()
+-----+
| name|
+-----+
|  Tom|
|Alice|
|  Bob|
+-----+

>>> df.drop(df.age).show()
+-----+
| name|
+-----+
|  Tom|
|Alice|
|  Bob|
+-----+

>>> df.drop(df.age).collect()
[Row(name='Alice'), Row(name='Bob')]
[Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]

>>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what these 3 inner joins do exactly. I dont see anywhere an instantiation of df2..

What should I do with these 3 examples?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's showing a common example that join and drop the join key.

[Row(age=5, height=85, name='Bob')]
Expand Down Expand Up @@ -3393,12 +3549,22 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
Parameters
----------
cols : str
new column names
new column names. The length of the list needs to be the same as the number of columns in the initial DataFrame

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.toDF('f1', 'f2').collect()
[Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')]
[Row(f1=14, f2='Tom'), Row(f1=23, f2='Alice'), Row(f1=16, f2='Bob')]
"""
jdf = self._jdf.toDF(self._jseq(cols))
return DataFrame(jdf, self.sparkSession)
Expand Down