Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 22 additions & 25 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,8 +909,7 @@ def dropDuplicates(self, subset=None):
@since("1.3.1")
def dropna(self, how='any', thresh=None, subset=None):
"""Returns a new :class:`DataFrame` omitting rows with null values.

This is an alias for ``na.drop()``.
:func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.

:param how: 'any' or 'all'.
If 'any', drop a row if it contains any nulls.
Expand All @@ -920,13 +919,6 @@ def dropna(self, how='any', thresh=None, subset=None):
This overwrites the `how` parameter.
:param subset: optional list of column names to consider.

>>> df4.dropna().show()
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+

>>> df4.na.drop().show()
+---+------+-----+
|age|height| name|
Expand All @@ -952,6 +944,7 @@ def dropna(self, how='any', thresh=None, subset=None):
@since("1.3.1")
def fillna(self, value, subset=None):
"""Replace null values, alias for ``na.fill()``.
:func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.

:param value: int, long, float, string, or dict.
Value to replace null values with.
Expand All @@ -963,7 +956,7 @@ def fillna(self, value, subset=None):
For example, if `value` is a string, and subset contains a non-string column,
then the non-string column is simply ignored.

>>> df4.fillna(50).show()
>>> df4.na.fill(50).show()
+---+------+-----+
|age|height| name|
+---+------+-----+
Expand All @@ -973,16 +966,6 @@ def fillna(self, value, subset=None):
| 50| 50| null|
+---+------+-----+

>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+

>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
+---+------+-------+
|age|height| name|
Expand Down Expand Up @@ -1014,6 +997,8 @@ def fillna(self, value, subset=None):
@since(1.4)
def replace(self, to_replace, value, subset=None):
"""Returns a new :class:`DataFrame` replacing a value with another value.
:func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
aliases of each other.

:param to_replace: int, long, float, string, or list.
Value to be replaced.
Expand All @@ -1029,7 +1014,7 @@ def replace(self, to_replace, value, subset=None):
For example, if `value` is a string, and subset contains a non-string column,
then the non-string column is simply ignored.

>>> df4.replace(10, 20).show()
>>> df4.na.replace(10, 20).show()
+----+------+-----+
| age|height| name|
+----+------+-----+
Expand All @@ -1039,7 +1024,7 @@ def replace(self, to_replace, value, subset=None):
|null| null| null|
+----+------+-----+

>>> df4.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
>>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+----+------+----+
| age|height|name|
+----+------+----+
Expand Down Expand Up @@ -1090,9 +1075,9 @@ def replace(self, to_replace, value, subset=None):
@since(1.4)
def corr(self, col1, col2, method=None):
"""
Calculates the correlation of two columns of a DataFrame as a double value. Currently only
supports the Pearson Correlation Coefficient.
:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases.
Calculates the correlation of two columns of a DataFrame as a double value.
Currently only supports the Pearson Correlation Coefficient.
:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.

:param col1: The name of the first column
:param col2: The name of the second column
Expand Down Expand Up @@ -1241,7 +1226,10 @@ def toPandas(self):
import pandas as pd
return pd.DataFrame.from_records(self.collect(), columns=self.columns)

##########################################################################################
# Pandas compatibility
##########################################################################################

groupby = groupBy
drop_duplicates = dropDuplicates

Expand All @@ -1261,6 +1249,8 @@ def _to_scala_map(sc, jm):

class DataFrameNaFunctions(object):
"""Functionality for working with missing data in :class:`DataFrame`.

.. versionadded:: 1.4
"""

def __init__(self, df):
Expand All @@ -1276,9 +1266,16 @@ def fill(self, value, subset=None):

fill.__doc__ = DataFrame.fillna.__doc__

def replace(self, to_replace, value, subset=None):
return self.df.replace(to_replace, value, subset)

replace.__doc__ = DataFrame.replace.__doc__


class DataFrameStatFunctions(object):
"""Functionality for statistic functions with :class:`DataFrame`.

.. versionadded:: 1.4
"""

def __init__(self, df):
Expand Down
1 change: 0 additions & 1 deletion python/pyspark/sql/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def _to_java_cols(cols):


class Window(object):

"""
Utility functions for defining window in DataFrames.

Expand Down