Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ def __hash__(self):
"pyspark.sql.connect.window",
"pyspark.sql.connect.column",
"pyspark.sql.connect.readwriter",
"pyspark.sql.connect.dataframe",
# unittests
"pyspark.sql.tests.connect.test_connect_plan",
"pyspark.sql.tests.connect.test_connect_basic",
Expand Down
99 changes: 98 additions & 1 deletion python/pyspark/sql/connect/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import warnings
from collections.abc import Iterable

from pyspark import _NoValue
from pyspark import _NoValue, SparkContext, SparkConf
from pyspark._globals import _NoValueType
from pyspark.sql.types import DataType, StructType, Row

Expand Down Expand Up @@ -1373,3 +1373,100 @@ def sampleBy(


DataFrameStatFunctions.__doc__ = PySparkDataFrameStatFunctions.__doc__


def _test() -> None:
import os
import sys
import doctest
from pyspark.sql import SparkSession as PySparkSession
from pyspark.testing.connectutils import should_test_connect, connect_requirement_message

os.chdir(os.environ["SPARK_HOME"])

if should_test_connect:
import pyspark.sql.connect.dataframe

globs = pyspark.sql.connect.dataframe.__dict__.copy()
# Works around to create a regular Spark session
sc = SparkContext("local[4]", "sql.connect.dataframe tests", conf=SparkConf())
globs["_spark"] = PySparkSession(
sc, options={"spark.app.name": "sql.connect.dataframe tests"}
)

# TODO(SPARK-41819): Implement RDD.getNumPartitions
del pyspark.sql.connect.dataframe.DataFrame.coalesce.__doc__
del pyspark.sql.connect.dataframe.DataFrame.repartition.__doc__

# TODO(SPARK-41820): Fix SparkConnectException: requirement failed
del pyspark.sql.connect.dataframe.DataFrame.createOrReplaceGlobalTempView.__doc__
del pyspark.sql.connect.dataframe.DataFrame.createOrReplaceTempView.__doc__

# TODO(SPARK-41821): Fix DataFrame.describe
del pyspark.sql.connect.dataframe.DataFrame.describe.__doc__

# TODO(SPARK-41823): ambiguous column names
del pyspark.sql.connect.dataframe.DataFrame.drop.__doc__
del pyspark.sql.connect.dataframe.DataFrame.join.__doc__

# TODO(SPARK-41824): DataFrame.explain format is different
del pyspark.sql.connect.dataframe.DataFrame.explain.__doc__
del pyspark.sql.connect.dataframe.DataFrame.hint.__doc__

# TODO(SPARK-41825): Dataframe.show formatting int as double
del pyspark.sql.connect.dataframe.DataFrame.fillna.__doc__
del pyspark.sql.connect.dataframe.DataFrameNaFunctions.replace.__doc__
del pyspark.sql.connect.dataframe.DataFrameNaFunctions.fill.__doc__
del pyspark.sql.connect.dataframe.DataFrame.replace.__doc__
del pyspark.sql.connect.dataframe.DataFrame.intersect.__doc__

# TODO(SPARK-41826): Implement Dataframe.readStream
del pyspark.sql.connect.dataframe.DataFrame.isStreaming.__doc__

# TODO(SPARK-41827): groupBy requires all cols be Column or str
del pyspark.sql.connect.dataframe.DataFrame.groupBy.__doc__

# TODO(SPARK-41828): Implement creating empty DataFrame
del pyspark.sql.connect.dataframe.DataFrame.isEmpty.__doc__

# TODO(SPARK-41829): Add Dataframe sort ordering
del pyspark.sql.connect.dataframe.DataFrame.sort.__doc__
del pyspark.sql.connect.dataframe.DataFrame.sortWithinPartitions.__doc__

# TODO(SPARK-41830): fix sample parameters
del pyspark.sql.connect.dataframe.DataFrame.sample.__doc__

# TODO(SPARK-41831): fix transform to accept ColumnReference
del pyspark.sql.connect.dataframe.DataFrame.transform.__doc__

# TODO(SPARK-41832): fix unionByName
del pyspark.sql.connect.dataframe.DataFrame.unionByName.__doc__

# TODO(SPARK-41818): Support saveAsTable
del pyspark.sql.connect.dataframe.DataFrame.write.__doc__

# Creates a remote Spark session.
os.environ["SPARK_REMOTE"] = "sc://localhost"
globs["spark"] = PySparkSession.builder.remote("sc://localhost").getOrCreate()

(failure_count, test_count) = doctest.testmod(
pyspark.sql.connect.dataframe,
globs=globs,
optionflags=doctest.ELLIPSIS
| doctest.NORMALIZE_WHITESPACE
| doctest.IGNORE_EXCEPTION_DETAIL,
)

globs["spark"].stop()
globs["_spark"].stop()
if failure_count:
sys.exit(-1)
else:
print(
f"Skipping pyspark.sql.connect.dataframe doctests: {connect_requirement_message}",
file=sys.stderr,
)


if __name__ == "__main__":
_test()
10 changes: 5 additions & 5 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def sparkSession(self) -> "SparkSession":
--------
>>> df = spark.range(1)
>>> type(df.sparkSession)
<class 'pyspark.sql.session.SparkSession'>
<class '...session.SparkSession'>
"""
return self._session

Expand Down Expand Up @@ -233,7 +233,7 @@ def na(self) -> "DataFrameNaFunctions":
--------
>>> df = spark.sql("SELECT 1 AS c1, int(NULL) AS c2")
>>> type(df.na)
<class 'pyspark.sql.dataframe.DataFrameNaFunctions'>
<class '...dataframe.DataFrameNaFunctions'>

Replace the missing values as 2.

Expand Down Expand Up @@ -264,7 +264,7 @@ def stat(self) -> "DataFrameStatFunctions":
>>> import pyspark.sql.functions as f
>>> df = spark.range(3).withColumn("c", f.expr("id + 1"))
>>> type(df.stat)
<class 'pyspark.sql.dataframe.DataFrameStatFunctions'>
<class '...dataframe.DataFrameStatFunctions'>
>>> df.stat.corr("id", "c")
1.0
"""
Expand Down Expand Up @@ -355,7 +355,7 @@ def createTempView(self, name: str) -> None:

Throw an exception if the table already exists.

>>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL
>>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL, +SKIP
Traceback (most recent call last):
...
AnalysisException: "Temporary table 'people' already exists;"
Expand Down Expand Up @@ -438,7 +438,7 @@ def createGlobalTempView(self, name: str) -> None:

Throws an exception if the global temporary view already exists.

>>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL
>>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL, +SKIP
Traceback (most recent call last):
...
AnalysisException: "Temporary table 'people' already exists;"
Expand Down