diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index fe01b92036377..29f98ed36e2ec 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -200,7 +200,9 @@ jobs:
architecture: x64
- name: Install Python linter dependencies
run: |
- pip3 install flake8 sphinx numpy
+ # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+ # See also https://github.com/sphinx-doc/sphinx/issues/7551.
+ pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
@@ -218,7 +220,9 @@ jobs:
- name: Install dependencies for documentation generation
run: |
sudo apt-get install -y libcurl4-openssl-dev pandoc
- pip install sphinx mkdocs numpy
+ # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+ # See also https://github.com/sphinx-doc/sphinx/issues/7551.
+ pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
diff --git a/.gitignore b/.gitignore
index 198fdee39be95..0d8addeb10e21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,6 +64,7 @@ python/lib/pyspark.zip
python/.eggs/
python/deps
python/docs/_site/
+python/docs/source/reference/api/
python/test_coverage/coverage_data
python/test_coverage/htmlcov
python/pyspark/python
diff --git a/LICENSE b/LICENSE
index af2cdd275d28d..8cec4f5ea5379 100644
--- a/LICENSE
+++ b/LICENSE
@@ -223,7 +223,7 @@ Python Software Foundation License
----------------------------------
pyspark/heapq3.py
-python/docs/_static/copybutton.js
+python/docs/source/_static/copybutton.js
BSD 3-Clause
------------
diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile
index 44d602415b262..a02a6b7bccf27 100644
--- a/dev/create-release/spark-rm/Dockerfile
+++ b/dev/create-release/spark-rm/Dockerfile
@@ -33,7 +33,10 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true
# These arguments are just for reuse and not really meant to be customized.
ARG APT_INSTALL="apt-get install --no-install-recommends -y"
-ARG PIP_PKGS="sphinx==2.3.1 mkdocs==1.0.4 numpy==1.18.1"
+# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+# See also https://github.com/sphinx-doc/sphinx/issues/7551.
+# We should use the latest Sphinx version once this is fixed.
+ARG PIP_PKGS="sphinx==3.0.4 mkdocs==1.0.4 numpy==1.18.1 pydata_sphinx_theme==0.3.1"
ARG GEM_PKGS="jekyll:4.0.0 jekyll-redirect-from:0.16.0 rouge:3.15.0"
# Install extra needed repos and refresh.
diff --git a/dev/lint-python b/dev/lint-python
index 1fddbfa64b32c..48dd94e36fae8 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -173,7 +173,6 @@ function sphinx_test {
return
fi
- # TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
@@ -181,6 +180,23 @@ function sphinx_test {
return
fi
+ # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+ # See also https://github.com/sphinx-doc/sphinx/issues/7551.
+ PYTHON_HAS_SPHINX_3_0=$("$PYTHON_EXECUTABLE" -c 'from distutils.version import LooseVersion; import sphinx; print(LooseVersion(sphinx.__version__) < LooseVersion("3.1.0"))')
+ if [[ "$PYTHON_HAS_SPHINX_3_0" == "False" ]]; then
+ echo "$PYTHON_EXECUTABLE has Sphinx 3.1+ installed but it requires lower then 3.1. Skipping Sphinx build for now."
+ echo
+ return
+ fi
+
+ # TODO(SPARK-32391): Install pydata_sphinx_theme in Jenkins machines
+ PYTHON_HAS_THEME=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("pydata_sphinx_theme") is not None)')
+ if [[ "$PYTHON_HAS_THEME" == "False" ]]; then
+ echo "$PYTHON_EXECUTABLE does not have pydata_sphinx_theme installed. Skipping Sphinx build for now."
+ echo
+ return
+ fi
+
echo "starting $SPHINX_BUILD tests..."
pushd python/docs &> /dev/null
make clean &> /dev/null
diff --git a/dev/requirements.txt b/dev/requirements.txt
index baea9213dbc97..a862a6e986791 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -3,3 +3,4 @@ jira==1.0.3
PyGithub==1.26.0
Unidecode==0.04.19
sphinx
+pydata_sphinx_theme
diff --git a/dev/tox.ini b/dev/tox.ini
index ba5df084daad7..e25595aa6c9a6 100644
--- a/dev/tox.ini
+++ b/dev/tox.ini
@@ -16,4 +16,4 @@
[pycodestyle]
ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504
max-line-length=100
-exclude=python/pyspark/cloudpickle/*.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
+exclude=python/pyspark/cloudpickle/*.py,heapq3.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
diff --git a/docs/README.md b/docs/README.md
index 22039871cf63d..e2002a66b0433 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -57,8 +57,13 @@ Note: Other versions of roxygen2 might work in SparkR documentation generation b
To generate API docs for any language, you'll need to install these libraries:
+
+
```sh
-$ sudo pip install sphinx mkdocs numpy
+$ sudo pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
```
## Generating the Documentation HTML
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 8e2a06e4bc9a8..17da22bf8a433 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -126,8 +126,8 @@
puts "Making directory api/python"
mkdir_p "api/python"
- puts "cp -r ../python/docs/_build/html/. api/python"
- cp_r("../python/docs/_build/html/.", "api/python")
+ puts "cp -r ../python/docs/build/html/. api/python"
+ cp_r("../python/docs/build/html/.", "api/python")
end
if not (ENV['SKIP_RDOC'] == '1')
diff --git a/docs/img/spark-logo-reverse.png b/docs/img/spark-logo-reverse.png
new file mode 100644
index 0000000000000..a3e4ed4bb3d08
Binary files /dev/null and b/docs/img/spark-logo-reverse.png differ
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 4272b7488d9a0..763f493a0eb58 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -3,8 +3,8 @@
# You can set these variables from the command line.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
-SOURCEDIR ?= .
-BUILDDIR ?= _build
+SOURCEDIR ?= source
+BUILDDIR ?= build
export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9-src.zip)
diff --git a/python/docs/_static/pyspark.css b/python/docs/_static/pyspark.css
deleted file mode 100644
index 41106f2f6e26d..0000000000000
--- a/python/docs/_static/pyspark.css
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-body {
- background-color: #ffffff;
-}
-
-div.sphinxsidebar {
- width: 274px;
-}
-
-div.bodywrapper {
- margin: 0 0 0 274px;
-}
-
-div.sphinxsidebar ul {
- margin-right: 10px;
-}
-
-div.sphinxsidebar li a {
- word-break: break-all;
-}
-
-span.pys-tag {
- font-size: 11px;
- font-weight: bold;
- margin: 0 0 0 2px;
- padding: 1px 3px 1px 3px;
- -moz-border-radius: 3px;
- -webkit-border-radius: 3px;
- border-radius: 3px;
- text-align: center;
- text-decoration: none;
-}
-
-span.pys-tag-experimental {
- background-color: rgb(37, 112, 128);
- color: rgb(255, 255, 255);
-}
-
-span.pys-tag-deprecated {
- background-color: rgb(238, 238, 238);
- color: rgb(62, 67, 73);
-}
-
-div.pys-note-experimental {
- background-color: rgb(88, 151, 165);
- border-color: rgb(59, 115, 127);
- color: rgb(255, 255, 255);
-}
-
-div.pys-note-deprecated {
-}
-
-.hasTooltip {
- position:relative;
-}
-.hasTooltip span {
- display:none;
-}
-
-.hasTooltip:hover span.tooltip {
- display: inline-block;
- -moz-border-radius: 2px;
- -webkit-border-radius: 2px;
- border-radius: 2px;
- background-color: rgb(250, 250, 250);
- color: rgb(68, 68, 68);
- font-weight: normal;
- box-shadow: 1px 1px 3px rgb(127, 127, 127);
- position: absolute;
- padding: 0 3px 0 3px;
- top: 1.3em;
- left: 14px;
- z-index: 9999
-}
diff --git a/python/docs/_static/pyspark.js b/python/docs/_static/pyspark.js
deleted file mode 100644
index 75e4c42492a48..0000000000000
--- a/python/docs/_static/pyspark.js
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-$(function (){
-
- function startsWith(s, prefix) {
- return s && s.indexOf(prefix) === 0;
- }
-
- function buildSidebarLinkMap() {
- var linkMap = {};
- $('div.sphinxsidebar a.reference.internal').each(function (i,a) {
- var href = $(a).attr('href');
- if (startsWith(href, '#module-')) {
- var id = href.substr(8);
- linkMap[id] = [$(a), null];
- }
- })
- return linkMap;
- };
-
- function getAdNoteDivs(dd) {
- var noteDivs = {};
- dd.find('> div.admonition.note > p.last').each(function (i, p) {
- var text = $(p).text();
- if (!noteDivs.experimental && startsWith(text, 'Experimental')) {
- noteDivs.experimental = $(p).parent();
- }
- if (!noteDivs.deprecated && startsWith(text, 'Deprecated')) {
- noteDivs.deprecated = $(p).parent();
- }
- });
- return noteDivs;
- }
-
- function getParentId(name) {
- var last_idx = name.lastIndexOf('.');
- return last_idx == -1? '': name.substr(0, last_idx);
- }
-
- function buildTag(text, cls, tooltip) {
- return '' + text + ''
- + tooltip + ''
- }
-
-
- var sidebarLinkMap = buildSidebarLinkMap();
-
- $('dl.class, dl.function').each(function (i,dl) {
-
- dl = $(dl);
- dt = dl.children('dt').eq(0);
- dd = dl.children('dd').eq(0);
- var id = dt.attr('id');
- var desc = dt.find('> .descname').text();
- var adNoteDivs = getAdNoteDivs(dd);
-
- if (id) {
- var parent_id = getParentId(id);
-
- var r = sidebarLinkMap[parent_id];
- if (r) {
- if (r[1] === null) {
- r[1] = $('
');
- r[0].parent().append(r[1]);
- }
- var tags = '';
- if (adNoteDivs.experimental) {
- tags += buildTag('E', 'pys-tag-experimental', 'Experimental');
- adNoteDivs.experimental.addClass('pys-note pys-note-experimental');
- }
- if (adNoteDivs.deprecated) {
- tags += buildTag('D', 'pys-tag-deprecated', 'Deprecated');
- adNoteDivs.deprecated.addClass('pys-note pys-note-deprecated');
- }
- var li = $('');
- var a = $('' + desc + '');
- li.append(a);
- li.append(tags);
- r[1].append(li);
- sidebarLinkMap[id] = [a, null];
- }
- }
- });
-});
diff --git a/python/docs/_templates/layout.html b/python/docs/_templates/layout.html
deleted file mode 100644
index ab36ebababf88..0000000000000
--- a/python/docs/_templates/layout.html
+++ /dev/null
@@ -1,6 +0,0 @@
-{% extends "!layout.html" %}
-{% set script_files = script_files + ["_static/pyspark.js"] %}
-{% set css_files = css_files + ['_static/pyspark.css'] %}
-{% block rootrellink %}
- {{ super() }}
-{% endblock %}
diff --git a/python/docs/index.rst b/python/docs/index.rst
deleted file mode 100644
index 6e059264e6bbb..0000000000000
--- a/python/docs/index.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-.. pyspark documentation master file, created by
- sphinx-quickstart on Thu Aug 28 15:17:47 2014.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to Spark Python API Docs!
-===================================
-
-Contents:
-
-.. toctree::
- :maxdepth: 2
-
- pyspark
- pyspark.sql
- pyspark.streaming
- pyspark.ml
- pyspark.mllib
- pyspark.resource
-
-
-Core classes:
----------------
-
- :class:`pyspark.SparkContext`
-
- Main entry point for Spark functionality.
-
- :class:`pyspark.RDD`
-
- A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
-
- :class:`pyspark.streaming.StreamingContext`
-
- Main entry point for Spark Streaming functionality.
-
- :class:`pyspark.streaming.DStream`
-
- A Discretized Stream (DStream), the basic abstraction in Spark Streaming.
-
- :class:`pyspark.sql.SparkSession`
-
- Main entry point for DataFrame and SQL functionality.
-
- :class:`pyspark.sql.DataFrame`
-
- A distributed collection of data grouped into named columns.
-
-
-Indices and tables
-==================
-
-* :ref:`search`
diff --git a/python/docs/make2.bat b/python/docs/make2.bat
index 7955a83051b8e..2f87032820f42 100644
--- a/python/docs/make2.bat
+++ b/python/docs/make2.bat
@@ -5,8 +5,8 @@ REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
-set SOURCEDIR=.
-set BUILDDIR=_build
+set SOURCEDIR=source
+set BUILDDIR=build
set PYTHONPATH=..;..\lib\py4j-0.10.9-src.zip
diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst
deleted file mode 100644
index e31dfddd5988e..0000000000000
--- a/python/docs/pyspark.ml.rst
+++ /dev/null
@@ -1,122 +0,0 @@
-pyspark.ml package
-==================
-
-ML Pipeline APIs
-----------------
-
-.. automodule:: pyspark.ml
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.param module
------------------------
-
-.. automodule:: pyspark.ml.param
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.feature module
--------------------------
-
-.. automodule:: pyspark.ml.feature
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.classification module
---------------------------------
-
-.. automodule:: pyspark.ml.classification
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.clustering module
-----------------------------
-
-.. automodule:: pyspark.ml.clustering
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.functions module
-----------------------------
-
-.. automodule:: pyspark.ml.functions
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.linalg module
-----------------------------
-
-.. automodule:: pyspark.ml.linalg
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.recommendation module
---------------------------------
-
-.. automodule:: pyspark.ml.recommendation
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.regression module
-----------------------------
-
-.. automodule:: pyspark.ml.regression
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.stat module
-----------------------
-
-.. automodule:: pyspark.ml.stat
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.tuning module
-------------------------
-
-.. automodule:: pyspark.ml.tuning
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.evaluation module
-----------------------------
-
-.. automodule:: pyspark.ml.evaluation
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.fpm module
-----------------------------
-
-.. automodule:: pyspark.ml.fpm
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.image module
-----------------------------
-
-.. automodule:: pyspark.ml.image
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.ml.util module
-----------------------------
-
-.. automodule:: pyspark.ml.util
- :members:
- :undoc-members:
- :inherited-members:
diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst
deleted file mode 100644
index 2d54ab118b94b..0000000000000
--- a/python/docs/pyspark.mllib.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-pyspark.mllib package
-=====================
-
-pyspark.mllib.classification module
------------------------------------
-
-.. automodule:: pyspark.mllib.classification
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.mllib.clustering module
--------------------------------
-
-.. automodule:: pyspark.mllib.clustering
- :members:
- :undoc-members:
-
-pyspark.mllib.evaluation module
--------------------------------
-
-.. automodule:: pyspark.mllib.evaluation
- :members:
- :undoc-members:
-
-pyspark.mllib.feature module
--------------------------------
-
-.. automodule:: pyspark.mllib.feature
- :members:
- :undoc-members:
- :show-inheritance:
-
-pyspark.mllib.fpm module
-------------------------
-
-.. automodule:: pyspark.mllib.fpm
- :members:
- :undoc-members:
-
-pyspark.mllib.linalg module
----------------------------
-
-.. automodule:: pyspark.mllib.linalg
- :members:
- :undoc-members:
- :show-inheritance:
-
-pyspark.mllib.linalg.distributed module
----------------------------------------
-
-.. automodule:: pyspark.mllib.linalg.distributed
- :members:
- :undoc-members:
- :show-inheritance:
-
-pyspark.mllib.random module
----------------------------
-
-.. automodule:: pyspark.mllib.random
- :members:
- :undoc-members:
-
-pyspark.mllib.recommendation module
------------------------------------
-
-.. automodule:: pyspark.mllib.recommendation
- :members:
- :undoc-members:
-
-pyspark.mllib.regression module
--------------------------------
-
-.. automodule:: pyspark.mllib.regression
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.mllib.stat module
--------------------------
-
-.. automodule:: pyspark.mllib.stat
- :members:
- :undoc-members:
-
-pyspark.mllib.tree module
--------------------------
-
-.. automodule:: pyspark.mllib.tree
- :members:
- :undoc-members:
- :inherited-members:
-
-pyspark.mllib.util module
--------------------------
-
-.. automodule:: pyspark.mllib.util
- :members:
- :undoc-members:
diff --git a/python/docs/pyspark.resource.rst b/python/docs/pyspark.resource.rst
deleted file mode 100644
index 7f3a79b9e5b52..0000000000000
--- a/python/docs/pyspark.resource.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-pyspark.resource module
-=======================
-
-Module Contents
----------------
-
-.. automodule:: pyspark.resource
- :members:
- :undoc-members:
- :inherited-members:
-
diff --git a/python/docs/pyspark.rst b/python/docs/pyspark.rst
deleted file mode 100644
index 402d6ce9eb016..0000000000000
--- a/python/docs/pyspark.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-pyspark package
-===============
-
-Subpackages
------------
-
-.. toctree::
- :maxdepth: 1
-
- pyspark.sql
- pyspark.streaming
- pyspark.ml
- pyspark.mllib
- pyspark.resource
-
-Contents
---------
-
-.. automodule:: pyspark
- :members:
- :undoc-members:
diff --git a/python/docs/pyspark.sql.rst b/python/docs/pyspark.sql.rst
deleted file mode 100644
index 406ada701941a..0000000000000
--- a/python/docs/pyspark.sql.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-pyspark.sql module
-==================
-
-Module Contents
----------------
-
-.. automodule:: pyspark.sql
- :members:
- :undoc-members:
- :inherited-members:
- :exclude-members: builder
-.. We need `exclude-members` to prevent default description generations
- as a workaround for old Sphinx (< 1.6.6).
-
-pyspark.sql.types module
-------------------------
-.. automodule:: pyspark.sql.types
- :members:
- :undoc-members:
-
-pyspark.sql.functions module
-----------------------------
-.. automodule:: pyspark.sql.functions
- :members:
- :undoc-members:
-
-pyspark.sql.avro.functions module
----------------------------------
-.. automodule:: pyspark.sql.avro.functions
- :members:
- :undoc-members:
-
-pyspark.sql.streaming module
-----------------------------
-.. automodule:: pyspark.sql.streaming
- :members:
- :undoc-members:
diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst
deleted file mode 100644
index f7df6438b9169..0000000000000
--- a/python/docs/pyspark.streaming.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-pyspark.streaming module
-========================
-
-Module contents
----------------
-
-.. automodule:: pyspark.streaming
- :members:
- :undoc-members:
- :show-inheritance:
-
-pyspark.streaming.kinesis module
---------------------------------
-.. automodule:: pyspark.streaming.kinesis
- :members:
- :undoc-members:
- :show-inheritance:
diff --git a/python/docs/_static/copybutton.js b/python/docs/source/_static/copybutton.js
similarity index 100%
rename from python/docs/_static/copybutton.js
rename to python/docs/source/_static/copybutton.js
diff --git a/python/docs/source/_static/css/pyspark.css b/python/docs/source/_static/css/pyspark.css
new file mode 100644
index 0000000000000..2fd8720e2fa0d
--- /dev/null
+++ b/python/docs/source/_static/css/pyspark.css
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+/* PySpark style CSS overwrite */
+
+/* Lato font (from its parent) does not make any difference when it's bold. Defaults to 'Source Sans Pro' */
+body {
+ font-family:"Source Sans Pro",sans-serif!important;
+}
+
+h1,h2 {
+ color:#1B5162!important;
+}
+
+h3 {
+ color: #555555
+}
+
+/* Top menu */
+#navbar-main {
+ background: #1B5162!important;
+ box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.11);
+}
+
+#navbar-main-elements li.nav-item a {
+ color: rgba(255, 255, 255, 0.8);
+}
+
+#navbar-main-elements li.active a {
+ font-weight: 600;
+ color: #FFFFFF!important;
+}
+
+.col-9 {
+ flex: 0 0 80%;
+ max-width: 80%;
+}
+
+/* Left pannel size */
+@media (min-width: 768px) {
+ .col-md-3 {
+ flex: 0 0 20%;
+ max-width: 20%;
+ }
+}
+
+/* Top menu right button */
+.navbar-toggler {
+ color:rgba(255,255,255,.5)!important;
+ border-color:rgba(255,255,255,.5)!important;
+}
+
+.navbar-toggler-icon {
+ background-image:url("data:image/svg+xml;charset=utf-8,%3Csvg xmlns='http://www.w3.org/2000/svg' width='30' height='30'%3E%3Cpath stroke='rgba(255,255,255,.5)' stroke-linecap='round' stroke-miterlimit='10' stroke-width='2' d='M4 7h22M4 15h22M4 23h22'/%3E%3C/svg%3E")!important;
+}
+
+/* Left bar list colors */
+.bd-sidebar .nav>.active>a {
+ color: #1B5162!important;
+}
+
+.bd-sidebar .nav>li>a:hover {
+ color: #1B5162!important;
+}
+
+.bd-sidebar .nav>.active:hover>a,.bd-sidebar .nav>.active>a {
+ color: #1B5162!important;
+}
+
+u.bd-sidebar .nav>li>ul>.active:hover>a,.bd-sidebar .nav>li>ul>.active>a {
+ color: #1B5162!important;
+}
+
+/* Right bar list colors */
+.toc-entry>.nav-link.active {
+ color: #1B5162!important;
+ border-left: 2px solid #1B5162!important;
+}
+
diff --git a/python/docs/source/_templates/class_with_docs.rst b/python/docs/source/_templates/class_with_docs.rst
new file mode 100644
index 0000000000000..7c37b83c0e90e
--- /dev/null
+++ b/python/docs/source/_templates/class_with_docs.rst
@@ -0,0 +1,79 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+{{ objname }}
+{{ underline }}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+ {% if '__init__' in methods %}
+ {% set caught_result = methods.remove('__init__') %}
+ {% endif %}
+
+ {% block methods_summary %}
+ {% if methods %}
+
+ .. rubric:: Methods
+
+ .. autosummary::
+ {% for item in methods %}
+ ~{{ name }}.{{ item }}
+ {%- endfor %}
+
+ {% endif %}
+ {% endblock %}
+
+ {% block attributes_summary %}
+ {% if attributes %}
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+ {% for item in attributes %}
+ ~{{ name }}.{{ item }}
+ {%- endfor %}
+
+ {% endif %}
+ {% endblock %}
+
+ {% block methods_documentation %}
+ {% if methods %}
+
+ .. rubric:: Methods Documentation
+
+ {% for item in methods %}
+ .. automethod:: {{ item }}
+ {%- endfor %}
+
+ {% endif %}
+ {% endblock %}
+
+ {% block attributes_documentation %}
+ {% if attributes %}
+
+ .. rubric:: Attributes Documentation
+
+ {% for item in attributes %}
+ .. autoattribute:: {{ item }}
+ {%- endfor %}
+
+ {% endif %}
+ {% endblock %}
+
diff --git a/python/docs/conf.py b/python/docs/source/conf.py
similarity index 89%
rename from python/docs/conf.py
rename to python/docs/source/conf.py
index 9e7afb7c07298..7b1939d976080 100644
--- a/python/docs/conf.py
+++ b/python/docs/source/conf.py
@@ -14,12 +14,24 @@
import sys
import os
+import shutil
+import errno
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.abspath('.'))
+# Remove previously generated rst files. Ignore errors just in case it stops
+# generating whole docs.
+shutil.rmtree(
+ "%s/reference/api" % os.path.dirname(os.path.abspath(__file__)), ignore_errors=True)
+try:
+ os.mkdir("%s/reference/api" % os.path.dirname(os.path.abspath(__file__)))
+except OSError as e:
+ if e.errno != errno.EEXIST:
+ raise
+
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
@@ -32,6 +44,7 @@
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx.ext.mathjax',
+ 'sphinx.ext.autosummary',
]
# Add any paths that contain templates here, relative to this directory.
@@ -47,8 +60,8 @@
master_doc = 'index'
# General information about the project.
-project = u'PySpark'
-copyright = u''
+project = 'PySpark'
+copyright = ''
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -101,12 +114,13 @@
# Look at the first line of the docstring for function and method signatures.
autodoc_docstring_signature = True
+autosummary_generate = True
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'nature'
+html_theme = 'pydata_sphinx_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
@@ -125,7 +139,7 @@
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-html_logo = "../../docs/img/spark-logo-hd.png"
+html_logo = "../../../docs/img/spark-logo-reverse.png"
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
@@ -137,6 +151,10 @@
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
+html_css_files = [
+ 'css/pyspark.css',
+]
+
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
@@ -204,8 +222,8 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- ('index', 'pyspark.tex', u'pyspark Documentation',
- u'Author', 'manual'),
+ ('index', 'pyspark.tex', 'pyspark Documentation',
+ 'Author', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
@@ -234,8 +252,8 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- ('index', 'pyspark', u'pyspark Documentation',
- [u'Author'], 1)
+ ('index', 'pyspark', 'pyspark Documentation',
+ ['Author'], 1)
]
# If true, show URL addresses after external links.
@@ -248,8 +266,8 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- ('index', 'pyspark', u'pyspark Documentation',
- u'Author', 'pyspark', 'One line description of project.',
+ ('index', 'pyspark', 'pyspark Documentation',
+ 'Author', 'pyspark', 'One line description of project.',
'Miscellaneous'),
]
@@ -269,13 +287,13 @@
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
-epub_title = u'pyspark'
-epub_author = u'Author'
-epub_publisher = u'Author'
-epub_copyright = u'2014, Author'
+epub_title = 'pyspark'
+epub_author = 'Author'
+epub_publisher = 'Author'
+epub_copyright = '2014, Author'
# The basename for the epub file. It defaults to the project name.
-#epub_basename = u'pyspark'
+#epub_basename = 'pyspark'
# The HTML theme for the epub output. Since the default themes are not optimized
# for small screen space, using the same theme for HTML and epub output is
@@ -335,7 +353,8 @@
# If false, no index is generated.
#epub_use_index = True
def setup(app):
- app.add_javascript('copybutton.js')
+ # The app.add_javascript() is deprecated.
+ getattr(app, "add_js_file", getattr(app, "add_javascript"))('copybutton.js')
# Skip sample endpoint link (not expected to resolve)
linkcheck_ignore = [r'https://kinesis.us-east-1.amazonaws.com']
diff --git a/python/docs/source/development/index.rst b/python/docs/source/development/index.rst
new file mode 100644
index 0000000000000..db9f667332635
--- /dev/null
+++ b/python/docs/source/development/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+===========
+Development
+===========
+
diff --git a/python/docs/source/getting_started/index.rst b/python/docs/source/getting_started/index.rst
new file mode 100644
index 0000000000000..457368c8194cb
--- /dev/null
+++ b/python/docs/source/getting_started/index.rst
@@ -0,0 +1,22 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+===============
+Getting Started
+===============
+
diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst
new file mode 100644
index 0000000000000..34011ec7c5573
--- /dev/null
+++ b/python/docs/source/index.rst
@@ -0,0 +1,32 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+.. PySpark documentation master file
+
+=====================
+PySpark Documentation
+=====================
+
+.. toctree::
+ :maxdepth: 2
+
+ getting_started/index
+ user_guide/index
+ reference/index
+ development/index
+ migration_guide/index
+
diff --git a/python/docs/source/migration_guide/index.rst b/python/docs/source/migration_guide/index.rst
new file mode 100644
index 0000000000000..fc12668f81a58
--- /dev/null
+++ b/python/docs/source/migration_guide/index.rst
@@ -0,0 +1,22 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+===============
+Migration Guide
+===============
+
diff --git a/python/docs/source/reference/index.rst b/python/docs/source/reference/index.rst
new file mode 100644
index 0000000000000..77f17da720dd5
--- /dev/null
+++ b/python/docs/source/reference/index.rst
@@ -0,0 +1,34 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+=============
+API Reference
+=============
+
+This page lists an overview of all public PySpark modules, classes, functions and methods.
+
+.. toctree::
+ :maxdepth: 2
+
+ pyspark.sql
+ pyspark.ss
+ pyspark.ml
+ pyspark.streaming
+ pyspark.mllib
+ pyspark
+ pyspark.resource
diff --git a/python/docs/source/reference/pyspark.ml.rst b/python/docs/source/reference/pyspark.ml.rst
new file mode 100644
index 0000000000000..b6e7d10276603
--- /dev/null
+++ b/python/docs/source/reference/pyspark.ml.rst
@@ -0,0 +1,363 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+ML
+==
+
+ML Pipeline APIs
+----------------
+
+.. currentmodule:: pyspark.ml
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Transformer
+ UnaryTransformer
+ Estimator
+ Model
+ Predictor
+ PredictionModel
+ Pipeline
+ PipelineModel
+
+
+Parameters
+----------
+
+.. currentmodule:: pyspark.ml.param
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Param
+ Params
+ TypeConverters
+
+
+Feature
+-------
+
+.. currentmodule:: pyspark.ml.feature
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ ANOVASelector
+ ANOVASelectorModel
+ Binarizer
+ BucketedRandomProjectionLSH
+ BucketedRandomProjectionLSHModel
+ Bucketizer
+ ChiSqSelector
+ ChiSqSelectorModel
+ CountVectorizer
+ CountVectorizerModel
+ DCT
+ ElementwiseProduct
+ FeatureHasher
+ FValueSelector
+ FValueSelectorModel
+ HashingTF
+ IDF
+ IDFModel
+ Imputer
+ ImputerModel
+ IndexToString
+ Interaction
+ MaxAbsScaler
+ MaxAbsScalerModel
+ MinHashLSH
+ MinHashLSHModel
+ MinMaxScaler
+ MinMaxScalerModel
+ NGram
+ Normalizer
+ OneHotEncoder
+ OneHotEncoderModel
+ PCA
+ PCAModel
+ PolynomialExpansion
+ QuantileDiscretizer
+ RobustScaler
+ RobustScalerModel
+ RegexTokenizer
+ RFormula
+ RFormulaModel
+ SQLTransformer
+ StandardScaler
+ StandardScalerModel
+ StopWordsRemover
+ StringIndexer
+ StringIndexerModel
+ Tokenizer
+ VarianceThresholdSelector
+ VarianceThresholdSelectorModel
+ VectorAssembler
+ VectorIndexer
+ VectorIndexerModel
+ VectorSizeHint
+ VectorSlicer
+ Word2Vec
+ Word2VecModel
+
+
+Classification
+--------------
+
+.. currentmodule:: pyspark.ml.classification
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ LinearSVC
+ LinearSVCModel
+ LinearSVCSummary
+ LinearSVCTrainingSummary
+ LogisticRegression
+ LogisticRegressionModel
+ LogisticRegressionSummary
+ LogisticRegressionTrainingSummary
+ BinaryLogisticRegressionSummary
+ BinaryLogisticRegressionTrainingSummary
+ DecisionTreeClassifier
+ DecisionTreeClassificationModel
+ GBTClassifier
+ GBTClassificationModel
+ RandomForestClassifier
+ RandomForestClassificationModel
+ RandomForestClassificationSummary
+ RandomForestClassificationTrainingSummary
+ BinaryRandomForestClassificationSummary
+ BinaryRandomForestClassificationTrainingSummary
+ NaiveBayes
+ NaiveBayesModel
+ MultilayerPerceptronClassifier
+ MultilayerPerceptronClassificationModel
+ OneVsRest
+ OneVsRestModel
+ FMClassifier
+ FMClassificationModel
+ FMClassificationSummary
+ FMClassificationTrainingSummary
+
+
+Clustering
+----------
+
+.. currentmodule:: pyspark.ml.clustering
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ BisectingKMeans
+ BisectingKMeansModel
+ BisectingKMeansSummary
+ KMeans
+ KMeansModel
+ GaussianMixture
+ GaussianMixtureModel
+ GaussianMixtureSummary
+ LDA
+ LDAModel
+ LocalLDAModel
+ DistributedLDAModel
+ PowerIterationClustering
+
+
+ML Functions
+----------------------------
+
+.. currentmodule:: pyspark.ml.functions
+
+.. autosummary::
+ :toctree: api/
+
+ vector_to_array
+
+
+Vector and Matrix
+-----------------
+
+.. currentmodule:: pyspark.ml.linalg
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Vector
+ DenseVector
+ SparseVector
+ Vectors
+ Matrix
+ DenseMatrix
+ SparseMatrix
+ Matrices
+
+
+Recommendation
+--------------
+
+.. currentmodule:: pyspark.ml.recommendation
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ ALS
+ ALSModel
+
+
+Regression
+----------
+
+.. currentmodule:: pyspark.ml.regression
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ AFTSurvivalRegression
+ AFTSurvivalRegressionModel
+ DecisionTreeRegressor
+ DecisionTreeRegressionModel
+ GBTRegressor
+ GBTRegressionModel
+ GeneralizedLinearRegression
+ GeneralizedLinearRegressionModel
+ GeneralizedLinearRegressionSummary
+ GeneralizedLinearRegressionTrainingSummary
+ IsotonicRegression
+ IsotonicRegressionModel
+ LinearRegression
+ LinearRegressionModel
+ LinearRegressionSummary
+ LinearRegressionTrainingSummary
+ RandomForestRegressor
+ RandomForestRegressionModel
+ FMRegressor
+ FMRegressionModel
+
+
+Statistics
+----------
+
+.. currentmodule:: pyspark.ml.stat
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ ANOVATest
+ ChiSquareTest
+ Correlation
+ FValueTest
+ KolmogorovSmirnovTest
+ MultivariateGaussian
+ Summarizer
+ SummaryBuilder
+
+
+Tuning
+------
+
+.. currentmodule:: pyspark.ml.tuning
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ ParamGridBuilder
+ CrossValidator
+ CrossValidatorModel
+ TrainValidationSplit
+ TrainValidationSplitModel
+
+
+Evaluation
+----------
+
+.. currentmodule:: pyspark.ml.evaluation
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Evaluator
+ BinaryClassificationEvaluator
+ RegressionEvaluator
+ MulticlassClassificationEvaluator
+ MultilabelClassificationEvaluator
+ ClusteringEvaluator
+ RankingEvaluator
+
+
+Frequency Pattern Mining
+----------------------------
+
+.. currentmodule:: pyspark.ml.fpm
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ FPGrowth
+ FPGrowthModel
+ PrefixSpan
+
+
+Image
+-----
+
+.. currentmodule:: pyspark.ml.image
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ ImageSchema
+ _ImageSchema
+
+
+Utilities
+---------
+
+.. currentmodule:: pyspark.ml.util
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ BaseReadWrite
+ DefaultParamsReadable
+ DefaultParamsReader
+ DefaultParamsWritable
+ DefaultParamsWriter
+ GeneralMLWriter
+ HasTrainingSummary
+ Identifiable
+ MLReadable
+ MLReader
+ MLWritable
+ MLWriter
+
diff --git a/python/docs/source/reference/pyspark.mllib.rst b/python/docs/source/reference/pyspark.mllib.rst
new file mode 100644
index 0000000000000..1251b1df752c7
--- /dev/null
+++ b/python/docs/source/reference/pyspark.mllib.rst
@@ -0,0 +1,253 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+MLlib
+=====
+
+Classification
+--------------
+
+.. currentmodule:: pyspark.mllib.classification
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ LogisticRegressionModel
+ LogisticRegressionWithSGD
+ LogisticRegressionWithLBFGS
+ SVMModel
+ SVMWithSGD
+ NaiveBayesModel
+ NaiveBayes
+ StreamingLogisticRegressionWithSGD
+
+
+Clustering
+----------
+
+.. currentmodule:: pyspark.mllib.clustering
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+
+ BisectingKMeansModel
+ BisectingKMeans
+ KMeansModel
+ KMeans
+ GaussianMixtureModel
+ GaussianMixture
+ PowerIterationClusteringModel
+ PowerIterationClustering
+ StreamingKMeans
+ StreamingKMeansModel
+ LDA
+ LDAModel
+
+
+Evaluation
+----------
+
+.. currentmodule:: pyspark.mllib.evaluation
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ BinaryClassificationMetrics
+ RegressionMetrics
+ MulticlassMetrics
+ RankingMetrics
+
+
+Feature
+-------
+
+.. currentmodule:: pyspark.mllib.feature
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Normalizer
+ StandardScalerModel
+ StandardScaler
+ HashingTF
+ IDFModel
+ IDF
+ Word2Vec
+ Word2VecModel
+ ChiSqSelector
+ ChiSqSelectorModel
+ ElementwiseProduct
+
+
+Frequency Pattern Mining
+------------------------
+
+.. currentmodule:: pyspark.mllib.fpm
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ FPGrowth
+ FPGrowthModel
+ PrefixSpan
+ PrefixSpanModel
+
+
+Vector and Matrix
+-----------------
+
+.. currentmodule:: pyspark.mllib.linalg
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Vector
+ DenseVector
+ SparseVector
+ Vectors
+ Matrix
+ DenseMatrix
+ SparseMatrix
+ Matrices
+ QRDecomposition
+
+
+Distributed Representation
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: pyspark.mllib.linalg.distributed
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ BlockMatrix
+ CoordinateMatrix
+ DistributedMatrix
+ IndexedRow
+ IndexedRowMatrix
+ MatrixEntry
+ RowMatrix
+ SingularValueDecomposition
+
+
+Random
+------
+
+.. currentmodule:: pyspark.mllib.random
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ RandomRDDs
+
+
+Recommendation
+--------------
+
+.. currentmodule:: pyspark.mllib.recommendation
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ MatrixFactorizationModel
+ ALS
+ Rating
+
+
+Regression
+----------
+
+.. currentmodule:: pyspark.mllib.regression
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ LabeledPoint
+ LinearModel
+ LinearRegressionModel
+ LinearRegressionWithSGD
+ RidgeRegressionModel
+ RidgeRegressionWithSGD
+ LassoModel
+ LassoWithSGD
+ IsotonicRegressionModel
+ IsotonicRegression
+ StreamingLinearAlgorithm
+ StreamingLinearRegressionWithSGD
+
+
+Statistics
+----------
+
+.. currentmodule:: pyspark.mllib.stat
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ Statistics
+ MultivariateStatisticalSummary
+ ChiSqTestResult
+ MultivariateGaussian
+ KernelDensity
+
+
+Tree
+----
+
+.. currentmodule:: pyspark.mllib.tree
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ DecisionTreeModel
+ DecisionTree
+ RandomForestModel
+ RandomForest
+ GradientBoostedTreesModel
+ GradientBoostedTrees
+
+
+Utilities
+---------
+
+.. currentmodule:: pyspark.mllib.util
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ JavaLoader
+ JavaSaveable
+ LinearDataGenerator
+ Loader
+ MLUtils
+ Saveable
+
diff --git a/python/docs/source/reference/pyspark.resource.rst b/python/docs/source/reference/pyspark.resource.rst
new file mode 100644
index 0000000000000..a1d885c44c480
--- /dev/null
+++ b/python/docs/source/reference/pyspark.resource.rst
@@ -0,0 +1,38 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+===================
+Resource Management
+===================
+
+Core Classes
+------------
+
+.. currentmodule:: pyspark.resource
+
+.. autosummary::
+ :toctree: api/
+
+ ResourceInformation
+ ResourceProfile
+ ResourceProfileBuilder
+ ExecutorResourceRequest
+ ExecutorResourceRequests
+ TaskResourceRequest
+ TaskResourceRequests
+
diff --git a/python/docs/source/reference/pyspark.rst b/python/docs/source/reference/pyspark.rst
new file mode 100644
index 0000000000000..b50ae37b99690
--- /dev/null
+++ b/python/docs/source/reference/pyspark.rst
@@ -0,0 +1,275 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+==========
+Spark Core
+==========
+
+Public Classes
+--------------
+
+.. currentmodule:: pyspark
+
+.. autosummary::
+ :toctree: api/
+
+ SparkContext
+ RDD
+ Broadcast
+ Accumulator
+ SparkConf
+ SparkFiles
+ StorageLevel
+ TaskContext
+ RDDBarrier
+ BarrierTaskContext
+ BarrierTaskInfo
+
+
+Spark Context APIs
+------------------
+
+.. currentmodule:: pyspark
+
+.. autosummary::
+ :toctree: api/
+
+ SparkContext.PACKAGE_EXTENSIONS
+ SparkContext.accumulator
+ SparkContext.addFile
+ SparkContext.addPyFile
+ SparkContext.applicationId
+ SparkContext.binaryFiles
+ SparkContext.binaryRecords
+ SparkContext.broadcast
+ SparkContext.cancelAllJobs
+ SparkContext.cancelJobGroup
+ SparkContext.defaultMinPartitions
+ SparkContext.defaultParallelism
+ SparkContext.dump_profiles
+ SparkContext.emptyRDD
+ SparkContext.getConf
+ SparkContext.getLocalProperty
+ SparkContext.getOrCreate
+ SparkContext.hadoopFile
+ SparkContext.hadoopRDD
+ SparkContext.newAPIHadoopFile
+ SparkContext.newAPIHadoopRDD
+ SparkContext.parallelize
+ SparkContext.pickleFile
+ SparkContext.range
+ SparkContext.resources
+ SparkContext.runJob
+ SparkContext.sequenceFile
+ SparkContext.setCheckpointDir
+ SparkContext.setJobDescription
+ SparkContext.setJobGroup
+ SparkContext.setLocalProperty
+ SparkContext.setLogLevel
+ SparkContext.setSystemProperty
+ SparkContext.show_profiles
+ SparkContext.sparkUser
+ SparkContext.startTime
+ SparkContext.statusTracker
+ SparkContext.stop
+ SparkContext.textFile
+ SparkContext.uiWebUrl
+ SparkContext.union
+ SparkContext.version
+ SparkContext.wholeTextFiles
+
+
+RDD APIs
+--------
+
+.. currentmodule:: pyspark
+
+.. autosummary::
+ :toctree: api/
+
+ RDD.aggregate
+ RDD.aggregateByKey
+ RDD.barrier
+ RDD.cache
+ RDD.cartesian
+ RDD.checkpoint
+ RDD.coalesce
+ RDD.cogroup
+ RDD.collect
+ RDD.collectAsMap
+ RDD.collectWithJobGroup
+ RDD.combineByKey
+ RDD.context
+ RDD.count
+ RDD.countApprox
+ RDD.countApproxDistinct
+ RDD.countByKey
+ RDD.countByValue
+ RDD.distinct
+ RDD.filter
+ RDD.first
+ RDD.flatMap
+ RDD.flatMapValues
+ RDD.fold
+ RDD.foldByKey
+ RDD.foreach
+ RDD.foreachPartition
+ RDD.fullOuterJoin
+ RDD.getCheckpointFile
+ RDD.getNumPartitions
+ RDD.getResourceProfile
+ RDD.getStorageLevel
+ RDD.glom
+ RDD.groupBy
+ RDD.groupByKey
+ RDD.groupWith
+ RDD.histogram
+ RDD.id
+ RDD.intersection
+ RDD.isCheckpointed
+ RDD.isEmpty
+ RDD.isLocallyCheckpointed
+ RDD.join
+ RDD.keyBy
+ RDD.keys
+ RDD.leftOuterJoin
+ RDD.localCheckpoint
+ RDD.lookup
+ RDD.map
+ RDD.mapPartitions
+ RDD.mapPartitionsWithIndex
+ RDD.mapPartitionsWithSplit
+ RDD.mapValues
+ RDD.max
+ RDD.mean
+ RDD.meanApprox
+ RDD.min
+ RDD.name
+ RDD.partitionBy
+ RDD.persist
+ RDD.pipe
+ RDD.randomSplit
+ RDD.reduce
+ RDD.reduceByKey
+ RDD.reduceByKeyLocally
+ RDD.repartition
+ RDD.repartitionAndSortWithinPartitions
+ RDD.rightOuterJoin
+ RDD.sample
+ RDD.sampleByKey
+ RDD.sampleStdev
+ RDD.sampleVariance
+ RDD.saveAsHadoopDataset
+ RDD.saveAsHadoopFile
+ RDD.saveAsNewAPIHadoopDataset
+ RDD.saveAsNewAPIHadoopFile
+ RDD.saveAsPickleFile
+ RDD.saveAsSequenceFile
+ RDD.saveAsTextFile
+ RDD.setName
+ RDD.sortBy
+ RDD.sortByKey
+ RDD.stats
+ RDD.stdev
+ RDD.subtract
+ RDD.subtractByKey
+ RDD.sum
+ RDD.sumApprox
+ RDD.take
+ RDD.takeOrdered
+ RDD.takeSample
+ RDD.toDebugString
+ RDD.toLocalIterator
+ RDD.top
+ RDD.treeAggregate
+ RDD.treeReduce
+ RDD.union
+ RDD.unpersist
+ RDD.values
+ RDD.variance
+ RDD.withResources
+ RDD.zip
+ RDD.zipWithIndex
+ RDD.zipWithUniqueId
+
+
+Broadcast and Accumulator
+-------------------------
+
+.. currentmodule:: pyspark
+
+.. autosummary::
+ :toctree: api/
+
+ Broadcast.destroy
+ Broadcast.dump
+ Broadcast.load
+ Broadcast.load_from_path
+ Broadcast.unpersist
+ Broadcast.value
+ Accumulator.add
+ Accumulator.value
+
+
+Management
+----------
+
+.. currentmodule:: pyspark
+
+.. autosummary::
+ :toctree: api/
+
+ SparkConf.contains
+ SparkConf.get
+ SparkConf.getAll
+ SparkConf.set
+ SparkConf.setAll
+ SparkConf.setAppName
+ SparkConf.setExecutorEnv
+ SparkConf.setIfMissing
+ SparkConf.setMaster
+ SparkConf.setSparkHome
+ SparkConf.toDebugString
+ SparkFiles.get
+ SparkFiles.getRootDirectory
+ StorageLevel.DISK_ONLY
+ StorageLevel.DISK_ONLY_2
+ StorageLevel.MEMORY_AND_DISK
+ StorageLevel.MEMORY_AND_DISK_2
+ StorageLevel.MEMORY_ONLY
+ StorageLevel.MEMORY_ONLY_2
+ StorageLevel.OFF_HEAP
+ TaskContext.attemptNumber
+ TaskContext.get
+ TaskContext.getLocalProperty
+ TaskContext.partitionId
+ TaskContext.resources
+ TaskContext.stageId
+ TaskContext.taskAttemptId
+ RDDBarrier.mapPartitions
+ RDDBarrier.mapPartitionsWithIndex
+ BarrierTaskContext.allGather
+ BarrierTaskContext.attemptNumber
+ BarrierTaskContext.barrier
+ BarrierTaskContext.get
+ BarrierTaskContext.getLocalProperty
+ BarrierTaskContext.getTaskInfos
+ BarrierTaskContext.partitionId
+ BarrierTaskContext.resources
+ BarrierTaskContext.stageId
+ BarrierTaskContext.taskAttemptId
diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst
new file mode 100644
index 0000000000000..7e0357cf9d858
--- /dev/null
+++ b/python/docs/source/reference/pyspark.sql.rst
@@ -0,0 +1,542 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+=========
+Spark SQL
+=========
+
+Core Classes
+------------
+
+.. currentmodule:: pyspark.sql
+
+.. autosummary::
+ :toctree: api/
+
+ SparkSession
+ DataFrame
+ Column
+ Row
+ GroupedData
+ DataFrameNaFunctions
+ DataFrameStatFunctions
+ Window
+
+
+Spark Session APIs
+------------------
+
+.. currentmodule:: pyspark.sql
+
+The entry point to programming Spark with the Dataset and DataFrame API.
+To create a Spark session, you should use ``SparkSession.builder`` attribute.
+See also :class:`SparkSession`.
+
+.. autosummary::
+ :toctree: api/
+
+ SparkSession.builder.appName
+ SparkSession.builder.config
+ SparkSession.builder.enableHiveSupport
+ SparkSession.builder.getOrCreate
+ SparkSession.builder.master
+ SparkSession.catalog
+ SparkSession.conf
+ SparkSession.createDataFrame
+ SparkSession.getActiveSession
+ SparkSession.newSession
+ SparkSession.range
+ SparkSession.read
+ SparkSession.readStream
+ SparkSession.sparkContext
+ SparkSession.sql
+ SparkSession.stop
+ SparkSession.streams
+ SparkSession.table
+ SparkSession.udf
+ SparkSession.version
+
+
+Input and Output
+----------------
+
+.. currentmodule:: pyspark.sql
+
+.. autosummary::
+ :toctree: api/
+
+ DataFrameReader.csv
+ DataFrameReader.format
+ DataFrameReader.jdbc
+ DataFrameReader.json
+ DataFrameReader.load
+ DataFrameReader.option
+ DataFrameReader.options
+ DataFrameReader.orc
+ DataFrameReader.parquet
+ DataFrameReader.schema
+ DataFrameReader.table
+ DataFrameWriter.bucketBy
+ DataFrameWriter.csv
+ DataFrameWriter.format
+ DataFrameWriter.insertInto
+ DataFrameWriter.jdbc
+ DataFrameWriter.json
+ DataFrameWriter.mode
+ DataFrameWriter.option
+ DataFrameWriter.options
+ DataFrameWriter.orc
+ DataFrameWriter.parquet
+ DataFrameWriter.partitionBy
+ DataFrameWriter.save
+ DataFrameWriter.saveAsTable
+ DataFrameWriter.sortBy
+ DataFrameWriter.text
+
+
+DataFrame APIs
+--------------
+
+.. currentmodule:: pyspark.sql
+
+.. autosummary::
+ :toctree: api/
+
+ DataFrame.agg
+ DataFrame.alias
+ DataFrame.approxQuantile
+ DataFrame.cache
+ DataFrame.checkpoint
+ DataFrame.coalesce
+ DataFrame.colRegex
+ DataFrame.collect
+ DataFrame.columns
+ DataFrame.corr
+ DataFrame.count
+ DataFrame.cov
+ DataFrame.createGlobalTempView
+ DataFrame.createOrReplaceGlobalTempView
+ DataFrame.createOrReplaceTempView
+ DataFrame.createTempView
+ DataFrame.crossJoin
+ DataFrame.crosstab
+ DataFrame.cube
+ DataFrame.describe
+ DataFrame.distinct
+ DataFrame.drop
+ DataFrame.dropDuplicates
+ DataFrame.drop_duplicates
+ DataFrame.dropna
+ DataFrame.dtypes
+ DataFrame.exceptAll
+ DataFrame.explain
+ DataFrame.fillna
+ DataFrame.filter
+ DataFrame.first
+ DataFrame.foreach
+ DataFrame.foreachPartition
+ DataFrame.freqItems
+ DataFrame.groupBy
+ DataFrame.head
+ DataFrame.hint
+ DataFrame.inputFiles
+ DataFrame.intersect
+ DataFrame.intersectAll
+ DataFrame.isLocal
+ DataFrame.isStreaming
+ DataFrame.join
+ DataFrame.limit
+ DataFrame.localCheckpoint
+ DataFrame.mapInPandas
+ DataFrame.na
+ DataFrame.orderBy
+ DataFrame.persist
+ DataFrame.printSchema
+ DataFrame.randomSplit
+ DataFrame.rdd
+ DataFrame.registerTempTable
+ DataFrame.repartition
+ DataFrame.repartitionByRange
+ DataFrame.replace
+ DataFrame.rollup
+ DataFrame.sameSemantics
+ DataFrame.sample
+ DataFrame.sampleBy
+ DataFrame.schema
+ DataFrame.select
+ DataFrame.selectExpr
+ DataFrame.semanticHash
+ DataFrame.show
+ DataFrame.sort
+ DataFrame.sortWithinPartitions
+ DataFrame.stat
+ DataFrame.storageLevel
+ DataFrame.subtract
+ DataFrame.summary
+ DataFrame.tail
+ DataFrame.take
+ DataFrame.toDF
+ DataFrame.toJSON
+ DataFrame.toLocalIterator
+ DataFrame.toPandas
+ DataFrame.transform
+ DataFrame.union
+ DataFrame.unionAll
+ DataFrame.unionByName
+ DataFrame.unpersist
+ DataFrame.where
+ DataFrame.withColumn
+ DataFrame.withColumnRenamed
+ DataFrame.withWatermark
+ DataFrame.write
+ DataFrame.writeStream
+ DataFrame.writeTo
+ DataFrameNaFunctions.drop
+ DataFrameNaFunctions.fill
+ DataFrameNaFunctions.replace
+ DataFrameStatFunctions.approxQuantile
+ DataFrameStatFunctions.corr
+ DataFrameStatFunctions.cov
+ DataFrameStatFunctions.crosstab
+ DataFrameStatFunctions.freqItems
+ DataFrameStatFunctions.sampleBy
+
+
+Data Types
+----------
+
+.. currentmodule:: pyspark.sql.types
+
+.. autosummary::
+ :template: class_with_docs.rst
+ :toctree: api/
+
+ ArrayType
+ BinaryType
+ BooleanType
+ ByteType
+ DataType
+ DateType
+ DecimalType
+ DoubleType
+ FloatType
+ IntegerType
+ LongType
+ MapType
+ NullType
+ ShortType
+ StringType
+ StructField
+ StructType
+ TimestampType
+
+
+Row
+---
+
+.. currentmodule:: pyspark.sql
+
+.. autosummary::
+ :toctree: api/
+
+ Row.asDict
+
+
+Functions
+---------
+
+.. currentmodule:: pyspark.sql.functions
+
+.. autosummary::
+ :toctree: api/
+
+ abs
+ acos
+ add_months
+ aggregate
+ approxCountDistinct
+ approx_count_distinct
+ array
+ array_contains
+ array_distinct
+ array_except
+ array_intersect
+ array_join
+ array_max
+ array_min
+ array_position
+ array_remove
+ array_repeat
+ array_sort
+ array_union
+ arrays_overlap
+ arrays_zip
+ asc
+ asc_nulls_first
+ asc_nulls_last
+ ascii
+ asin
+ atan
+ atan2
+ avg
+ base64
+ bin
+ bitwiseNOT
+ broadcast
+ bround
+ bucket
+ cbrt
+ ceil
+ coalesce
+ col
+ collect_list
+ collect_set
+ column
+ concat
+ concat_ws
+ conv
+ corr
+ cos
+ cosh
+ count
+ countDistinct
+ covar_pop
+ covar_samp
+ crc32
+ create_map
+ cume_dist
+ current_date
+ current_timestamp
+ date_add
+ date_format
+ date_sub
+ date_trunc
+ datediff
+ dayofmonth
+ dayofweek
+ dayofyear
+ days
+ decode
+ degrees
+ dense_rank
+ desc
+ desc_nulls_first
+ desc_nulls_last
+ element_at
+ encode
+ exists
+ exp
+ explode
+ explode_outer
+ expm1
+ expr
+ factorial
+ filter
+ first
+ flatten
+ floor
+ forall
+ format_number
+ format_string
+ from_csv
+ from_json
+ from_unixtime
+ from_utc_timestamp
+ get_json_object
+ greatest
+ grouping
+ grouping_id
+ hash
+ hex
+ hour
+ hours
+ hypot
+ initcap
+ input_file_name
+ instr
+ isnan
+ isnull
+ json_tuple
+ kurtosis
+ lag
+ last
+ last_day
+ lead
+ least
+ length
+ levenshtein
+ lit
+ locate
+ log
+ log10
+ log1p
+ log2
+ lower
+ lpad
+ ltrim
+ map_concat
+ map_entries
+ map_filter
+ map_from_arrays
+ map_from_entries
+ map_keys
+ map_values
+ map_zip_with
+ max
+ md5
+ mean
+ min
+ minute
+ monotonically_increasing_id
+ month
+ months
+ months_between
+ nanvl
+ next_day
+ ntile
+ overlay
+ pandas_udf
+ percent_rank
+ percentile_approx
+ posexplode
+ posexplode_outer
+ pow
+ quarter
+ radians
+ rand
+ randn
+ rank
+ regexp_extract
+ regexp_replace
+ repeat
+ reverse
+ rint
+ round
+ row_number
+ rpad
+ rtrim
+ schema_of_csv
+ schema_of_json
+ second
+ sequence
+ sha1
+ sha2
+ shiftLeft
+ shiftRight
+ shiftRightUnsigned
+ shuffle
+ signum
+ sin
+ sinh
+ size
+ skewness
+ slice
+ sort_array
+ soundex
+ spark_partition_id
+ split
+ sqrt
+ stddev
+ stddev_pop
+ stddev_samp
+ struct
+ substring
+ substring_index
+ sum
+ sumDistinct
+ tan
+ tanh
+ timestamp_seconds
+ toDegrees
+ toRadians
+ to_csv
+ to_date
+ to_json
+ to_timestamp
+ to_utc_timestamp
+ transform
+ transform_keys
+ transform_values
+ translate
+ trim
+ trunc
+ udf
+ unbase64
+ unhex
+ unix_timestamp
+ upper
+ var_pop
+ var_samp
+ variance
+ weekofyear
+ when
+ window
+ xxhash64
+ year
+ years
+ zip_with
+
+
+.. currentmodule:: pyspark.sql.avro.functions
+
+.. autosummary::
+ :toctree: api/
+
+ from_avro
+ to_avro
+
+Window
+------
+
+.. currentmodule:: pyspark.sql
+
+.. autosummary::
+ :toctree: api/
+
+ Window.currentRow
+ Window.orderBy
+ Window.partitionBy
+ Window.rangeBetween
+ Window.rowsBetween
+ Window.unboundedFollowing
+ Window.unboundedPreceding
+ WindowSpec.orderBy
+ WindowSpec.partitionBy
+ WindowSpec.rangeBetween
+ WindowSpec.rowsBetween
+
+Grouping
+--------
+
+.. currentmodule:: pyspark.sql
+
+.. autosummary::
+ :toctree: api/
+
+ GroupedData.agg
+ GroupedData.apply
+ GroupedData.applyInPandas
+ GroupedData.avg
+ GroupedData.cogroup
+ GroupedData.count
+ GroupedData.max
+ GroupedData.mean
+ GroupedData.min
+ GroupedData.pivot
+ GroupedData.sum
+
diff --git a/python/docs/source/reference/pyspark.ss.rst b/python/docs/source/reference/pyspark.ss.rst
new file mode 100644
index 0000000000000..a7936a4f2a59c
--- /dev/null
+++ b/python/docs/source/reference/pyspark.ss.rst
@@ -0,0 +1,90 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+====================
+Structured Streaming
+====================
+
+Core Classes
+------------
+
+.. currentmodule:: pyspark.sql.streaming
+
+.. autosummary::
+ :toctree: api/
+
+ DataStreamReader
+ DataStreamWriter
+ ForeachBatchFunction
+ StreamingQuery
+ StreamingQueryException
+ StreamingQueryManager
+
+Input and Output
+----------------
+
+.. currentmodule:: pyspark.sql.streaming
+
+.. autosummary::
+ :toctree: api/
+
+ DataStreamReader.csv
+ DataStreamReader.format
+ DataStreamReader.json
+ DataStreamReader.load
+ DataStreamReader.option
+ DataStreamReader.options
+ DataStreamReader.orc
+ DataStreamReader.parquet
+ DataStreamReader.schema
+ DataStreamReader.text
+ DataStreamWriter.foreach
+ DataStreamWriter.foreachBatch
+ DataStreamWriter.format
+ DataStreamWriter.option
+ DataStreamWriter.options
+ DataStreamWriter.outputMode
+ DataStreamWriter.partitionBy
+ DataStreamWriter.queryName
+ DataStreamWriter.start
+ DataStreamWriter.trigger
+
+Query Management
+----------------
+
+.. currentmodule:: pyspark.sql.streaming
+
+.. autosummary::
+ :toctree: api/
+
+ StreamingQuery.awaitTermination
+ StreamingQuery.exception
+ StreamingQuery.explain
+ StreamingQuery.id
+ StreamingQuery.isActive
+ StreamingQuery.lastProgress
+ StreamingQuery.name
+ StreamingQuery.processAllAvailable
+ StreamingQuery.recentProgress
+ StreamingQuery.runId
+ StreamingQuery.status
+ StreamingQuery.stop
+ StreamingQueryManager.active
+ StreamingQueryManager.awaitAnyTermination
+ StreamingQueryManager.get
+ StreamingQueryManager.resetTerminated
diff --git a/python/docs/source/reference/pyspark.streaming.rst b/python/docs/source/reference/pyspark.streaming.rst
new file mode 100644
index 0000000000000..57cbd00b67e4c
--- /dev/null
+++ b/python/docs/source/reference/pyspark.streaming.rst
@@ -0,0 +1,130 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+===============
+Spark Streaming
+===============
+
+Core Classes
+------------
+
+.. currentmodule:: pyspark.streaming
+
+.. autosummary::
+ :toctree: api/
+
+ StreamingContext
+ DStream
+
+
+Streaming Management
+--------------------
+
+.. currentmodule:: pyspark.streaming
+
+.. autosummary::
+ :toctree: api/
+
+ StreamingContext.addStreamingListener
+ StreamingContext.awaitTermination
+ StreamingContext.awaitTerminationOrTimeout
+ StreamingContext.checkpoint
+ StreamingContext.getActive
+ StreamingContext.getActiveOrCreate
+ StreamingContext.getOrCreate
+ StreamingContext.remember
+ StreamingContext.sparkContext
+ StreamingContext.start
+ StreamingContext.stop
+ StreamingContext.transform
+ StreamingContext.union
+
+
+Input and Output
+----------------
+
+.. autosummary::
+ :toctree: api/
+
+ StreamingContext.binaryRecordsStream
+ StreamingContext.queueStream
+ StreamingContext.socketTextStream
+ StreamingContext.textFileStream
+ DStream.pprint
+ DStream.saveAsTextFiles
+
+
+Transformations and Actions
+---------------------------
+
+.. currentmodule:: pyspark.streaming
+
+.. autosummary::
+ :toctree: api/
+
+ DStream.cache
+ DStream.checkpoint
+ DStream.cogroup
+ DStream.combineByKey
+ DStream.context
+ DStream.count
+ DStream.countByValue
+ DStream.countByValueAndWindow
+ DStream.countByWindow
+ DStream.filter
+ DStream.flatMap
+ DStream.flatMapValues
+ DStream.foreachRDD
+ DStream.fullOuterJoin
+ DStream.glom
+ DStream.groupByKey
+ DStream.groupByKeyAndWindow
+ DStream.join
+ DStream.leftOuterJoin
+ DStream.map
+ DStream.mapPartitions
+ DStream.mapPartitionsWithIndex
+ DStream.mapValues
+ DStream.partitionBy
+ DStream.persist
+ DStream.reduce
+ DStream.reduceByKey
+ DStream.reduceByKeyAndWindow
+ DStream.reduceByWindow
+ DStream.repartition
+ DStream.rightOuterJoin
+ DStream.slice
+ DStream.transform
+ DStream.transformWith
+ DStream.union
+ DStream.updateStateByKey
+ DStream.window
+
+
+Kinesis
+-------
+
+.. currentmodule:: pyspark.streaming.kinesis
+
+.. autosummary::
+ :toctree: api/
+
+ KinesisUtils.createStream
+ InitialPositionInStream.LATEST
+ InitialPositionInStream.TRIM_HORIZON
+
diff --git a/python/docs/source/user_guide/index.rst b/python/docs/source/user_guide/index.rst
new file mode 100644
index 0000000000000..e8a8d905f46d1
--- /dev/null
+++ b/python/docs/source/user_guide/index.rst
@@ -0,0 +1,22 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+
+==========
+User Guide
+==========
+
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 3ca4edafa6873..1807df4bacc85 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1196,6 +1196,8 @@ def to_date(col, format=None):
By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
is omitted. Equivalent to ``col.cast("date")``.
+ .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_date(df.t).alias('date')).collect()
[Row(date=datetime.date(1997, 2, 28))]
@@ -1219,6 +1221,8 @@ def to_timestamp(col, format=None):
By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format
is omitted. Equivalent to ``col.cast("timestamp")``.
+ .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_timestamp(df.t).alias('dt')).collect()
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 6925adf567fb6..e5553a8bb162b 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -335,6 +335,9 @@ def parquet(self, *paths, **options):
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+
>>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
@@ -367,6 +370,9 @@ def text(self, paths, wholetext=False, lineSep=None, pathGlobFilter=None,
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+
>>> df = spark.read.text('python/test_support/sql/text-test.txt')
>>> df.collect()
[Row(value='hello'), Row(value='this')]
@@ -502,6 +508,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+ .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
>>> df = spark.read.csv('python/test_support/sql/ages.csv')
>>> df.dtypes
[('_c0', 'string'), ('_c1', 'string')]
@@ -561,6 +571,9 @@ def orc(self, path, mergeSchema=None, pathGlobFilter=None, recursiveFileLookup=N
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+
>>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
>>> df.dtypes
[('a', 'bigint'), ('b', 'int'), ('c', 'int')]
@@ -893,6 +906,8 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm
:param ignoreNullFields: Whether to ignore null fields when generating JSON objects.
If None is set, it uses the default value, ``true``.
+ .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
>>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)
@@ -1007,6 +1022,8 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
:param lineSep: defines the line separator that should be used for writing. If None is
set, it uses the default value, ``\\n``. Maximum length is 1 character.
+ .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 5c528c1d54df7..4ec47305aa13e 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -541,6 +541,9 @@ def orc(self, path, mergeSchema=None, pathGlobFilter=None, recursiveFileLookup=N
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+
>>> orc_sdf = spark.readStream.schema(sdf_schema).orc(tempfile.mkdtemp())
>>> orc_sdf.isStreaming
True
@@ -571,6 +574,9 @@ def parquet(self, path, mergeSchema=None, pathGlobFilter=None, recursiveFileLook
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+
>>> parquet_sdf = spark.readStream.schema(sdf_schema).parquet(tempfile.mkdtemp())
>>> parquet_sdf.isStreaming
True
@@ -607,6 +613,9 @@ def text(self, path, wholetext=False, lineSep=None, pathGlobFilter=None,
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+
>>> text_sdf = spark.readStream.text(tempfile.mkdtemp())
>>> text_sdf.isStreaming
True
@@ -737,6 +746,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
:param recursiveFileLookup: recursively scan a directory for files. Using this option
disables `partition discovery`_.
+ .. _partition discovery:
+ https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
+ .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
>>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema)
>>> csv_sdf.isStreaming
True