From 982ce47b51d44d442cd2d5feb62d1639fb4d98fa Mon Sep 17 00:00:00 2001 From: David Toneian Date: Sun, 17 May 2020 22:15:51 +0200 Subject: [PATCH 1/2] [SPARK-31739][PYSPARK][DOCS][MINOR] Fix docstring syntax issues and misplaced space characters. --- python/pyspark/ml/clustering.py | 2 +- python/pyspark/ml/util.py | 1 + python/pyspark/mllib/util.py | 4 ++-- python/pyspark/sql/dataframe.py | 6 +++--- python/pyspark/sql/readwriter.py | 14 +++++++------- python/pyspark/sql/streaming.py | 26 +++++++++++++------------- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 8f0e9aad01b53..54a184bc081ee 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -802,7 +802,7 @@ def computeCost(self, dataset): Computes the sum of squared distances between the input points and their corresponding cluster centers. - ..note:: Deprecated in 3.0.0. It will be removed in future versions. Use + .. note:: Deprecated in 3.0.0. It will be removed in future versions. Use ClusteringEvaluator instead. You can also get the cost on the training dataset in the summary. """ diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 35ad5518e1c1f..aac2b38d3f57d 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -563,6 +563,7 @@ def loadParamsInstance(path, sc): class HasTrainingSummary(object): """ Base class for models that provides Training summary. + .. versionadded:: 3.0.0 """ diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index 1a0ce42dc4e4f..f0f9cda4672b1 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -372,7 +372,7 @@ def save(self, sc, path): * human-readable (JSON) model metadata to path/metadata/ * Parquet formatted data to path/data/ - The model may be loaded using py:meth:`Loader.load`. + The model may be loaded using :py:meth:`Loader.load`. :param sc: Spark context used to save model data. :param path: Path specifying the directory in which to save @@ -412,7 +412,7 @@ class Loader(object): def load(cls, sc, path): """ Load a model from the given path. The model should have been - saved using py:meth:`Saveable.save`. + saved using :py:meth:`Saveable.save`. :param sc: Spark context used for loading model files. :param path: Path specifying the directory to which the model diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 78b574685327c..a3ce87096e790 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2138,7 +2138,7 @@ def drop(self, *cols): @ignore_unicode_prefix def toDF(self, *cols): - """Returns a new class:`DataFrame` that with new specified column names + """Returns a new :class:`DataFrame` that with new specified column names :param cols: list of new column names (string) @@ -2150,9 +2150,9 @@ def toDF(self, *cols): @since(3.0) def transform(self, func): - """Returns a new class:`DataFrame`. Concise syntax for chaining custom transformations. + """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations. - :param func: a function that takes and returns a class:`DataFrame`. + :param func: a function that takes and returns a :class:`DataFrame`. >>> from pyspark.sql.functions import col >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"]) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 6ad6377288ec5..336345e383729 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -223,15 +223,15 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, :param mode: allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ + * ``PERMISSIVE``: when it meets a corrupted record, puts the malformed string \ into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \ fields to ``null``. To keep corrupt records, an user can set a string type \ field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ schema does not have the field, it drops corrupt records during parsing. \ When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \ field in an output schema. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. + * ``DROPMALFORMED``: ignores the whole corrupted records. + * ``FAILFAST``: throws an exception when it meets corrupted records. :param columnNameOfCorruptRecord: allows renaming the new field having malformed string created by ``PERMISSIVE`` mode. This overrides @@ -470,7 +470,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non be controlled by ``spark.sql.csv.parser.columnPruning.enabled`` (enabled by default). - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ + * ``PERMISSIVE``: when it meets a corrupted record, puts the malformed string \ into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \ fields to ``null``. To keep corrupt records, an user can set a string type \ field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ @@ -479,8 +479,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non When it meets a record having fewer tokens than the length of the schema, \ sets ``null`` to extra fields. When the record has more tokens than the \ length of the schema, it drops extra tokens. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. + * ``DROPMALFORMED``: ignores the whole corrupted records. + * ``FAILFAST``: throws an exception when it meets corrupted records. :param columnNameOfCorruptRecord: allows renaming the new field having malformed string created by ``PERMISSIVE`` mode. This overrides @@ -830,7 +830,7 @@ def save(self, path=None, format=None, mode=None, partitionBy=None, **options): def insertInto(self, tableName, overwrite=None): """Inserts the content of the :class:`DataFrame` to the specified table. - It requires that the schema of the class:`DataFrame` is the same as the + It requires that the schema of the :class:`DataFrame` is the same as the schema of the table. Optionally overwriting any existing data. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 05cf331d897a2..2450a4c93c460 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -461,15 +461,15 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, :param mode: allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ + * ``PERMISSIVE``: when it meets a corrupted record, puts the malformed string \ into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \ fields to ``null``. To keep corrupt records, an user can set a string type \ field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ schema does not have the field, it drops corrupt records during parsing. \ When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \ field in an output schema. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. + * ``DROPMALFORMED``: ignores the whole corrupted records. + * ``FAILFAST``: throws an exception when it meets corrupted records. :param columnNameOfCorruptRecord: allows renaming the new field having malformed string created by ``PERMISSIVE`` mode. This overrides @@ -707,7 +707,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param mode: allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ + * ``PERMISSIVE``: when it meets a corrupted record, puts the malformed string \ into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \ fields to ``null``. To keep corrupt records, an user can set a string type \ field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ @@ -716,8 +716,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non When it meets a record having fewer tokens than the length of the schema, \ sets ``null`` to extra fields. When the record has more tokens than the \ length of the schema, it drops extra tokens. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. + * ``DROPMALFORMED``: ignores the whole corrupted records. + * ``FAILFAST``: throws an exception when it meets corrupted records. :param columnNameOfCorruptRecord: allows renaming the new field having malformed string created by ``PERMISSIVE`` mode. This overrides @@ -795,11 +795,11 @@ def outputMode(self, outputMode): Options include: - * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to + * `append`: Only the new rows in the streaming DataFrame/Dataset will be written to the sink - * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink + * `complete`: All the rows in the streaming DataFrame/Dataset will be written to the sink every time these is some updates - * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be + * `update`: only the rows that were updated in the streaming DataFrame/Dataset will be written to the sink every time there are some updates. If the query doesn't contain aggregations, it will be equivalent to `append` mode. @@ -1170,11 +1170,11 @@ def start(self, path=None, format=None, outputMode=None, partitionBy=None, query :param outputMode: specifies how data of a streaming DataFrame/Dataset is written to a streaming sink. - * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to the + * `append`: Only the new rows in the streaming DataFrame/Dataset will be written to the sink - * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink - every time these is some updates - * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be + * `complete`: All the rows in the streaming DataFrame/Dataset will be written to the + sink every time these is some updates + * `update`: only the rows that were updated in the streaming DataFrame/Dataset will be written to the sink every time there are some updates. If the query doesn't contain aggregations, it will be equivalent to `append` mode. :param partitionBy: names of partitioning columns From 239bf9c0d1ee20deb1705ce4c0658b64596a8e1e Mon Sep 17 00:00:00 2001 From: David Toneian Date: Mon, 18 May 2020 12:54:26 +0200 Subject: [PATCH 2/2] [SPARK-31739][PYSPARK][DOCS][MINOR] Additional fixes of docstring syntax issues. --- python/pyspark/ml/classification.py | 2 +- python/pyspark/ml/regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3bc862cc42af9..c43a79fac3312 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -197,7 +197,7 @@ class _JavaClassificationModel(ClassificationModel, JavaPredictionModel): """ Java Model produced by a ``Classifier``. Classes are indexed {0, 1, ..., numClasses - 1}. - To be mixed in with class:`pyspark.ml.JavaModel` + To be mixed in with :class:`pyspark.ml.JavaModel` """ @property diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 2ce467308e6cb..b58255ea12afc 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -77,7 +77,7 @@ class _JavaRegressor(Regressor, JavaPredictor): class _JavaRegressionModel(RegressionModel, JavaPredictionModel): """ Java Model produced by a ``_JavaRegressor``. - To be mixed in with class:`pyspark.ml.JavaModel` + To be mixed in with :class:`pyspark.ml.JavaModel` .. versionadded:: 3.0.0 """