apache · srowen · Sep 12, 2018 · Sep 12, 2018 · Sep 12, 2018 · Sep 12, 2018
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
@@ -67,7 +67,7 @@
 print("Release tag: %s" % RELEASE_TAG)
 print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
 print("Number of commits in this range: %s" % len(new_commits))
-print
+print("")
 
 
 def print_indented(_list):
@@ -88,10 +88,10 @@ def print_indented(_list):
 
 
 def is_release(commit_title):
-    return re.findall("\[release\]", commit_title.lower()) or \
-        "preparing spark release" in commit_title.lower() or \
-        "preparing development version" in commit_title.lower() or \
-        "CHANGES.txt" in commit_title
+    return ("[release]" in commit_title.lower() or
+            "preparing spark release" in commit_title.lower() or
+            "preparing development version" in commit_title.lower() or
+            "CHANGES.txt" in commit_title)
 
 
 def is_maintenance(commit_title):

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
@@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings):
 # Parse components in the commit message
 # The returned components are already filtered and translated
 def find_components(commit, commit_hash):
-    components = re.findall("\[\w*\]", commit.lower())
+    components = re.findall(r"\[\w*\]", commit.lower())
     components = [translate_component(c, commit_hash)
                   for c in components if c in known_components]
     return components

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
@@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
     versions = sorted(versions, key=lambda x: x.name, reverse=True)
     versions = filter(lambda x: x.raw['released'] is False, versions)
     # Consider only x.y.z versions
-    versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
+    versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions)
 
     default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
     for v in default_fix_versions:
@@ -403,7 +403,7 @@ def standardize_jira_ref(text):
 
     # Extract spark component(s):
     # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
+    pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE)
     for component in pattern.findall(text):
         components.append(component.upper())
         text = text.replace(component, '')

diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
@@ -115,7 +115,8 @@ def run_tests(tests_timeout):
                                          os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
 
     failure_note_by_errcode = {
-        1: 'executing the `dev/run-tests` script',  # error to denote run-tests script failures
+        # error to denote run-tests script failures:
+        1: 'executing the `dev/run-tests` script',  # noqa: W605
         ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
         ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
         ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -169,7 +169,7 @@ def determine_java_version(java_exe):
     # find raw version string, eg 'java version "1.8.0_25"'
     raw_version_str = next(x for x in raw_output_lines if " version " in x)
 
-    match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str)
+    match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)
 
     major = int(match.group(1))
     minor = int(match.group(2))

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -773,8 +773,8 @@ def roc(self):
         which is a Dataframe having two fields (FPR, TPR) with
         (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
 
-        .. seealso:: `Wikipedia reference \
-        <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+        .. seealso:: `Wikipedia reference
+            <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 
         .. note:: This ignores instance weights (setting all to 1.0) from
             `LogisticRegression.weightCol`. This will change in later Spark

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
     .. note:: Experimental
 
     Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
-    <a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
+    `Lin and Cohen <http://www.icml2010.org/papers/387.pdf>`_. From the abstract:
     PIC finds a very low-dimensional embedding of a dataset using truncated power
     iteration on a normalized pair-wise similarity matrix of the data.
 
     This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method
     to run the PowerIterationClustering algorithm.
 
-    .. seealso:: `Wikipedia on Spectral clustering \
-    <http://en.wikipedia.org/wiki/Spectral_clustering>`_
+    .. seealso:: `Wikipedia on Spectral clustering
+        <http://en.wikipedia.org/wiki/Spectral_clustering>`_
 
-   >>> data = [(1, 0, 0.5), \
-               (2, 0, 0.5), (2, 1, 0.7), \
-               (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \
-               (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \
-               (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
+    >>> data = [(1, 0, 0.5),
+    ...         (2, 0, 0.5), (2, 1, 0.7),
+    ...         (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
+    ...         (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
+    ...         (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
     >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
     >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
     >>> assignments = pic.assignClusters(df)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -207,8 +207,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
     distance space. The output will be vectors of configurable dimension. Hash values in the same
     dimension are calculated by the same hash function.
 
-    .. seealso:: `Stable Distributions \
-    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
+    .. seealso:: `Stable Distributions
+        <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
     .. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_
 
     >>> from pyspark.ml.linalg import Vectors
@@ -303,7 +303,7 @@ def _create_model(self, java_model):
 
 
 class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
-    """
+    r"""
     .. note:: Experimental
 
     Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are
@@ -653,8 +653,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
     The return vector is scaled such that the transform matrix is
     unitary (aka scaled DCT-II).
 
-    .. seealso:: `More information on Wikipedia \
-    <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
+    .. seealso:: `More information on Wikipedia
+        <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
 
     >>> from pyspark.ml.linalg import Vectors
     >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
@@ -1353,7 +1353,7 @@ def _create_model(self, java_model):
 
 
 class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
-    """
+    r"""
     .. note:: Experimental
 
     Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each
@@ -1362,8 +1362,8 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
     :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise
     independent according to the reference.
 
-    .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \
-    permutations." Electronic Journal of Combinatorics 7 (2000): R26.
+    .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear
+        permutations." Electronic Journal of Combinatorics 7 (2000): R26.
 
     .. versionadded:: 2.2.0
     """

diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py
@@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol,
                HasMinSupport, HasNumPartitions, HasMinConfidence,
                JavaMLWritable, JavaMLReadable):
 
-    """
+    r"""
     .. note:: Experimental
 
     A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
@@ -188,8 +188,8 @@ def intercept(self):
     @property
     @since("2.3.0")
     def scale(self):
-        """
-        The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
+        r"""
+        The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0.
         """
         return self._call_java("scale")
 
@@ -279,12 +279,12 @@ def featuresCol(self):
     @property
     @since("2.0.0")
     def explainedVariance(self):
-        """
+        r"""
         Returns the explained variance regression score.
-        explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+        explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
 
-        .. seealso:: `Wikipedia explain variation \
-        <http://en.wikipedia.org/wiki/Explained_variation>`_
+        .. seealso:: `Wikipedia explain variation
+            <http://en.wikipedia.org/wiki/Explained_variation>`_
 
         .. note:: This ignores instance weights (setting all to 1.0) from
             `LinearRegression.weightCol`. This will change in later Spark
@@ -339,8 +339,8 @@ def r2(self):
         """
         Returns R^2, the coefficient of determination.
 
-        .. seealso:: `Wikipedia coefficient of determination \
-        <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
+        .. seealso:: `Wikipedia coefficient of determination
+            <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
 
         .. note:: This ignores instance weights (setting all to 1.0) from
             `LinearRegression.weightCol`. This will change in later Spark
@@ -354,8 +354,8 @@ def r2adj(self):
         """
         Returns Adjusted R^2, the adjusted coefficient of determination.
 
-        .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \
-        <https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
+        .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2
+            <https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
 
         .. note:: This ignores instance weights (setting all to 1.0) from
             `LinearRegression.weightCol`. This will change in later Spark versions.

diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
@@ -647,7 +647,7 @@ class PowerIterationClustering(object):
     @classmethod
     @since('1.5.0')
     def train(cls, rdd, k, maxIterations=100, initMode="random"):
-        """
+        r"""
         :param rdd:
           An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
           affinity matrix, which is the matrix A in the PIC paper.  The

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
@@ -117,9 +117,9 @@ def __init__(self, predictionAndObservations):
     @property
     @since('1.4.0')
     def explainedVariance(self):
-        """
+        r"""
         Returns the explained variance regression score.
-        explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+        explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
         """
         return self.call("explainedVariance")
 

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
@@ -59,7 +59,7 @@ def transform(self, vector):
 
 
 class Normalizer(VectorTransformer):
-    """
+    r"""
     Normalizes samples individually to unit L\ :sup:`p`\  norm
 
     For any 1 <= `p` < float('inf'), normalizes samples using

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -2399,7 +2399,7 @@ def barrier(self):
         :return: an :class:`RDDBarrier` instance that provides actions within a barrier stage.
 
         .. seealso:: :class:`BarrierTaskContext`
-        .. seealso:: `SPIP: Barrier Execution Mode \
+        .. seealso:: `SPIP: Barrier Execution Mode
             <http://jira.apache.org/jira/browse/SPARK-24374>`_
         .. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_
 

diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
@@ -54,7 +54,7 @@
 sqlContext = spark._wrapped
 sqlCtx = sqlContext
 
-print("""Welcome to
+print(r"""Welcome to
       ____              __
      / __/__  ___ _____/ /__
     _\ \/ _ \/ _ `/ __/  '_/

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None):
 
 @since(2.1)
 def approx_count_distinct(col, rsd=None):
-    """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
+    """Aggregate function: returns a new :class:`Column` for approximate distinct count of
+    column `col`.
 
     :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
         efficient to use :func:`countDistinct`
@@ -346,7 +347,8 @@ def coalesce(*cols):
 
 @since(1.6)
 def corr(col1, col2):
-    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
+    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
+    and ``col2``.
 
     >>> a = range(20)
     >>> b = [2 * x for x in range(20)]
@@ -1688,14 +1690,14 @@ def split(str, pattern):
 @ignore_unicode_prefix
 @since(1.5)
 def regexp_extract(str, pattern, idx):
-    """Extract a specific group matched by a Java regex, from the specified string column.
+    r"""Extract a specific group matched by a Java regex, from the specified string column.
     If the regex did not match, or the specified group did not match, an empty string is returned.
 
     >>> df = spark.createDataFrame([('100-200',)], ['str'])
-    >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
+    >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
     [Row(d=u'100')]
     >>> df = spark.createDataFrame([('foo',)], ['str'])
-    >>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
+    >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
     [Row(d=u'')]
     >>> df = spark.createDataFrame([('aaaac',)], ['str'])
     >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
@@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement):
     """Replace all substrings of the specified string value that match regexp with rep.
 
     >>> df = spark.createDataFrame([('100-200',)], ['str'])
-    >>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()
+    >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
     [Row(d=u'-----')]
     """
     sc = SparkContext._active_spark_context

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -350,7 +350,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
             columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
             samplingRatio=None, enforceSchema=None, emptyValue=None):
-        """Loads a CSV file and returns the result as a  :class:`DataFrame`.
+        r"""Loads a CSV file and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
         ``inferSchema`` is enabled. To avoid going through the entire data once, disable
@@ -519,8 +519,8 @@ def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPar
 
         If both ``column`` and ``predicates`` are specified, ``column`` will be used.
 
-        .. note:: Don't create too many partitions in parallel on a large cluster; \
-        otherwise Spark might crash your external database systems.
+        .. note:: Don't create too many partitions in parallel on a large cluster;
+            otherwise Spark might crash your external database systems.
 
         :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
         :param table: the name of the table
@@ -862,7 +862,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
             header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
             timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
             charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None):
-        """Saves the content of the :class:`DataFrame` in CSV format at the specified path.
+        r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
 
         :param path: the path in any Hadoop supported file system
         :param mode: specifies the behavior of the save operation when data already exists.
@@ -962,8 +962,8 @@ def orc(self, path, mode=None, partitionBy=None, compression=None):
     def jdbc(self, url, table, mode=None, properties=None):
         """Saves the content of the :class:`DataFrame` to an external database table via JDBC.
 
-        .. note:: Don't create too many partitions in parallel on a large cluster; \
-        otherwise Spark might crash your external database systems.
+        .. note:: Don't create too many partitions in parallel on a large cluster;
+            otherwise Spark might crash your external database systems.
 
         :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
         :param table: Name of the table in the external database.

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -565,7 +565,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
             columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
             enforceSchema=None, emptyValue=None):
-        """Loads a CSV file stream and returns the result as a  :class:`DataFrame`.
+        r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
         ``inferSchema`` is enabled. To avoid going through the entire data once, disable

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -752,7 +752,7 @@ def __eq__(self, other):
                           for v in [ArrayType, MapType, StructType])
 
 
-_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
+_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)")
 
 
 def _parse_datatype_string(s):

diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py
@@ -58,8 +58,8 @@ def __str__(self):
 StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)
 
 """
-.. note:: The following four storage level constants are deprecated in 2.0, since the records \
-will always be serialized in Python.
+.. note:: The following four storage level constants are deprecated in 2.0, since the records
+    will always be serialized in Python.
 """
 StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY
 """.. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead."""

diff --git a/python/pyspark/util.py b/python/pyspark/util.py
@@ -80,7 +80,7 @@ def majorMinorVersion(sparkVersion):
         (2, 3)
 
         """
-        m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion)
+        m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion)
         if m is not None:
             return (int(m.group(1)), int(m.group(2)))
         else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -59,7 +59,7 @@ def transform(self, vector): @@
     class Normalizer(VectorTransformer):
-        """
+        r"""
         Normalizes samples individually to unit L\ :sup:`p`\  norm
         For any 1 <= `p` < float('inf'), normalizes samples using
@@ Expand Down @@