diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index 131d81c8a75cf..d9135173419ae 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -67,7 +67,7 @@ print("Release tag: %s" % RELEASE_TAG) print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG) print("Number of commits in this range: %s" % len(new_commits)) -print +print("") def print_indented(_list): @@ -88,10 +88,10 @@ def print_indented(_list): def is_release(commit_title): - return re.findall("\[release\]", commit_title.lower()) or \ - "preparing spark release" in commit_title.lower() or \ - "preparing development version" in commit_title.lower() or \ - "CHANGES.txt" in commit_title + return ("[release]" in commit_title.lower() or + "preparing spark release" in commit_title.lower() or + "preparing development version" in commit_title.lower() or + "CHANGES.txt" in commit_title) def is_maintenance(commit_title): diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 8cc990d871842..f273b337fdb4e 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings): # Parse components in the commit message # The returned components are already filtered and translated def find_components(commit, commit_hash): - components = re.findall("\[\w*\]", commit.lower()) + components = re.findall(r"\[\w*\]", commit.lower()) components = [translate_component(c, commit_hash) for c in components if c in known_components] return components diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index 81daa909e019c..cca6f405e89ac 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""): versions = sorted(versions, key=lambda x: x.name, reverse=True) versions = filter(lambda x: x.raw['released'] is False, versions) # Consider only x.y.z versions - versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions) + versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions) default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) for v in default_fix_versions: @@ -403,7 +403,7 @@ def standardize_jira_ref(text): # Extract spark component(s): # Look for alphanumeric chars, spaces, dashes, periods, and/or commas - pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE) + pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE) for component in pattern.findall(text): components.append(component.upper()) text = text.replace(component, '') diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index e6fe3b82ed202..6e943898ffed9 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -115,7 +115,8 @@ def run_tests(tests_timeout): os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait() failure_note_by_errcode = { - 1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures + # error to denote run-tests script failures: + 1: 'executing the `dev/run-tests` script', # noqa: W605 ERROR_CODES["BLOCK_GENERAL"]: 'some tests', ERROR_CODES["BLOCK_RAT"]: 'RAT tests', ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests', diff --git a/dev/run-tests.py b/dev/run-tests.py index d9d3789ac1255..f534637b80d6b 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -169,7 +169,7 @@ def determine_java_version(java_exe): # find raw version string, eg 'java version "1.8.0_25"' raw_version_str = next(x for x in raw_output_lines if " version " in x) - match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str) + match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str) major = int(match.group(1)) minor = int(match.group(2)) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d5963f4f7042c..ce028512357f2 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -773,8 +773,8 @@ def roc(self): which is a Dataframe having two fields (FPR, TPR) with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - .. seealso:: `Wikipedia reference \ - `_ + .. seealso:: `Wikipedia reference + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. This will change in later Spark diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index ab449bc3f8f51..5ef4e765ea4e1 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada .. note:: Experimental Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by - Lin and Cohen. From the abstract: + `Lin and Cohen `_. From the abstract: PIC finds a very low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise similarity matrix of the data. This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method to run the PowerIterationClustering algorithm. - .. seealso:: `Wikipedia on Spectral clustering \ - `_ + .. seealso:: `Wikipedia on Spectral clustering + `_ - >>> data = [(1, 0, 0.5), \ - (2, 0, 0.5), (2, 1, 0.7), \ - (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \ - (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \ - (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] + >>> data = [(1, 0, 0.5), + ... (2, 0, 0.5), (2, 1, 0.7), + ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), + ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), + ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight") >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") >>> assignments = pic.assignClusters(df) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 760aa82168f5a..eccb7acae5b98 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -207,8 +207,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp distance space. The output will be vectors of configurable dimension. Hash values in the same dimension are calculated by the same hash function. - .. seealso:: `Stable Distributions \ - `_ + .. seealso:: `Stable Distributions + `_ .. seealso:: `Hashing for Similarity Search: A Survey `_ >>> from pyspark.ml.linalg import Vectors @@ -303,7 +303,7 @@ def _create_model(self, java_model): class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): - """ + r""" .. note:: Experimental Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are @@ -653,8 +653,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit The return vector is scaled such that the transform matrix is unitary (aka scaled DCT-II). - .. seealso:: `More information on Wikipedia \ - `_. + .. seealso:: `More information on Wikipedia + `_. >>> from pyspark.ml.linalg import Vectors >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) @@ -1353,7 +1353,7 @@ def _create_model(self, java_model): class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): - """ + r""" .. note:: Experimental Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each @@ -1362,8 +1362,8 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise independent according to the reference. - .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \ - permutations." Electronic Journal of Combinatorics 7 (2000): R26. + .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear + permutations." Electronic Journal of Combinatorics 7 (2000): R26. .. versionadded:: 2.2.0 """ diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py index c2b29b73460ff..886ad8409ca66 100644 --- a/python/pyspark/ml/fpm.py +++ b/python/pyspark/ml/fpm.py @@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol, HasMinSupport, HasNumPartitions, HasMinConfidence, JavaMLWritable, JavaMLReadable): - """ + r""" .. note:: Experimental A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 513ca5a9df85e..98f4361351847 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -188,8 +188,8 @@ def intercept(self): @property @since("2.3.0") def scale(self): - """ - The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0. + r""" + The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0. """ return self._call_java("scale") @@ -279,12 +279,12 @@ def featuresCol(self): @property @since("2.0.0") def explainedVariance(self): - """ + r""" Returns the explained variance regression score. - explainedVariance = 1 - variance(y - \hat{y}) / variance(y) + explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}` - .. seealso:: `Wikipedia explain variation \ - `_ + .. seealso:: `Wikipedia explain variation + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark @@ -339,8 +339,8 @@ def r2(self): """ Returns R^2, the coefficient of determination. - .. seealso:: `Wikipedia coefficient of determination \ - `_ + .. seealso:: `Wikipedia coefficient of determination + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark @@ -354,8 +354,8 @@ def r2adj(self): """ Returns Adjusted R^2, the adjusted coefficient of determination. - .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \ - `_ + .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark versions. diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index b09469b9f5c2d..b1a8af6bcc094 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -647,7 +647,7 @@ class PowerIterationClustering(object): @classmethod @since('1.5.0') def train(cls, rdd, k, maxIterations=100, initMode="random"): - """ + r""" :param rdd: An RDD of (i, j, s\ :sub:`ij`\) tuples representing the affinity matrix, which is the matrix A in the PIC paper. The diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 6c65da58e4e2b..0bb0ca37c1ab6 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -117,9 +117,9 @@ def __init__(self, predictionAndObservations): @property @since('1.4.0') def explainedVariance(self): - """ + r""" Returns the explained variance regression score. - explainedVariance = 1 - variance(y - \hat{y}) / variance(y) + explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}` """ return self.call("explainedVariance") diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 40ecd2e0ff4be..6d7d4d61db043 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -59,7 +59,7 @@ def transform(self, vector): class Normalizer(VectorTransformer): - """ + r""" Normalizes samples individually to unit L\ :sup:`p`\ norm For any 1 <= `p` < float('inf'), normalizes samples using diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index b317156885e51..ccf39e1ffbe96 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2399,7 +2399,7 @@ def barrier(self): :return: an :class:`RDDBarrier` instance that provides actions within a barrier stage. .. seealso:: :class:`BarrierTaskContext` - .. seealso:: `SPIP: Barrier Execution Mode \ + .. seealso:: `SPIP: Barrier Execution Mode `_ .. seealso:: `Design Doc `_ diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index 472c3cd4452f0..65e3bdbc05ce8 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -54,7 +54,7 @@ sqlContext = spark._wrapped sqlCtx = sqlContext -print("""Welcome to +print(r"""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 81f35f54aa54d..e288ec818b404 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None): @since(2.1) def approx_count_distinct(col, rsd=None): - """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`. + """Aggregate function: returns a new :class:`Column` for approximate distinct count of + column `col`. :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more efficient to use :func:`countDistinct` @@ -346,7 +347,8 @@ def coalesce(*cols): @since(1.6) def corr(col1, col2): - """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. + """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` + and ``col2``. >>> a = range(20) >>> b = [2 * x for x in range(20)] @@ -1688,14 +1690,14 @@ def split(str, pattern): @ignore_unicode_prefix @since(1.5) def regexp_extract(str, pattern, idx): - """Extract a specific group matched by a Java regex, from the specified string column. + r"""Extract a specific group matched by a Java regex, from the specified string column. If the regex did not match, or the specified group did not match, an empty string is returned. >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect() + >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect() [Row(d=u'100')] >>> df = spark.createDataFrame([('foo',)], ['str']) - >>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect() + >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect() [Row(d=u'')] >>> df = spark.createDataFrame([('aaaac',)], ['str']) >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() @@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement): """Replace all substrings of the specified string value that match regexp with rep. >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect() + >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect() [Row(d=u'-----')] """ sc = SparkContext._active_spark_context diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 3ca5d548ae7d6..690b13072244b 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -350,7 +350,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None): - """Loads a CSV file and returns the result as a :class:`DataFrame`. + r"""Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if ``inferSchema`` is enabled. To avoid going through the entire data once, disable @@ -519,8 +519,8 @@ def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPar If both ``column`` and ``predicates`` are specified, ``column`` will be used. - .. note:: Don't create too many partitions in parallel on a large cluster; \ - otherwise Spark might crash your external database systems. + .. note:: Don't create too many partitions in parallel on a large cluster; + otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: the name of the table @@ -862,7 +862,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None, timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None): - """Saves the content of the :class:`DataFrame` in CSV format at the specified path. + r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path. :param path: the path in any Hadoop supported file system :param mode: specifies the behavior of the save operation when data already exists. @@ -962,8 +962,8 @@ def orc(self, path, mode=None, partitionBy=None, compression=None): def jdbc(self, url, table, mode=None, properties=None): """Saves the content of the :class:`DataFrame` to an external database table via JDBC. - .. note:: Don't create too many partitions in parallel on a large cluster; \ - otherwise Spark might crash your external database systems. + .. note:: Don't create too many partitions in parallel on a large cluster; + otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: Name of the table in the external database. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 522900bf6684c..b18453b2a4f96 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -565,7 +565,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, enforceSchema=None, emptyValue=None): - """Loads a CSV file stream and returns the result as a :class:`DataFrame`. + r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if ``inferSchema`` is enabled. To avoid going through the entire data once, disable diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index ce1d004c6c8ff..1d24c40e5858e 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -752,7 +752,7 @@ def __eq__(self, other): for v in [ArrayType, MapType, StructType]) -_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)") +_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)") def _parse_datatype_string(s): diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py index ef012d27cb22f..7f29646c07432 100644 --- a/python/pyspark/storagelevel.py +++ b/python/pyspark/storagelevel.py @@ -58,8 +58,8 @@ def __str__(self): StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1) """ -.. note:: The following four storage level constants are deprecated in 2.0, since the records \ -will always be serialized in Python. +.. note:: The following four storage level constants are deprecated in 2.0, since the records + will always be serialized in Python. """ StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY """.. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead.""" diff --git a/python/pyspark/util.py b/python/pyspark/util.py index f015542c8799d..f906f49595438 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -80,7 +80,7 @@ def majorMinorVersion(sparkVersion): (2, 3) """ - m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion) + m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion) if m is not None: return (int(m.group(1)), int(m.group(2))) else: diff --git a/python/run-tests.py b/python/run-tests.py index 4c90926cfa350..ccbdfac3f3850 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -138,7 +138,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) skipped_tests = list(filter( - lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line), + lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: