Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions dev/create-release/generate-contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
print("Release tag: %s" % RELEASE_TAG)
print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
print("Number of commits in this range: %s" % len(new_commits))
print
print("")
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Works in Python 2 and 3



def print_indented(_list):
Expand All @@ -88,10 +88,10 @@ def print_indented(_list):


def is_release(commit_title):
return re.findall("\[release\]", commit_title.lower()) or \
"preparing spark release" in commit_title.lower() or \
"preparing development version" in commit_title.lower() or \
"CHANGES.txt" in commit_title
return ("[release]" in commit_title.lower() or
"preparing spark release" in commit_title.lower() or
"preparing development version" in commit_title.lower() or
"CHANGES.txt" in commit_title)


def is_maintenance(commit_title):
Expand Down
2 changes: 1 addition & 1 deletion dev/create-release/releaseutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings):
# Parse components in the commit message
# The returned components are already filtered and translated
def find_components(commit, commit_hash):
components = re.findall("\[\w*\]", commit.lower())
components = re.findall(r"\[\w*\]", commit.lower())
components = [translate_component(c, commit_hash)
for c in components if c in known_components]
return components
Expand Down
4 changes: 2 additions & 2 deletions dev/merge_spark_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
versions = sorted(versions, key=lambda x: x.name, reverse=True)
versions = filter(lambda x: x.raw['released'] is False, versions)
# Consider only x.y.z versions
versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions)

default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
for v in default_fix_versions:
Expand Down Expand Up @@ -403,7 +403,7 @@ def standardize_jira_ref(text):

# Extract spark component(s):
# Look for alphanumeric chars, spaces, dashes, periods, and/or commas
pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two issues: \. is unnecessary, and ,-. is probably misleading as it's a range, not three characters in the class. As it happens the range from comma to period is exactly those chars in ASCII!

for component in pattern.findall(text):
components.append(component.upper())
text = text.replace(component, '')
Expand Down
3 changes: 2 additions & 1 deletion dev/run-tests-jenkins.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def run_tests(tests_timeout):
os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()

failure_note_by_errcode = {
1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures
# error to denote run-tests script failures:
1: 'executing the `dev/run-tests` script', # noqa: W605
ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
Expand Down
2 changes: 1 addition & 1 deletion dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def determine_java_version(java_exe):
# find raw version string, eg 'java version "1.8.0_25"'
raw_version_str = next(x for x in raw_output_lines if " version " in x)

match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str)
match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)

major = int(match.group(1))
minor = int(match.group(2))
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,8 +773,8 @@ def roc(self):
which is a Dataframe having two fields (FPR, TPR) with
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.

.. seealso:: `Wikipedia reference \
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. seealso:: `Wikipedia reference
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

.. note:: This ignores instance weights (setting all to 1.0) from
`LogisticRegression.weightCol`. This will change in later Spark
Expand Down
16 changes: 8 additions & 8 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
.. note:: Experimental

Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
<a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
`Lin and Cohen <http://www.icml2010.org/papers/387.pdf>`_. From the abstract:
PIC finds a very low-dimensional embedding of a dataset using truncated power
iteration on a normalized pair-wise similarity matrix of the data.

This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method
to run the PowerIterationClustering algorithm.

.. seealso:: `Wikipedia on Spectral clustering \
<http://en.wikipedia.org/wiki/Spectral_clustering>`_
.. seealso:: `Wikipedia on Spectral clustering
<http://en.wikipedia.org/wiki/Spectral_clustering>`_

>>> data = [(1, 0, 0.5), \
(2, 0, 0.5), (2, 1, 0.7), \
(3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \
(4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \
(5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
>>> data = [(1, 0, 0.5),
... (2, 0, 0.5), (2, 1, 0.7),
... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
>>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
>>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
>>> assignments = pic.assignClusters(df)
Expand Down
16 changes: 8 additions & 8 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
distance space. The output will be vectors of configurable dimension. Hash values in the same
dimension are calculated by the same hash function.

.. seealso:: `Stable Distributions \
<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
.. seealso:: `Stable Distributions
<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
.. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_

>>> from pyspark.ml.linalg import Vectors
Expand Down Expand Up @@ -303,7 +303,7 @@ def _create_model(self, java_model):


class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
"""
r"""
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few docstrings have backslash or backticks in them. This should make sure they don't have surprising effects some day.

.. note:: Experimental

Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are
Expand Down Expand Up @@ -653,8 +653,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
The return vector is scaled such that the transform matrix is
unitary (aka scaled DCT-II).

.. seealso:: `More information on Wikipedia \
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
.. seealso:: `More information on Wikipedia
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.

>>> from pyspark.ml.linalg import Vectors
>>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
Expand Down Expand Up @@ -1353,7 +1353,7 @@ def _create_model(self, java_model):


class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
"""
r"""
.. note:: Experimental

Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each
Expand All @@ -1362,8 +1362,8 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
:math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise
independent according to the reference.

.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \
permutations." Electronic Journal of Combinatorics 7 (2000): R26.
.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear
permutations." Electronic Journal of Combinatorics 7 (2000): R26.

.. versionadded:: 2.2.0
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/fpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol,
HasMinSupport, HasNumPartitions, HasMinConfidence,
JavaMLWritable, JavaMLReadable):

"""
r"""
.. note:: Experimental

A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
Expand Down
20 changes: 10 additions & 10 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ def intercept(self):
@property
@since("2.3.0")
def scale(self):
"""
The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
r"""
The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0.
"""
return self._call_java("scale")

Expand Down Expand Up @@ -279,12 +279,12 @@ def featuresCol(self):
@property
@since("2.0.0")
def explainedVariance(self):
"""
r"""
Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`

.. seealso:: `Wikipedia explain variation \
<http://en.wikipedia.org/wiki/Explained_variation>`_
.. seealso:: `Wikipedia explain variation
<http://en.wikipedia.org/wiki/Explained_variation>`_

.. note:: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark
Expand Down Expand Up @@ -339,8 +339,8 @@ def r2(self):
"""
Returns R^2, the coefficient of determination.

.. seealso:: `Wikipedia coefficient of determination \
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
.. seealso:: `Wikipedia coefficient of determination
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_

.. note:: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark
Expand All @@ -354,8 +354,8 @@ def r2adj(self):
"""
Returns Adjusted R^2, the adjusted coefficient of determination.

.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \
<https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2
<https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_

.. note:: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark versions.
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ class PowerIterationClustering(object):
@classmethod
@since('1.5.0')
def train(cls, rdd, k, maxIterations=100, initMode="random"):
"""
r"""
:param rdd:
An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
affinity matrix, which is the matrix A in the PIC paper. The
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/mllib/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,9 @@ def __init__(self, predictionAndObservations):
@property
@since('1.4.0')
def explainedVariance(self):
"""
r"""
Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
"""
return self.call("explainedVariance")

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def transform(self, vector):


class Normalizer(VectorTransformer):
"""
r"""
Normalizes samples individually to unit L\ :sup:`p`\ norm

For any 1 <= `p` < float('inf'), normalizes samples using
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2399,7 +2399,7 @@ def barrier(self):
:return: an :class:`RDDBarrier` instance that provides actions within a barrier stage.

.. seealso:: :class:`BarrierTaskContext`
.. seealso:: `SPIP: Barrier Execution Mode \
.. seealso:: `SPIP: Barrier Execution Mode
<http://jira.apache.org/jira/browse/SPARK-24374>`_
.. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
sqlContext = spark._wrapped
sqlCtx = sqlContext

print("""Welcome to
print(r"""Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
Expand Down
14 changes: 8 additions & 6 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None):

@since(2.1)
def approx_count_distinct(col, rsd=None):
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line too long

column `col`.

:param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
efficient to use :func:`countDistinct`
Expand Down Expand Up @@ -346,7 +347,8 @@ def coalesce(*cols):

@since(1.6)
def corr(col1, col2):
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
and ``col2``.

>>> a = range(20)
>>> b = [2 * x for x in range(20)]
Expand Down Expand Up @@ -1688,14 +1690,14 @@ def split(str, pattern):
@ignore_unicode_prefix
@since(1.5)
def regexp_extract(str, pattern, idx):
"""Extract a specific group matched by a Java regex, from the specified string column.
r"""Extract a specific group matched by a Java regex, from the specified string column.
If the regex did not match, or the specified group did not match, an empty string is returned.

>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
>>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d=u'100')]
>>> df = spark.createDataFrame([('foo',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
>>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
[Row(d=u'')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
Expand All @@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement):
"""Replace all substrings of the specified string value that match regexp with rep.

>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()
>>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
[Row(d=u'-----')]
"""
sc = SparkContext._active_spark_context
Expand Down
12 changes: 6 additions & 6 deletions python/pyspark/sql/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
samplingRatio=None, enforceSchema=None, emptyValue=None):
"""Loads a CSV file and returns the result as a :class:`DataFrame`.
r"""Loads a CSV file and returns the result as a :class:`DataFrame`.

This function will go through the input once to determine the input schema if
``inferSchema`` is enabled. To avoid going through the entire data once, disable
Expand Down Expand Up @@ -519,8 +519,8 @@ def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPar

If both ``column`` and ``predicates`` are specified, ``column`` will be used.

.. note:: Don't create too many partitions in parallel on a large cluster; \
otherwise Spark might crash your external database systems.
.. note:: Don't create too many partitions in parallel on a large cluster;
otherwise Spark might crash your external database systems.

:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
:param table: the name of the table
Expand Down Expand Up @@ -862,7 +862,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.

:param path: the path in any Hadoop supported file system
:param mode: specifies the behavior of the save operation when data already exists.
Expand Down Expand Up @@ -962,8 +962,8 @@ def orc(self, path, mode=None, partitionBy=None, compression=None):
def jdbc(self, url, table, mode=None, properties=None):
"""Saves the content of the :class:`DataFrame` to an external database table via JDBC.

.. note:: Don't create too many partitions in parallel on a large cluster; \
otherwise Spark might crash your external database systems.
.. note:: Don't create too many partitions in parallel on a large cluster;
otherwise Spark might crash your external database systems.

:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
:param table: Name of the table in the external database.
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
enforceSchema=None, emptyValue=None):
"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.

This function will go through the input once to determine the input schema if
``inferSchema`` is enabled. To avoid going through the entire data once, disable
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ def __eq__(self, other):
for v in [ArrayType, MapType, StructType])


_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)")


def _parse_datatype_string(s):
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/storagelevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def __str__(self):
StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)

"""
.. note:: The following four storage level constants are deprecated in 2.0, since the records \
will always be serialized in Python.
.. note:: The following four storage level constants are deprecated in 2.0, since the records
will always be serialized in Python.
"""
StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY
""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead."""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def majorMinorVersion(sparkVersion):
(2, 3)

"""
m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion)
m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion)
if m is not None:
return (int(m.group(1)), int(m.group(2)))
else:
Expand Down
Loading