From 7c49184c411230fe3c263570d6c2d24bb6700bf6 Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Sun, 4 Sep 2022 19:15:15 +0100 Subject: [PATCH 1/6] Docstring improvements --- python/pyspark/sql/functions.py | 574 +++++++++++++++++++++++++++++++- 1 file changed, 567 insertions(+), 7 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 89e86414fbf46..76b590829010d 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -4486,6 +4486,11 @@ def window( start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`. + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + Examples -------- >>> import datetime @@ -4548,6 +4553,11 @@ def session_window(timeColumn: "ColumnOrName", gapDuration: Union[Column, str]) static value, e.g. `10 minutes`, `1 second`, or an expression/UDF that specifies gap duration dynamically based on the input row. + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + Examples -------- >>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val") @@ -4579,6 +4589,16 @@ def crc32(col: "ColumnOrName") -> Column: Calculates the cyclic redundancy check value (CRC32) of a binary column and returns the value as a bigint. + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + .. versionadded:: 1.5.0 Examples @@ -4594,6 +4614,16 @@ def md5(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + Examples -------- >>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() @@ -4607,6 +4637,16 @@ def sha1(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + Examples -------- >>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() @@ -4622,6 +4662,19 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + numBits : int + the desired bit length of the result, which must have a + value of 224, 256, 384, 512, or 0 (which is equivalent to 256). + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + Examples -------- >>> digests = df.select(sha2(df.name, 256).alias('s')).collect() @@ -4638,6 +4691,16 @@ def hash(*cols: "ColumnOrName") -> Column: .. versionadded:: 2.0.0 + Parameters + ---------- + cols : :class:`~pyspark.sql.Column` or str + list of columns to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + hash value as int column. + Examples -------- >>> spark.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect() @@ -4652,6 +4715,16 @@ def xxhash64(*cols: "ColumnOrName") -> Column: .. versionadded:: 3.0.0 + Parameters + ---------- + cols : :class:`~pyspark.sql.Column` or str + list of columns to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + hash value as long column. + Examples -------- >>> spark.createDataFrame([('ABC',)], ['a']).select(xxhash64('a').alias('hash')).collect() @@ -4662,8 +4735,8 @@ def xxhash64(*cols: "ColumnOrName") -> Column: def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None) -> Column: """ - Returns null if the input column is true; throws an exception with the provided error message - otherwise. + Returns `null` if the input column is `true`; throws an exception + with the provided error message otherwise. .. versionadded:: 3.1.0 @@ -4671,20 +4744,27 @@ def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None ---------- col : :class:`~pyspark.sql.Column` or str column name or column that represents the input column to test - errMsg : :class:`~pyspark.sql.Column` or str + errMsg : :class:`~pyspark.sql.Column` or str, optional A Python string literal or column containing the error message + Returns + ------- + :class:`~pyspark.sql.Column` + `null` if the input column is `true` otherwise throws an error with specified message. + Examples -------- >>> df = spark.createDataFrame([(0,1)], ['a', 'b']) >>> df.select(assert_true(df.a < df.b).alias('r')).collect() [Row(r=None)] - >>> df = spark.createDataFrame([(0,1)], ['a', 'b']) >>> df.select(assert_true(df.a < df.b, df.a).alias('r')).collect() [Row(r=None)] - >>> df = spark.createDataFrame([(0,1)], ['a', 'b']) >>> df.select(assert_true(df.a < df.b, 'error').alias('r')).collect() [Row(r=None)] + >>> df.select(assert_true(df.a > df.b, 'My error msg').alias('r')).collect() # doctest: +SKIP + 22/09/03 20:18:15 ERROR Executor: Exception in task 15.0 in stage 45.0 (TID 383) + java.lang.RuntimeException: My error msg + ... """ if errMsg is None: return _invoke_function_over_columns("assert_true", col) @@ -4707,6 +4787,19 @@ def raise_error(errMsg: Union[Column, str]) -> Column: ---------- errMsg : :class:`~pyspark.sql.Column` or str A Python string literal or column containing the error message + + Returns + ------- + :class:`~pyspark.sql.Column` + throws an error with specified message. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(raise_error("My error message")).show() # doctest: +SKIP + 22/09/03 20:26:49 ERROR Executor: Exception in task 15.0 in stage 46.0 (TID 399) + java.lang.RuntimeException: My error message + ... """ if not isinstance(errMsg, (str, Column)): raise TypeError("errMsg should be a Column or a str, got {}".format(type(errMsg))) @@ -4725,6 +4818,29 @@ def upper(col: "ColumnOrName") -> Column: Converts a string expression to upper case. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + upper case values. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df.select(upper("value")).show() + +------------+ + |upper(value)| + +------------+ + | SPARK| + | PYSPARK| + | PANDAS API| + +------------+ """ return _invoke_function_over_columns("upper", col) @@ -4734,6 +4850,29 @@ def lower(col: "ColumnOrName") -> Column: Converts a string expression to lower case. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + lower case values. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df.select(lower("value")).show() + +------------+ + |lower(value)| + +------------+ + | spark| + | pyspark| + | pandas api| + +------------+ """ return _invoke_function_over_columns("lower", col) @@ -4743,6 +4882,29 @@ def ascii(col: "ColumnOrName") -> Column: Computes the numeric value of the first character of the string column. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + numeric value. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df.select(ascii("value")).show() + +------------+ + |ascii(value)| + +------------+ + | 83| + | 80| + | 80| + +------------+ """ return _invoke_function_over_columns("ascii", col) @@ -4752,6 +4914,29 @@ def base64(col: "ColumnOrName") -> Column: Computes the BASE64 encoding of a binary column and returns it as a string column. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + BASE64 encoding of string value. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df.select(base64("value")).show() + +----------------+ + | base64(value)| + +----------------+ + | U3Bhcms=| + | UHlTcGFyaw==| + |UGFuZGFzIEFQSQ==| + +----------------+ """ return _invoke_function_over_columns("base64", col) @@ -4761,6 +4946,31 @@ def unbase64(col: "ColumnOrName") -> Column: Decodes a BASE64 encoded string column and returns it as a binary column. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + encoded string value. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame(["U3Bhcms=", + ... "UHlTcGFyaw==", + ... "UGFuZGFzIEFQSQ=="], types.StringType()) + >>> df.select(unbase64("value")).show() + +--------------------+ + | unbase64(value)| + +--------------------+ + | [53 70 61 72 6B]| + |[50 79 53 70 61 7...| + |[50 61 6E 64 61 7...| + +--------------------+ """ return _invoke_function_over_columns("unbase64", col) @@ -4770,6 +4980,29 @@ def ltrim(col: "ColumnOrName") -> Column: Trim the spaces from left end for the specified string value. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + left trimmed values. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], types.StringType()) + >>> df.select(ltrim("value").alias("r")).withColumn("length", length("r")).show() + +-------+------+ + | r|length| + +-------+------+ + | Spark| 5| + |Spark | 7| + | Spark| 5| + +-------+------+ """ return _invoke_function_over_columns("ltrim", col) @@ -4779,6 +5012,29 @@ def rtrim(col: "ColumnOrName") -> Column: Trim the spaces from right end for the specified string value. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + right trimmed values. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], types.StringType()) + >>> df.select(rtrim("value").alias("r")).withColumn("length", length("r")).show() + +--------+------+ + | r|length| + +--------+------+ + | Spark| 8| + | Spark| 5| + | Spark| 6| + +--------+------+ """ return _invoke_function_over_columns("rtrim", col) @@ -4788,6 +5044,29 @@ def trim(col: "ColumnOrName") -> Column: Trim the spaces from both ends for the specified string column. .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + trimmed values from both sides. + + Examples + -------- + >>> from pyspark.sql import types + >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], types.StringType()) + >>> df.select(trim("value").alias("r")).withColumn("length", length("r")).show() + +-----+------+ + | r|length| + +-----+------+ + |Spark| 5| + |Spark| 5| + |Spark| 5| + +-----+------+ """ return _invoke_function_over_columns("trim", col) @@ -4799,6 +5078,18 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + sep : str + words seperator. + cols : :class:`~pyspark.sql.Column` or str + list of columns to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + string of concatenated words. + Examples -------- >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) @@ -4816,6 +5107,28 @@ def decode(col: "ColumnOrName", charset: str) -> Column: (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + charset : str + charset to use to decode to. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + >>> df = spark.createDataFrame([('abcd',)], ['a']) + >>> df.select(decode("a", "UTF-8")).show() + +----------------------+ + |stringdecode(a, UTF-8)| + +----------------------+ + | abcd| + +----------------------+ """ return _invoke_function("decode", _to_java_column(col), charset) @@ -4826,6 +5139,28 @@ def encode(col: "ColumnOrName", charset: str) -> Column: (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). .. versionadded:: 1.5.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + charset : str + charset to use to encode. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + >>> df = spark.createDataFrame([('abcd',)], ['c']) + >>> df.select(encode("c", "UTF-8")).show() + +----------------+ + |encode(c, UTF-8)| + +----------------+ + | [61 62 63 64]| + +----------------+ """ return _invoke_function("encode", _to_java_column(col), charset) @@ -4844,6 +5179,11 @@ def format_number(col: "ColumnOrName", d: int) -> Column: d : int the N decimal places + Returns + ------- + :class:`~pyspark.sql.Column` + the column of formatted results. + >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() [Row(v='5.0000')] """ @@ -4863,6 +5203,11 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column: cols : :class:`~pyspark.sql.Column` or str column names or :class:`~pyspark.sql.Column`\\s to be used in formatting + Returns + ------- + :class:`~pyspark.sql.Column` + the column of formatted results. + Examples -------- >>> df = spark.createDataFrame([(5, "hello")], ['a', 'b']) @@ -4886,6 +5231,20 @@ def instr(str: "ColumnOrName", substr: str) -> Column: The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str. + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + target column to work on. + substr : str + substring to look for. + + Returns + ------- + :class:`~pyspark.sql.Column` + location of the first occurence of the substring as integer. + + Examples + -------- >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(instr(df.s, 'b').alias('s')).collect() [Row(s=2)] @@ -4913,10 +5272,15 @@ def overlay( column name or column containing the substitution string pos : :class:`~pyspark.sql.Column` or str or int column name, column, or int containing the starting position in src - len : :class:`~pyspark.sql.Column` or str or int + len : :class:`~pyspark.sql.Column` or str or int, optional column name, column, or int containing the number of bytes to replace in src string by 'replace' defaults to -1, which represents the length of the 'replace' string + Returns + ------- + :class:`~pyspark.sql.Column` + string with replaced values. + Examples -------- >>> df = spark.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y")) @@ -4962,6 +5326,11 @@ def sentences( country : :class:`~pyspark.sql.Column` or str, optional a country of the locale + Returns + ------- + :class:`~pyspark.sql.Column` + arrays of split sentences. + Examples -------- >>> df = spark.createDataFrame([["This is an example sentence."]], ["string"]) @@ -4971,6 +5340,13 @@ def sentences( +-----------------------------------+ |[[This, is, an, example, sentence]]| +-----------------------------------+ + >>> df = spark.createDataFrame([["Hello world. How are you?"]], ["s"]) + >>> df.select(sentences("s")).show(truncate=False) + +---------------------------------+ + |sentences(s, , ) | + +---------------------------------+ + |[[Hello, world], [How, are, you]]| + +---------------------------------+ """ if language is None: language = lit("") @@ -4992,6 +5368,20 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: ----- The position is not zero based, but 1 based index. + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + target column to work on. + pos : int + starting position in str. + len : int + length of chars. + + Returns + ------- + :class:`~pyspark.sql.Column` + substring of given value. + Examples -------- >>> df = spark.createDataFrame([('abcd',)], ['s',]) @@ -5010,6 +5400,20 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + target column to work on. + delim : str + delimiter of values. + count : int + number of occurences. + + Returns + ------- + :class:`~pyspark.sql.Column` + substring of given value. + Examples -------- >>> df = spark.createDataFrame([('a.b.c.d',)], ['s']) @@ -5026,6 +5430,18 @@ def levenshtein(left: "ColumnOrName", right: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + left : :class:`~pyspark.sql.Column` or str + first column value. + right : :class:`~pyspark.sql.Column` or str + second column value. + + Returns + ------- + :class:`~pyspark.sql.Column` + Levenshtein distance as integer value. + Examples -------- >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) @@ -5050,6 +5466,11 @@ def locate(substr: str, str: "ColumnOrName", pos: int = 1) -> Column: pos : int, optional start position (zero based) + Returns + ------- + :class:`~pyspark.sql.Column` + position of the substring. + Notes ----- The position is not zero based, but 1 based index. Returns 0 if substr @@ -5070,6 +5491,20 @@ def lpad(col: "ColumnOrName", len: int, pad: str) -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + len : int + length of the final string. + pad : str + chars to prepend. + + Returns + ------- + :class:`~pyspark.sql.Column` + left padded result. + Examples -------- >>> df = spark.createDataFrame([('abcd',)], ['s',]) @@ -5085,6 +5520,20 @@ def rpad(col: "ColumnOrName", len: int, pad: str) -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + len : int + length of the final string. + pad : str + chars to append. + + Returns + ------- + :class:`~pyspark.sql.Column` + right padded result. + Examples -------- >>> df = spark.createDataFrame([('abcd',)], ['s',]) @@ -5100,6 +5549,18 @@ def repeat(col: "ColumnOrName", n: int) -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + n : int + number of times to repeat value. + + Returns + ------- + :class:`~pyspark.sql.Column` + string with repeated values. + Examples -------- >>> df = spark.createDataFrame([('ab',)], ['s',]) @@ -5134,6 +5595,11 @@ def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: .. versionchanged:: 3.0 `split` now takes an optional `limit` field. If not provided, default limit value is -1. + Returns + ------- + :class:`~pyspark.sql.Column` + array of separated strings. + Examples -------- >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) @@ -5151,6 +5617,20 @@ def regexp_extract(str: "ColumnOrName", pattern: str, idx: int) -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + target column to work on. + pattern : str + regex pattern to apply. + idx : int + matched group id. + + Returns + ------- + :class:`~pyspark.sql.Column` + matched value specified by `idx` group id. + Examples -------- >>> df = spark.createDataFrame([('100-200',)], ['str']) @@ -5182,6 +5662,11 @@ def regexp_replace( replacement : :class:`~pyspark.sql.Column` or str column object or str containing the replacement + Returns + ------- + :class:`~pyspark.sql.Column` + string with all substrings replaced. + Examples -------- >>> df = spark.createDataFrame([("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"]) @@ -5206,6 +5691,16 @@ def initcap(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + string with all first letters are uppercase in each word. + Examples -------- >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect() @@ -5220,6 +5715,16 @@ def soundex(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + SoundEx encoded string. + Examples -------- >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name']) @@ -5234,6 +5739,16 @@ def bin(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + binary representation of given value as string. + Examples -------- >>> df.select(bin(df.age).alias('c')).collect() @@ -5249,6 +5764,16 @@ def hex(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + hexadecimal representation of given value as string. + Examples -------- >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() @@ -5263,6 +5788,16 @@ def unhex(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + string representation of given hexadecimal value. + Examples -------- >>> spark.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect() @@ -5278,6 +5813,16 @@ def length(col: "ColumnOrName") -> Column: .. versionadded:: 1.5.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to work on. + + Returns + ------- + :class:`~pyspark.sql.Column` + length of the value. + Examples -------- >>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect() @@ -5341,11 +5886,26 @@ def bit_length(col: "ColumnOrName") -> Column: def translate(srcCol: "ColumnOrName", matching: str, replace: str) -> Column: """A function translate any character in the `srcCol` by a character in `matching`. The characters in `replace` is corresponding to the characters in `matching`. - The translate will happen when any character in the string matching with the character + Translation will happen whenever any character in the string is matching with the character in the `matching`. .. versionadded:: 1.5.0 + Parameters + ---------- + srcCol : :class:`~pyspark.sql.Column` or str + Source column or strings + matching : str + matching characters. + replace : str + characters for replacement. If this is shorter than `matching` string then + those chars that don't have replacement will be dropped. + + Returns + ------- + :class:`~pyspark.sql.Column` + replaced value. + Examples -------- >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\ From 2c898e7b62ef45bdeae05588cfe24e8bc5be88d0 Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Tue, 6 Sep 2022 08:48:32 +0100 Subject: [PATCH 2/6] Review corrections --- python/pyspark/sql/functions.py | 65 ++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 76b590829010d..133f3df49f6fc 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -4694,7 +4694,7 @@ def hash(*cols: "ColumnOrName") -> Column: Parameters ---------- cols : :class:`~pyspark.sql.Column` or str - list of columns to work on. + one or more columns to compute on. Returns ------- @@ -4703,8 +4703,25 @@ def hash(*cols: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect() - [Row(hash=-757602832)] + >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) + + Hash for one column + + >>> df.select(hash('c1').alias('hash')).show() + +----------+ + | hash| + +----------+ + |-757602832| + +----------+ + + Two or more columns + + >>> df.select(hash('c1', 'c2').alias('hash')).show() + +---------+ + | hash| + +---------+ + |599895104| + +---------+ """ return _invoke_function_over_seq_of_columns("hash", cols) @@ -4718,7 +4735,7 @@ def xxhash64(*cols: "ColumnOrName") -> Column: Parameters ---------- cols : :class:`~pyspark.sql.Column` or str - list of columns to work on. + one or more columns to compute on. Returns ------- @@ -4727,8 +4744,25 @@ def xxhash64(*cols: "ColumnOrName") -> Column: Examples -------- - >>> spark.createDataFrame([('ABC',)], ['a']).select(xxhash64('a').alias('hash')).collect() - [Row(hash=4105715581806190027)] + >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) + + Hash for one column + + >>> df.select(xxhash64('c1').alias('hash')).show() + +-------------------+ + | hash| + +-------------------+ + |4105715581806190027| + +-------------------+ + + Two or more columns + + >>> df.select(xxhash64('c1', 'c2').alias('hash')).show() + +-------------------+ + | hash| + +-------------------+ + |3233247871021311208| + +-------------------+ """ return _invoke_function_over_seq_of_columns("xxhash64", cols) @@ -4762,7 +4796,7 @@ def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None >>> df.select(assert_true(df.a < df.b, 'error').alias('r')).collect() [Row(r=None)] >>> df.select(assert_true(df.a > df.b, 'My error msg').alias('r')).collect() # doctest: +SKIP - 22/09/03 20:18:15 ERROR Executor: Exception in task 15.0 in stage 45.0 (TID 383) + ... java.lang.RuntimeException: My error msg ... """ @@ -4797,7 +4831,7 @@ def raise_error(errMsg: Union[Column, str]) -> Column: -------- >>> df = spark.range(1) >>> df.select(raise_error("My error message")).show() # doctest: +SKIP - 22/09/03 20:26:49 ERROR Executor: Exception in task 15.0 in stage 46.0 (TID 399) + ... java.lang.RuntimeException: My error message ... """ @@ -4831,8 +4865,7 @@ def upper(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") >>> df.select(upper("value")).show() +------------+ |upper(value)| @@ -4863,8 +4896,7 @@ def lower(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") >>> df.select(lower("value")).show() +------------+ |lower(value)| @@ -4895,8 +4927,7 @@ def ascii(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") >>> df.select(ascii("value")).show() +------------+ |ascii(value)| @@ -4927,8 +4958,7 @@ def base64(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], types.StringType()) + >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") >>> df.select(base64("value")).show() +----------------+ | base64(value)| @@ -4959,10 +4989,9 @@ def unbase64(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types >>> df = spark.createDataFrame(["U3Bhcms=", ... "UHlTcGFyaw==", - ... "UGFuZGFzIEFQSQ=="], types.StringType()) + ... "UGFuZGFzIEFQSQ=="], "STRING") >>> df.select(unbase64("value")).show() +--------------------+ | unbase64(value)| From b1ba4321d6176367a79adbee8404da53b505b8ea Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Tue, 6 Sep 2022 11:49:38 +0100 Subject: [PATCH 3/6] Simplify df creation Co-authored-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 133f3df49f6fc..e42fdac11cefc 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -5054,8 +5054,7 @@ def rtrim(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], types.StringType()) + >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") >>> df.select(rtrim("value").alias("r")).withColumn("length", length("r")).show() +--------+------+ | r|length| From 303f240b4121ddf612a070dd6f737e75d4077e11 Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Tue, 6 Sep 2022 11:49:48 +0100 Subject: [PATCH 4/6] Simplify df creation Co-authored-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e42fdac11cefc..0a60b3ce2ce79 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -5085,8 +5085,7 @@ def trim(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], types.StringType()) + >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") >>> df.select(trim("value").alias("r")).withColumn("length", length("r")).show() +-----+------+ | r|length| From e180740500dc818bee4060428fbc3f54196b17e4 Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Tue, 6 Sep 2022 11:50:08 +0100 Subject: [PATCH 5/6] Simplify df creation Co-authored-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 0a60b3ce2ce79..e7e2628e71412 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -5022,8 +5022,7 @@ def ltrim(col: "ColumnOrName") -> Column: Examples -------- - >>> from pyspark.sql import types - >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], types.StringType()) + >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") >>> df.select(ltrim("value").alias("r")).withColumn("length", length("r")).show() +-------+------+ | r|length| From edf862484041cccbb66f0ebb00ad56a460f7cc0e Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Tue, 6 Sep 2022 21:46:47 +0100 Subject: [PATCH 6/6] Trigger pipeline