diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index df71969ccb7fe..1c0ae426a2111 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -962,6 +962,13 @@ def cos(col: "ColumnOrName") -> Column: ------- :class:`~pyspark.sql.Column` cosine of the angle, as if computed by `java.lang.Math.cos()`. + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(cos(lit(math.pi))).first() + Row(COS(3.14159...)=-1.0) """ return _invoke_function_over_columns("cos", col) @@ -981,6 +988,12 @@ def cosh(col: "ColumnOrName") -> Column: ------- :class:`~pyspark.sql.Column` hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()` + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(cosh(lit(1))).first() + Row(COSH(1)=1.54308...) """ return _invoke_function_over_columns("cosh", col) @@ -994,12 +1007,19 @@ def cot(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - Angle in radians + angle in radians. Returns ------- :class:`~pyspark.sql.Column` - Cotangent of the angle. + cotangent of the angle. + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(cot(lit(math.radians(45)))).first() + Row(COT(0.78539...)=1.00000...) """ return _invoke_function_over_columns("cot", col) @@ -1013,12 +1033,19 @@ def csc(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - Angle in radians + angle in radians. Returns ------- :class:`~pyspark.sql.Column` - Cosecant of the angle. + cosecant of the angle. + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(csc(lit(math.radians(90)))).first() + Row(CSC(1.57079...)=1.0) """ return _invoke_function_over_columns("csc", col) @@ -1028,6 +1055,26 @@ def exp(col: "ColumnOrName") -> Column: Computes the exponential of the given value. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + column to calculate exponential for. + + Returns + ------- + :class:`~pyspark.sql.Column` + exponential of the given value. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(exp(lit(0))).show() + +------+ + |EXP(0)| + +------+ + | 1.0| + +------+ """ return _invoke_function_over_columns("exp", col) @@ -1037,6 +1084,22 @@ def expm1(col: "ColumnOrName") -> Column: Computes the exponential of the given value minus one. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + column to calculate exponential for. + + Returns + ------- + :class:`~pyspark.sql.Column` + exponential less one. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(expm1(lit(1))).first() + Row(EXPM1(1)=1.71828...) """ return _invoke_function_over_columns("expm1", col) @@ -1046,6 +1109,26 @@ def floor(col: "ColumnOrName") -> Column: Computes the floor of the given value. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + column to find floor for. + + Returns + ------- + :class:`~pyspark.sql.Column` + neares integer that is less than or equal to given value. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(floor(lit(2.5))).show() + +----------+ + |FLOOR(2.5)| + +----------+ + | 2| + +----------+ """ return _invoke_function_over_columns("floor", col) @@ -1055,6 +1138,23 @@ def log(col: "ColumnOrName") -> Column: Computes the natural logarithm of the given value. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + column to calculate natural logarithm for. + + Returns + ------- + :class:`~pyspark.sql.Column` + natural logarithm of the given value. + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(log(lit(math.e))).first() + Row(ln(2.71828...)=1.0) """ return _invoke_function_over_columns("log", col) @@ -1064,15 +1164,57 @@ def log10(col: "ColumnOrName") -> Column: Computes the logarithm of the given value in Base 10. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + column to calculate logarithm for. + + Returns + ------- + :class:`~pyspark.sql.Column` + logarithm of the given value in Base 10. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(log10(lit(100))).show() + +----------+ + |LOG10(100)| + +----------+ + | 2.0| + +----------+ """ return _invoke_function_over_columns("log10", col) def log1p(col: "ColumnOrName") -> Column: """ - Computes the natural logarithm of the given value plus one. + Computes the natural logarithm of the "given value plus one". .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + column to calculate natural logarithm for. + + Returns + ------- + :class:`~pyspark.sql.Column` + natural logarithm of the "given value plus one". + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(log1p(lit(math.e))).first() + Row(LOG1P(2.71828...)=1.31326...) + + Same as: + + >>> df.select(log(lit(math.e+1))).first() + Row(ln(3.71828...)=1.31326...) """ return _invoke_function_over_columns("log1p", col) @@ -1083,6 +1225,33 @@ def rint(col: "ColumnOrName") -> Column: is equal to a mathematical integer. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(rint(lit(10.6))).show() + +----------+ + |rint(10.6)| + +----------+ + | 11.0| + +----------+ + + >>> df.select(rint(lit(10.3))).show() + +----------+ + |rint(10.3)| + +----------+ + | 10.0| + +----------+ """ return _invoke_function_over_columns("rint", col) @@ -1102,6 +1271,12 @@ def sec(col: "ColumnOrName") -> Column: ------- :class:`~pyspark.sql.Column` Secant of the angle. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(sec(lit(1.5))).first() + Row(SEC(1.5)=14.13683...) """ return _invoke_function_over_columns("sec", col) @@ -1111,6 +1286,33 @@ def signum(col: "ColumnOrName") -> Column: Computes the signum of the given value. .. versionadded:: 1.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(signum(lit(-5))).show() + +----------+ + |SIGNUM(-5)| + +----------+ + | -1.0| + +----------+ + + >>> df.select(signum(lit(6))).show() + +---------+ + |SIGNUM(6)| + +---------+ + | 1.0| + +---------+ """ return _invoke_function_over_columns("signum", col) @@ -1124,11 +1326,19 @@ def sin(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str + target column to compute on. Returns ------- :class:`~pyspark.sql.Column` sine of the angle, as if computed by `java.lang.Math.sin()` + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(sin(lit(math.radians(90)))).first() + Row(SIN(1.57079...)=1.0) """ return _invoke_function_over_columns("sin", col) @@ -1142,13 +1352,19 @@ def sinh(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - hyperbolic angle + hyperbolic angle. Returns ------- :class:`~pyspark.sql.Column` hyperbolic sine of the given value, as if computed by `java.lang.Math.sinh()` + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(sinh(lit(1.1))).first() + Row(SINH(1.1)=1.33564...) """ return _invoke_function_over_columns("sinh", col) @@ -1168,6 +1384,13 @@ def tan(col: "ColumnOrName") -> Column: ------- :class:`~pyspark.sql.Column` tangent of the given value, as if computed by `java.lang.Math.tan()` + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(tan(lit(math.radians(45)))).first() + Row(TAN(0.78539...)=0.99999...) """ return _invoke_function_over_columns("tan", col) @@ -1188,6 +1411,13 @@ def tanh(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` hyperbolic tangent of the given value as if computed by `java.lang.Math.tanh()` + + Examples + -------- + >>> import math + >>> df = spark.range(1) + >>> df.select(tanh(lit(math.radians(90)))).first() + Row(TANH(1.57079...)=0.91715...) """ return _invoke_function_over_columns("tanh", col) @@ -1232,6 +1462,32 @@ def bitwise_not(col: "ColumnOrName") -> Column: Computes bitwise not. .. versionadded:: 3.2.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column for computed results. + + Examples + -------- + >>> df = spark.range(1) + >>> df.select(bitwise_not(lit(0))).show() + +---+ + | ~0| + +---+ + | -1| + +---+ + >>> df.select(bitwise_not(lit(1))).show() + +---+ + | ~1| + +---+ + | -2| + +---+ """ return _invoke_function_over_columns("bitwise_not", col) @@ -1242,6 +1498,31 @@ def asc_nulls_first(col: "ColumnOrName") -> Column: column name, and null values return before non-null values. .. versionadded:: 2.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to sort by in the ascending order. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column specifying the order. + + Examples + -------- + >>> df1 = spark.createDataFrame([(1, "Bob"), + ... (0, None), + ... (2, "Alice")], ["age", "name"]) + >>> df1.sort(asc_nulls_first(df1.name)).show() + +---+-----+ + |age| name| + +---+-----+ + | 0| null| + | 2|Alice| + | 1| Bob| + +---+-----+ + """ return ( col.asc_nulls_first() @@ -1256,6 +1537,31 @@ def asc_nulls_last(col: "ColumnOrName") -> Column: column name, and null values appear after non-null values. .. versionadded:: 2.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to sort by in the ascending order. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column specifying the order. + + Examples + -------- + >>> df1 = spark.createDataFrame([(0, None), + ... (1, "Bob"), + ... (2, "Alice")], ["age", "name"]) + >>> df1.sort(asc_nulls_last(df1.name)).show() + +---+-----+ + |age| name| + +---+-----+ + | 2|Alice| + | 1| Bob| + | 0| null| + +---+-----+ + """ return ( col.asc_nulls_last() if isinstance(col, Column) else _invoke_function("asc_nulls_last", col) @@ -1268,6 +1574,31 @@ def desc_nulls_first(col: "ColumnOrName") -> Column: column name, and null values appear before non-null values. .. versionadded:: 2.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to sort by in the descending order. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column specifying the order. + + Examples + -------- + >>> df1 = spark.createDataFrame([(0, None), + ... (1, "Bob"), + ... (2, "Alice")], ["age", "name"]) + >>> df1.sort(desc_nulls_first(df1.name)).show() + +---+-----+ + |age| name| + +---+-----+ + | 0| null| + | 1| Bob| + | 2|Alice| + +---+-----+ + """ return ( col.desc_nulls_first() @@ -1282,6 +1613,31 @@ def desc_nulls_last(col: "ColumnOrName") -> Column: column name, and null values appear after non-null values. .. versionadded:: 2.4.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to sort by in the descending order. + + Returns + ------- + :class:`~pyspark.sql.Column` + the column specifying the order. + + Examples + -------- + >>> df1 = spark.createDataFrame([(0, None), + ... (1, "Bob"), + ... (2, "Alice")], ["age", "name"]) + >>> df1.sort(desc_nulls_last(df1.name)).show() + +---+-----+ + |age| name| + +---+-----+ + | 1| Bob| + | 2|Alice| + | 0| null| + +---+-----+ + """ return ( col.desc_nulls_last() @@ -1295,6 +1651,22 @@ def stddev(col: "ColumnOrName") -> Column: Aggregate function: alias for stddev_samp. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + standard deviation of given column. + + Examples + -------- + >>> df = spark.range(6) + >>> df.select(stddev(df.id)).first() + Row(stddev_samp(id)=1.87082...) """ return _invoke_function_over_columns("stddev", col) @@ -1305,6 +1677,22 @@ def stddev_samp(col: "ColumnOrName") -> Column: the expression in a group. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + standard deviation of given column. + + Examples + -------- + >>> df = spark.range(6) + >>> df.select(stddev_samp(df.id)).first() + Row(stddev_samp(id)=1.87082...) """ return _invoke_function_over_columns("stddev_samp", col) @@ -1315,6 +1703,22 @@ def stddev_pop(col: "ColumnOrName") -> Column: the expression in a group. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + standard deviation of given column. + + Examples + -------- + >>> df = spark.range(6) + >>> df.select(stddev_pop(df.id)).first() + Row(stddev_pop(id)=1.70782...) """ return _invoke_function_over_columns("stddev_pop", col) @@ -1324,6 +1728,26 @@ def variance(col: "ColumnOrName") -> Column: Aggregate function: alias for var_samp .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + variance of given column. + + Examples + -------- + >>> df = spark.range(6) + >>> df.select(variance(df.id)).show() + +------------+ + |var_samp(id)| + +------------+ + | 3.5| + +------------+ """ return _invoke_function_over_columns("variance", col) @@ -1334,6 +1758,26 @@ def var_samp(col: "ColumnOrName") -> Column: the values in a group. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + variance of given column. + + Examples + -------- + >>> df = spark.range(6) + >>> df.select(var_samp(df.id)).show() + +------------+ + |var_samp(id)| + +------------+ + | 3.5| + +------------+ """ return _invoke_function_over_columns("var_samp", col) @@ -1343,6 +1787,22 @@ def var_pop(col: "ColumnOrName") -> Column: Aggregate function: returns the population variance of the values in a group. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + variance of given column. + + Examples + -------- + >>> df = spark.range(6) + >>> df.select(var_pop(df.id)).first() + Row(var_pop(id)=2.91666...) """ return _invoke_function_over_columns("var_pop", col) @@ -1352,6 +1812,22 @@ def skewness(col: "ColumnOrName") -> Column: Aggregate function: returns the skewness of the values in a group. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + skewness of given column. + + Examples + -------- + >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) + >>> df.select(skewness(df.c)).first() + Row(skewness(c)=0.70710...) """ return _invoke_function_over_columns("skewness", col) @@ -1361,6 +1837,26 @@ def kurtosis(col: "ColumnOrName") -> Column: Aggregate function: returns the kurtosis of the values in a group. .. versionadded:: 1.6.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + kurtosis of given column. + + Examples + -------- + >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) + >>> df.select(kurtosis(df.c)).show() + +-----------+ + |kurtosis(c)| + +-----------+ + | -1.5| + +-----------+ """ return _invoke_function_over_columns("kurtosis", col) @@ -1376,6 +1872,16 @@ def collect_list(col: "ColumnOrName") -> Column: The function is non-deterministic because the order of collected results depends on the order of the rows which may be non-deterministic after a shuffle. + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + list of objects with duplicates. + Examples -------- >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) @@ -1396,6 +1902,16 @@ def collect_set(col: "ColumnOrName") -> Column: The function is non-deterministic because the order of collected results depends on the order of the rows which may be non-deterministic after a shuffle. + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + target column to compute on. + + Returns + ------- + :class:`~pyspark.sql.Column` + list of objects with no duplicates. + Examples -------- >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))