diff --git a/velox/docs/functions/presto/math.rst b/velox/docs/functions/presto/math.rst index b61a623aca6..91e3cc7e6a2 100644 --- a/velox/docs/functions/presto/math.rst +++ b/velox/docs/functions/presto/math.rst @@ -383,6 +383,10 @@ Probability Functions: cdf Compute the Poisson cdf with given lambda (mean) parameter: P(N <= value; lambda). The lambda parameter must be a positive real number (of type DOUBLE) and value must be a non-negative integer. +.. function:: t_cdf(df, value) -> double + + Compute the Student's t cdf with given degrees of freedom: P(N < value; df). + The degrees of freedom must be a positive real number and value must be a real value. .. function:: weibull_cdf(a, b, value) -> double @@ -455,6 +459,12 @@ Probability Functions: inverse_cdf probability (p): P(N < n). The df parameter must be positive real values. The probability p must lie on the interval [0, 1]. +.. function:: inverse_t_cdf(df, p) -> double + + Compute the inverse of the Student's t cdf with given degrees of freedom for the cumulative + probability (p): P(N < n). The degrees of freedom must be a positive real value. + The probability p must lie on the interval [0, 1]. + ==================================== Statistical Functions ==================================== diff --git a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp index 377b880e9bf..9fd31de64d8 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp @@ -261,6 +261,9 @@ std::unordered_set skipFunctions = { }; std::unordered_set skipFunctionsSOT = { + "t_cdf", // New function, not yet widely deployed in Presto instances + "inverse_t_cdf", // New function, not yet widely deployed in Presto + // instances "array_subset", // Velox-only function, not available in Presto "remap_keys", // Velox-only function, not available in Presto "noisy_empty_approx_set_sfm", // non-deterministic because of privacy. diff --git a/velox/functions/prestosql/Probability.h b/velox/functions/prestosql/Probability.h index db42a8da3e6..8e9c0957f3e 100644 --- a/velox/functions/prestosql/Probability.h +++ b/velox/functions/prestosql/Probability.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "boost/math/distributions/beta.hpp" #include "boost/math/distributions/binomial.hpp" @@ -468,5 +469,40 @@ struct InverseChiSquaredCdf { } }; +template +struct TCDFFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call(double& result, double df, double value) { + VELOX_USER_CHECK_GT(df, 0, "df must be greater than 0"); + + boost::math::students_t_distribution<> dist(df); + result = boost::math::cdf(dist, value); + } +}; + +template +struct InverseTCDFFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call(double& result, double df, double p) { + static constexpr double kInf = std::numeric_limits::infinity(); + + VELOX_USER_CHECK_GT(df, 0, "df must be greater than 0"); + VELOX_USER_CHECK( + p >= 0 && p <= 1 && p != kInf, "p must be in the interval [0, 1]"); + + if (p == 0.0) { + result = -std::numeric_limits::infinity(); + return; + } else if (p == 1.0) { + result = std::numeric_limits::infinity(); + return; + } + boost::math::students_t_distribution<> dist(df); + result = boost::math::quantile(dist, p); + } +}; + } // namespace } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/registration/ProbabilityTrigonometricFunctionsRegistration.cpp b/velox/functions/prestosql/registration/ProbabilityTrigonometricFunctionsRegistration.cpp index a2687b55814..818a3d9145b 100644 --- a/velox/functions/prestosql/registration/ProbabilityTrigonometricFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/ProbabilityTrigonometricFunctionsRegistration.cpp @@ -93,6 +93,9 @@ void registerProbTrigFunctions(const std::string& prefix) { {prefix + "inverse_f_cdf"}); registerFunction( {prefix + "inverse_chi_squared_cdf"}); + registerFunction({prefix + "t_cdf"}); + registerFunction( + {prefix + "inverse_t_cdf"}); } } // namespace diff --git a/velox/functions/prestosql/tests/ProbabilityTest.cpp b/velox/functions/prestosql/tests/ProbabilityTest.cpp index ce4974aeb1d..71d918630c2 100644 --- a/velox/functions/prestosql/tests/ProbabilityTest.cpp +++ b/velox/functions/prestosql/tests/ProbabilityTest.cpp @@ -853,5 +853,64 @@ TEST_F(ProbabilityTest, inverseChiSquaredCdf) { "inverseChiSquaredCdf Function: p must be in the interval [0, 1]"); } +TEST_F(ProbabilityTest, tCDF) { + const auto tCDF = [&](std::optional df, std::optional value) { + return evaluateOnce("t_cdf(c0, c1)", df, value); + }; + + EXPECT_EQ(0.5, tCDF(1000, 0.0)); + EXPECT_EQ(1.0, tCDF(1000, kInf)); + EXPECT_EQ(0.0, tCDF(1000, -kInf)); + + EXPECT_EQ(std::nullopt, tCDF(std::nullopt, 0.5)); + EXPECT_EQ(std::nullopt, tCDF(1000, std::nullopt)); + + VELOX_ASSERT_THROW(tCDF(0, 0.5), "df must be greater than 0"); + VELOX_ASSERT_THROW(tCDF(-1, 0.5), "df must be greater than 0"); + + EXPECT_NEAR(0.95, tCDF(5, 2.015).value(), 0.001); + EXPECT_NEAR(0.05, tCDF(5, -2.015).value(), 0.001); + + EXPECT_NEAR(0.975, tCDF(10, 2.228).value(), 0.001); + EXPECT_NEAR(0.025, tCDF(10, -2.228).value(), 0.001); + + EXPECT_NEAR(0.99, tCDF(20, 2.528).value(), 0.001); + + EXPECT_NEAR(0.90, tCDF(30, 1.310).value(), 0.001); + + EXPECT_NEAR(0.95, tCDF(100, 1.660).value(), 0.001); +} + +TEST_F(ProbabilityTest, inverseTCDF) { + const auto inverseTCDF = [&](std::optional df, + std::optional p) { + return evaluateOnce("inverse_t_cdf(c0, c1)", df, p); + }; + + EXPECT_EQ(0.0, inverseTCDF(1000, 0.5)); + EXPECT_EQ(-kInf, inverseTCDF(1000, 0.0)); + EXPECT_EQ(kInf, inverseTCDF(1000, 1.0)); + + EXPECT_EQ(std::nullopt, inverseTCDF(std::nullopt, 0.5)); + EXPECT_EQ(std::nullopt, inverseTCDF(1000, std::nullopt)); + + VELOX_ASSERT_THROW(inverseTCDF(0, 0.5), "df must be greater than 0"); + VELOX_ASSERT_THROW(inverseTCDF(-1, 0.5), "df must be greater than 0"); + VELOX_ASSERT_THROW(inverseTCDF(3, -0.1), "p must be in the interval [0, 1]"); + VELOX_ASSERT_THROW(inverseTCDF(3, 1.1), "p must be in the interval [0, 1]"); + + EXPECT_NEAR(2.015, inverseTCDF(5, 0.95).value(), 0.001); + EXPECT_NEAR(-2.015, inverseTCDF(5, 0.05).value(), 0.001); + + EXPECT_NEAR(2.228, inverseTCDF(10, 0.975).value(), 0.001); + EXPECT_NEAR(-2.228, inverseTCDF(10, 0.025).value(), 0.001); + + EXPECT_NEAR(2.528, inverseTCDF(20, 0.99).value(), 0.001); + + EXPECT_NEAR(1.310, inverseTCDF(30, 0.90).value(), 0.001); + + EXPECT_NEAR(1.660, inverseTCDF(100, 0.95).value(), 0.001); +} + } // namespace } // namespace facebook::velox