From 945b4322308316517cebaa4e42901d3ccc38a628 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Wed, 19 Aug 2020 09:15:49 -0400 Subject: [PATCH 01/14] Kolmogorov-Smirnov distribution #421 Add a new distribution, kolmogorov_smirnov_distribution, which takes a parameter that represents the number of observations used in a Kolmogorov-Smirnov test. (The K-S test is a popular test for comparing two CDFs, but the test statistic is not implemented here.) This implementation includes Kolmogorov's original 1st order Taylor expansion. There is a literature on the distribution's other mathematical properties (higher order terms and exact version); this literature is summarized in the main header file for anyone who may want to expand the implementation later. The CDF is implemented using a Jacobi theta function, and the PDF is a hand-rolled derivative of that function. Quantiles plug the CDF and PDF into a Newton-Raphson iteration. The mean and variance have nice closed-form expressions, and the mode uses a dumb run-time maximizer. This commit includes graphs, a ULP plotter for the PDF, and the usual compilation and numerical tests. The test file is on the small side, but it integrates the distribution from zero to infinity, and covers the quantiles pretty well. As of now the numerical tests only verify self-consistency (e.g. distribution moments and CDF-quantile relations), so there's room to add some external checks. I will add user-facing documentation after the API is approved and the implementation is finalized. --- doc/graphs/dist_graphs.cpp | 17 +- doc/graphs/kolmogorov_smirnov_pdf.svg | 70 +++ include/boost/math/distributions.hpp | 1 + include/boost/math/distributions/fwd.hpp | 4 + .../math/distributions/kolmogorov_smirnov.hpp | 451 ++++++++++++++++++ .../accuracy/plot_kolmogorov_smirnov_pdf.cpp | 34 ++ reporting/performance/test_distributions.cpp | 10 + test/Jamfile.v2 | 1 + .../dist_kolmogorov_smirnov_incl_test.cpp | 25 + .../distribution_concept_check.cpp | 1 + test/compile_test/instantiate.hpp | 2 + test/test_kolmogorov_smirnov.cpp | 87 ++++ 12 files changed, 699 insertions(+), 4 deletions(-) create mode 100644 doc/graphs/kolmogorov_smirnov_pdf.svg create mode 100644 include/boost/math/distributions/kolmogorov_smirnov.hpp create mode 100644 reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp create mode 100644 test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp create mode 100644 test/test_kolmogorov_smirnov.cpp diff --git a/doc/graphs/dist_graphs.cpp b/doc/graphs/dist_graphs.cpp index bcf0bb23af..e324224198 100644 --- a/doc/graphs/dist_graphs.cpp +++ b/doc/graphs/dist_graphs.cpp @@ -84,8 +84,9 @@ class distribution_plotter m_distributions.push_back(std::make_pair(name, d)); // // Get the extent of the distribution from the support: - double a, b; - std::tr1::tie(a, b) = support(d); + std::pair r = support(d); + double a = r.first; + double b = r.second; // // PDF maximum is at the mode (probably): double mod; @@ -253,7 +254,7 @@ class distribution_plotter if(!is_discrete_distribution::value) { // Continuous distribution: - for(std::list >::const_iterator i = m_distributions.begin(); + for(typename std::list >::const_iterator i = m_distributions.begin(); i != m_distributions.end(); ++i) { double x = m_min_x; @@ -280,7 +281,7 @@ class distribution_plotter // Discrete distribution: double x_width = 0.75 / m_distributions.size(); double x_off = -0.5 * 0.75; - for(std::list >::const_iterator i = m_distributions.begin(); + for(typename std::list >::const_iterator i = m_distributions.begin(); i != m_distributions.end(); ++i) { double x = ceil(m_min_x); @@ -469,6 +470,14 @@ int main() fisher_f_plotter.add(boost::math::fisher_f_distribution<>(4, 10), "n=4, m=10"); fisher_f_plotter.plot("F Distribution PDF", "fisher_f_pdf.svg"); + distribution_plotter > + kolmogorov_smirnov_plotter; + kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(1), "n=1"); + kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(2), "n=2"); + kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(5), "n=5"); + kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(10), "n=10"); + kolmogorov_smirnov_plotter.plot("Kolmogorov-Smirnov Distribution PDF", "kolmogorov_smirnov_pdf.svg"); + distribution_plotter > lognormal_plotter; lognormal_plotter.add(boost::math::lognormal_distribution<>(-1), "location=-1"); diff --git a/doc/graphs/kolmogorov_smirnov_pdf.svg b/doc/graphs/kolmogorov_smirnov_pdf.svg new file mode 100644 index 0000000000..46dcb98aa7 --- /dev/null +++ b/doc/graphs/kolmogorov_smirnov_pdf.svg @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0 +0.5 +1 +1.5 +0 + +0 +1 +2 +3 +4 +5 +0 + +Probability + +Random Variable + + + + + + + + + +n=1 +n=2 +n=5 +n=10 + +Kolmogorov-Smirnov Distribution PDF + + + diff --git a/include/boost/math/distributions.hpp b/include/boost/math/distributions.hpp index 300f2312ce..64da99415e 100644 --- a/include/boost/math/distributions.hpp +++ b/include/boost/math/distributions.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/include/boost/math/distributions/fwd.hpp b/include/boost/math/distributions/fwd.hpp index 5b212c8ec6..a3c1a41df5 100644 --- a/include/boost/math/distributions/fwd.hpp +++ b/include/boost/math/distributions/fwd.hpp @@ -63,6 +63,9 @@ class inverse_gamma_distribution; template class inverse_gaussian_distribution; +template +class kolmogorov_smirnov_distribution; + template class laplace_distribution; @@ -129,6 +132,7 @@ class weibull_distribution; typedef boost::math::gamma_distribution gamma;\ typedef boost::math::geometric_distribution geometric;\ typedef boost::math::hypergeometric_distribution hypergeometric;\ + typedef boost::math::kolmogorov_smirnov_distribution kolmogorov_smirnov;\ typedef boost::math::inverse_chi_squared_distribution inverse_chi_squared;\ typedef boost::math::inverse_gaussian_distribution inverse_gaussian;\ typedef boost::math::inverse_gamma_distribution inverse_gamma;\ diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp new file mode 100644 index 0000000000..3f1b09f237 --- /dev/null +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -0,0 +1,451 @@ +// Kolmogorov-Smirnov 1st order asymptotic distribution +// Copyright Evan Miller 2020 +// +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// The Kolmogorov-Smirnov test in statistics compares two empirical distributions, +// or an empirical distribution against any theoretical distribution. It makes +// use of a specific distribution which doesn't have a formal name, but which +// is often called the Kolmogorv-Smirnov distribution for lack of anything +// better. This file implements the limiting form of this distribution, first +// identified by Andrey Kolmogorov in +// +// Kolmogorov, A. (1933) “Sulla Determinazione Empirica di una Legge di +// Distribuzione.” Giornale dell’ Istituto Italiano degli Attuari +// +// This limiting form of the CDF is a first-order Taylor expansion that is +// easily implemented by the fourth Jacobi Theta function (setting z=0). The +// PDF is then implemented here as a derivative of the Theta function. Note +// that this derivative is with respect to x, which enters into \tau, and not +// with respect to the z argument, which is always zero, and so the derivative +// identities in DLMF 20.4 do not apply here. +// +// A higher order order expansion is possible, and was first outlined by +// +// Pelz W, Good IJ (1976). “Approximating the Lower Tail-Areas of the +// Kolmogorov-Smirnov One-sample Statistic.” Journal of the Royal Statistical +// Society B. +// +// The terms in this expansion get fairly complicated, and as far as I know the +// Pelz-Good expansion is not used in any statistics software. Someone could +// consider updating this implementation to use the Pelz-Good expansion in the +// future, but the math gets considerably hairier with each additional term. +// +// A formula for an exact version of the Kolmogorov-Smirnov test is laid out in +// Equation 2.4.4 of +// +// Durbin J (1973). “Distribution Theory for Tests Based on the Sample +// Distribution Func- tion.” In SIAM CBMS-NSF Regional Conference Series in +// Applied Mathematics. SIAM, Philadelphia, PA. +// +// which is available in book form from Amazon and others. This exact version +// involves taking powers of large matrices. To do that right you need to +// compute eigenvalues and eigenvectors, which are beyond the scope of Boost. +// (Some recent work indicates the exact form can also be computed via FFT, see +// https://cran.r-project.org/web/packages/KSgeneral/KSgeneral.pdf). +// +// Even if the CDF of the exact distribution could be computed using Boost +// libraries (which would be cumbersome), the PDF would present another +// difficulty. Therefore I am limiting this implementation to the asymptotic +// form, even though the exact form has trivial values for certain specific +// values of x and n. For more on trivial values see +// +// Ruben H, Gambino J (1982). “The Exact Distribution of Kolmogorov’s Statistic +// Dn for n ≤ 10.” Annals of the Institute of Statistical Mathematics. +// +// For a good bibliography and overview of the various algorithms, including +// both exact and asymptotic forms, see +// https://www.jstatsoft.org/article/view/v039i11 +// +// As for this implementation: the distribution is parameterized by n (number +// of observations) in the spirit of chi-squared's degrees of freedom. It then +// takes a single argument x. In terms of the Kolmogorov-Smirnov statistical +// test, x represents the distribution of D_n, where D_n is the maximum +// difference between the CDFs being compared, that is, +// +// D_n = sup|F_n(x) - G(x)| +// +// In the exact distribution, x is confined to the support [0, 1], but in this +// limiting approximation, we allow x to exceed unity (similar to how a normal +// approximation always spills over any boundaries). +// +// As mentioned previously, the CDF is implemented using the \tau +// parameterization of the fourth Jacobi Theta function as +// +// CDF=θ₄(0|2*x*x*n/pi) +// +// The PDF is a hand-coded derivative of that function. Actually, there are two +// (independent) derivatives, as separate code paths are used for "small x" +// (2*x*x*n < pi) and "large x", mirroring the separate code paths in the +// Jacobi Theta implementation to achieve fast convergence. Quantiles are +// computed using a Newton-Raphson iteration from an initial guess that I +// arrived at by trial and error. +// +// The mean and variance are implemented using simple closed-form expressions. +// The mode is calculated at run-time by maximizing the PDF. If you have an +// analytical solution for the mode, feel free to plop it in. +// +// The CDF and PDF could almost certainly be re-implemented and sped up using a +// polynomial or rational approximation, since the only meaningful argument is +// x * sqrt(n). But that is left as an exercise for the next maintainer. +// +// In the future, the Pelz-Good approximation could be added. I suggest adding +// a second parameter representing the order, e.g. +// +// kolmogorov_smirnov_dist<>(100) // N=100, order=1 +// kolmogorov_smirnov_dist<>(100, 1) // N=100, order=1, i.e. Kolmogorov's formula +// kolmogorov_smirnov_dist<>(100, 4) // N=100, order=4, i.e. Pelz-Good formula +// +// The exact distribution could be added to the API with a special order +// parameter (e.g. 0 or INT_MAX), or a separate distribution type altogether +// (e.g. kolmogorov_smirnov_exact_dist). +// +#ifndef BOOST_STATS_KOLMOGOROV_ASYMPTOTIC_HPP +#define BOOST_STATS_KOLMOGOROV_ASYMPTOTIC_HPP + +#include +#include +#include +#include +#include +#include // Newton-Raphson +#include // For the mode + +// #include + +namespace boost { namespace math { + +namespace detail { +template +inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { + RealType k = 0.0; + // Choose a starting point for the Newton-Raphson iteration + if (p > 0.9) { + k = 1.8 - 5 * (1 - p); + } else if (p < 0.3) { + k = p + 0.45; + } else { + k = p + 0.3; + } + return k; +} + +// d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI)) +template +RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { + RealType value = 0.0, delta = 0.0, last_delta = 0.0; + RealType eps = policies::get_epsilon(); + int i = 0; + RealType pi2 = constants::pi_sqr(); + RealType x2n = x*x*n; + if (x2n*x2n == 0.0) { + return static_cast(0); + } + while (1) { + delta = exp(-0.5*(i+0.5)*(i+0.5)*pi2/x2n) * ((i+0.5)*(i+0.5)*pi2 - x2n); + + if (delta == 0.0) + break; + + if (last_delta != 0.0 && fabs(delta/last_delta) < eps) + break; + + value += delta + delta; + last_delta = delta; + i++; + } + + return value * sqrt(n) * constants::root_half_pi() / (x2n*x2n); +} + +// d/dx (theta4(0, 2*x*x*n/M_PI)) +template +inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Policy&) { + BOOST_MATH_STD_USING + RealType value = 0.0, delta = 0.0, last_delta = 0.0; + RealType eps = policies::get_epsilon(); + int i = 1; + while (1) { + delta = 8*x*i*i*exp(-2.0*i*i*x*x*n); + + if (delta == 0.0) + break; + + if (last_delta != 0.0 && fabs(delta / last_delta) < eps) + break; + + if (i%2 == 0) + delta *= -1.0; + + value += delta; + last_delta = delta; + i++; + } + + return value * n; +} + +}; // detail + +template > + class kolmogorov_smirnov_distribution +{ + public: + typedef RealType value_type; + typedef Policy policy_type; + + // Constructor + kolmogorov_smirnov_distribution( RealType n ) : n_obs(n) + { + RealType result; + detail::check_df( + "boost::math::kolmogorov_smirnov_distribution<%1%>::kolmogorov_smirnov_distribution", n_obs, &result, Policy()); + } + + RealType number_of_observations()const + { + return n_obs; + } + + private: + + RealType n_obs; // positive integer +}; + +typedef kolmogorov_smirnov_distribution kolmogorov_k; // Convenience typedef for double version. + +namespace detail { +template +struct kolmogorov_smirnov_quantile_functor +{ + kolmogorov_smirnov_quantile_functor(const boost::math::kolmogorov_smirnov_distribution dist, RealType const& p) + : distribution(dist), prob(p) + { + } + + boost::math::tuple operator()(RealType const& x) + { + RealType fx = cdf(distribution, x) - prob; // Difference cdf - value - to minimize. + RealType dx = pdf(distribution, x); // pdf is 1st derivative. + // return both function evaluation difference f(x) and 1st derivative f'(x). + return boost::math::make_tuple(fx, dx); + } +private: + const boost::math::kolmogorov_smirnov_distribution distribution; + RealType prob; +}; + +template +struct kolmogorov_smirnov_complementary_quantile_functor +{ + kolmogorov_smirnov_complementary_quantile_functor(const boost::math::kolmogorov_smirnov_distribution dist, RealType const& p) + : distribution(dist), prob(p) + { + } + + boost::math::tuple operator()(RealType const& x) + { + RealType fx = cdf(complement(distribution, x)) - prob; // Difference cdf - value - to minimize. + RealType dx = -pdf(distribution, x); // pdf is the negative of the derivative of (1-CDF) + // return both function evaluation difference f(x) and 1st derivative f'(x). + return boost::math::make_tuple(fx, dx); + } +private: + const boost::math::kolmogorov_smirnov_distribution distribution; + RealType prob; +}; + +template +struct kolmogorov_smirnov_negative_pdf_functor +{ + RealType operator()(RealType const& x) { + if (2*x*x < constants::pi()) { + return -kolmogorov_smirnov_pdf_small_x(x, static_cast(1), Policy()); + } + return -kolmogorov_smirnov_pdf_large_x(x, static_cast(1), Policy()); + } +}; +} // namespace detail + +template +inline const std::pair range(const kolmogorov_smirnov_distribution& /*dist*/) +{ // Range of permissible values for random variable x. + using boost::math::tools::max_value; + return std::pair(static_cast(0), max_value()); +} + +template +inline const std::pair support(const kolmogorov_smirnov_distribution& /*dist*/) +{ // Range of supported values for random variable x. + // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. + // In the exact distribution, the upper limit would be 1. + using boost::math::tools::max_value; + return std::pair(static_cast(0), max_value()); +} + +template +inline RealType pdf(const kolmogorov_smirnov_distribution& dist, const RealType& x) +{ + BOOST_FPU_EXCEPTION_GUARD + BOOST_MATH_STD_USING // for ADL of std functions. + + RealType n = dist.number_of_observations(); + RealType error_result; + static const char* function = "boost::math::pdf(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + if(false == detail::check_x_not_NaN(function, x, &error_result, Policy())) + return error_result; + + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + if (x < 0 || !(boost::math::isfinite)(x)) + { + return policies::raise_domain_error( + function, "Kolmogorov-Smirnov parameter was %1%, but must be > 0 !", x, Policy()); + } + + if (2*x*x*n < constants::pi()) { + return detail::kolmogorov_smirnov_pdf_small_x(x, n, Policy()); + } + + return detail::kolmogorov_smirnov_pdf_large_x(x, n, Policy()); +} // pdf + +template +inline RealType cdf(const kolmogorov_smirnov_distribution& dist, const RealType& x) +{ + BOOST_MATH_STD_USING // for ADL of std function exp. + static const char* function = "boost::math::cdf(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + RealType error_result; + RealType n = dist.number_of_observations(); + if(false == detail::check_x_not_NaN(function, x, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + if((x < 0) || !(boost::math::isfinite(x))) { + return policies::raise_domain_error( + function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy()); + } + + if (x*x*n == 0) + return 0; + + return jacobi_theta4tau(RealType(0), 2*x*x*n/constants::pi(), Policy()); +} // cdf + +template +inline RealType cdf(const complemented2_type, RealType>& c) { + BOOST_MATH_STD_USING // for ADL of std function exp. + RealType x = c.param; + static const char* function = "boost::math::cdf(const complemented2_type&, %1%>)"; + RealType error_result; + kolmogorov_smirnov_distribution const& dist = c.dist; + RealType n = dist.number_of_observations(); + + if(false == detail::check_x_not_NaN(function, x, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + if((x < 0) || !(boost::math::isfinite(x))) + return policies::raise_domain_error( + function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy()); + + if (x*x*n == 0) + return 1; + + if (2*x*x*n > constants::pi()) + return -jacobi_theta4m1tau(RealType(0), 2*x*x*n/constants::pi(), Policy()); + + return RealType(1) - jacobi_theta4tau(RealType(0), 2*x*x*n/constants::pi(), Policy()); +} // cdf (complemented) + +template +inline RealType quantile(const kolmogorov_smirnov_distribution& dist, const RealType& p) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + // Error check: + RealType error_result = 0; + RealType n = dist.number_of_observations(); + if(false == detail::check_probability(function, p, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + RealType k = detail::kolmogorov_smirnov_quantile_guess(p) / sqrt(n); + const int get_digits = policies::digits();// get digits from policy, + boost::uintmax_t m = policies::get_max_root_iterations(); // and max iterations. + + return tools::newton_raphson_iterate(detail::kolmogorov_smirnov_quantile_functor(dist, p), + k, RealType(0), boost::math::tools::max_value(), get_digits, m); +} // quantile + +template +inline RealType quantile(const complemented2_type, RealType>& c) { + static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + kolmogorov_smirnov_distribution const& dist = c.dist; + RealType n = dist.number_of_observations(); + // Error check: + RealType error_result = 0; + RealType p = c.param; + + if(false == detail::check_probability(function, p, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + RealType k = detail::kolmogorov_smirnov_quantile_guess(1-p) / sqrt(n); + + const int get_digits = policies::digits();// get digits from policy, + boost::uintmax_t m = policies::get_max_root_iterations(); // and max iterations. + + return tools::newton_raphson_iterate( + detail::kolmogorov_smirnov_complementary_quantile_functor(dist, p), + k, RealType(0), boost::math::tools::max_value(), get_digits, m); +} // quantile (complemented) + +template +inline RealType mode(const kolmogorov_smirnov_distribution& dist) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::mode(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result = 0; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + std::pair r = boost::math::tools::brent_find_minima( + detail::kolmogorov_smirnov_negative_pdf_functor(), + static_cast(0), static_cast(1), policies::digits()); + return r.first / sqrt(n); +} + +// https://www.jstatsoft.org/article/view/v008i18 Section 3 +template +inline RealType mean(const kolmogorov_smirnov_distribution& dist) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::mean(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result = 0; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + return constants::root_half_pi() * constants::ln_two() / sqrt(n); +} + +template +inline RealType variance(const kolmogorov_smirnov_distribution& dist) +{ + static const char* function = "boost::math::variance(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result = 0; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + return 0.5 * (constants::pi_sqr_div_six() + - constants::pi() * constants::ln_two() * constants::ln_two()) / n; +} +}} +#endif diff --git a/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp b/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp new file mode 100644 index 0000000000..1c7fc419ab --- /dev/null +++ b/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp @@ -0,0 +1,34 @@ + +#include +#include +#include +#include + +using boost::math::tools::ulps_plot; + +int main() { + using PreciseReal = long double; + using CoarseReal = float; + + boost::math::kolmogorov_smirnov_distribution dist_coarse(10); + auto pdf_coarse = [&, dist_coarse](CoarseReal x) { + return boost::math::pdf(dist_coarse, x); + }; + boost::math::kolmogorov_smirnov_distribution dist_precise(10); + auto pdf_precise = [&, dist_precise](PreciseReal x) { + return boost::math::pdf(dist_precise, x); + }; + + int samples = 2500; + int width = 800; + PreciseReal clip = 100; + + std::string filename1 = "kolmogorov_smirnov_pdf_" + boost::core::demangle(typeid(CoarseReal).name()) + ".svg"; + auto plot1 = ulps_plot(pdf_precise, 0.0, 1.0, samples); + plot1.clip(clip).width(width); + std::string title1 = "Kolmogorov-Smirnov PDF (N=10) ULP plot at " + boost::core::demangle(typeid(CoarseReal).name()) + " precision"; + plot1.title(title1); + plot1.vertical_lines(10); + plot1.add_fn(pdf_coarse); + plot1.write(filename1); +} diff --git a/reporting/performance/test_distributions.cpp b/reporting/performance/test_distributions.cpp index ddd72ee96f..2b54d4760e 100644 --- a/reporting/performance/test_distributions.cpp +++ b/reporting/performance/test_distributions.cpp @@ -436,6 +436,16 @@ int main() test_boost_2_param(inverse_gaussian); + distribution_tester kolmogorov("KolmogorovSmirnov"); + kolmogorov.add_test_case(3, one_param_quantile >()); + kolmogorov.add_test_case(20, one_param_quantile >()); + kolmogorov.add_test_case(200, one_param_quantile >()); + kolmogorov.add_test_case(2000, one_param_quantile >()); + kolmogorov.add_test_case(20000, one_param_quantile >()); + kolmogorov.add_test_case(200000, one_param_quantile >()); + + test_boost_1_param(kolmogorov); + distribution_tester laplace("Laplace"); laplace.add_test_case(0, 1, two_param_quantile >()); laplace.add_test_case(20, 20, two_param_quantile >()); diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 9110a0130f..0d93453003 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -630,6 +630,7 @@ test-suite distribution_tests : [ run test_inverse_chi_squared_distribution.cpp ../../test/build//boost_unit_test_framework ] [ run test_inverse_gamma_distribution.cpp ../../test/build//boost_unit_test_framework ] [ run test_inverse_gaussian.cpp ../../test/build//boost_unit_test_framework ] + [ run test_kolmogorov_smirnov.cpp ../../test/build//boost_unit_test_framework ] [ run test_laplace.cpp ../../test/build//boost_unit_test_framework ] [ run test_inv_hyp.cpp pch ../../test/build//boost_unit_test_framework ] [ run test_logistic_dist.cpp ../../test/build//boost_unit_test_framework ] diff --git a/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp b/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp new file mode 100644 index 0000000000..e1bbc95f8a --- /dev/null +++ b/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp @@ -0,0 +1,25 @@ +// Copyright John Maddock 2006. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Basic sanity check that header +// #includes all the files that it needs to. +// +#include +// +// Note this header includes no other headers, this is +// important if this test is to be meaningful: +// +#include "test_compile_result.hpp" + +void compile_and_link_test() +{ + TEST_DIST_FUNC(kolmogorov_smirnov) +} + +template class boost::math::kolmogorov_smirnov_distribution >; +template class boost::math::kolmogorov_smirnov_distribution >; +#ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS +template class boost::math::kolmogorov_smirnov_distribution >; +#endif diff --git a/test/compile_test/distribution_concept_check.cpp b/test/compile_test/distribution_concept_check.cpp index 26b236a55c..fc8d2fab84 100644 --- a/test/compile_test/distribution_concept_check.cpp +++ b/test/compile_test/distribution_concept_check.cpp @@ -33,6 +33,7 @@ void instantiate(RealType) function_requires > >(); function_requires > >(); function_requires > >(); + function_requires > >(); function_requires > >(); function_requires > >(); function_requires > >(); diff --git a/test/compile_test/instantiate.hpp b/test/compile_test/instantiate.hpp index be9dae129c..16bd7ece0c 100644 --- a/test/compile_test/instantiate.hpp +++ b/test/compile_test/instantiate.hpp @@ -76,6 +76,7 @@ void instantiate(RealType) function_requires > >(); function_requires > >(); function_requires > >(); + function_requires > >(); function_requires > >(); function_requires > >(); function_requires > >(); @@ -111,6 +112,7 @@ void instantiate(RealType) function_requires > >(); function_requires > >(); function_requires > >(); + function_requires > >(); function_requires > >(); function_requires > >(); function_requires > >(); diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp new file mode 100644 index 0000000000..5bf912b6eb --- /dev/null +++ b/test/test_kolmogorov_smirnov.cpp @@ -0,0 +1,87 @@ +// Copyright Evan Miller 2020 +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt +// or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#include +#include + +#define BOOST_TEST_MAIN +#include // for test_main +#include // for BOOST_CHECK_CLOSE +#include +#include + +template // Any floating-point type RealType. +void test_spots(RealType T) +{ + using namespace boost::math; + // Test quantiles, CDFs, and complements + RealType eps = tools::epsilon(); + RealType tol = tools::epsilon() * 25; + for (int n=10; n<100; n += 10) { + kolmogorov_smirnov_distribution dist(n); + for (int i=0; i<1000; i++) { + RealType p = 1.0 * (i+1) / 1001; + RealType crit1 = quantile(dist, 1 - p); + RealType crit2 = quantile(complement(dist, p)); + RealType p1 = cdf(dist, crit1); + BOOST_CHECK_CLOSE_FRACTION(crit1, crit2, tol); + BOOST_CHECK_CLOSE_FRACTION(1 - p, p1, tol); + } + + for (int i=0; i<1000; i++) { + RealType x = 1.0 * (i+1) / 1001; + RealType p = cdf(dist, x); + RealType p1 = cdf(complement(dist, x)); + RealType x1; + if (p < 0.5) + x1 = quantile(dist, p); + else + x1 = quantile(complement(dist, p1)); + if (p > tol && p1 > tol) // skip the extreme tails + BOOST_CHECK_CLOSE_FRACTION(x, x1, tol); + } + } + + kolmogorov_smirnov_distribution dist(100); + + // Basics + BOOST_CHECK_THROW(pdf(dist, RealType(-1.0)), std::domain_error); + BOOST_CHECK_THROW(cdf(dist, RealType(-1.0)), std::domain_error); + BOOST_CHECK_THROW(quantile(dist, RealType(-1.0)), std::domain_error); + BOOST_CHECK_THROW(quantile(dist, RealType(2.0)), std::domain_error); + + // Confirm mode is at least a local minimum + RealType mode = boost::math::mode(dist); + + BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode - 100 * tol)); + BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode + 100 * tol)); + + // Test first three moments - pretty well covers the entire distribution + RealType mean = boost::math::mean(dist); + RealType var = variance(dist); + quadrature::exp_sinh integrator; + auto f_one = [&, dist](RealType t) { return pdf(dist, t); }; + auto f_mean = [&, dist](RealType t) { return pdf(dist, t) * t; }; + auto f_var = [&, dist, mean](RealType t) { return pdf(dist, t) * (t - mean) * (t - mean); }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_one, eps), RealType(1), tol); + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_mean, eps), mean, tol); + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_var, eps), var, tol); +} + +BOOST_AUTO_TEST_CASE( test_main ) +{ + BOOST_MATH_CONTROL_FP; + + // (Parameter value, arbitrarily zero, only communicates the floating point type). + test_spots(0.0F); // Test float. + test_spots(0.0); // Test double. +#ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS + test_spots(0.0L); // Test long double. +#if !defined(BOOST_MATH_NO_REAL_CONCEPT_TESTS) + test_spots(boost::math::concepts::real_concept(0.)); // Test real concept. +#endif +#endif +} From de347f169b1187b1933d32260a2b271e76a5a9bd Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Wed, 19 Aug 2020 12:42:37 -0400 Subject: [PATCH 02/14] Fix copyright notices [CI SKIP] --- reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp | 4 ++++ test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp b/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp index 1c7fc419ab..04261a03ee 100644 --- a/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp +++ b/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp @@ -1,3 +1,7 @@ +// Copyright Evan Miller 2020 +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #include #include diff --git a/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp b/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp index e1bbc95f8a..9bfe384ace 100644 --- a/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp +++ b/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp @@ -1,4 +1,4 @@ -// Copyright John Maddock 2006. +// Copyright Evan Miller 2020 // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) From 59c4da1dc1d4a643283438be504edcf97094b70e Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Thu, 20 Aug 2020 14:04:37 -0400 Subject: [PATCH 03/14] Implement skewness for K-S distribution [CI SKIP] The third moment integrates nicely with the help of Apery's constant (zeta_three). Verify the result via quadrature. --- .../math/distributions/kolmogorov_smirnov.hpp | 14 ++++++++++++++ test/test_kolmogorov_smirnov.cpp | 6 +++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index 3f1b09f237..c9e4ef8052 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -447,5 +447,19 @@ inline RealType variance(const kolmogorov_smirnov_distribution return 0.5 * (constants::pi_sqr_div_six() - constants::pi() * constants::ln_two() * constants::ln_two()) / n; } + +template +inline RealType skewness(const kolmogorov_smirnov_distribution& dist) +{ + static const char* function = "boost::math::skewness(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result = 0; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + RealType ex3 = 9.0 / 16.0 * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); + RealType mean = boost::math::mean(dist); + RealType var = boost::math::variance(dist); + return (ex3 - 3 * mean * var - mean * mean * mean) / var / sqrt(var); +} }} #endif diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp index 5bf912b6eb..ff611cb47b 100644 --- a/test/test_kolmogorov_smirnov.cpp +++ b/test/test_kolmogorov_smirnov.cpp @@ -59,16 +59,20 @@ void test_spots(RealType T) BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode - 100 * tol)); BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode + 100 * tol)); - // Test first three moments - pretty well covers the entire distribution + // Test first four moments - pretty well covers the entire distribution RealType mean = boost::math::mean(dist); RealType var = variance(dist); + RealType skew = skewness(dist); quadrature::exp_sinh integrator; auto f_one = [&, dist](RealType t) { return pdf(dist, t); }; auto f_mean = [&, dist](RealType t) { return pdf(dist, t) * t; }; auto f_var = [&, dist, mean](RealType t) { return pdf(dist, t) * (t - mean) * (t - mean); }; + auto f_skew = [&, dist, mean, var](RealType t) { return pdf(dist, t) + * (t - mean) * (t - mean) * (t - mean) / var / sqrt(var); }; BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_one, eps), RealType(1), tol); BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_mean, eps), mean, tol); BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_var, eps), var, tol); + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_skew, eps), skew, 4*tol); } BOOST_AUTO_TEST_CASE( test_main ) From 73f0dec11d6e633451477243c4586e3d3ba426df Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Thu, 20 Aug 2020 22:39:31 -0400 Subject: [PATCH 04/14] Implement kurtosis for the K-S distribution Verify the result via quadrature. --- .../math/distributions/kolmogorov_smirnov.hpp | 31 ++++++++++++++++++- test/test_kolmogorov_smirnov.cpp | 25 ++++++++++----- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index c9e4ef8052..6a686f5286 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -423,6 +423,7 @@ inline RealType mode(const kolmogorov_smirnov_distribution& di return r.first / sqrt(n); } +// Mean and variance come directly from // https://www.jstatsoft.org/article/view/v008i18 Section 3 template inline RealType mean(const kolmogorov_smirnov_distribution& dist) @@ -448,6 +449,8 @@ inline RealType variance(const kolmogorov_smirnov_distribution - constants::pi() * constants::ln_two() * constants::ln_two()) / n; } +// Skewness and kurtosis come from integrating the PDF +// The alternating series pops out a Dirichlet eta function which is related to the zeta function template inline RealType skewness(const kolmogorov_smirnov_distribution& dist) { @@ -456,10 +459,36 @@ inline RealType skewness(const kolmogorov_smirnov_distribution RealType error_result = 0; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; - RealType ex3 = 9.0 / 16.0 * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); + RealType ex3 = 0.5625 * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); RealType mean = boost::math::mean(dist); RealType var = boost::math::variance(dist); return (ex3 - 3 * mean * var - mean * mean * mean) / var / sqrt(var); } + +template +inline RealType kurtosis(const kolmogorov_smirnov_distribution& dist) +{ + static const char* function = "boost::math::kurtosis(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result = 0; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + RealType ex4 = 7.0 * constants::pi_sqr_div_six() * constants::pi_sqr_div_six() / 20.0 / n / n; + RealType mean = boost::math::mean(dist); + RealType var = boost::math::variance(dist); + RealType skew = boost::math::skewness(dist); + return (ex4 - 4 * mean * skew * var * sqrt(var) - 6 * mean * mean * var - mean * mean * mean * mean) / var / var; +} + +template +inline RealType kurtosis_excess(const kolmogorov_smirnov_distribution& dist) +{ + static const char* function = "boost::math::kurtosis_excess(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result = 0; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + return kurtosis(dist) - 3; +} }} #endif diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp index ff611cb47b..8ddb00d43d 100644 --- a/test/test_kolmogorov_smirnov.cpp +++ b/test/test_kolmogorov_smirnov.cpp @@ -59,20 +59,31 @@ void test_spots(RealType T) BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode - 100 * tol)); BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode + 100 * tol)); - // Test first four moments - pretty well covers the entire distribution - RealType mean = boost::math::mean(dist); - RealType var = variance(dist); - RealType skew = skewness(dist); + // Test the moments - each one integrates the entire distribution quadrature::exp_sinh integrator; + auto f_one = [&, dist](RealType t) { return pdf(dist, t); }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_one, eps), RealType(1), tol); + + RealType mean = boost::math::mean(dist); auto f_mean = [&, dist](RealType t) { return pdf(dist, t) * t; }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_mean, eps), mean, tol); + + RealType var = variance(dist); auto f_var = [&, dist, mean](RealType t) { return pdf(dist, t) * (t - mean) * (t - mean); }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_var, eps), var, tol); + + RealType skew = skewness(dist); auto f_skew = [&, dist, mean, var](RealType t) { return pdf(dist, t) * (t - mean) * (t - mean) * (t - mean) / var / sqrt(var); }; - BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_one, eps), RealType(1), tol); - BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_mean, eps), mean, tol); - BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_var, eps), var, tol); BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_skew, eps), skew, 4*tol); + + RealType kurt = kurtosis(dist); + auto f_kurt= [&, dist, mean, var](RealType t) { return pdf(dist, t) + * (t - mean) * (t - mean) * (t - mean) * (t - mean) / var / var; }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_kurt, eps), kurt, 5*tol); + + BOOST_CHECK_CLOSE_FRACTION(kurt, kurtosis_excess(dist) + 3, eps); } BOOST_AUTO_TEST_CASE( test_main ) From 8ef4d3feef7571d7bd644dc4fb79f29bf13528ce Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Fri, 21 Aug 2020 20:48:51 -0400 Subject: [PATCH 05/14] Fix "0.0" conversion errors with multiprecision types --- .../boost/math/distributions/kolmogorov_smirnov.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index 6a686f5286..9249a8c435 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -120,7 +120,7 @@ namespace boost { namespace math { namespace detail { template inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { - RealType k = 0.0; + RealType k = 0; // Choose a starting point for the Newton-Raphson iteration if (p > 0.9) { k = 1.8 - 5 * (1 - p); @@ -135,8 +135,8 @@ inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { // d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI)) template RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { - RealType value = 0.0, delta = 0.0, last_delta = 0.0; - RealType eps = policies::get_epsilon(); + RealType value = 0, delta = 0, last_delta = 0; + RealType eps = policies::get_epsilon(); int i = 0; RealType pi2 = constants::pi_sqr(); RealType x2n = x*x*n; @@ -164,8 +164,8 @@ RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { template inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Policy&) { BOOST_MATH_STD_USING - RealType value = 0.0, delta = 0.0, last_delta = 0.0; - RealType eps = policies::get_epsilon(); + RealType value = 0, delta = 0, last_delta = 0; + RealType eps = policies::get_epsilon(); int i = 1; while (1) { delta = 8*x*i*i*exp(-2.0*i*i*x*x*n); From 91fd5f6322f3806a76e3df8e3b6ff654f1c9cdc6 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sat, 22 Aug 2020 04:10:36 -0400 Subject: [PATCH 06/14] Kolmogorov-Smirnov build fixes --- .../math/distributions/kolmogorov_smirnov.hpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index 9249a8c435..8a1ed6fe2b 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -120,7 +120,7 @@ namespace boost { namespace math { namespace detail { template inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { - RealType k = 0; + RealType k = RealType(0); // Choose a starting point for the Newton-Raphson iteration if (p > 0.9) { k = 1.8 - 5 * (1 - p); @@ -135,7 +135,7 @@ inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { // d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI)) template RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { - RealType value = 0, delta = 0, last_delta = 0; + RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0); RealType eps = policies::get_epsilon(); int i = 0; RealType pi2 = constants::pi_sqr(); @@ -164,7 +164,7 @@ RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { template inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Policy&) { BOOST_MATH_STD_USING - RealType value = 0, delta = 0, last_delta = 0; + RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0); RealType eps = policies::get_epsilon(); int i = 1; while (1) { @@ -324,7 +324,7 @@ inline RealType cdf(const kolmogorov_smirnov_distribution& dis return error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; - if((x < 0) || !(boost::math::isfinite(x))) { + if((x < 0) || !(boost::math::isfinite)(x)) { return policies::raise_domain_error( function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy()); } @@ -349,7 +349,7 @@ inline RealType cdf(const complemented2_type( function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy()); @@ -368,7 +368,7 @@ inline RealType quantile(const kolmogorov_smirnov_distribution BOOST_MATH_STD_USING static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; // Error check: - RealType error_result = 0; + RealType error_result; RealType n = dist.number_of_observations(); if(false == detail::check_probability(function, p, &error_result, Policy())) return error_result; @@ -389,7 +389,7 @@ inline RealType quantile(const complemented2_type const& dist = c.dist; RealType n = dist.number_of_observations(); // Error check: - RealType error_result = 0; + RealType error_result; RealType p = c.param; if(false == detail::check_probability(function, p, &error_result, Policy())) @@ -413,7 +413,7 @@ inline RealType mode(const kolmogorov_smirnov_distribution& di BOOST_MATH_STD_USING static const char* function = "boost::math::mode(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); - RealType error_result = 0; + RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; @@ -431,7 +431,7 @@ inline RealType mean(const kolmogorov_smirnov_distribution& di BOOST_MATH_STD_USING static const char* function = "boost::math::mean(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); - RealType error_result = 0; + RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; return constants::root_half_pi() * constants::ln_two() / sqrt(n); @@ -442,7 +442,7 @@ inline RealType variance(const kolmogorov_smirnov_distribution { static const char* function = "boost::math::variance(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); - RealType error_result = 0; + RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; return 0.5 * (constants::pi_sqr_div_six() @@ -456,7 +456,7 @@ inline RealType skewness(const kolmogorov_smirnov_distribution { static const char* function = "boost::math::skewness(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); - RealType error_result = 0; + RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; RealType ex3 = 0.5625 * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); @@ -470,7 +470,7 @@ inline RealType kurtosis(const kolmogorov_smirnov_distribution { static const char* function = "boost::math::kurtosis(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); - RealType error_result = 0; + RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; RealType ex4 = 7.0 * constants::pi_sqr_div_six() * constants::pi_sqr_div_six() / 20.0 / n / n; @@ -485,7 +485,7 @@ inline RealType kurtosis_excess(const kolmogorov_smirnov_distribution Date: Sat, 22 Aug 2020 09:13:09 -0400 Subject: [PATCH 07/14] Kolmogorov-Smirnov needs more BOOST_MATH_STD_USING --- include/boost/math/distributions/kolmogorov_smirnov.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index 8a1ed6fe2b..d59fd1f295 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -120,7 +120,7 @@ namespace boost { namespace math { namespace detail { template inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { - RealType k = RealType(0); + RealType k; // Choose a starting point for the Newton-Raphson iteration if (p > 0.9) { k = 1.8 - 5 * (1 - p); @@ -135,6 +135,7 @@ inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { // d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI)) template RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { + BOOST_MATH_STD_USING RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0); RealType eps = policies::get_epsilon(); int i = 0; @@ -385,6 +386,7 @@ inline RealType quantile(const kolmogorov_smirnov_distribution template inline RealType quantile(const complemented2_type, RealType>& c) { + BOOST_MATH_STD_USING static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; kolmogorov_smirnov_distribution const& dist = c.dist; RealType n = dist.number_of_observations(); @@ -454,6 +456,7 @@ inline RealType variance(const kolmogorov_smirnov_distribution template inline RealType skewness(const kolmogorov_smirnov_distribution& dist) { + BOOST_MATH_STD_USING static const char* function = "boost::math::skewness(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); RealType error_result; @@ -468,6 +471,7 @@ inline RealType skewness(const kolmogorov_smirnov_distribution template inline RealType kurtosis(const kolmogorov_smirnov_distribution& dist) { + BOOST_MATH_STD_USING static const char* function = "boost::math::kurtosis(const kolmogorov_smirnov_distribution<%1%>&)"; RealType n = dist.number_of_observations(); RealType error_result; From ae6c2d8644460f290713268ff4efc646fcc3f86e Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sun, 23 Aug 2020 07:14:54 -0400 Subject: [PATCH 08/14] Kolmogorov-Smirnov build fixes --- include/boost/math/distributions/kolmogorov_smirnov.hpp | 2 +- test/test_kolmogorov_smirnov.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index d59fd1f295..2869c4aaf6 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -399,7 +399,7 @@ inline RealType quantile(const complemented2_type();// get digits from policy, boost::uintmax_t m = policies::get_max_root_iterations(); // and max iterations. diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp index 8ddb00d43d..641a3478e5 100644 --- a/test/test_kolmogorov_smirnov.cpp +++ b/test/test_kolmogorov_smirnov.cpp @@ -76,7 +76,7 @@ void test_spots(RealType T) RealType skew = skewness(dist); auto f_skew = [&, dist, mean, var](RealType t) { return pdf(dist, t) * (t - mean) * (t - mean) * (t - mean) / var / sqrt(var); }; - BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_skew, eps), skew, 4*tol); + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_skew, eps), skew, 10*tol); RealType kurt = kurtosis(dist); auto f_kurt= [&, dist, mean, var](RealType t) { return pdf(dist, t) From 1599c1daa3b42b727e54b443f38184c874e81b31 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sun, 23 Aug 2020 14:36:19 -0400 Subject: [PATCH 09/14] Require C++11 features for Kolmogorov-Smirnov tests --- test/Jamfile.v2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 0d93453003..60dc9c154d 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -630,7 +630,7 @@ test-suite distribution_tests : [ run test_inverse_chi_squared_distribution.cpp ../../test/build//boost_unit_test_framework ] [ run test_inverse_gamma_distribution.cpp ../../test/build//boost_unit_test_framework ] [ run test_inverse_gaussian.cpp ../../test/build//boost_unit_test_framework ] - [ run test_kolmogorov_smirnov.cpp ../../test/build//boost_unit_test_framework ] + [ run test_kolmogorov_smirnov.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_hdr_initializer_list cxx11_auto_declarations cxx11_lambdas cxx11_unified_initialization_syntax cxx11_smart_ptr ] ] [ run test_laplace.cpp ../../test/build//boost_unit_test_framework ] [ run test_inv_hyp.cpp pch ../../test/build//boost_unit_test_framework ] [ run test_logistic_dist.cpp ../../test/build//boost_unit_test_framework ] From dafb60258451a2f44415cba8fc4fbc002b192bff Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Mon, 24 Aug 2020 08:08:43 -0400 Subject: [PATCH 10/14] Fix double-to-float warnings with double literals Should fix Appveyor build --- .../math/distributions/kolmogorov_smirnov.hpp | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index 2869c4aaf6..da7fc690b1 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -120,16 +120,12 @@ namespace boost { namespace math { namespace detail { template inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { - RealType k; // Choose a starting point for the Newton-Raphson iteration - if (p > 0.9) { - k = 1.8 - 5 * (1 - p); - } else if (p < 0.3) { - k = p + 0.45; - } else { - k = p + 0.3; - } - return k; + if (p > 0.9) + return RealType(1.8) - 5 * (1 - p); + if (p < 0.3) + return p + RealType(0.45); + return p + RealType(0.3); } // d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI)) @@ -145,7 +141,7 @@ RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { return static_cast(0); } while (1) { - delta = exp(-0.5*(i+0.5)*(i+0.5)*pi2/x2n) * ((i+0.5)*(i+0.5)*pi2 - x2n); + delta = exp(-RealType(i+0.5)*RealType(i+0.5)*pi2/(2*x2n)) * (RealType(i+0.5)*RealType(i+0.5)*pi2 - x2n); if (delta == 0.0) break; @@ -169,7 +165,7 @@ inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Pol RealType eps = policies::get_epsilon(); int i = 1; while (1) { - delta = 8*x*i*i*exp(-2.0*i*i*x*x*n); + delta = 8*x*i*i*exp(-2*i*i*x*x*n); if (delta == 0.0) break; @@ -178,7 +174,7 @@ inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Pol break; if (i%2 == 0) - delta *= -1.0; + delta = -delta; value += delta; last_delta = delta; @@ -447,8 +443,8 @@ inline RealType variance(const kolmogorov_smirnov_distribution RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; - return 0.5 * (constants::pi_sqr_div_six() - - constants::pi() * constants::ln_two() * constants::ln_two()) / n; + return (constants::pi_sqr_div_six() + - constants::pi() * constants::ln_two() * constants::ln_two()) / (2*n); } // Skewness and kurtosis come from integrating the PDF @@ -462,7 +458,7 @@ inline RealType skewness(const kolmogorov_smirnov_distribution RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; - RealType ex3 = 0.5625 * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); + RealType ex3 = RealType(0.5625) * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); RealType mean = boost::math::mean(dist); RealType var = boost::math::variance(dist); return (ex3 - 3 * mean * var - mean * mean * mean) / var / sqrt(var); @@ -477,7 +473,7 @@ inline RealType kurtosis(const kolmogorov_smirnov_distribution RealType error_result; if(false == detail::check_df(function, n, &error_result, Policy())) return error_result; - RealType ex4 = 7.0 * constants::pi_sqr_div_six() * constants::pi_sqr_div_six() / 20.0 / n / n; + RealType ex4 = 7 * constants::pi_sqr_div_six() * constants::pi_sqr_div_six() / 20 / n / n; RealType mean = boost::math::mean(dist); RealType var = boost::math::variance(dist); RealType skew = boost::math::skewness(dist); From 07791a70d06b14d14ac1d28cf37b6840e75ef1e2 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sun, 30 Aug 2020 18:11:42 -0400 Subject: [PATCH 11/14] Kolmogorov-Smirnov user documentation [CI SKIP] --- doc/distributions/dist_reference.qbk | 1 + doc/distributions/kolmogorov_smirnov.qbk | 122 + .../kolmogorov_smirnov_distribution.svg | 1 + .../kolmogorov_smirnov_test_statistic.svg | 1 + doc/graphs/dist_graphs.cpp | 20 +- doc/graphs/kolmogorov_smirnov_pdf_ulp.svg | 2551 +++++++++++++++++ doc/html/dist.html | 2 + .../dists/kolmogorov_smirnov_dist.html | 362 +++ doc/html/standalone_HTML.manifest | 1 + doc/math.qbk | 1 + 10 files changed, 3056 insertions(+), 6 deletions(-) create mode 100644 doc/distributions/kolmogorov_smirnov.qbk create mode 100644 doc/equations/kolmogorov_smirnov_distribution.svg create mode 100644 doc/equations/kolmogorov_smirnov_test_statistic.svg create mode 100644 doc/graphs/kolmogorov_smirnov_pdf_ulp.svg create mode 100644 doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html diff --git a/doc/distributions/dist_reference.qbk b/doc/distributions/dist_reference.qbk index 63fe12bf32..c225d1953e 100644 --- a/doc/distributions/dist_reference.qbk +++ b/doc/distributions/dist_reference.qbk @@ -21,6 +21,7 @@ [include inverse_chi_squared.qbk] [include inverse_gamma.qbk] [include inverse_gaussian.qbk] +[include kolmogorov_smirnov.qbk] [include laplace.qbk] [include logistic.qbk] [include lognormal.qbk] diff --git a/doc/distributions/kolmogorov_smirnov.qbk b/doc/distributions/kolmogorov_smirnov.qbk new file mode 100644 index 0000000000..8f854770ce --- /dev/null +++ b/doc/distributions/kolmogorov_smirnov.qbk @@ -0,0 +1,122 @@ +[section:kolmogorov_smirnov_dist Kolmogorov-Smirnov Distribution] + +``#include `` + + namespace boost{ namespace math{ + + template + class kolmogorov_smirnov_distribution; + + typedef kolmogorov_smirnov_distribution<> kolmogorov_smirnov; + + template + class kolmogorov_smirnov_distribution + { + public: + typedef RealType value_type; + typedef Policy policy_type; + + // Constructor: + kolmogorov_smirnov_distribution(RealType n); + + // Accessor to parameter: + RealType number_of_observations()const; + + }} // namespaces + +The Kolmogorov-Smirnov test in statistics compares two empirical distributions, +or an empirical distribution against any theoretical distribution.[footnote +[@https://en.wikipedia.org/wiki/Kolmogorov–Smirnov_test Wikipedia: +Kolmogorov-Smirnov test]] It makes use of a specific distribution which is +informally known in the literature as the Kolmogorv-Smirnov distribution, +implemented here. + +Formally, if /n/ observations are taken from a theoretical distribution /G(x)/, +and if /G/[sub /n/](/x/) represents the empirical CDF of those /n/ observations, +then the test statistic + +[equation kolmogorov_smirnov_test_statistic] [/ D_n = \sup_x|G(x)-\hat{G}(x)| ] + +will be distributed according to a Kolmogorov-Smirnov distribution parameterized by /n/. + +The exact form of a Kolmogorov-Smirnov distribution is the subject of a large, +decades-old literature.[footnote +[@https://www.jstatsoft.org/article/view/v039i11 Simard, R. and L'Ecuyer, P. +(2011) "Computing the Two-Sided Kolmogorov-Smirnov Distribution". Journal of +Statistical Software, vol. 39, no. 11.]] In the interest of simplicity, Boost +implements the first-order, limiting form of this distribution (the same form +originally identified by Kolmogorov[footnote Kolmogorov A (1933). "Sulla +determinazione empirica di una legge di distribuzione". G. Ist. Ital. Attuari. +4: 83–91.]), namely + +[equation kolmogorov_smirnov_distribution] +[/ \lim_{n \to \infty} F_n(x/\sqrt{n})=1+2\sum_{k=1}^\infty (-1)^k e^{-2k^2x^2} ] + +Note that while the exact distribution only has support over \[0, 1\], this +limiting form has positive mass above unity, particlularly for small /n/. The +following graph illustrations how the distribution changes for different values +of /n/: + +[graph kolmogorov_smirnov_pdf] + +[h4 Member Functions] + + kolmogorov_smirnov_distribution(RealType n); + +Constructs a Kolmogorov-Smirnov distribution with /n/ observations. + +Requires n > 0, otherwise calls __domain_error. + + RealType number_of_observations()const; + +Returns the parameter /n/ from which this object was constructed. + +[h4 Non-member Accessors] + +All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] +that are generic to all distributions are supported: __usual_accessors. + +The domain of the random variable is \[0, +[infin]\]. + +[h4 Accuracy] + +The CDF of the Kolmogorov-Smirnov distribution is implemented in terms of the +fourth [link math_toolkit.jacobi_theta.jacobi_theta_overview Jacobi Theta +function]; please refer to the accuracy ULP plots for that function. + +The PDF is implemented separately, and the following ULP plot illustrates its +accuracy: + +[graph kolmogorov_smirnov_pdf_ulp] + +Because PDF values are simply scaled out and up by the square root of /n/, the +above plot is representative for all values of /n/. Note that for present +purposes, "accuracy" refers to deviations from the limiting approximation, +rather than deviations from the exact distribution. + +[h4 Implementation] + +In the following table, /n/ is the number of observations, /x/ is the random variable, +[pi] is Archimedes' constant, and [zeta](3) is Apéry's constant. + +[table +[[Function][Implementation Notes]] +[[cdf][Using the relation: cdf = __jacobi_theta4tau(0, 2*x*x/[pi])]] +[[pdf][Using a manual derivative of the CDF]] +[[cdf complement][ +When x*x*n == 0: 1 + +When 2*x*x*n <= [pi]: 1 - __jacobi_theta4tau(0, 2*x*x*n/[pi]) + +When 2*x*x*n > [pi]: -__jacobi_theta4m1tau(0, 2*x*x*n/[pi])]] +[[quantile][Using a Newton-Raphson iteration]] +[[quantile from the complement][Using a Newton-Raphson iteration]] +[[mode][Using a run-time PDF maximizer]] +[[mean][sqrt([pi]/2) * ln(2) / sqrt(n)]] +[[variance][([pi][super 2]/12 - [pi]/2*ln[super 2](2))/n]] +[[skewness][(9/16*sqrt([pi]/2)*[zeta](3)/n[super 3/2] - 3 * mean * variance - mean[super 2] * variance) / (variance[super 3/2])]] +[[kurtosis][(7/720*[pi][super 4]/n[super 2] - 4 * mean * skewness * variance[super 3/2] - 6 * mean[super 2] * variance - mean[super 4]) / (variance[super 2])]] +] + +[endsect] [/section:kolmogorov_smirnov_dist Kolmogorov-Smirnov] diff --git a/doc/equations/kolmogorov_smirnov_distribution.svg b/doc/equations/kolmogorov_smirnov_distribution.svg new file mode 100644 index 0000000000..4233fe70d1 --- /dev/null +++ b/doc/equations/kolmogorov_smirnov_distribution.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/equations/kolmogorov_smirnov_test_statistic.svg b/doc/equations/kolmogorov_smirnov_test_statistic.svg new file mode 100644 index 0000000000..8c72cefa59 --- /dev/null +++ b/doc/equations/kolmogorov_smirnov_test_statistic.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/graphs/dist_graphs.cpp b/doc/graphs/dist_graphs.cpp index e324224198..611c357cae 100644 --- a/doc/graphs/dist_graphs.cpp +++ b/doc/graphs/dist_graphs.cpp @@ -471,12 +471,20 @@ int main() fisher_f_plotter.plot("F Distribution PDF", "fisher_f_pdf.svg"); distribution_plotter > - kolmogorov_smirnov_plotter; - kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(1), "n=1"); - kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(2), "n=2"); - kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(5), "n=5"); - kolmogorov_smirnov_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(10), "n=10"); - kolmogorov_smirnov_plotter.plot("Kolmogorov-Smirnov Distribution PDF", "kolmogorov_smirnov_pdf.svg"); + kolmogorov_smirnov_cdf_plotter(false); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(1), "n=1"); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(2), "n=2"); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(5), "n=5"); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(10), "n=10"); + kolmogorov_smirnov_cdf_plotter.plot("Kolmogorov-Smirnov Distribution CDF", "kolmogorov_smirnov_cdf.svg"); + + distribution_plotter > + kolmogorov_smirnov_pdf_plotter; + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(1), "n=1"); + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(2), "n=2"); + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(5), "n=5"); + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(10), "n=10"); + kolmogorov_smirnov_pdf_plotter.plot("Kolmogorov-Smirnov Distribution PDF", "kolmogorov_smirnov_pdf.svg"); distribution_plotter > lognormal_plotter; diff --git a/doc/graphs/kolmogorov_smirnov_pdf_ulp.svg b/doc/graphs/kolmogorov_smirnov_pdf_ulp.svg new file mode 100644 index 0000000000..f1adef5dc2 --- /dev/null +++ b/doc/graphs/kolmogorov_smirnov_pdf_ulp.svg @@ -0,0 +1,2551 @@ + + + +Kolmogorov-Smirnov PDF (N=10) ULP plot at float precision + + + + +-75 + +-50 + +-25 + +0 + +25 + +50 + +75 + +100 + +0.1 + +0.2 + +0.3 + +0.4 + +0.5 + +0.6 + +0.7 + +0.8 + +0.9 + +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/html/dist.html b/doc/html/dist.html index 6dfd1bd00c..4d84eb9867 100644 --- a/doc/html/dist.html +++ b/doc/html/dist.html @@ -169,6 +169,8 @@ Gamma Distribution
Inverse Gaussian (or Inverse Normal) Distribution
+
Kolmogorov-Smirnov + Distribution
Laplace Distribution
Logistic Distribution
diff --git a/doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html b/doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html new file mode 100644 index 0000000000..ff44b4c370 --- /dev/null +++ b/doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html @@ -0,0 +1,362 @@ + + + +Kolmogorov-Smirnov Distribution + + + + + + + + + + + + + + + +
Boost C++ LibrariesHomeLibrariesPeopleFAQMore
+
+
+PrevUpHomeNext +
+
+ + +
#include <boost/math/distributions/kolmogorov_smirnov.hpp>
+
namespace boost{ namespace math{
+
+template <class RealType = double,
+          class Policy   = policies::policy<> >
+class kolmogorov_smirnov_distribution;
+
+typedef kolmogorov_smirnov_distribution<> kolmogorov_smirnov;
+
+template <class RealType, class Policy>
+class kolmogorov_smirnov_distribution
+{
+public:
+   typedef RealType  value_type;
+   typedef Policy    policy_type;
+
+   // Constructor:
+   kolmogorov_smirnov_distribution(RealType n);
+
+   // Accessor to parameter:
+   RealType number_of_observations()const;
+
+}} // namespaces
+
+

+ The Kolmogorov-Smirnov test in statistics compares two empirical distributions, + or an empirical distribution against any theoretical distribution.[1] It makes use of a specific distribution which is informally + known in the literature as the Kolmogorv-Smirnov distribution, implemented + here. +

+

+ Formally, if n observations are taken from a theoretical + distribution G(x), and if Gn(x) + represents the empirical CDF of those n observations, + then the test statistic +

+
+

+ + +

+
+

+ will be distributed according to a Kolmogorov-Smirnov distribution parameterized + by n. +

+

+ The exact form of a Kolmogorov-Smirnov distribution is the subject of a + large, decades-old literature.[2] In the interest of simplicity, Boost implements the first-order, + limiting form of this distribution (the same form originally identified + by Kolmogorov[3]), namely +

+
+

+ + +

+
+

+ Note that while the exact distribution only has support over [0, 1], this + limiting form has positive mass above unity, particlularly for small n. + The following graph illustrations how the distribution changes for different + values of n: +

+
+

+ + +

+
+
+ + Member + Functions +
+
kolmogorov_smirnov_distribution(RealType n);
+
+

+ Constructs a Kolmogorov-Smirnov distribution with n + observations. +

+

+ Requires n > 0, otherwise calls domain_error. +

+
RealType number_of_observations()const;
+
+

+ Returns the parameter n from which this object was + constructed. +

+
+ + Non-member + Accessors +
+

+ All the usual non-member accessor + functions that are generic to all distributions are supported: + Cumulative Distribution Function, + Probability Density Function, + Quantile, Hazard Function, Cumulative Hazard Function, + mean, median, + mode, variance, + standard deviation, + skewness, kurtosis, kurtosis_excess, + range and support. +

+

+ The domain of the random variable is [0, +∞]. +

+
+ + Accuracy +
+

+ The CDF of the Kolmogorov-Smirnov distribution is implemented in terms + of the fourth Jacobi + Theta function; please refer to the accuracy ULP plots for that + function. +

+

+ The PDF is implemented separately, and the following ULP plot illustrates + its accuracy: +

+
+

+ + +

+
+

+ Because PDF values are simply scaled out and up by the square root of + n, the above plot is representative for all values + of n. Note that for present purposes, "accuracy" + refers to deviations from the limiting approximation, rather than deviations + from the exact distribution. +

+
+ + Implementation +
+

+ In the following table, n is the number of observations, + x is the random variable, π is Archimedes' constant, + and ζ(3) is Apéry's constant. +

+
+ ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

+ Function +

+
+

+ Implementation Notes +

+
+

+ cdf +

+
+

+ Using the relation: cdf = jacobi_theta4tau(0, + 2*x*x/π) +

+
+

+ pdf +

+
+

+ Using a manual derivative of the CDF +

+
+

+ cdf complement +

+
+

+ When x*x*n == 0: 1 +

+

+ When 2*x*x*n <= π: 1 - jacobi_theta4tau(0, + 2*x*x*n/π) +

+

+ When 2*x*x*n > π: -jacobi_theta4m1tau(0, + 2*x*x*n/π) +

+
+

+ quantile +

+
+

+ Using a Newton-Raphson iteration +

+
+

+ quantile from the complement +

+
+

+ Using a Newton-Raphson iteration +

+
+

+ mode +

+
+

+ Using a run-time PDF maximizer +

+
+

+ mean +

+
+

+ sqrt(π/2) * ln(2) / sqrt(n) +

+
+

+ variance +

+
+

+ (π2/12 - π/2*ln2(2))/n +

+
+

+ skewness +

+
+

+ (9/16*sqrt(π/2)*ζ(3)/n3/2 - 3 * mean * variance - mean2 * variance) + / (variance3/2) +

+
+

+ kurtosis +

+
+

+ (7/720*π4/n2 - 4 * mean * skewness * variance3/2 - 6 * mean2 * variance + - mean4) / (variance2) +

+
+
+
+

+ + +
+

[3] + Kolmogorov A (1933). "Sulla determinazione empirica di una legge + di distribuzione". G. Ist. Ital. Attuari. 4: 83–91. +

+
+
+
+ + + +
+
+
+PrevUpHomeNext +
+ + diff --git a/doc/html/standalone_HTML.manifest b/doc/html/standalone_HTML.manifest index 003b88ea9b..614cce4973 100644 --- a/doc/html/standalone_HTML.manifest +++ b/doc/html/standalone_HTML.manifest @@ -130,6 +130,7 @@ math_toolkit/dist_ref/dists/hypergeometric_dist.html math_toolkit/dist_ref/dists/inverse_chi_squared_dist.html math_toolkit/dist_ref/dists/inverse_gamma_dist.html math_toolkit/dist_ref/dists/inverse_gaussian_dist.html +math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html math_toolkit/dist_ref/dists/laplace_dist.html math_toolkit/dist_ref/dists/logistic_dist.html math_toolkit/dist_ref/dists/lognormal_dist.html diff --git a/doc/math.qbk b/doc/math.qbk index 984140d840..847b33e6eb 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -387,6 +387,7 @@ and use the function's name as the link text.] [def __inverse_gamma_distrib [link math_toolkit.dist_ref.dists.inverse_gamma_dist Inverse Gamma Distribution]] [def __inverse_gaussian_distrib [link math_toolkit.dist_ref.dists.inverse_gaussian_dist Inverse Gaussian Distribution]] [def __inverse_chi_squared_distrib [link math_toolkit.dist_ref.dists.inverse_chi_squared_dist Inverse chi squared Distribution]] +[def __kolmogorov_smirnov_distrib [link math_toolkit.dist_ref.dists.kolmogorov_smirnov Kolmogorov-Smirnov Distribution]] [def __laplace_distrib [link math_toolkit.dist_ref.dists.laplace_dist Laplace Distribution]] [def __logistic_distrib [link math_toolkit.dist_ref.dists.logistic_dist Logistic Distribution]] [def __lognormal_distrib [link math_toolkit.dist_ref.dists.lognormal_dist Log-normal Distribution]] From c1565d1dcd64c4fc9ca91a9c9c2403d0863e0922 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sun, 30 Aug 2020 22:30:21 -0400 Subject: [PATCH 12/14] Fix code comment (and kick off CI build) --- include/boost/math/distributions/kolmogorov_smirnov.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index da7fc690b1..959518b1fc 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -99,8 +99,8 @@ // kolmogorov_smirnov_dist<>(100, 4) // N=100, order=4, i.e. Pelz-Good formula // // The exact distribution could be added to the API with a special order -// parameter (e.g. 0 or INT_MAX), or a separate distribution type altogether -// (e.g. kolmogorov_smirnov_exact_dist). +// parameter (e.g. 0 or infinity), or a separate distribution type altogether +// (e.g. kolmogorov_smirnov_exact_distribution). // #ifndef BOOST_STATS_KOLMOGOROV_ASYMPTOTIC_HPP #define BOOST_STATS_KOLMOGOROV_ASYMPTOTIC_HPP From 30e325a7ad77a23912d58e97e008d1cc5cd1fdb8 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Wed, 2 Sep 2020 10:47:23 -0400 Subject: [PATCH 13/14] Make suggested changes [CI SKIP] --- doc/distributions/kolmogorov_smirnov.qbk | 2 +- .../math/distributions/kolmogorov_smirnov.hpp | 20 +++++++++---------- test/test_kolmogorov_smirnov.cpp | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/distributions/kolmogorov_smirnov.qbk b/doc/distributions/kolmogorov_smirnov.qbk index 8f854770ce..c557f6fc41 100644 --- a/doc/distributions/kolmogorov_smirnov.qbk +++ b/doc/distributions/kolmogorov_smirnov.qbk @@ -54,7 +54,7 @@ determinazione empirica di una legge di distribuzione". G. Ist. Ital. Attuari. [/ \lim_{n \to \infty} F_n(x/\sqrt{n})=1+2\sum_{k=1}^\infty (-1)^k e^{-2k^2x^2} ] Note that while the exact distribution only has support over \[0, 1\], this -limiting form has positive mass above unity, particlularly for small /n/. The +limiting form has positive mass above unity, particularly for small /n/. The following graph illustrations how the distribution changes for different values of /n/: diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp index 959518b1fc..1dc0a92ccf 100644 --- a/include/boost/math/distributions/kolmogorov_smirnov.hpp +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -84,8 +84,10 @@ // arrived at by trial and error. // // The mean and variance are implemented using simple closed-form expressions. -// The mode is calculated at run-time by maximizing the PDF. If you have an -// analytical solution for the mode, feel free to plop it in. +// Skewness and kurtosis use slightly more complicated closed-form expressions +// that involve the zeta function. The mode is calculated at run-time by +// maximizing the PDF. If you have an analytical solution for the mode, feel +// free to plop it in. // // The CDF and PDF could almost certainly be re-implemented and sped up using a // polynomial or rational approximation, since the only meaningful argument is @@ -102,8 +104,8 @@ // parameter (e.g. 0 or infinity), or a separate distribution type altogether // (e.g. kolmogorov_smirnov_exact_distribution). // -#ifndef BOOST_STATS_KOLMOGOROV_ASYMPTOTIC_HPP -#define BOOST_STATS_KOLMOGOROV_ASYMPTOTIC_HPP +#ifndef BOOST_MATH_DISTRIBUTIONS_KOLMOGOROV_SMIRNOV_HPP +#define BOOST_MATH_DISTRIBUTIONS_KOLMOGOROV_SMIRNOV_HPP #include #include @@ -113,8 +115,6 @@ #include // Newton-Raphson #include // For the mode -// #include - namespace boost { namespace math { namespace detail { @@ -194,21 +194,21 @@ template > typedef Policy policy_type; // Constructor - kolmogorov_smirnov_distribution( RealType n ) : n_obs(n) + kolmogorov_smirnov_distribution( RealType n ) : n_obs_(n) { RealType result; detail::check_df( - "boost::math::kolmogorov_smirnov_distribution<%1%>::kolmogorov_smirnov_distribution", n_obs, &result, Policy()); + "boost::math::kolmogorov_smirnov_distribution<%1%>::kolmogorov_smirnov_distribution", n_obs_, &result, Policy()); } RealType number_of_observations()const { - return n_obs; + return n_obs_; } private: - RealType n_obs; // positive integer + RealType n_obs_; // positive integer }; typedef kolmogorov_smirnov_distribution kolmogorov_k; // Convenience typedef for double version. diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp index 641a3478e5..fcbefc2f7e 100644 --- a/test/test_kolmogorov_smirnov.cpp +++ b/test/test_kolmogorov_smirnov.cpp @@ -13,8 +13,8 @@ #include #include -template // Any floating-point type RealType. -void test_spots(RealType T) +template // Any floating-point type RealType. +void test_spots(RealType) { using namespace boost::math; // Test quantiles, CDFs, and complements From a1fa5e4a76c83c7154e9786b052ba0ca8286db42 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Wed, 2 Sep 2020 13:02:55 -0400 Subject: [PATCH 14/14] Tweak testing logic for K-S mode [CI SKIP] 100*tol fails on multiprecision (I am told). sqrt(eps) should be larger and work better across precisions. --- test/test_kolmogorov_smirnov.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp index fcbefc2f7e..1d949fd6a6 100644 --- a/test/test_kolmogorov_smirnov.cpp +++ b/test/test_kolmogorov_smirnov.cpp @@ -56,8 +56,8 @@ void test_spots(RealType) // Confirm mode is at least a local minimum RealType mode = boost::math::mode(dist); - BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode - 100 * tol)); - BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode + 100 * tol)); + BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode - sqrt(eps))); + BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode + sqrt(eps))); // Test the moments - each one integrates the entire distribution quadrature::exp_sinh integrator;