diff --git a/NOTICE.txt b/NOTICE.txt index 82eff06994..9d626fa34a 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -15,3 +15,8 @@ This product contains a modified part of client_golang, distributed by prometheu * License: https://github.com/prometheus/client_golang/blob/master/LICENSE (Apache License v2.0) * Homepage: https://github.com/prometheus/client_golang + +This product contains a modified part of perf, distributed by golang: + + * License: https://github.com/golang/perf/blob/master/LICENSE + * Homepage: https://github.com/golang/perf diff --git a/pkg/app/piped/executor/analysis/mannwhitney/BUILD.bazel b/pkg/app/piped/executor/analysis/mannwhitney/BUILD.bazel new file mode 100644 index 0000000000..548284fa64 --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/BUILD.bazel @@ -0,0 +1,26 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "go_default_library", + srcs = [ + "alg.go", + "dist.go", + "mannwhitney.go", + "mathx.go", + "normaldist.go", + "udist.go", + ], + importpath = "github.com/pipe-cd/pipe/pkg/app/piped/executor/analysis/mannwhitney", + visibility = ["//visibility:public"], +) + +go_test( + name = "go_default_test", + size = "small", + srcs = [ + "mannwhitney_test.go", + "udist_test.go", + "util_test.go", + ], + embed = [":go_default_library"], +) diff --git a/pkg/app/piped/executor/analysis/mannwhitney/alg.go b/pkg/app/piped/executor/analysis/mannwhitney/alg.go new file mode 100644 index 0000000000..382f7a64f2 --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/alg.go @@ -0,0 +1,126 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +// Miscellaneous helper algorithms + +import ( + "fmt" +) + +func maxint(a, b int) int { + if a > b { + return a + } + return b +} + +func minint(a, b int) int { + if a < b { + return a + } + return b +} + +func sumint(xs []int) int { + sum := 0 + for _, x := range xs { + sum += x + } + return sum +} + +// bisect returns an x in [low, high] such that |f(x)| <= tolerance +// using the bisection method. +// +// f(low) and f(high) must have opposite signs. +// +// If f does not have a root in this interval (e.g., it is +// discontiguous), this returns the X of the apparent discontinuity +// and false. +func bisect(f func(float64) float64, low, high, tolerance float64) (float64, bool) { + flow, fhigh := f(low), f(high) + if -tolerance <= flow && flow <= tolerance { + return low, true + } + if -tolerance <= fhigh && fhigh <= tolerance { + return high, true + } + if mathSign(flow) == mathSign(fhigh) { + panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%g f(%g)=%g", low, flow, high, fhigh)) + } + for { + mid := (high + low) / 2 + fmid := f(mid) + if -tolerance <= fmid && fmid <= tolerance { + return mid, true + } + if mid == high || mid == low { + return mid, false + } + if mathSign(fmid) == mathSign(flow) { + low = mid + flow = fmid + } else { + high = mid + fhigh = fmid + } + } +} + +// bisectBool implements the bisection method on a boolean function. +// It returns x1, x2 ∈ [low, high], x1 < x2 such that f(x1) != f(x2) +// and x2 - x1 <= xtol. +// +// If f(low) == f(high), it panics. +func bisectBool(f func(float64) bool, low, high, xtol float64) (x1, x2 float64) { + flow, fhigh := f(low), f(high) + if flow == fhigh { + panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%v f(%g)=%v", low, flow, high, fhigh)) + } + for { + if high-low <= xtol { + return low, high + } + mid := (high + low) / 2 + if mid == high || mid == low { + return low, high + } + fmid := f(mid) + if fmid == flow { + low = mid + flow = fmid + } else { + high = mid + fhigh = fmid + } + } +} + +// series returns the sum of the series f(0), f(1), ... +// +// This implementation is fast, but subject to round-off error. +func series(f func(float64) float64) float64 { + y, yp := 0.0, 1.0 + for n := 0.0; y != yp; n++ { + yp = y + y += f(n) + } + return y +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/dist.go b/pkg/app/piped/executor/analysis/mannwhitney/dist.go new file mode 100644 index 0000000000..226199b4eb --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/dist.go @@ -0,0 +1,224 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import "math/rand" + +// A DistCommon is a statistical distribution. DistCommon is a base +// interface provided by both continuous and discrete distributions. +type DistCommon interface { + // CDF returns the cumulative probability Pr[X <= x]. + // + // For continuous distributions, the CDF is the integral of + // the PDF from -inf to x. + // + // For discrete distributions, the CDF is the sum of the PMF + // at all defined points from -inf to x, inclusive. Note that + // the CDF of a discrete distribution is defined for the whole + // real line (unlike the PMF) but has discontinuities where + // the PMF is non-zero. + // + // The CDF is a monotonically increasing function and has a + // domain of all real numbers. If the distribution has bounded + // support, it has a range of [0, 1]; otherwise it has a range + // of (0, 1). Finally, CDF(-inf)==0 and CDF(inf)==1. + CDF(x float64) float64 + + // Bounds returns reasonable bounds for this distribution's + // PDF/PMF and CDF. The total weight outside of these bounds + // should be approximately 0. + // + // For a discrete distribution, both bounds are integer + // multiples of Step(). + // + // If this distribution has finite support, it returns exact + // bounds l, h such that CDF(l')=0 for all l' < l and + // CDF(h')=1 for all h' >= h. + Bounds() (float64, float64) +} + +// A Dist is a continuous statistical distribution. +type Dist interface { + DistCommon + + // PDF returns the value of the probability density function + // of this distribution at x. + PDF(x float64) float64 +} + +// A DiscreteDist is a discrete statistical distribution. +// +// Most discrete distributions are defined only at integral values of +// the random variable. However, some are defined at other intervals, +// so this interface takes a float64 value for the random variable. +// The probability mass function rounds down to the nearest defined +// point. Note that float64 values can exactly represent integer +// values between ±2**53, so this generally shouldn't be an issue for +// integer-valued distributions (likewise, for half-integer-valued +// distributions, float64 can exactly represent all values between +// ±2**52). +type DiscreteDist interface { + DistCommon + + // PMF returns the value of the probability mass function + // Pr[X = x'], where x' is x rounded down to the nearest + // defined point on the distribution. + // + // Note for implementers: for integer-valued distributions, + // round x using int(math.Floor(x)). Do not use int(x), since + // that truncates toward zero (unless all x <= 0 are handled + // the same). + PMF(x float64) float64 + + // Step returns s, where the distribution is defined for sℕ. + Step() float64 +} + +// TODO: Add a Support method for finite support distributions? Or +// maybe just another return value from Bounds indicating that the +// bounds are exact? + +// TODO: Plot method to return a pre-configured Plot object with +// reasonable bounds and an integral function? Have to distinguish +// PDF/CDF/InvCDF. Three methods? Argument? +// +// Doesn't have to be a method of Dist. Could be just a function that +// takes a Dist and uses Bounds. + +// InvCDF returns the inverse CDF function of the given distribution +// (also known as the quantile function or the percent point +// function). This is a function f such that f(dist.CDF(x)) == x. If +// dist.CDF is only weakly monotonic (that it, there are intervals +// over which it is constant) and y > 0, f returns the smallest x that +// satisfies this condition. In general, the inverse CDF is not +// well-defined for y==0, but for convenience if y==0, f returns the +// largest x that satisfies this condition. For distributions with +// infinite support both the largest and smallest x are -Inf; however, +// for distributions with finite support, this is the lower bound of +// the support. +// +// If y < 0 or y > 1, f returns NaN. +// +// If dist implements InvCDF(float64) float64, this returns that +// method. Otherwise, it returns a function that uses a generic +// numerical method to construct the inverse CDF at y by finding x +// such that dist.CDF(x) == y. This may have poor precision around +// points of discontinuity, including f(0) and f(1). +func InvCDF(dist DistCommon) func(y float64) (x float64) { + type invCDF interface { + InvCDF(float64) float64 + } + if dist, ok := dist.(invCDF); ok { + return dist.InvCDF + } + + // Otherwise, use a numerical algorithm. + // + // TODO: For discrete distributions, use the step size to + // inform this computation. + return func(y float64) (x float64) { + const almostInf = 1e100 + const xtol = 1e-16 + + if y < 0 || y > 1 { + return nan + } else if y == 0 { + l, _ := dist.Bounds() + if dist.CDF(l) == 0 { + // Finite support + return l + } else { + // Infinite support + return -inf + } + } else if y == 1 { + _, h := dist.Bounds() + if dist.CDF(h) == 1 { + // Finite support + return h + } else { + // Infinite support + return inf + } + } + + // Find loX, hiX for which cdf(loX) < y <= cdf(hiX). + var loX, loY, hiX, hiY float64 + x1, y1 := 0.0, dist.CDF(0) + xdelta := 1.0 + if y1 < y { + hiX, hiY = x1, y1 + for hiY < y && hiX != inf { + loX, loY, hiX = hiX, hiY, hiX+xdelta + hiY = dist.CDF(hiX) + xdelta *= 2 + } + } else { + loX, loY = x1, y1 + for y <= loY && loX != -inf { + hiX, hiY, loX = loX, loY, loX-xdelta + loY = dist.CDF(loX) + xdelta *= 2 + } + } + if loX == -inf { + return loX + } else if hiX == inf { + return hiX + } + + // Use bisection on the interval to find the smallest + // x at which cdf(x) <= y. + _, x = bisectBool(func(x float64) bool { + return dist.CDF(x) < y + }, loX, hiX, xtol) + return + } +} + +// Rand returns a random number generator that draws from the given +// distribution. The returned generator takes an optional source of +// randomness; if this is nil, it uses the default global source. +// +// If dist implements Rand(*rand.Rand) float64, Rand returns that +// method. Otherwise, it returns a generic generator based on dist's +// inverse CDF (which may in turn use an efficient implementation or a +// generic numerical implementation; see InvCDF). +func Rand(dist DistCommon) func(*rand.Rand) float64 { + type distRand interface { + Rand(*rand.Rand) float64 + } + if dist, ok := dist.(distRand); ok { + return dist.Rand + } + + // Otherwise, use a generic algorithm. + inv := InvCDF(dist) + return func(r *rand.Rand) float64 { + var y float64 + for y == 0 { + if r == nil { + y = rand.Float64() + } else { + y = r.Float64() + } + } + return inv(y) + } +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/mannwhitney.go b/pkg/app/piped/executor/analysis/mannwhitney/mannwhitney.go new file mode 100644 index 0000000000..a17a51f2a8 --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/mannwhitney.go @@ -0,0 +1,297 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import ( + "errors" + "math" + "sort" +) + +// A LocationHypothesis specifies the alternative hypothesis of a +// location test such as a t-test or a Mann-Whitney U-test. The +// default (zero) value is to test against the alternative hypothesis +// that they differ. +type LocationHypothesis int + +const ( + // LocationLess specifies the alternative hypothesis that the + // location of the first sample is less than the second. This + // is a one-tailed test. + LocationLess LocationHypothesis = -1 + + // LocationDiffers specifies the alternative hypothesis that + // the locations of the two samples are not equal. This is a + // two-tailed test. + LocationDiffers LocationHypothesis = 0 + + // LocationGreater specifies the alternative hypothesis that + // the location of the first sample is greater than the + // second. This is a one-tailed test. + LocationGreater LocationHypothesis = 1 +) + +var ( + inf = math.Inf(1) + nan = math.NaN() + + ErrSamplesEqual = errors.New("all samples are equal") + ErrSampleSize = errors.New("sample is too small") + ErrZeroVariance = errors.New("sample has zero variance") + ErrMismatchedSamples = errors.New("samples have different lengths") +) + +// A MannWhitneyUTestResult is the result of a Mann-Whitney U-test. +type MannWhitneyUTestResult struct { + // N1 and N2 are the sizes of the input samples. + N1, N2 int + + // U is the value of the Mann-Whitney U statistic for this + // test, generalized by counting ties as 0.5. + // + // Given the Cartesian product of the two samples, this is the + // number of pairs in which the value from the first sample is + // greater than the value of the second, plus 0.5 times the + // number of pairs where the values from the two samples are + // equal. Hence, U is always an integer multiple of 0.5 (it is + // a whole integer if there are no ties) in the range [0, N1*N2]. + // + // U statistics always come in pairs, depending on which + // sample is "first". The mirror U for the other sample can be + // calculated as N1*N2 - U. + // + // There are many equivalent statistics with slightly + // different definitions. The Wilcoxon (1945) W statistic + // (generalized for ties) is U + (N1(N1+1))/2. It is also + // common to use 2U to eliminate the half steps and Smid + // (1956) uses N1*N2 - 2U to additionally center the + // distribution. + U float64 + + // AltHypothesis specifies the alternative hypothesis tested + // by this test against the null hypothesis that there is no + // difference in the locations of the samples. + AltHypothesis LocationHypothesis + + // P is the p-value of the Mann-Whitney test for the given + // null hypothesis. + P float64 +} + +// MannWhitneyExactLimit gives the largest sample size for which the +// exact U distribution will be used for the Mann-Whitney U-test. +// +// Using the exact distribution is necessary for small sample sizes +// because the distribution is highly irregular. However, computing +// the distribution for large sample sizes is both computationally +// expensive and unnecessary because it quickly approaches a normal +// approximation. Computing the distribution for two 50 value samples +// takes a few milliseconds on a 2014 laptop. +var MannWhitneyExactLimit = 50 + +// MannWhitneyTiesExactLimit gives the largest sample size for which +// the exact U distribution will be used for the Mann-Whitney U-test +// in the presence of ties. +// +// Computing this distribution is more expensive than computing the +// distribution without ties, so this is set lower. Computing this +// distribution for two 25 value samples takes about ten milliseconds +// on a 2014 laptop. +var MannWhitneyTiesExactLimit = 25 + +// MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null +// hypothesis that two samples come from the same population against +// the alternative hypothesis that one sample tends to have larger or +// smaller values than the other. +// +// This is similar to a t-test, but unlike the t-test, the +// Mann-Whitney U-test is non-parametric (it does not assume a normal +// distribution). It has very slightly lower efficiency than the +// t-test on normal distributions. +// +// Computing the exact U distribution is expensive for large sample +// sizes, so this uses a normal approximation for sample sizes larger +// than MannWhitneyExactLimit if there are no ties or +// MannWhitneyTiesExactLimit if there are ties. This normal +// approximation uses both the tie correction and the continuity +// correction. +// +// This can fail with ErrSampleSize if either sample is empty or +// ErrSamplesEqual if all sample values are equal. +// +// This is also known as a Mann-Whitney-Wilcoxon test and is +// equivalent to the Wilcoxon rank-sum test, though the Wilcoxon +// rank-sum test differs in nomenclature. +// +// [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of +// Whether one of Two Random Variables is Stochastically Larger than +// the Other". Annals of Mathematical Statistics 18 (1): 50–60. +// +// [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". +// Journal of the American Statistical Association 61 (315): 772-787. +func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) { + n1, n2 := len(x1), len(x2) + if n1 == 0 || n2 == 0 { + return nil, ErrSampleSize + } + + // Compute the U statistic and tie vector T. + x1 = append([]float64(nil), x1...) + x2 = append([]float64(nil), x2...) + sort.Float64s(x1) + sort.Float64s(x2) + merged, labels := labeledMerge(x1, x2) + + R1 := 0.0 + T, hasTies := []int{}, false + for i := 0; i < len(merged); { + rank1, nx1, v1 := i+1, 0, merged[i] + // Consume samples that tie this sample (including itself). + for ; i < len(merged) && merged[i] == v1; i++ { + if labels[i] == 1 { + nx1++ + } + } + // Assign all tied samples the average rank of the + // samples, where merged[0] has rank 1. + if nx1 != 0 { + rank := float64(i+rank1) / 2 + R1 += rank * float64(nx1) + } + T = append(T, i-rank1+1) + if i > rank1 { + hasTies = true + } + } + U1 := R1 - float64(n1*(n1+1))/2 + + // Compute the smaller of U1 and U2 + U2 := float64(n1*n2) - U1 + Usmall := math.Min(U1, U2) + + var p float64 + if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit || + hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit { + // Use exact U distribution. U1 will be an integer. + if len(T) == 1 { + // All values are equal. Test is meaningless. + return nil, ErrSamplesEqual + } + + dist := UDist{N1: n1, N2: n2, T: T} + switch alt { + case LocationDiffers: + if U1 == U2 { + // The distribution is symmetric about + // Usmall. Since the distribution is + // discrete, the CDF is discontinuous + // and if simply double CDF(Usmall), + // we'll double count the + // (non-infinitesimal) probability + // mass at Usmall. What we want is + // just the integral of the whole CDF, + // which is 1. + p = 1 + } else { + p = dist.CDF(Usmall) * 2 + } + + case LocationLess: + p = dist.CDF(U1) + + case LocationGreater: + p = 1 - dist.CDF(U1-1) + } + } else { + // Use normal approximation (with tie and continuity + // correction). + t := tieCorrection(T) + N := float64(n1 + n2) + μ_U := float64(n1*n2) / 2 + σ_U := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12) + if σ_U == 0 { + return nil, ErrSamplesEqual + } + numer := U1 - μ_U + // Perform continuity correction. + switch alt { + case LocationDiffers: + numer -= mathSign(numer) * 0.5 + case LocationLess: + numer += 0.5 + case LocationGreater: + numer -= 0.5 + } + z := numer / σ_U + switch alt { + case LocationDiffers: + p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z)) + case LocationLess: + p = StdNormal.CDF(z) + case LocationGreater: + p = 1 - StdNormal.CDF(z) + } + } + + return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1, + AltHypothesis: alt, P: p}, nil +} + +// labeledMerge merges sorted lists x1 and x2 into sorted list merged. +// labels[i] is 1 or 2 depending on whether merged[i] is a value from +// x1 or x2, respectively. +func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) { + merged = make([]float64, len(x1)+len(x2)) + labels = make([]byte, len(x1)+len(x2)) + + i, j, o := 0, 0, 0 + for i < len(x1) && j < len(x2) { + if x1[i] < x2[j] { + merged[o] = x1[i] + labels[o] = 1 + i++ + } else { + merged[o] = x2[j] + labels[o] = 2 + j++ + } + o++ + } + for ; i < len(x1); i++ { + merged[o] = x1[i] + labels[o] = 1 + o++ + } + for ; j < len(x2); j++ { + merged[o] = x2[j] + labels[o] = 2 + o++ + } + return +} + +// tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j) +// where t_j is the number of ties in the j'th rank. +func tieCorrection(ties []int) float64 { + t := 0 + for _, tie := range ties { + t += tie*tie*tie - tie + } + return float64(t) +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/mannwhitney_test.go b/pkg/app/piped/executor/analysis/mannwhitney/mannwhitney_test.go new file mode 100644 index 0000000000..6a63ecae71 --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/mannwhitney_test.go @@ -0,0 +1,94 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import "testing" + +func TestMannWhitneyUTest(t *testing.T) { + check := func(want, got *MannWhitneyUTestResult) { + if want.N1 != got.N1 || want.N2 != got.N2 || + !aeq(want.U, got.U) || + want.AltHypothesis != got.AltHypothesis || + !aeq(want.P, got.P) { + t.Errorf("want %+v, got %+v", want, got) + } + } + check3 := func(x1, x2 []float64, U float64, pless, pdiff, pgreater float64) { + want := &MannWhitneyUTestResult{N1: len(x1), N2: len(x2), U: U} + + want.AltHypothesis = LocationLess + want.P = pless + got, _ := MannWhitneyUTest(x1, x2, want.AltHypothesis) + check(want, got) + + want.AltHypothesis = LocationDiffers + want.P = pdiff + got, _ = MannWhitneyUTest(x1, x2, want.AltHypothesis) + check(want, got) + + want.AltHypothesis = LocationGreater + want.P = pgreater + got, _ = MannWhitneyUTest(x1, x2, want.AltHypothesis) + check(want, got) + } + + s1 := []float64{2, 1, 3, 5} + s2 := []float64{12, 11, 13, 15} + s3 := []float64{0, 4, 6, 7} // Interleaved with s1, but no ties + s4 := []float64{2, 2, 2, 2} + s5 := []float64{1, 1, 1, 1, 1} + + // Small sample, no ties + check3(s1, s2, 0, 0.014285714285714289, 0.028571428571428577, 1) + check3(s2, s1, 16, 1, 0.028571428571428577, 0.014285714285714289) + check3(s1, s3, 5, 0.24285714285714288, 0.485714285714285770, 0.8285714285714285) + + // Small sample, ties + // TODO: Check these against some other implementation. + check3(s1, s1, 8, 0.6285714285714286, 1, 0.6285714285714286) + check3(s1, s4, 10, 0.8571428571428571, 0.7142857142857143, 0.3571428571428571) + check3(s1, s5, 17.5, 1, 0, 0.04761904761904767) + + r, err := MannWhitneyUTest(s4, s4, LocationDiffers) + if err != ErrSamplesEqual { + t.Errorf("want ErrSamplesEqual, got %+v, %+v", r, err) + } + + // Large samples. + l1 := make([]float64, 500) + for i := range l1 { + l1[i] = float64(i * 2) + } + l2 := make([]float64, 600) + for i := range l2 { + l2[i] = float64(i*2 - 41) + } + l3 := append([]float64{}, l2...) + for i := 0; i < 30; i++ { + l3[i] = l1[i] + } + // For comparing with R's wilcox.test: + // l1 <- seq(0, 499)*2 + // l2 <- seq(0,599)*2-41 + // l3 <- l2; for (i in 1:30) { l3[i] = l1[i] } + + check3(l1, l2, 135250, 0.0024667680407086112, 0.0049335360814172224, 0.9975346930458906) + check3(l1, l1, 125000, 0.5000436801680628, 1, 0.5000436801680628) + check3(l1, l3, 134845, 0.0019351907119808942, 0.0038703814239617884, 0.9980659818257166) +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/mathx.go b/pkg/app/piped/executor/analysis/mannwhitney/mathx.go new file mode 100644 index 0000000000..8843b87663 --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/mathx.go @@ -0,0 +1,89 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import "math" + +// mathSign returns the sign of x: -1 if x < 0, 0 if x == 0, 1 if x > 0. +// If x is NaN, it returns NaN. +func mathSign(x float64) float64 { + if x == 0 { + return 0 + } else if x < 0 { + return -1 + } else if x > 0 { + return 1 + } + return nan +} + +const smallFactLimit = 20 // 20! => 62 bits +var smallFact [smallFactLimit + 1]int64 + +func init() { + smallFact[0] = 1 + fact := int64(1) + for n := int64(1); n <= smallFactLimit; n++ { + fact *= n + smallFact[n] = fact + } +} + +// mathChoose returns the binomial coefficient of n and k. +func mathChoose(n, k int) float64 { + if k == 0 || k == n { + return 1 + } + if k < 0 || n < k { + return 0 + } + if n <= smallFactLimit { // Implies k <= smallFactLimit + // It's faster to do several integer multiplications + // than it is to do an extra integer division. + // Remarkably, this is also faster than pre-computing + // Pascal's triangle (presumably because this is very + // cache efficient). + numer := int64(1) + for n1 := int64(n - (k - 1)); n1 <= int64(n); n1++ { + numer *= n1 + } + denom := smallFact[k] + return float64(numer / denom) + } + + return math.Exp(lchoose(n, k)) +} + +// mathLchoose returns math.Log(mathChoose(n, k)). +func mathLchoose(n, k int) float64 { + if k == 0 || k == n { + return 0 + } + if k < 0 || n < k { + return math.NaN() + } + return lchoose(n, k) +} + +func lchoose(n, k int) float64 { + a, _ := math.Lgamma(float64(n + 1)) + b, _ := math.Lgamma(float64(k + 1)) + c, _ := math.Lgamma(float64(n - k + 1)) + return a - b - c +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/normaldist.go b/pkg/app/piped/executor/analysis/mannwhitney/normaldist.go new file mode 100644 index 0000000000..20fc06e09f --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/normaldist.go @@ -0,0 +1,155 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import ( + "math" + "math/rand" +) + +// NormalDist is a normal (Gaussian) distribution with mean Mu and +// standard deviation Sigma. +type NormalDist struct { + Mu, Sigma float64 +} + +// StdNormal is the standard normal distribution (Mu = 0, Sigma = 1) +var StdNormal = NormalDist{0, 1} + +// 1/sqrt(2 * pi) +const invSqrt2Pi = 0.39894228040143267793994605993438186847585863116493465766592583 + +func (n NormalDist) PDF(x float64) float64 { + z := x - n.Mu + return math.Exp(-z*z/(2*n.Sigma*n.Sigma)) * invSqrt2Pi / n.Sigma +} + +func (n NormalDist) pdfEach(xs []float64) []float64 { + res := make([]float64, len(xs)) + if n.Mu == 0 && n.Sigma == 1 { + // Standard normal fast path + for i, x := range xs { + res[i] = math.Exp(-x*x/2) * invSqrt2Pi + } + } else { + a := -1 / (2 * n.Sigma * n.Sigma) + b := invSqrt2Pi / n.Sigma + for i, x := range xs { + z := x - n.Mu + res[i] = math.Exp(z*z*a) * b + } + } + return res +} + +func (n NormalDist) CDF(x float64) float64 { + return math.Erfc(-(x-n.Mu)/(n.Sigma*math.Sqrt2)) / 2 +} + +func (n NormalDist) cdfEach(xs []float64) []float64 { + res := make([]float64, len(xs)) + a := 1 / (n.Sigma * math.Sqrt2) + for i, x := range xs { + res[i] = math.Erfc(-(x-n.Mu)*a) / 2 + } + return res +} + +func (n NormalDist) InvCDF(p float64) (x float64) { + // This is based on Peter John Acklam's inverse normal CDF + // algorithm: http://home.online.no/~pjacklam/notes/invnorm/ + const ( + a1 = -3.969683028665376e+01 + a2 = 2.209460984245205e+02 + a3 = -2.759285104469687e+02 + a4 = 1.383577518672690e+02 + a5 = -3.066479806614716e+01 + a6 = 2.506628277459239e+00 + + b1 = -5.447609879822406e+01 + b2 = 1.615858368580409e+02 + b3 = -1.556989798598866e+02 + b4 = 6.680131188771972e+01 + b5 = -1.328068155288572e+01 + + c1 = -7.784894002430293e-03 + c2 = -3.223964580411365e-01 + c3 = -2.400758277161838e+00 + c4 = -2.549732539343734e+00 + c5 = 4.374664141464968e+00 + c6 = 2.938163982698783e+00 + + d1 = 7.784695709041462e-03 + d2 = 3.224671290700398e-01 + d3 = 2.445134137142996e+00 + d4 = 3.754408661907416e+00 + + plow = 0.02425 + phigh = 1 - plow + ) + + if p < 0 || p > 1 { + return nan + } else if p == 0 { + return -inf + } else if p == 1 { + return inf + } + + if p < plow { + // Rational approximation for lower region. + q := math.Sqrt(-2 * math.Log(p)) + x = (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) / + ((((d1*q+d2)*q+d3)*q+d4)*q + 1) + } else if phigh < p { + // Rational approximation for upper region. + q := math.Sqrt(-2 * math.Log(1-p)) + x = -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) / + ((((d1*q+d2)*q+d3)*q+d4)*q + 1) + } else { + // Rational approximation for central region. + q := p - 0.5 + r := q * q + x = (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r + a6) * q / + (((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r + 1) + } + + // Refine approximation. + e := 0.5*math.Erfc(-x/math.Sqrt2) - p + u := e * math.Sqrt(2*math.Pi) * math.Exp(x*x/2) + x = x - u/(1+x*u/2) + + // Adjust from standard normal. + return x*n.Sigma + n.Mu +} + +func (n NormalDist) Rand(r *rand.Rand) float64 { + var x float64 + if r == nil { + x = rand.NormFloat64() + } else { + x = r.NormFloat64() + } + return x*n.Sigma + n.Mu +} + +func (n NormalDist) Bounds() (float64, float64) { + const stddevs = 3 + return n.Mu - stddevs*n.Sigma, n.Mu + stddevs*n.Sigma +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/udist.go b/pkg/app/piped/executor/analysis/mannwhitney/udist.go new file mode 100644 index 0000000000..c6b862ebbb --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/udist.go @@ -0,0 +1,401 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import ( + "math" +) + +// A UDist is the discrete probability distribution of the +// Mann-Whitney U statistic for a pair of samples of sizes N1 and N2. +// +// The details of computing this distribution with no ties can be +// found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of +// Whether one of Two Random Variables is Stochastically Larger than +// the Other". Annals of Mathematical Statistics 18 (1): 50–60. +// Computing this distribution in the presence of ties is described in +// Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". +// Journal of the American Statistical Association 61 (315): 772-787 +// and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney +// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: +// 805-813 (the former paper contains details that are glossed over in +// the latter paper but has mathematical typesetting issues, so it's +// easiest to get the context from the former paper and the details +// from the latter). +type UDist struct { + N1, N2 int + + // T is the count of the number of ties at each rank in the + // input distributions. T may be nil, in which case it is + // assumed there are no ties (which is equivalent to an M+N + // slice of 1s). It must be the case that Sum(T) == M+N. + T []int +} + +// hasTies returns true if d has any tied samples. +func (d UDist) hasTies() bool { + for _, t := range d.T { + if t > 1 { + return true + } + } + return false +} + +// p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947 +// for values of U from 0 up to and including the U argument. +// +// This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite +// fast for small values of N1 and N2. However, it does not handle ties. +func (d UDist) p(U int) []float64 { + // This is a dynamic programming implementation of the + // recursive recurrence definition given by Mann and Whitney: + // + // p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m) + // p_{n,m}(U) = 0 if U < 0 + // p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0 + // = 0 if U > 0 + // + // (Note that there is a typo in the original paper. The first + // recursive application of p should be for U-m, not U-M.) + // + // Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only + // need to store one "plane" of the three dimensional space at + // a time. + // + // Furthermore, p_{n,m} = p_{m,n}, so we only construct values + // for n <= m and obtain the rest through symmetry. + // + // We organize the computed values of p as followed: + // + // n → N + // m * + // ↓ * * + // * * * + // * * * * + // * * * * + // M * * * * + // + // where each * is a slice indexed by U. The code below + // computes these left-to-right, top-to-bottom, so it only + // stores one row of this matrix at a time. Furthermore, + // computing an element in a given U slice only depends on the + // same and smaller values of U, so we can overwrite the U + // slice we're computing in place as long as we start with the + // largest value of U. Finally, even though the recurrence + // depends on (n,m) above the diagonal and we use symmetry to + // mirror those across the diagonal to (m,n), the mirrored + // indexes are always available in the current row, so this + // mirroring does not interfere with our ability to recycle + // state. + + N, M := d.N1, d.N2 + if N > M { + N, M = M, N + } + + memo := make([][]float64, N+1) + for n := range memo { + memo[n] = make([]float64, U+1) + } + + for m := 0; m <= M; m++ { + // Compute p_{0,m}. This is zero except for U=0. + memo[0][0] = 1 + + // Compute the remainder of this row. + nlim := N + if m < nlim { + nlim = m + } + for n := 1; n <= nlim; n++ { + lp := memo[n-1] // p_{n-1,m} + var rp []float64 + if n <= m-1 { + rp = memo[n] // p_{n,m-1} + } else { + rp = memo[m-1] // p{m-1,n} and m==n + } + + // For a given n,m, U is at most n*m. + // + // TODO: Actually, it's at most ⌈n*m/2⌉, but + // then we need to use more complex symmetries + // in the inner loop below. + ulim := n * m + if U < ulim { + ulim = U + } + + out := memo[n] // p_{n,m} + nplusm := float64(n + m) + for U1 := ulim; U1 >= 0; U1-- { + l := 0.0 + if U1-m >= 0 { + l = float64(n) * lp[U1-m] + } + r := float64(m) * rp[U1] + out[U1] = (l + r) / nplusm + } + } + } + return memo[N] +} + +type ukey struct { + n1 int // size of first sample + twoU int // 2*U statistic for this permutation +} + +// This computes the cumulative counts of the Mann-Whitney U +// distribution in the presence of ties using the computation from +// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney +// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: +// 805-813, with much guidance from appendix L of Klotz, A +// Computational Approach to Statistics. +// +// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the +// number of ranks (up to len(t)), n1 is the size of the first sample +// (up to the n1 argument), and U is the U statistic (up to the +// argument twoU/2). The value of an entry in the memo table is the +// number of permutations of a sample of size n1 in a ranking with tie +// vector t[:K] having a U statistic <= U. +func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { + // Another candidate for a fast implementation is van de Wiel, + // "The split-up algorithm: a fast symbolic method for + // computing p-values of distribution-free statistics". This + // is what's used by R's coin package. It's a comparatively + // recent publication, so it's presumably faster (or perhaps + // just more general) than previous techniques, but I can't + // get my hands on the paper. + // + // TODO: ~40% of this function's time is spent in mapassign on + // the assignment lines in the two loops and another ~20% in + // map access and iteration. Improving map behavior or + // replacing the maps altogether with some other constant-time + // structure could double performance. + // + // TODO: The worst case for this function is when there are + // few ties. Yet the best case overall is when there are *no* + // ties. Can we get the best of both worlds? Use the fast + // algorithm for the most part when there are few ties and mix + // in the general algorithm just where we need it? That's + // certainly possible for sub-problems where t[:k] has no + // ties, but that doesn't help if t[0] has a tie but nothing + // else does. Is it possible to rearrange the ranks without + // messing up our computation of the U statistic for + // sub-problems? + + K := len(t) + + // Compute a coefficients. The a slice is indexed by k (a[0] + // is unused). + a := make([]int, K+1) + a[1] = t[0] + for k := 2; k <= K; k++ { + a[k] = a[k-1] + t[k-2] + t[k-1] + } + + // Create the memo table for the counts function, A. The A + // slice is indexed by k (A[0] is unused). + // + // In "The Mann Whitney Distribution Using Linked Lists", they + // use linked lists (*gasp*) for this, but within each K it's + // really just a memoization table, so it's faster to use a + // map. The outer structure is a slice indexed by k because we + // need to find all memo entries with certain values of k. + // + // TODO: The n1 and twoU values in the ukeys follow strict + // patterns. For each K value, the n1 values are every integer + // between two bounds. For each (K, n1) value, the twoU values + // are every integer multiple of a certain base between two + // bounds. It might be worth turning these into directly + // indexible slices. + A := make([]map[ukey]float64, K+1) + A[K] = map[ukey]float64{ukey{n1: n1, twoU: twoU}: 0} + + // Compute memo table (k, n1, twoU) triples from high K values + // to low K values. This drives the recurrence relation + // downward to figure out all of the needed argument triples. + // + // TODO: Is it possible to generate this table bottom-up? If + // so, this could be a pure dynamic programming algorithm and + // we could discard the K dimension. We could at least store + // the inputs in a more compact representation that replaces + // the twoU dimension with an interval and a step size (as + // suggested by Cheung, Klotz, not that they make it at all + // clear *why* they're suggesting this). + tsum := sumint(t) // always ∑ t[0:k] + for k := K - 1; k >= 2; k-- { + tsum -= t[k] + A[k] = make(map[ukey]float64) + + // Construct A[k] from A[k+1]. + for A_kplus1 := range A[k+1] { + rkLow := maxint(0, A_kplus1.n1-tsum) + rkHigh := minint(A_kplus1.n1, t[k]) + for rk := rkLow; rk <= rkHigh; rk++ { + twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) + n1_k := A_kplus1.n1 - rk + if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { + key := ukey{n1: n1_k, twoU: twoU_k} + A[k][key] = 0 + } + } + } + } + + // Fill counts in memo table from low K values to high K + // values. This unwinds the recurrence relation. + + // Start with K==2 base case. + // + // TODO: Later computations depend on these, but these don't + // depend on anything (including each other), so if K==2, we + // can skip the memo table altogether. + if K < 2 { + panic("K < 2") + } + N_2 := t[0] + t[1] + for A_2i := range A[2] { + Asum := 0.0 + r2Low := maxint(0, A_2i.n1-t[0]) + r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 + for r2 := r2Low; r2 <= r2High; r2++ { + Asum += mathChoose(t[0], A_2i.n1-r2) * + mathChoose(t[1], r2) + } + A[2][A_2i] = Asum + } + + // Derive counts for the rest of the memo table. + tsum = t[0] // always ∑ t[0:k-1] + for k := 3; k <= K; k++ { + tsum += t[k-2] + + // Compute A[k] counts from A[k-1] counts. + for A_ki := range A[k] { + Asum := 0.0 + rkLow := maxint(0, A_ki.n1-tsum) + rkHigh := minint(A_ki.n1, t[k-1]) + for rk := rkLow; rk <= rkHigh; rk++ { + twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) + n1_kminus1 := A_ki.n1 - rk + x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] + if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { + x = mathChoose(tsum, n1_kminus1) + } + Asum += x * mathChoose(t[k-1], rk) + } + A[k][A_ki] = Asum + } + } + + return A +} + +func twoUmin(n1 int, t, a []int) int { + K := len(t) + twoU := -n1 * n1 + n1_k := n1 + for k := 1; k <= K; k++ { + twoU_k := minint(n1_k, t[k-1]) + twoU += twoU_k * a[k] + n1_k -= twoU_k + } + return twoU +} + +func twoUmax(n1 int, t, a []int) int { + K := len(t) + twoU := -n1 * n1 + n1_k := n1 + for k := K; k > 0; k-- { + twoU_k := minint(n1_k, t[k-1]) + twoU += twoU_k * a[k] + n1_k -= twoU_k + } + return twoU +} + +func (d UDist) PMF(U float64) float64 { + if U < 0 || U >= 0.5+float64(d.N1*d.N2) { + return 0 + } + + if d.hasTies() { + // makeUmemo computes the CDF directly. Take its + // difference to get the PMF. + p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] + p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] + if !ok1 || !ok2 { + panic("makeUmemo did not return expected memoization table") + } + return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1) + } + + // There are no ties. Use the fast algorithm. U must be integral. + Ui := int(math.Floor(U)) + // TODO: Use symmetry to minimize U + return d.p(Ui)[Ui] +} + +func (d UDist) CDF(U float64) float64 { + if U < 0 { + return 0 + } else if U >= float64(d.N1*d.N2) { + return 1 + } + + if d.hasTies() { + // TODO: Minimize U? + p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] + if !ok { + panic("makeUmemo did not return expected memoization table") + } + return p / mathChoose(d.N1+d.N2, d.N1) + } + + // There are no ties. Use the fast algorithm. U must be integral. + Ui := int(math.Floor(U)) + // The distribution is symmetric around U = m * n / 2. Sum up + // whichever tail is smaller. + flip := Ui >= (d.N1*d.N2+1)/2 + if flip { + Ui = d.N1*d.N2 - Ui - 1 + } + pdfs := d.p(Ui) + p := 0.0 + for _, pdf := range pdfs[:Ui+1] { + p += pdf + } + if flip { + p = 1 - p + } + return p +} + +func (d UDist) Step() float64 { + return 0.5 +} + +func (d UDist) Bounds() (float64, float64) { + // TODO: More precise bounds when there are ties. + return 0, float64(d.N1 * d.N2) +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/udist_test.go b/pkg/app/piped/executor/analysis/mannwhitney/udist_test.go new file mode 100644 index 0000000000..a74bae4b1b --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/udist_test.go @@ -0,0 +1,336 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import ( + "fmt" + "math" + "testing" +) + +func aeqTable(a, b [][]float64) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if len(a[i]) != len(b[i]) { + return false + } + for j := range a[i] { + // "%f" precision + if math.Abs(a[i][j]-b[i][j]) >= 0.000001 { + return false + } + } + } + return true +} + +// U distribution for N=3 up to U=5. +var udist3 = [][]float64{ + // m=1 2 3 + {0.250000, 0.100000, 0.050000}, // U=0 + {0.500000, 0.200000, 0.100000}, // U=1 + {0.750000, 0.400000, 0.200000}, // U=2 + {1.000000, 0.600000, 0.350000}, // U=3 + {1.000000, 0.800000, 0.500000}, // U=4 + {1.000000, 0.900000, 0.650000}, // U=5 +} + +// U distribution for N=5 up to U=5. +var udist5 = [][]float64{ + // m=1 2 3 4 5 + {0.166667, 0.047619, 0.017857, 0.007937, 0.003968}, // U=0 + {0.333333, 0.095238, 0.035714, 0.015873, 0.007937}, // U=1 + {0.500000, 0.190476, 0.071429, 0.031746, 0.015873}, // U=2 + {0.666667, 0.285714, 0.125000, 0.055556, 0.027778}, // U=3 + {0.833333, 0.428571, 0.196429, 0.095238, 0.047619}, // U=4 + {1.000000, 0.571429, 0.285714, 0.142857, 0.075397}, // U=5 +} + +func TestUDist(t *testing.T) { + makeTable := func(n int) [][]float64 { + out := make([][]float64, 6) + for U := 0; U < 6; U++ { + out[U] = make([]float64, n) + for m := 1; m <= n; m++ { + out[U][m-1] = UDist{N1: m, N2: n}.CDF(float64(U)) + } + } + return out + } + fmtTable := func(a [][]float64) string { + out := fmt.Sprintf("%8s", "m=") + for m := 1; m <= len(a[0]); m++ { + out += fmt.Sprintf("%9d", m) + } + out += "\n" + + for U, row := range a { + out += fmt.Sprintf("U=%-6d", U) + for m := 1; m <= len(a[0]); m++ { + out += fmt.Sprintf(" %f", row[m-1]) + } + out += "\n" + } + return out + } + + // Compare against tables given in Mann, Whitney (1947). + got3 := makeTable(3) + if !aeqTable(got3, udist3) { + t.Errorf("For n=3, want:\n%sgot:\n%s", fmtTable(udist3), fmtTable(got3)) + } + + got5 := makeTable(5) + if !aeqTable(got5, udist5) { + t.Errorf("For n=5, want:\n%sgot:\n%s", fmtTable(udist5), fmtTable(got5)) + } +} + +func BenchmarkUDist(b *testing.B) { + for i := 0; i < b.N; i++ { + // R uses the exact distribution up to N=50. + // N*M/2=1250 is the hardest point to get the CDF for. + UDist{N1: 50, N2: 50}.CDF(1250) + } +} + +func TestUDistTies(t *testing.T) { + makeTable := func(m, N int, t []int, minx, maxx float64) [][]float64 { + out := [][]float64{} + dist := UDist{N1: m, N2: N - m, T: t} + for x := minx; x <= maxx; x += 0.5 { + // Convert x from uQt' to uQv'. + U := x - float64(m*m)/2 + P := dist.CDF(U) + if len(out) == 0 || !aeq(out[len(out)-1][1], P) { + out = append(out, []float64{x, P}) + } + } + return out + } + fmtTable := func(table [][]float64) string { + out := "" + for _, row := range table { + out += fmt.Sprintf("%5.1f %f\n", row[0], row[1]) + } + return out + } + + // Compare against Table 1 from Klotz (1966). + got := makeTable(5, 10, []int{1, 1, 2, 1, 1, 2, 1, 1}, 12.5, 19.5) + want := [][]float64{ + {12.5, 0.003968}, {13.5, 0.007937}, + {15.0, 0.023810}, {16.5, 0.047619}, + {17.5, 0.071429}, {18.0, 0.087302}, + {19.0, 0.134921}, {19.5, 0.138889}, + } + if !aeqTable(got, want) { + t.Errorf("Want:\n%sgot:\n%s", fmtTable(want), fmtTable(got)) + } + + got = makeTable(10, 21, []int{6, 5, 4, 3, 2, 1}, 52, 87) + want = [][]float64{ + {52.0, 0.000014}, {56.5, 0.000128}, + {57.5, 0.000145}, {60.0, 0.000230}, + {61.0, 0.000400}, {62.0, 0.000740}, + {62.5, 0.000797}, {64.0, 0.000825}, + {64.5, 0.001165}, {65.5, 0.001477}, + {66.5, 0.002498}, {67.0, 0.002725}, + {67.5, 0.002895}, {68.0, 0.003150}, + {68.5, 0.003263}, {69.0, 0.003518}, + {69.5, 0.003603}, {70.0, 0.005648}, + {70.5, 0.005818}, {71.0, 0.006626}, + {71.5, 0.006796}, {72.0, 0.008157}, + {72.5, 0.009688}, {73.0, 0.009801}, + {73.5, 0.010430}, {74.0, 0.011111}, + {74.5, 0.014230}, {75.0, 0.014612}, + {75.5, 0.017249}, {76.0, 0.018307}, + {76.5, 0.020178}, {77.0, 0.022270}, + {77.5, 0.023189}, {78.0, 0.026931}, + {78.5, 0.028207}, {79.0, 0.029979}, + {79.5, 0.030931}, {80.0, 0.038969}, + {80.5, 0.043063}, {81.0, 0.044262}, + {81.5, 0.046389}, {82.0, 0.049581}, + {82.5, 0.056300}, {83.0, 0.058027}, + {83.5, 0.063669}, {84.0, 0.067454}, + {84.5, 0.074122}, {85.0, 0.077425}, + {85.5, 0.083498}, {86.0, 0.094079}, + {86.5, 0.096693}, {87.0, 0.101132}, + } + if !aeqTable(got, want) { + t.Errorf("Want:\n%sgot:\n%s", fmtTable(want), fmtTable(got)) + } + + got = makeTable(8, 16, []int{2, 2, 2, 2, 2, 2, 2, 2}, 32, 54) + want = [][]float64{ + {32.0, 0.000078}, {34.0, 0.000389}, + {36.0, 0.001088}, {38.0, 0.002642}, + {40.0, 0.005905}, {42.0, 0.011500}, + {44.0, 0.021057}, {46.0, 0.035664}, + {48.0, 0.057187}, {50.0, 0.086713}, + {52.0, 0.126263}, {54.0, 0.175369}, + } + if !aeqTable(got, want) { + t.Errorf("Want:\n%sgot:\n%s", fmtTable(want), fmtTable(got)) + } + + // Check remaining tables from Klotz against the reference + // implementation. + checkRef := func(n1 int, tie []int) { + wantPMF1, wantCDF1 := udistRef(n1, tie) + + dist := UDist{N1: n1, N2: sumint(tie) - n1, T: tie} + gotPMF, wantPMF := [][]float64{}, [][]float64{} + gotCDF, wantCDF := [][]float64{}, [][]float64{} + N := sumint(tie) + for U := 0.0; U <= float64(n1*(N-n1)); U += 0.5 { + gotPMF = append(gotPMF, []float64{U, dist.PMF(U)}) + gotCDF = append(gotCDF, []float64{U, dist.CDF(U)}) + wantPMF = append(wantPMF, []float64{U, wantPMF1[int(U*2)]}) + wantCDF = append(wantCDF, []float64{U, wantCDF1[int(U*2)]}) + } + if !aeqTable(wantPMF, gotPMF) { + t.Errorf("For PMF of n1=%v, t=%v, want:\n%sgot:\n%s", n1, tie, fmtTable(wantPMF), fmtTable(gotPMF)) + } + if !aeqTable(wantCDF, gotCDF) { + t.Errorf("For CDF of n1=%v, t=%v, want:\n%sgot:\n%s", n1, tie, fmtTable(wantCDF), fmtTable(gotCDF)) + } + } + checkRef(5, []int{1, 1, 2, 1, 1, 2, 1, 1}) + checkRef(5, []int{1, 1, 2, 1, 1, 1, 2, 1}) + checkRef(5, []int{1, 3, 1, 2, 1, 1, 1}) + checkRef(8, []int{1, 2, 1, 1, 1, 1, 2, 2, 1, 2}) + checkRef(12, []int{3, 3, 4, 3, 4, 5}) + checkRef(10, []int{1, 2, 3, 4, 5, 6}) +} + +func BenchmarkUDistTies(b *testing.B) { + // Worst case: just one tie. + n := 20 + t := make([]int, 2*n-1) + for i := range t { + t[i] = 1 + } + t[0] = 2 + + for i := 0; i < b.N; i++ { + UDist{N1: n, N2: n, T: t}.CDF(float64(n*n) / 2) + } +} + +func XTestPrintUmemo(t *testing.T) { + // Reproduce table from Cheung, Klotz. + ties := []int{4, 5, 3, 4, 6} + printUmemo(makeUmemo(80, 10, ties), ties) +} + +// udistRef computes the PMF and CDF of the U distribution for two +// samples of sizes n1 and sum(t)-n1 with tie vector t. The returned +// pmf and cdf are indexed by 2*U. +// +// This uses the "graphical method" of Klotz (1966). It is very slow +// (Θ(∏ (t[i]+1)) = Ω(2^|t|)), but very correct, and hence useful as a +// reference for testing faster implementations. +func udistRef(n1 int, t []int) (pmf, cdf []float64) { + // Enumerate all u vectors for which 0 <= u_i <= t_i. Count + // the number of permutations of two samples of sizes n1 and + // sum(t)-n1 with tie vector t and accumulate these counts by + // their U statistics in count[2*U]. + counts := make([]int, 1+2*n1*(sumint(t)-n1)) + + u := make([]int, len(t)) + u[0] = -1 // Get enumeration started. +enumu: + for { + // Compute the next u vector. + u[0]++ + for i := 0; i < len(u) && u[i] > t[i]; i++ { + if i == len(u)-1 { + // All u vectors have been enumerated. + break enumu + } + // Carry. + u[i+1]++ + u[i] = 0 + } + + // Is this a legal u vector? + if sumint(u) != n1 { + // Klotz (1966) has a method for directly + // enumerating legal u vectors, but the point + // of this is to be correct, not fast. + continue + } + + // Compute 2*U statistic for this u vector. + twoU, vsum := 0, 0 + for i, u_i := range u { + v_i := t[i] - u_i + // U = U + vsum*u_i + u_i*v_i/2 + twoU += 2*vsum*u_i + u_i*v_i + vsum += v_i + } + + // Compute Π choose(t_i, u_i). This is the number of + // ways of permuting the input sample under u. + prod := 1 + for i, u_i := range u { + prod *= int(mathChoose(t[i], u_i) + 0.5) + } + + // Accumulate the permutations on this u path. + counts[twoU] += prod + + if false { + // Print a table in the form of Klotz's + // "direct enumeration" example. + // + // Convert 2U = 2UQV' to UQt' used in Klotz + // examples. + UQt := float64(twoU)/2 + float64(n1*n1)/2 + fmt.Printf("%+v %f %-2d\n", u, UQt, prod) + } + } + + // Convert counts into probabilities for PMF and CDF. + pmf = make([]float64, len(counts)) + cdf = make([]float64, len(counts)) + total := int(mathChoose(sumint(t), n1) + 0.5) + for i, count := range counts { + pmf[i] = float64(count) / float64(total) + if i > 0 { + cdf[i] = cdf[i-1] + } + cdf[i] += pmf[i] + } + return +} + +// printUmemo prints the output of makeUmemo for debugging. +func printUmemo(A []map[ukey]float64, t []int) { + fmt.Printf("K\tn1\t2*U\tpr\n") + for K := len(A) - 1; K >= 0; K-- { + for i, pr := range A[K] { + _, ref := udistRef(i.n1, t[:K]) + fmt.Printf("%v\t%v\t%v\t%v\t%v\n", K, i.n1, i.twoU, pr, ref[i.twoU]) + } + } +} diff --git a/pkg/app/piped/executor/analysis/mannwhitney/util_test.go b/pkg/app/piped/executor/analysis/mannwhitney/util_test.go new file mode 100644 index 0000000000..a4f6a8ce96 --- /dev/null +++ b/pkg/app/piped/executor/analysis/mannwhitney/util_test.go @@ -0,0 +1,131 @@ +// Copyright 2021 The PipeCD Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mannwhitney + +import ( + "fmt" + "math" + "sort" + "strings" + "testing" +) + +func testDiscreteCDF(t *testing.T, name string, dist DiscreteDist) { + // Build the expected CDF out of the PMF. + l, h := dist.Bounds() + s := dist.Step() + want := map[float64]float64{l - 0.1: 0, h: 1} + sum := 0.0 + for x := l; x < h; x += s { + sum += dist.PMF(x) + want[x] = sum + want[x+s/2] = sum + } + + testFunc(t, name, dist.CDF, want) +} + +func testInvCDF(t *testing.T, dist Dist, bounded bool) { + inv := InvCDF(dist) + name := fmt.Sprintf("InvCDF(%+v)", dist) + cdfName := fmt.Sprintf("CDF(%+v)", dist) + + // Test bounds. + vals := map[float64]float64{-0.01: nan, 1.01: nan} + if !bounded { + vals[0] = -inf + vals[1] = inf + } + testFunc(t, name, inv, vals) + + if bounded { + lo, hi := inv(0), inv(1) + vals := map[float64]float64{ + lo - 0.01: 0, lo: 0, + hi: 1, hi + 0.01: 1, + } + testFunc(t, cdfName, dist.CDF, vals) + if got := dist.CDF(lo + 0.01); !(got > 0) { + t.Errorf("%s(0)=%v, but %s(%v)=0", name, lo, cdfName, lo+0.01) + } + if got := dist.CDF(hi - 0.01); !(got < 1) { + t.Errorf("%s(1)=%v, but %s(%v)=1", name, hi, cdfName, hi-0.01) + } + } + + // Test points between. + vals = map[float64]float64{} + for _, p := range vecLinspace(0, 1, 11) { + if p == 0 || p == 1 { + continue + } + x := inv(p) + vals[x] = x + } + testFunc(t, fmt.Sprintf("InvCDF(CDF(%+v))", dist), + func(x float64) float64 { + return inv(dist.CDF(x)) + }, + vals) +} + +// aeq returns true if expect and got are equal to 8 significant +// figures (1 part in 100 million). +func aeq(expect, got float64) bool { + if expect < 0 && got < 0 { + expect, got = -expect, -got + } + return expect*0.99999999 <= got && got*0.99999999 <= expect +} + +func testFunc(t *testing.T, name string, f func(float64) float64, vals map[float64]float64) { + xs := make([]float64, 0, len(vals)) + for x := range vals { + xs = append(xs, x) + } + sort.Float64s(xs) + + for _, x := range xs { + want, got := vals[x], f(x) + if math.IsNaN(want) && math.IsNaN(got) || aeq(want, got) { + continue + } + var label string + if strings.Contains(name, "%v") { + label = fmt.Sprintf(name, x) + } else { + label = fmt.Sprintf("%s(%v)", name, x) + } + t.Errorf("want %s=%v, got %v", label, want, got) + } +} + +// vecLinspace returns num values spaced evenly between lo and hi, +// inclusive. If num is 1, this returns an array consisting of lo. +func vecLinspace(lo, hi float64, num int) []float64 { + res := make([]float64, num) + if num == 1 { + res[0] = lo + return res + } + for i := 0; i < num; i++ { + res[i] = lo + float64(i)*(hi-lo)/float64(num-1) + } + return res +}