diff --git a/src/Core/Core.fsproj b/src/Core/Core.fsproj index 8c0c5761..7073ae30 100644 --- a/src/Core/Core.fsproj +++ b/src/Core/Core.fsproj @@ -39,6 +39,7 @@ + diff --git a/src/Core/RobustStats.fs b/src/Core/RobustStats.fs new file mode 100644 index 00000000..48480820 --- /dev/null +++ b/src/Core/RobustStats.fs @@ -0,0 +1,102 @@ +namespace Zeta.Core + +open System + + +/// **Robust statistical aggregation** — median plus median-absolute- +/// deviation (MAD) with an outlier filter. The canonical operational +/// shape for numeric-oracle aggregation proposed in Amara's 10th +/// courier ferry (`docs/aurora/2026-04-23-amara-aurora-deep-research- +/// report-10th-ferry.md`) — first graduation from the Amara- +/// absorb-to-ship cadence (see the Otto-105 feedback memory +/// `feedback_amara_contributions_must_operationalize_*_2026-04-24`). +/// +/// **Why this shape** — the arithmetic mean inherits everything bad +/// about every sample, including the ones that are wrong. The +/// median survives half its inputs being adversarial. MAD is to the +/// median what standard deviation is to the mean: a scale estimate +/// that also survives outliers. The 3-sigma-equivalent filter +/// (`|x - median| <= 3 * max(MAD, epsilon)`) is the classical robust- +/// aggregation move; `epsilon` is a degenerate-input floor that +/// stops the filter from collapsing to "median only" when the +/// sample is perfectly uniform and MAD = 0. +/// +/// **Relation to Zeta substrate** — this is a pure-function helper +/// for downstream oracle / bullshit-detector / reputation-aggregation +/// code; it does not depend on the Z-set algebra or the operator +/// graph and does not need a streaming/incremental variant at this +/// scale. If incremental-median is needed later, that's a separate +/// module (t-digest / p-squared / HdrHistogram territory). +/// +/// **Anti-consensus framing** — the implementation follows Amara's +/// explicit rationale: *"agreement alone is not proof; what matters +/// is independent, bounded, falsifiable convergence."* The robust +/// aggregate reduces one mechanical failure mode — "a few loud +/// outliers pull the mean" — without claiming it resolves +/// independence-of-sources (that's `antiConsensusGate` territory, +/// a separate graduation). +[] +module RobustStats = + + /// Degenerate-MAD floor used by `robustAggregate` when the + /// sample's MAD collapses to zero (all values equal, or + /// insufficient sample). `1e-9` matches Amara's 10th-ferry + /// snippet; any positive floor is fine. + [] + let MadFloor = 1e-9 + + /// Median of a sequence of `double`. Returns `None` on empty + /// input. Ties at even-length split to the arithmetic mean of + /// the two centre elements (the standard R-7 convention). + let median (xs: double seq) : double option = + let arr = Seq.toArray xs + if arr.Length = 0 then None + else + Array.Sort(arr) + let n = arr.Length + if n % 2 = 1 then Some arr.[n / 2] + else Some ((arr.[n / 2 - 1] + arr.[n / 2]) / 2.0) + + /// Median-absolute-deviation around the sample's own median. + /// `None` on empty input. Uses the raw MAD definition (no + /// Gaussian-consistency scale factor `1.4826`; callers can apply + /// if they want standard-deviation-equivalent units). + let mad (xs: double seq) : double option = + let arr = Seq.toArray xs + if arr.Length = 0 then None + else + match median arr with + | None -> None + | Some m -> + let devs = arr |> Array.map (fun x -> abs (x - m)) + median devs + + /// **Robust aggregate** — drop outliers outside + /// `|x - median| <= 3 * max(MAD, MadFloor)`, then return the + /// median of the kept set. `None` on empty input. + /// + /// Amara's 10th-ferry F# snippet (preserved verbatim in + /// `docs/aurora/2026-04-23-amara-aurora-deep-research-report- + /// 10th-ferry.md` under §Prioritized implementation plan) + /// reproduced here against Zeta's `Array`-first shape: + /// + /// ``` + /// let robustAggregate (xs: float list) = + /// let median = Statistics.median xs + /// let mad = Statistics.median (xs |> List.map (fun x -> abs (x - median))) + /// let kept = xs |> List.filter (fun x -> abs (x - median) <= 3.0 * max mad 1e-9) + /// Statistics.median kept + /// ``` + let robustAggregate (xs: double seq) : double option = + let arr = Seq.toArray xs + if arr.Length = 0 then None + else + match median arr with + | None -> None + | Some m -> + match mad arr with + | None -> Some m + | Some d -> + let threshold = 3.0 * max d MadFloor + let kept = arr |> Array.filter (fun x -> abs (x - m) <= threshold) + median kept diff --git a/tests/Tests.FSharp/Algebra/RobustStats.Tests.fs b/tests/Tests.FSharp/Algebra/RobustStats.Tests.fs new file mode 100644 index 00000000..2efff645 --- /dev/null +++ b/tests/Tests.FSharp/Algebra/RobustStats.Tests.fs @@ -0,0 +1,81 @@ +module Zeta.Tests.Algebra.RobustStatsTests + +open FsUnit.Xunit +open global.Xunit +open Zeta.Core + + +// ─── Core: median on odd / even / empty ───────── + +[] +let ``median of empty sequence is None`` () = + RobustStats.median [] |> should equal (None: double option) + +[] +let ``median of single element returns that element`` () = + RobustStats.median [ 42.0 ] |> should equal (Some 42.0) + +[] +let ``median of odd-length sample picks middle element after sort`` () = + RobustStats.median [ 3.0; 1.0; 2.0 ] |> should equal (Some 2.0) + +[] +let ``median of even-length sample averages two centre elements`` () = + RobustStats.median [ 4.0; 2.0; 1.0; 3.0 ] |> should equal (Some 2.5) + + +// ─── MAD properties ───────── + +[] +let ``mad of empty sequence is None`` () = + RobustStats.mad [] |> should equal (None: double option) + +[] +let ``mad of constant sample is zero`` () = + RobustStats.mad [ 5.0; 5.0; 5.0; 5.0 ] |> should equal (Some 0.0) + +[] +let ``mad of 1 2 3 4 5 equals 1`` () = + // median = 3, deviations = 2,1,0,1,2, median of devs = 1. + RobustStats.mad [ 1.0; 2.0; 3.0; 4.0; 5.0 ] |> should equal (Some 1.0) + + +// ─── robustAggregate: the load-bearing behaviour ───────── + +[] +let ``robustAggregate of empty sequence is None`` () = + RobustStats.robustAggregate [] |> should equal (None: double option) + +[] +let ``robustAggregate of single element returns that element`` () = + RobustStats.robustAggregate [ 7.0 ] |> should equal (Some 7.0) + +[] +let ``robustAggregate of constant sample returns the constant`` () = + // MAD = 0 here; MadFloor prevents the filter from collapsing. + RobustStats.robustAggregate [ 5.0; 5.0; 5.0; 5.0; 5.0 ] |> should equal (Some 5.0) + +[] +let ``robustAggregate survives a single extreme outlier`` () = + // The mean of [1;2;3;4;5;1000] is 169.2 — a single adversarial + // sample has moved the answer beyond any legitimate reading. The + // robust aggregate discards the outlier and returns the median + // of the kept set. + let xs = [ 1.0; 2.0; 3.0; 4.0; 5.0; 1000.0 ] + let result = RobustStats.robustAggregate xs + // median = 3.5; MAD ≈ 1.5; threshold = 4.5; 1000 is dropped; + // kept = [1;2;3;4;5]; median of kept = 3. + result |> should equal (Some 3.0) + +[] +let ``robustAggregate keeps values within three MAD of the median`` () = + // median = 3, MAD = 1, threshold = 3. Values 1..5 all satisfy + // |x - 3| <= 3; no outlier to drop. Kept-median = 3. + RobustStats.robustAggregate [ 1.0; 2.0; 3.0; 4.0; 5.0 ] |> should equal (Some 3.0) + +[] +let ``robustAggregate is unaffected by adding a mirrored outlier pair`` () = + // Symmetric extreme pair on both sides of the sample. + let baseline = RobustStats.robustAggregate [ 1.0; 2.0; 3.0; 4.0; 5.0 ] + let withOutliers = RobustStats.robustAggregate [ -1000.0; 1.0; 2.0; 3.0; 4.0; 5.0; 1000.0 ] + withOutliers |> should equal baseline diff --git a/tests/Tests.FSharp/Tests.FSharp.fsproj b/tests/Tests.FSharp/Tests.FSharp.fsproj index 875c577e..b65af635 100644 --- a/tests/Tests.FSharp/Tests.FSharp.fsproj +++ b/tests/Tests.FSharp/Tests.FSharp.fsproj @@ -17,6 +17,7 @@ +