From 2e28810156a556e0d492a2075a7fe06333f05433 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 16:55:43 +0000 Subject: [PATCH 01/21] Add KS test for WeightedIndex --- distr_test/tests/weighted.rs | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 distr_test/tests/weighted.rs diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs new file mode 100644 index 0000000000..0a3078ccd0 --- /dev/null +++ b/distr_test/tests/weighted.rs @@ -0,0 +1,37 @@ +// Copyright 2024 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +mod ks; +use ks::test_discrete; +use rand::distr::WeightedIndex; + +#[test] +fn weighted_index() { + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap(); + + let mut cdf = Vec::with_capacity(num); + let mut ac = 0.0; + for i in 0..num { + ac += weight(i as i64); + cdf.push(ac); + } + let frac = 1.0 / ac; + for x in &mut cdf { + *x = *x * frac; + } + + test_discrete(0, distr, |i| if i < 0 { 0.0 } else { cdf[i as usize] }); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} From eb1836bb343627d189371331115b6478f5f18feb Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:05:51 +0000 Subject: [PATCH 02/21] Add KS test for WeightedAliasIndex --- distr_test/Cargo.toml | 2 +- distr_test/tests/weighted.rs | 48 +++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/distr_test/Cargo.toml b/distr_test/Cargo.toml index 7f37853b3c..36314b37cf 100644 --- a/distr_test/Cargo.toml +++ b/distr_test/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" publish = false [dev-dependencies] -rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false } +rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] } rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] } num-traits = "0.2.19" # Special functions for testing distributions diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index 0a3078ccd0..fe31346bd2 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -9,24 +9,50 @@ mod ks; use ks::test_discrete; use rand::distr::WeightedIndex; +use rand_distr::WeightedAliasIndex; + +fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { + let mut cdf = Vec::with_capacity(num); + let mut ac = 0.0; + for i in 0..num { + ac += f(i as i64); + cdf.push(ac); + } + + let frac = 1.0 / ac; + for x in &mut cdf { + *x = *x * frac; + } + + move |i| { + if i < 0 { + 0.0 + } else { + cdf[i as usize] + } + } +} #[test] fn weighted_index() { fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap(); + test_discrete(0, distr, make_cdf(num, weight)); + } - let mut cdf = Vec::with_capacity(num); - let mut ac = 0.0; - for i in 0..num { - ac += weight(i as i64); - cdf.push(ac); - } - let frac = 1.0 / ac; - for x in &mut cdf { - *x = *x * frac; - } + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} - test_discrete(0, distr, |i| if i < 0 { 0.0 } else { cdf[i as usize] }); +#[test] +fn weighted_alias_index() { + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let weights = (0..num).map(|i| weight(i as i64)).collect(); + let distr = WeightedAliasIndex::new(weights).unwrap(); + test_discrete(0, distr, make_cdf(num, weight)); } test_weights(100, |_| 1.0); From a8ce256881a2733ee36162de9e8a3e4417bf3be3 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:06:58 +0000 Subject: [PATCH 03/21] Add KS test for WeightedTreeIndex --- distr_test/tests/weighted.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index fe31346bd2..fadbad620e 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -9,7 +9,7 @@ mod ks; use ks::test_discrete; use rand::distr::WeightedIndex; -use rand_distr::WeightedAliasIndex; +use rand_distr::{WeightedAliasIndex, WeightedTreeIndex}; fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { let mut cdf = Vec::with_capacity(num); @@ -61,3 +61,17 @@ fn weighted_alias_index() { test_weights(100, |i| (i as f64).powi(3)); test_weights(100, |i| 1.0 / ((i + 1) as f64)); } + +#[test] +fn weighted_tree_index() { + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap(); + test_discrete(0, distr, make_cdf(num, weight)); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} From 9e03a1571064cf5428675169cb95329c833ffd92 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:20:35 +0000 Subject: [PATCH 04/21] Add KS test for IndexedRandom::choose_weighted --- distr_test/tests/weighted.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index fadbad620e..d577535f11 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -8,7 +8,8 @@ mod ks; use ks::test_discrete; -use rand::distr::WeightedIndex; +use rand::distr::{Distribution, WeightedIndex}; +use rand::seq::IndexedRandom; use rand_distr::{WeightedAliasIndex, WeightedTreeIndex}; fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { @@ -75,3 +76,24 @@ fn weighted_tree_index() { test_weights(100, |i| (i as f64).powi(3)); test_weights(100, |i| 1.0 / ((i + 1) as f64)); } + +#[test] +fn choose_weighted_indexed() { + struct Adapter f64>(Vec, F); + impl f64> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + *IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap() + } + } + + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); + test_discrete(0, distr, make_cdf(num, |i| weight(i))); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} From 4b0a2969eba758f851fb95fd998a4c7914b8f245 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:26:14 +0000 Subject: [PATCH 05/21] Add KS test for IndexedRandom::choose_multiple_weighted (one element) --- distr_test/tests/weighted.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index d577535f11..8097533087 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -97,3 +97,27 @@ fn choose_weighted_indexed() { test_weights(100, |i| (i as f64).powi(3)); test_weights(100, |i| 1.0 / ((i + 1) as f64)); } + +#[test] +fn choose_one_weighted_indexed() { + struct Adapter f64>(Vec, F); + impl f64> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + *IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i)) + .unwrap() + .next() + .unwrap() + } + } + + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); + test_discrete(0, distr, make_cdf(num, |i| weight(i))); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} From 25842126ff396d7b1f1a5e90061833eb91561142 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:42:42 +0000 Subject: [PATCH 06/21] Add KS test for IndexedRandom::choose_multiple_weighted (two elements) Some failures --- distr_test/tests/weighted.rs | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index 8097533087..d0628c37fa 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -121,3 +121,51 @@ fn choose_one_weighted_indexed() { test_weights(100, |i| (i as f64).powi(3)); test_weights(100, |i| 1.0 / ((i + 1) as f64)); } + +#[test] +fn choose_two_weighted_indexed() { + struct Adapter f64>(Vec, F); + impl f64> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + let mut iter = + IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i)) + .unwrap(); + let a = *iter.next().unwrap(); + let b = *iter.next().unwrap(); + assert!(iter.next().is_none()); + a * self.0.len() as i64 + b + } + } + + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); + + let pmf1 = (0..num).map(|i| weight(i as i64)).collect::>(); + let total: f64 = pmf1.iter().sum(); + let mut ac = 0.0; + let frac = total.powi(-2); + let mut cdf = Vec::with_capacity(num * num); + for a in 0..num { + for b in 0..num { + ac += pmf1[a] * pmf1[b]; + cdf.push(ac * frac); + } + } + + let cdf = |i| { + if i < 0 { + 0.0 + } else { + cdf[i as usize] + } + }; + + test_discrete(0, distr, cdf); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + // test_weights(100, |i| i as f64); + // test_weights(100, |i| (i as f64).powi(3)); + // test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} From 336ddbe8fe5b435a8f8fbbb29a49f1f3d7e519b2 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:48:48 +0000 Subject: [PATCH 07/21] Add KS test for IteratorRandom::choose --- distr_test/tests/weighted.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index d0628c37fa..8df9604399 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -9,7 +9,7 @@ mod ks; use ks::test_discrete; use rand::distr::{Distribution, WeightedIndex}; -use rand::seq::IndexedRandom; +use rand::seq::{IndexedRandom, IteratorRandom}; use rand_distr::{WeightedAliasIndex, WeightedTreeIndex}; fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { @@ -169,3 +169,16 @@ fn choose_two_weighted_indexed() { // test_weights(100, |i| (i as f64).powi(3)); // test_weights(100, |i| 1.0 / ((i + 1) as f64)); } + +#[test] +fn choose_iterator() { + struct Adapter(I); + impl> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + IteratorRandom::choose(self.0.clone(), rng).unwrap() + } + } + + let distr = Adapter((0..100).map(|i| i as i64)); + test_discrete(0, distr, make_cdf(100, |_| 1.0)); +} From 2ef212b19653de10d52a3f0dc288123843a0a18f Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 17:50:10 +0000 Subject: [PATCH 08/21] Add KS test for IteratorRandom::choose_stable --- distr_test/tests/weighted.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index 8df9604399..9adefb73e8 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -182,3 +182,16 @@ fn choose_iterator() { let distr = Adapter((0..100).map(|i| i as i64)); test_discrete(0, distr, make_cdf(100, |_| 1.0)); } + +#[test] +fn choose_stable_iterator() { + struct Adapter(I); + impl> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + IteratorRandom::choose_stable(self.0.clone(), rng).unwrap() + } + } + + let distr = Adapter((0..100).map(|i| i as i64)); + test_discrete(0, distr, make_cdf(100, |_| 1.0)); +} From a0908bc44a3220ba1baa56b7a5d502e5ab6063f7 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 19:10:49 +0000 Subject: [PATCH 09/21] Add KS test for IteratorRandom::choose_multiple_fill (two elements) --- distr_test/tests/weighted.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index 9adefb73e8..a4c74224cf 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -195,3 +195,26 @@ fn choose_stable_iterator() { let distr = Adapter((0..100).map(|i| i as i64)); test_discrete(0, distr, make_cdf(100, |_| 1.0)); } + +#[test] +fn choose_two_iterator() { + struct Adapter(I); + impl> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + let mut buf = [0; 2]; + IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf); + buf.sort_unstable(); + assert!(buf[0] < 99 && buf[1] >= 1); + let a = buf[0]; + 4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1 + } + } + + let distr = Adapter((0..100).map(|i| i as i64)); + + test_discrete( + 0, + distr, + |i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 }, + ); +} From 865aba2c4f4fe02a80bbb8d262973691994d23c2 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 18 Nov 2024 19:28:11 +0000 Subject: [PATCH 10/21] Fix cdf for choose_two_weighted_indexed --- distr_test/tests/weighted.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index a4c74224cf..cb89287556 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -130,9 +130,12 @@ fn choose_two_weighted_indexed() { let mut iter = IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i)) .unwrap(); - let a = *iter.next().unwrap(); - let b = *iter.next().unwrap(); + let mut a = *iter.next().unwrap(); + let mut b = *iter.next().unwrap(); assert!(iter.next().is_none()); + if b < a { + std::mem::swap(&mut a, &mut b); + } a * self.0.len() as i64 + b } } @@ -141,16 +144,21 @@ fn choose_two_weighted_indexed() { let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); let pmf1 = (0..num).map(|i| weight(i as i64)).collect::>(); - let total: f64 = pmf1.iter().sum(); + let sum: f64 = pmf1.iter().sum(); + let sum_sq: f64 = pmf1.iter().map(|x| x * x).sum(); + let frac = 2.0 / (sum * sum - sum_sq); + let mut ac = 0.0; - let frac = total.powi(-2); let mut cdf = Vec::with_capacity(num * num); for a in 0..num { for b in 0..num { - ac += pmf1[a] * pmf1[b]; + if a < b { + ac += pmf1[a] * pmf1[b]; + } cdf.push(ac * frac); } } + assert!((ac * frac - 1.0).abs() < 1e-9); let cdf = |i| { if i < 0 { From 16a16c6420c9e1569d7a277da0344b2e17329e46 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 10:09:34 +0000 Subject: [PATCH 11/21] More complex test for choose_multiple_weighted --- src/seq/slice.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/seq/slice.rs b/src/seq/slice.rs index 4144f91345..1fc10c0985 100644 --- a/src/seq/slice.rs +++ b/src/seq/slice.rs @@ -732,17 +732,17 @@ mod test { use super::*; // The theoretical probabilities of the different outcomes are: - // AB: 0.5 * 0.5 = 0.250 - // AC: 0.5 * 0.5 = 0.250 - // BA: 0.25 * 0.67 = 0.167 - // BC: 0.25 * 0.33 = 0.082 - // CA: 0.25 * 0.67 = 0.167 - // CB: 0.25 * 0.33 = 0.082 - let choices = [('a', 2), ('b', 1), ('c', 1)]; + // AB: 0.5 * 0.667 = 0.3333 + // AC: 0.5 * 0.333 = 0.1667 + // BA: 0.333 * 0.75 = 0.25 + // BC: 0.333 * 0.25 = 0.0833 + // CA: 0.167 * 0.6 = 0.1 + // CB: 0.167 * 0.4 = 0.0667 + let choices = [('a', 3), ('b', 2), ('c', 1)]; let mut rng = crate::test::rng(414); let mut results = [0i32; 3]; - let expected_results = [4167, 4167, 1666]; + let expected_results = [5833, 2667, 1500]; for _ in 0..10000 { let result = choices .choose_multiple_weighted(&mut rng, 2, |item| item.1) From 554d3311a32ce91305d644d2c4250fdf05f6699d Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 10:23:49 +0000 Subject: [PATCH 12/21] Fix calculated CDF in choose_two_weighted_indexed --- distr_test/tests/weighted.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index cb89287556..0d24c88909 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -145,20 +145,25 @@ fn choose_two_weighted_indexed() { let pmf1 = (0..num).map(|i| weight(i as i64)).collect::>(); let sum: f64 = pmf1.iter().sum(); - let sum_sq: f64 = pmf1.iter().map(|x| x * x).sum(); - let frac = 2.0 / (sum * sum - sum_sq); + let frac = 1.0 / sum; let mut ac = 0.0; let mut cdf = Vec::with_capacity(num * num); for a in 0..num { for b in 0..num { if a < b { - ac += pmf1[a] * pmf1[b]; + let pa = pmf1[a] * frac; + let pab = pa * pmf1[b] / (sum - pmf1[a]); + + let pb = pmf1[b] * frac; + let pba = pb * pmf1[a] / (sum - pmf1[b]); + + ac += pab + pba; } - cdf.push(ac * frac); + cdf.push(ac); } } - assert!((ac * frac - 1.0).abs() < 1e-9); + assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9); let cdf = |i| { if i < 0 { @@ -173,9 +178,9 @@ fn choose_two_weighted_indexed() { test_weights(100, |_| 1.0); test_weights(100, |i| ((i + 1) as f64).ln()); - // test_weights(100, |i| i as f64); - // test_weights(100, |i| (i as f64).powi(3)); - // test_weights(100, |i| 1.0 / ((i + 1) as f64)); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); } #[test] From d6459528c5f04e93dcbd9c241c1cd9166ff65ac9 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 10:32:29 +0000 Subject: [PATCH 13/21] Test and fix choose_multiple_weighted with very small probabilities: #1476 Also improves choose_two_weighted_indexed time by 23% (excluding new test) --- distr_test/tests/weighted.rs | 1 + src/seq/index.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index 0d24c88909..d617de2b77 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -181,6 +181,7 @@ fn choose_two_weighted_indexed() { test_weights(100, |i| i as f64); test_weights(100, |i| (i as f64).powi(3)); test_weights(100, |i| 1.0 / ((i + 1) as f64)); + test_weights(10, |i| ((i + 1) as f64).powi(-8)); } #[test] diff --git a/src/seq/index.rs b/src/seq/index.rs index 70231dde2c..f457e8a162 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -392,7 +392,7 @@ where while index < length { let weight = weight(index.as_usize()).into(); if weight > 0.0 { - let key = rng.random::().powf(1.0 / weight); + let key = rng.random::().ln() / weight; candidates.push(Element { index, key }); } else if !(weight >= 0.0) { return Err(WeightError::InvalidWeight); From b806b29812c05e8b705358b5fd73041e475c704b Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 10:44:00 +0000 Subject: [PATCH 14/21] sample_efraimidis_spirakis: keep at most amount candidates Approx 2% improvement to tests sampling 2 of 100 elements --- src/seq/index.rs | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/src/seq/index.rs b/src/seq/index.rs index f457e8a162..24baa47fcb 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -387,13 +387,18 @@ where impl Eq for Element {} - let mut candidates = Vec::with_capacity(length.as_usize()); + let mut candidates = Vec::with_capacity(amount.as_usize()); let mut index = N::zero(); while index < length { let weight = weight(index.as_usize()).into(); if weight > 0.0 { let key = rng.random::().ln() / weight; - candidates.push(Element { index, key }); + if candidates.len() < amount.as_usize() { + candidates.push(Element { index, key }); + } else if key > candidates[0].key { + candidates[0] = Element { index, key }; + } + candidates.sort_unstable(); } else if !(weight >= 0.0) { return Err(WeightError::InvalidWeight); } @@ -406,18 +411,7 @@ where return Err(WeightError::InsufficientNonZero); } - // Partially sort the array to find the `amount` elements with the greatest - // keys. Do this by using `select_nth_unstable` to put the elements with - // the *smallest* keys at the beginning of the list in `O(n)` time, which - // provides equivalent information about the elements with the *greatest* keys. - let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize()); - - let mut result: Vec = Vec::with_capacity(amount.as_usize()); - result.push(mid.index); - for element in greater { - result.push(element.index); - } - Ok(IndexVec::from(result)) + Ok(IndexVec::from(candidates.iter().map(|elt| elt.index).collect())) } /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's From 0f662b1d94180d715ad006f2ce2fed9dd22501db Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 11:00:15 +0000 Subject: [PATCH 15/21] sample_efraimidis_spirakis: use algorithm A-ExpJ This results in approx 18% faster tests choosing 2-in-100 items --- src/seq/index.rs | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/seq/index.rs b/src/seq/index.rs index 24baa47fcb..00b9cac047 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -389,28 +389,44 @@ where let mut candidates = Vec::with_capacity(amount.as_usize()); let mut index = N::zero(); - while index < length { + while index < length && candidates.len() < amount.as_usize() { let weight = weight(index.as_usize()).into(); if weight > 0.0 { let key = rng.random::().ln() / weight; - if candidates.len() < amount.as_usize() { - candidates.push(Element { index, key }); - } else if key > candidates[0].key { - candidates[0] = Element { index, key }; - } - candidates.sort_unstable(); + // let key = rng.random::().powf(1.0 / weight); + candidates.push(Element { index, key }); } else if !(weight >= 0.0) { return Err(WeightError::InvalidWeight); } index += N::one(); } + candidates.sort_unstable(); - let avail = candidates.len(); - if avail < amount.as_usize() { + if candidates.len() < amount.as_usize() { return Err(WeightError::InsufficientNonZero); } + let mut x = rng.random::().ln() / candidates[0].key; + while index < length { + let weight = weight(index.as_usize()).into(); + if weight > 0.0 { + x -= weight; + if x <= 0.0 { + let t = core::f64::consts::E.powf(candidates[0].key * weight); + let key = rng.random_range(t..1.0).ln() / weight; + candidates[0] = Element { index, key }; + candidates.sort_unstable(); + + x = rng.random::().ln() / candidates[0].key; + } + } else if !(weight >= 0.0) { + return Err(WeightError::InvalidWeight); + } + + index += N::one(); + } + Ok(IndexVec::from(candidates.iter().map(|elt| elt.index).collect())) } From 51c6fdd3071fc9aaf8788bcfae9ff68a7cad1ded Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 11:04:28 +0000 Subject: [PATCH 16/21] Rustfmt --- src/seq/index.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/seq/index.rs b/src/seq/index.rs index 00b9cac047..6f3695602a 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -427,7 +427,9 @@ where index += N::one(); } - Ok(IndexVec::from(candidates.iter().map(|elt| elt.index).collect())) + Ok(IndexVec::from( + candidates.iter().map(|elt| elt.index).collect(), + )) } /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's From a1f61ae0e63959927b8deda24d56fe21824abe25 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Nov 2024 11:06:44 +0000 Subject: [PATCH 17/21] Clippy --- distr_test/tests/weighted.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index d617de2b77..c73ab6d632 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -22,7 +22,7 @@ fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { let frac = 1.0 / ac; for x in &mut cdf { - *x = *x * frac; + *x *= frac; } move |i| { @@ -88,7 +88,7 @@ fn choose_weighted_indexed() { fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); - test_discrete(0, distr, make_cdf(num, |i| weight(i))); + test_discrete(0, distr, make_cdf(num, &weight)); } test_weights(100, |_| 1.0); @@ -112,7 +112,7 @@ fn choose_one_weighted_indexed() { fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); - test_discrete(0, distr, make_cdf(num, |i| weight(i))); + test_discrete(0, distr, make_cdf(num, &weight)); } test_weights(100, |_| 1.0); From 1c147a4bc3a512e5ef01d554c46a01497cb73d63 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Sat, 23 Nov 2024 15:35:45 +0000 Subject: [PATCH 18/21] Temp: extra test --- tests/choose_multiple_weighted.rs | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tests/choose_multiple_weighted.rs diff --git a/tests/choose_multiple_weighted.rs b/tests/choose_multiple_weighted.rs new file mode 100644 index 0000000000..23082e2cff --- /dev/null +++ b/tests/choose_multiple_weighted.rs @@ -0,0 +1,39 @@ +use rand::{prelude::IndexedRandom, rng}; + +#[test] +fn main() { + let values: Vec<(i32, f64)> = + vec![(1, 0.080), (2, 0.0078), (3, 0.012)] + .into_iter() + .map(|v| (v.0, f64::powf(v.1, 4.0))) + .collect(); + + for v in &values { + println!("value={} has weight={:.10}", v.0, v.1); + } + let weight_sum: f64 = values.iter().map(|(_, w)| w).sum(); + let weights: Vec = values.iter().map(|(_, w)| w / weight_sum).collect(); + println!("expected: {weights:?}"); + + let largest_weight = values[0].1; + + let mut results = [0u32; 9]; + + for _ in 0..100_000 { + let mut iter = values.choose_multiple_weighted(&mut rng(), 2, |a| a.1 / largest_weight).unwrap(); + let a = iter.next().unwrap().0; + let b = iter.next().unwrap().0; + assert!(iter.next().is_none()); + + let i = (a - 1) * 3 + b - 1; + results[i as usize] += 1; + } + + println!("got following results:"); + for i in 0..9 { + let i0 = i / 3 + 1; + let i1 = i % 3 + 1; + println!("({}, {}):\t{}", i0, i1, results[i]); + } + assert!(false); +} From ddb8f5f887bcdc082f5274744124f52de9bc09a7 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 25 Nov 2024 17:05:13 +0000 Subject: [PATCH 19/21] Review feedback --- distr_test/tests/weighted.rs | 1 + src/seq/index.rs | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs index c73ab6d632..cf87b3ee63 100644 --- a/distr_test/tests/weighted.rs +++ b/distr_test/tests/weighted.rs @@ -12,6 +12,7 @@ use rand::distr::{Distribution, WeightedIndex}; use rand::seq::{IndexedRandom, IteratorRandom}; use rand_distr::{WeightedAliasIndex, WeightedTreeIndex}; +/// Takes the unnormalized pdf and creates the cdf of a discrete distribution fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { let mut cdf = Vec::with_capacity(num); let mut ac = 0.0; diff --git a/src/seq/index.rs b/src/seq/index.rs index 6f3695602a..39ac708a67 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -333,8 +333,8 @@ where /// ordering). The weights are to be provided by the input function `weights`, /// which will be called once for each index. /// -/// This implementation uses the algorithm described by Efraimidis and Spirakis -/// in this paper: +/// This implementation is based on the algorithm A-ExpJ as found in +/// [Efraimidis and Spirakis, 2005](https://doi.org/10.1016/j.ipl.2005.11.003). /// It uses `O(length + amount)` space and `O(length)` time. /// /// Error cases: @@ -392,8 +392,9 @@ where while index < length && candidates.len() < amount.as_usize() { let weight = weight(index.as_usize()).into(); if weight > 0.0 { + // We use the log of the key used in A-ExpJ to improve precision + // for small weights: let key = rng.random::().ln() / weight; - // let key = rng.random::().powf(1.0 / weight); candidates.push(Element { index, key }); } else if !(weight >= 0.0) { return Err(WeightError::InvalidWeight); @@ -413,9 +414,11 @@ where if weight > 0.0 { x -= weight; if x <= 0.0 { - let t = core::f64::consts::E.powf(candidates[0].key * weight); + let t = (candidates[0].key * weight).exp(); let key = rng.random_range(t..1.0).ln() / weight; candidates[0] = Element { index, key }; + // TODO: consider using a binary tree instead of sorting at each + // step. This should be faster for some THRESHOLD < amount. candidates.sort_unstable(); x = rng.random::().ln() / candidates[0].key; From a039a7fdf9c61b54c9fc0704ebdb7b3c318e8ed8 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 26 Nov 2024 09:44:03 +0000 Subject: [PATCH 20/21] Remove extra test --- tests/choose_multiple_weighted.rs | 39 ------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 tests/choose_multiple_weighted.rs diff --git a/tests/choose_multiple_weighted.rs b/tests/choose_multiple_weighted.rs deleted file mode 100644 index 23082e2cff..0000000000 --- a/tests/choose_multiple_weighted.rs +++ /dev/null @@ -1,39 +0,0 @@ -use rand::{prelude::IndexedRandom, rng}; - -#[test] -fn main() { - let values: Vec<(i32, f64)> = - vec![(1, 0.080), (2, 0.0078), (3, 0.012)] - .into_iter() - .map(|v| (v.0, f64::powf(v.1, 4.0))) - .collect(); - - for v in &values { - println!("value={} has weight={:.10}", v.0, v.1); - } - let weight_sum: f64 = values.iter().map(|(_, w)| w).sum(); - let weights: Vec = values.iter().map(|(_, w)| w / weight_sum).collect(); - println!("expected: {weights:?}"); - - let largest_weight = values[0].1; - - let mut results = [0u32; 9]; - - for _ in 0..100_000 { - let mut iter = values.choose_multiple_weighted(&mut rng(), 2, |a| a.1 / largest_weight).unwrap(); - let a = iter.next().unwrap().0; - let b = iter.next().unwrap().0; - assert!(iter.next().is_none()); - - let i = (a - 1) * 3 + b - 1; - results[i as usize] += 1; - } - - println!("got following results:"); - for i in 0..9 { - let i0 = i / 3 + 1; - let i1 = i % 3 + 1; - println!("({}, {}):\t{}", i0, i1, results[i]); - } - assert!(false); -} From 831353ff55bb30dadadd64c383b102e69cbf28e0 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 26 Nov 2024 09:44:24 +0000 Subject: [PATCH 21/21] fn sample_efraimidis_spirakis: use BinaryHeap --- src/seq/index.rs | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/seq/index.rs b/src/seq/index.rs index 39ac708a67..852bdac76c 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -354,7 +354,7 @@ where N: UInt, IndexVec: From>, { - use std::cmp::Ordering; + use std::{cmp::Ordering, collections::BinaryHeap}; if amount == N::zero() { return Ok(IndexVec::U32(Vec::new())); @@ -373,9 +373,9 @@ where impl Ord for Element { fn cmp(&self, other: &Self) -> Ordering { - // partial_cmp will always produce a value, - // because we check that the weights are not nan - self.key.partial_cmp(&other.key).unwrap() + // unwrap() should not panic since weights should not be NaN + // We reverse so that BinaryHeap::peek shows the smallest item + self.key.partial_cmp(&other.key).unwrap().reverse() } } @@ -387,7 +387,7 @@ where impl Eq for Element {} - let mut candidates = Vec::with_capacity(amount.as_usize()); + let mut candidates = BinaryHeap::with_capacity(amount.as_usize()); let mut index = N::zero(); while index < length && candidates.len() < amount.as_usize() { let weight = weight(index.as_usize()).into(); @@ -402,26 +402,23 @@ where index += N::one(); } - candidates.sort_unstable(); if candidates.len() < amount.as_usize() { return Err(WeightError::InsufficientNonZero); } - let mut x = rng.random::().ln() / candidates[0].key; + let mut x = rng.random::().ln() / candidates.peek().unwrap().key; while index < length { let weight = weight(index.as_usize()).into(); if weight > 0.0 { x -= weight; if x <= 0.0 { - let t = (candidates[0].key * weight).exp(); + let min_candidate = candidates.pop().unwrap(); + let t = (min_candidate.key * weight).exp(); let key = rng.random_range(t..1.0).ln() / weight; - candidates[0] = Element { index, key }; - // TODO: consider using a binary tree instead of sorting at each - // step. This should be faster for some THRESHOLD < amount. - candidates.sort_unstable(); + candidates.push(Element { index, key }); - x = rng.random::().ln() / candidates[0].key; + x = rng.random::().ln() / candidates.peek().unwrap().key; } } else if !(weight >= 0.0) { return Err(WeightError::InvalidWeight);