Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add KS tests for weighted sampling #1530

Merged
merged 21 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion distr_test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ edition = "2021"
publish = false

[dev-dependencies]
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false }
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] }
rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] }
num-traits = "0.2.19"
# Special functions for testing distributions
Expand Down
234 changes: 234 additions & 0 deletions distr_test/tests/weighted.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
// Copyright 2024 Developers of the Rand project.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

mod ks;
use ks::test_discrete;
use rand::distr::{Distribution, WeightedIndex};
use rand::seq::{IndexedRandom, IteratorRandom};
use rand_distr::{WeightedAliasIndex, WeightedTreeIndex};

fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 {
dhardy marked this conversation as resolved.
Show resolved Hide resolved
let mut cdf = Vec::with_capacity(num);
let mut ac = 0.0;
for i in 0..num {
ac += f(i as i64);
cdf.push(ac);
}

let frac = 1.0 / ac;
for x in &mut cdf {
*x *= frac;
}

move |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
}
}

#[test]
fn weighted_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_alias_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let weights = (0..num).map(|i| weight(i as i64)).collect();
let distr = WeightedAliasIndex::new(weights).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_tree_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_one_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i))
.unwrap()
.next()
.unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_two_weighted_indexed() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably more complex than needed, but looks correct.
It's probably worth implementing chi squared at some point, but this should also be quite sensitive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably more complex than needed, but looks correct.

You mean the use of an Adapter? Yes, but I'd sooner do this than revise the KS test API (which is well adapted for other usages).

It's probably worth implementing chi squared at some point, but this should also be quite sensitive.

A fair point.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean using KS for these distributions (chi squared would be more straight forward), Adapter I think is fine.

struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut iter =
IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i))
.unwrap();
let mut a = *iter.next().unwrap();
let mut b = *iter.next().unwrap();
assert!(iter.next().is_none());
if b < a {
std::mem::swap(&mut a, &mut b);
}
a * self.0.len() as i64 + b
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);

let pmf1 = (0..num).map(|i| weight(i as i64)).collect::<Vec<f64>>();
let sum: f64 = pmf1.iter().sum();
let frac = 1.0 / sum;

let mut ac = 0.0;
let mut cdf = Vec::with_capacity(num * num);
for a in 0..num {
for b in 0..num {
if a < b {
let pa = pmf1[a] * frac;
let pab = pa * pmf1[b] / (sum - pmf1[a]);

let pb = pmf1[b] * frac;
let pba = pb * pmf1[a] / (sum - pmf1[b]);

ac += pab + pba;
}
cdf.push(ac);
}
}
assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9);

let cdf = |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
};

test_discrete(0, distr, cdf);
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
test_weights(10, |i| ((i + 1) as f64).powi(-8));
}

#[test]
fn choose_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_stable_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose_stable(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_two_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut buf = [0; 2];
IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf);
buf.sort_unstable();
assert!(buf[0] < 99 && buf[1] >= 1);
let a = buf[0];
4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1
}
}

let distr = Adapter((0..100).map(|i| i as i64));

test_discrete(
0,
distr,
|i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 },
);
}
42 changes: 27 additions & 15 deletions src/seq/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -387,37 +387,49 @@ where

impl<N> Eq for Element<N> {}

let mut candidates = Vec::with_capacity(length.as_usize());
let mut candidates = Vec::with_capacity(amount.as_usize());
let mut index = N::zero();
while index < length {
while index < length && candidates.len() < amount.as_usize() {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
let key = rng.random::<f64>().powf(1.0 / weight);
let key = rng.random::<f64>().ln() / weight;
// let key = rng.random::<f64>().powf(1.0 / weight);
candidates.push(Element { index, key });
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
}

index += N::one();
}
candidates.sort_unstable();

let avail = candidates.len();
if avail < amount.as_usize() {
if candidates.len() < amount.as_usize() {
return Err(WeightError::InsufficientNonZero);
}

// Partially sort the array to find the `amount` elements with the greatest
// keys. Do this by using `select_nth_unstable` to put the elements with
// the *smallest* keys at the beginning of the list in `O(n)` time, which
// provides equivalent information about the elements with the *greatest* keys.
let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize());
let mut x = rng.random::<f64>().ln() / candidates[0].key;
benjamin-lieser marked this conversation as resolved.
Show resolved Hide resolved
while index < length {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
x -= weight;
if x <= 0.0 {
let t = core::f64::consts::E.powf(candidates[0].key * weight);
dhardy marked this conversation as resolved.
Show resolved Hide resolved
let key = rng.random_range(t..1.0).ln() / weight;
candidates[0] = Element { index, key };
candidates.sort_unstable();
dhardy marked this conversation as resolved.
Show resolved Hide resolved

x = rng.random::<f64>().ln() / candidates[0].key;
}
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
}

let mut result: Vec<N> = Vec::with_capacity(amount.as_usize());
result.push(mid.index);
for element in greater {
result.push(element.index);
index += N::one();
}
Ok(IndexVec::from(result))

Ok(IndexVec::from(
candidates.iter().map(|elt| elt.index).collect(),
))
}

/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
Expand Down
16 changes: 8 additions & 8 deletions src/seq/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,17 +732,17 @@ mod test {
use super::*;

// The theoretical probabilities of the different outcomes are:
// AB: 0.5 * 0.5 = 0.250
// AC: 0.5 * 0.5 = 0.250
// BA: 0.25 * 0.67 = 0.167
// BC: 0.25 * 0.33 = 0.082
// CA: 0.25 * 0.67 = 0.167
// CB: 0.25 * 0.33 = 0.082
let choices = [('a', 2), ('b', 1), ('c', 1)];
// AB: 0.5 * 0.667 = 0.3333
// AC: 0.5 * 0.333 = 0.1667
// BA: 0.333 * 0.75 = 0.25
// BC: 0.333 * 0.25 = 0.0833
// CA: 0.167 * 0.6 = 0.1
// CB: 0.167 * 0.4 = 0.0667
let choices = [('a', 3), ('b', 2), ('c', 1)];
let mut rng = crate::test::rng(414);

let mut results = [0i32; 3];
let expected_results = [4167, 4167, 1666];
let expected_results = [5833, 2667, 1500];
for _ in 0..10000 {
let result = choices
.choose_multiple_weighted(&mut rng, 2, |item| item.1)
Expand Down