From 2be90ab1d4ce61114124868dacbd68e400b22019 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 15 Dec 2025 17:09:44 -0800 Subject: [PATCH 1/5] ci: add memory and io benchmarks for building indices --- .github/workflows/ci-benchmarks.yml | 2 +- .../ci_benchmarks/benchmarks/test_indexing.py | 60 +++++++++++++++++++ python/python/lance/_datagen.py | 3 +- 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 python/python/ci_benchmarks/benchmarks/test_indexing.py diff --git a/.github/workflows/ci-benchmarks.yml b/.github/workflows/ci-benchmarks.yml index ab1da5a121b..2eadf65dd0d 100644 --- a/.github/workflows/ci-benchmarks.yml +++ b/.github/workflows/ci-benchmarks.yml @@ -45,7 +45,7 @@ jobs: python -m venv venv source venv/bin/activate pip install maturin duckdb requests pytest pytest-benchmark - maturin develop --locked --release + maturin develop --locked --release --features datagen - name: Build memtest run: | source venv/bin/activate diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py new file mode 100644 index 00000000000..33368c38de4 --- /dev/null +++ b/python/python/ci_benchmarks/benchmarks/test_indexing.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors +from pathlib import Path + +import lance +import pyarrow as pa +import pytest +from lance._datagen import rand_batches + + +@pytest.mark.parametrize( + "data_type", [pa.int64(), pa.string()], ids=["int64", "string"] +) +@pytest.mark.parametrize("index_type", ["btree", "bitmap", "zonemap", "bloomfilter"]) +@pytest.mark.io_memory_benchmark() +def test_io_mem_build_scalar_index( + io_mem_benchmark, data_type: pa.DataType, index_type: str, tmp_path: Path +): + schema = pa.schema([pa.field("col", data_type)]) + # 100MB + data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024) + ds = lance.write_dataset(data, tmp_path) + + def build_index(ds): + ds.create_scalar_index("col", index_type, replace=True) + + io_mem_benchmark(build_index, ds, warmup=False) + + +@pytest.mark.parametrize("with_positions", [True, False]) +@pytest.mark.io_memory_benchmark() +def test_io_mem_build_fts(io_mem_benchmark, with_positions: bool, tmp_path: Path): + schema = pa.schema([pa.field("text", pa.string())]) + # 100MB + data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024) + ds = lance.write_dataset(data, tmp_path) + + def build_index(ds): + ds.create_scalar_index("text", "INVERTED", with_position=True, replace=True) + + io_mem_benchmark(build_index, ds, warmup=False) + + +@pytest.mark.io_memory_benchmark() +def test_io_mem_build_ivf_pq(io_mem_benchmark, tmp_path: Path): + schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 1024))]) + # 1GB + data = rand_batches(schema, num_batches=100, batch_size_bytes=10 * 1024 * 1024) + ds = lance.write_dataset(data, tmp_path) + + def build_index(ds): + ds.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=32, + num_sub_vectors=4, + replace=True, + ) + + io_mem_benchmark(build_index, ds, warmup=False) diff --git a/python/python/lance/_datagen.py b/python/python/lance/_datagen.py index 9c0e203cb77..b156066eca6 100644 --- a/python/python/lance/_datagen.py +++ b/python/python/lance/_datagen.py @@ -26,4 +26,5 @@ def rand_batches( raise NotImplementedError( "This version of lance was not built with the datagen feature" ) - return datagen.rand_batches(schema, num_batches, batch_size_bytes) + batch_iter = datagen.rand_batches(schema, num_batches, batch_size_bytes) + return pa.RecordBatchReader.from_batches(schema, batch_iter) From eccbc79ce321f0a4ad9ae4c7106ad11d69a464d4 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 16 Dec 2025 10:11:16 -0800 Subject: [PATCH 2/5] Make FTS datagen easier. --- Cargo.lock | 1 + python/Cargo.lock | 1 + rust/lance-datagen/Cargo.toml | 1 + rust/lance-datagen/src/generator.rs | 57 ++++++++++++++++++++++++----- 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be4fb9b25ad..12e6d7e735f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4600,6 +4600,7 @@ dependencies = [ "hex", "pprof", "rand 0.9.2", + "rand_distr 0.5.1", "rand_xoshiro", "random_word", ] diff --git a/python/Cargo.lock b/python/Cargo.lock index 43622d4d9df..d0bbee9d9d5 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4019,6 +4019,7 @@ dependencies = [ "half", "hex", "rand 0.9.2", + "rand_distr 0.5.1", "rand_xoshiro", "random_word", ] diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml index 2330d083f97..c192485b271 100644 --- a/rust/lance-datagen/Cargo.toml +++ b/rust/lance-datagen/Cargo.toml @@ -19,6 +19,7 @@ futures = { workspace = true } half = { workspace = true } hex = "0.4.3" rand = { workspace = true } +rand_distr = { workspace = true } rand_xoshiro = { workspace = true } random_word = { version = "0.5", features = ["en"] } diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index bc319c1ed2e..e4cf673069f 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc}; +use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc, sync::LazyLock}; use arrow::{ array::{ArrayData, AsArray, Float32Builder, GenericBinaryBuilder, GenericStringBuilder}, @@ -21,6 +21,7 @@ use arrow_array::{ use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef}; use futures::{stream::BoxStream, StreamExt}; use rand::{distr::Uniform, Rng, RngCore, SeedableRng}; +use rand_distr::Zipf; use random_word; use self::array::rand_with_distribution; @@ -1172,21 +1173,55 @@ impl ArrayGenerator for BinaryPrefixPlusCounterGenerator { } } -#[derive(Debug)] +// Common English stop words placed at the front to be sampled more frequently +const STOP_WORDS: &[&str] = &[ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with", +]; + +/// Word list with stop words at the front for Zipf sampling, computed once. +static SENTENCE_WORDS: LazyLock> = LazyLock::new(|| { + let all_words = random_word::all(random_word::Lang::En); + let mut words = Vec::with_capacity(STOP_WORDS.len() + all_words.len()); + words.extend(STOP_WORDS.iter().copied()); + words.extend( + all_words + .iter() + .filter(|w| !STOP_WORDS.contains(w)) + .copied(), + ); + words +}); + struct RandomSentenceGenerator { min_words: usize, max_words: usize, - words: &'static [&'static str], + /// Zipf distribution for word selection (favors lower indices) + zipf: Zipf, is_large: bool, } +impl std::fmt::Debug for RandomSentenceGenerator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RandomSentenceGenerator") + .field("min_words", &self.min_words) + .field("max_words", &self.max_words) + .field("num_words", &SENTENCE_WORDS.len()) + .field("is_large", &self.is_large) + .finish() + } +} + impl RandomSentenceGenerator { pub fn new(min_words: usize, max_words: usize, is_large: bool) -> Self { - let words = random_word::all(random_word::Lang::En); + // Zipf distribution with exponent ~1.0 approximates natural language + let zipf = Zipf::new(SENTENCE_WORDS.len() as f64, 1.0).unwrap(); + Self { min_words, max_words, - words, + zipf, is_large, } } @@ -1203,7 +1238,11 @@ impl ArrayGenerator for RandomSentenceGenerator { for _ in 0..length.0 { let num_words = rng.random_range(self.min_words..=self.max_words); let sentence: String = (0..num_words) - .map(|_| self.words[rng.random_range(0..self.words.len())]) + .map(|_| { + // Zipf returns 1-indexed values, subtract 1 for 0-indexed array + let idx = rng.sample(self.zipf) as usize - 1; + SENTENCE_WORDS[idx] + }) .collect::>() .join(" "); values.push(sentence); @@ -2931,9 +2970,9 @@ mod tests { assert_eq!( *genn.generate(RowCount::from(3), &mut rng).unwrap(), arrow_array::BinaryArray::from_iter_values([ - vec![234, 107], - vec![220, 152], - vec![21, 16, 184, 220] + vec![174, 178], + vec![64, 122, 207, 248], + vec![124, 3, 58] ]) ); } From 5ea53033c6d38d6382c7ed7651061cc6c1356f1b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 16 Dec 2025 10:40:38 -0800 Subject: [PATCH 3/5] choose FTS --- .../ci_benchmarks/benchmarks/test_indexing.py | 8 ++++++- rust/lance-datagen/src/generator.rs | 22 +++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py index 33368c38de4..d0593c60c49 100644 --- a/python/python/ci_benchmarks/benchmarks/test_indexing.py +++ b/python/python/ci_benchmarks/benchmarks/test_indexing.py @@ -30,7 +30,13 @@ def build_index(ds): @pytest.mark.parametrize("with_positions", [True, False]) @pytest.mark.io_memory_benchmark() def test_io_mem_build_fts(io_mem_benchmark, with_positions: bool, tmp_path: Path): - schema = pa.schema([pa.field("text", pa.string())]) + schema = pa.schema( + [ + pa.field( + "text", pa.string(), metadata={"lance-datagen:content-type": "sentence"} + ) + ] + ) # 100MB data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024) ds = lance.write_dataset(data, tmp_path) diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index e4cf673069f..4df8c8f6554 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -2788,13 +2788,31 @@ pub fn gen_array(genn: Box) -> ArrayGeneratorBuilder { ArrayGeneratorBuilder::new(genn) } +/// Metadata key to specify content type for string generation. +/// Set to "sentence" to use the sentence generator with Zipf distribution. +pub const CONTENT_TYPE_KEY: &str = "lance-datagen:content-type"; + +/// Create a generator for a field, checking metadata for content type hints. +pub fn rand_field(field: &Field) -> Box { + if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) { + match (content_type.as_str(), field.data_type()) { + ("sentence", DataType::Utf8) => return array::random_sentence(1, 10, false), + ("sentence", DataType::LargeUtf8) => return array::random_sentence(1, 10, true), + _ => {} + } + } + array::rand_type(field.data_type()) +} + /// Create a BatchGeneratorBuilder with the given schema /// -/// You can add more columns or convert this into a reader immediately +/// You can add more columns or convert this into a reader immediately. +/// Fields with metadata `lance-datagen:content-type` = `"sentence"` will use +/// the sentence generator with Zipf distribution for more realistic text. pub fn rand(schema: &Schema) -> BatchGeneratorBuilder { let mut builder = BatchGeneratorBuilder::default(); for field in schema.fields() { - builder = builder.col(field.name(), array::rand_type(field.data_type())); + builder = builder.col(field.name(), rand_field(field)); } builder } From 1fdd85dcd83b6194c6bb84975edc552243d3e3a6 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 16 Dec 2025 12:02:57 -0800 Subject: [PATCH 4/5] cardinality limit --- .../ci_benchmarks/benchmarks/test_indexing.py | 6 +- rust/lance-datagen/src/generator.rs | 116 ++++++++++++++++-- 2 files changed, 114 insertions(+), 8 deletions(-) diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py index d0593c60c49..8131fd41369 100644 --- a/python/python/ci_benchmarks/benchmarks/test_indexing.py +++ b/python/python/ci_benchmarks/benchmarks/test_indexing.py @@ -16,7 +16,11 @@ def test_io_mem_build_scalar_index( io_mem_benchmark, data_type: pa.DataType, index_type: str, tmp_path: Path ): - schema = pa.schema([pa.field("col", data_type)]) + metadata = None + if index_type == "bitmap": + metadata = {b"lance-datagen:cardinality": b"1000"} + schema = pa.schema([pa.field("col", data_type, metadata=metadata)]) + # 100MB data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024) ds = lance.write_dataset(data, tmp_path) diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index 4df8c8f6554..348576f2586 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -1569,6 +1569,72 @@ impl ArrayGenerator for DictionaryGener } } +/// Generator that produces low-cardinality data by generating a fixed set of +/// unique values and then randomly selecting from them. +struct LowCardinalityGenerator { + inner: Box, + cardinality: usize, + /// Cached unique values, generated on first call + unique_values: Option>, +} + +impl std::fmt::Debug for LowCardinalityGenerator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LowCardinalityGenerator") + .field("inner", &self.inner) + .field("cardinality", &self.cardinality) + .field("initialized", &self.unique_values.is_some()) + .finish() + } +} + +impl LowCardinalityGenerator { + fn new(inner: Box, cardinality: usize) -> Self { + Self { + inner, + cardinality, + unique_values: None, + } + } +} + +impl ArrayGenerator for LowCardinalityGenerator { + fn generate( + &mut self, + length: RowCount, + rng: &mut rand_xoshiro::Xoshiro256PlusPlus, + ) -> Result, ArrowError> { + // Generate unique values on first call + if self.unique_values.is_none() { + self.unique_values = Some( + self.inner + .generate(RowCount::from(self.cardinality as u64), rng)?, + ); + } + + let unique_values = self.unique_values.as_ref().unwrap(); + + // Generate random indices into the unique values + let indices: Vec = (0..length.0) + .map(|_| rng.random_range(0..self.cardinality)) + .collect(); + + // Use arrow's take to select values + let indices_array = + arrow_array::UInt32Array::from(indices.iter().map(|&i| i as u32).collect::>()); + arrow::compute::take(unique_values.as_ref(), &indices_array, None) + .map(|arr| arr as Arc) + } + + fn data_type(&self) -> &DataType { + self.inner.data_type() + } + + fn element_size_bytes(&self) -> Option { + self.inner.element_size_bytes() + } +} + #[derive(Debug)] struct RandomListGenerator { field: Arc, @@ -2776,6 +2842,17 @@ pub mod array { _ => unimplemented!(), } } + + /// Wraps a generator to produce low-cardinality data. + /// + /// Generates `cardinality` unique values on first call, then randomly + /// selects from them for all subsequent rows. + pub fn low_cardinality( + generator: Box, + cardinality: usize, + ) -> Box { + Box::new(LowCardinalityGenerator::new(generator, cardinality)) + } } /// Create a BatchGeneratorBuilder to start generating batch data @@ -2792,23 +2869,48 @@ pub fn gen_array(genn: Box) -> ArrayGeneratorBuilder { /// Set to "sentence" to use the sentence generator with Zipf distribution. pub const CONTENT_TYPE_KEY: &str = "lance-datagen:content-type"; +/// Metadata key to specify cardinality for low-cardinality data generation. +/// Set to a numeric string (e.g., "100") to limit unique values. +pub const CARDINALITY_KEY: &str = "lance-datagen:cardinality"; + /// Create a generator for a field, checking metadata for content type hints. +/// +/// Supported metadata keys: +/// - `lance-datagen:content-type`: Set to "sentence" for Utf8/LargeUtf8 fields +/// to use the sentence generator with Zipf distribution. +/// - `lance-datagen:cardinality`: Set to a number to limit unique values. +/// The generator will produce only that many unique values and randomly +/// select from them. pub fn rand_field(field: &Field) -> Box { - if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) { + let mut generator = if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) { match (content_type.as_str(), field.data_type()) { - ("sentence", DataType::Utf8) => return array::random_sentence(1, 10, false), - ("sentence", DataType::LargeUtf8) => return array::random_sentence(1, 10, true), - _ => {} + ("sentence", DataType::Utf8) => array::random_sentence(1, 10, false), + ("sentence", DataType::LargeUtf8) => array::random_sentence(1, 10, true), + _ => array::rand_type(field.data_type()), + } + } else { + array::rand_type(field.data_type()) + }; + + if let Some(cardinality_str) = field.metadata().get(CARDINALITY_KEY) { + if let Ok(cardinality) = cardinality_str.parse::() { + if cardinality > 0 { + generator = array::low_cardinality(generator, cardinality); + } } } - array::rand_type(field.data_type()) + + generator } /// Create a BatchGeneratorBuilder with the given schema /// /// You can add more columns or convert this into a reader immediately. -/// Fields with metadata `lance-datagen:content-type` = `"sentence"` will use -/// the sentence generator with Zipf distribution for more realistic text. +/// +/// Supported field metadata: +/// - `lance-datagen:content-type` = `"sentence"`: Use sentence generator with +/// Zipf distribution for more realistic text (Utf8/LargeUtf8 only). +/// - `lance-datagen:cardinality` = `""`: Limit to N unique values. pub fn rand(schema: &Schema) -> BatchGeneratorBuilder { let mut builder = BatchGeneratorBuilder::default(); for field in schema.fields() { From 90a0c4f611581d0342c93ff52811d3e8d9c858a2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 16 Dec 2025 12:22:24 -0800 Subject: [PATCH 5/5] fix formatting --- python/python/ci_benchmarks/benchmark.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/python/ci_benchmarks/benchmark.py b/python/python/ci_benchmarks/benchmark.py index e735b8e382f..7d80596e305 100644 --- a/python/python/ci_benchmarks/benchmark.py +++ b/python/python/ci_benchmarks/benchmark.py @@ -69,6 +69,15 @@ def _format_bytes(num_bytes: int) -> str: return f"{num_bytes:.1f} PB" +def _format_count(count: int) -> str: + """Format a large count with commas.""" + for unit in ["", "K"]: + if abs(count) < 1000.0: + return f"{count:.1f} {unit}" + count /= 1000.0 + return f"{count:.1f} M" + + class IOMemoryBenchmark: """Benchmark fixture that tracks IO and memory during execution.""" @@ -204,14 +213,14 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): f"{'Read IOPS':>10} {'Read Bytes':>12} " f"{'Write IOPS':>10} {'Write Bytes':>12}" ) - terminalreporter.write_line("-" * (name_width + 72)) + terminalreporter.write_line("-" * (name_width + 76)) else: terminalreporter.write_line( f"{'Test':<{name_width}} " f"{'Read IOPS':>10} {'Read Bytes':>12} " f"{'Write IOPS':>10} {'Write Bytes':>12}" ) - terminalreporter.write_line("-" * (name_width + 50)) + terminalreporter.write_line("-" * (name_width + 52)) # Results sorted by read bytes (descending) sorted_results = sorted( @@ -224,7 +233,7 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): terminalreporter.write_line( f"{result.name:<{name_width}} " f"{_format_bytes(s.peak_bytes):>10} " - f"{s.total_allocations:>10,} " + f"{_format_count(s.total_allocations):>10} " f"{s.read_iops:>10,} " f"{_format_bytes(s.read_bytes):>12} " f"{s.write_iops:>10,} "