From 2be90ab1d4ce61114124868dacbd68e400b22019 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 15 Dec 2025 17:09:44 -0800
Subject: [PATCH 1/5] ci: add memory and io benchmarks for building indices

---
 .github/workflows/ci-benchmarks.yml           |  2 +-
 .../ci_benchmarks/benchmarks/test_indexing.py | 60 +++++++++++++++++++
 python/python/lance/_datagen.py               |  3 +-
 3 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 python/python/ci_benchmarks/benchmarks/test_indexing.py

diff --git a/.github/workflows/ci-benchmarks.yml b/.github/workflows/ci-benchmarks.yml
index ab1da5a121b..2eadf65dd0d 100644
--- a/.github/workflows/ci-benchmarks.yml
+++ b/.github/workflows/ci-benchmarks.yml
@@ -45,7 +45,7 @@ jobs:
           python -m venv venv
           source venv/bin/activate
           pip install maturin duckdb requests pytest pytest-benchmark
-          maturin develop --locked --release
+          maturin develop --locked --release --features datagen
       - name: Build memtest
         run: |
           source venv/bin/activate
diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py
new file mode 100644
index 00000000000..33368c38de4
--- /dev/null
+++ b/python/python/ci_benchmarks/benchmarks/test_indexing.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+from pathlib import Path
+
+import lance
+import pyarrow as pa
+import pytest
+from lance._datagen import rand_batches
+
+
+@pytest.mark.parametrize(
+    "data_type", [pa.int64(), pa.string()], ids=["int64", "string"]
+)
+@pytest.mark.parametrize("index_type", ["btree", "bitmap", "zonemap", "bloomfilter"])
+@pytest.mark.io_memory_benchmark()
+def test_io_mem_build_scalar_index(
+    io_mem_benchmark, data_type: pa.DataType, index_type: str, tmp_path: Path
+):
+    schema = pa.schema([pa.field("col", data_type)])
+    # 100MB
+    data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
+    ds = lance.write_dataset(data, tmp_path)
+
+    def build_index(ds):
+        ds.create_scalar_index("col", index_type, replace=True)
+
+    io_mem_benchmark(build_index, ds, warmup=False)
+
+
+@pytest.mark.parametrize("with_positions", [True, False])
+@pytest.mark.io_memory_benchmark()
+def test_io_mem_build_fts(io_mem_benchmark, with_positions: bool, tmp_path: Path):
+    schema = pa.schema([pa.field("text", pa.string())])
+    # 100MB
+    data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
+    ds = lance.write_dataset(data, tmp_path)
+
+    def build_index(ds):
+        ds.create_scalar_index("text", "INVERTED", with_position=True, replace=True)
+
+    io_mem_benchmark(build_index, ds, warmup=False)
+
+
+@pytest.mark.io_memory_benchmark()
+def test_io_mem_build_ivf_pq(io_mem_benchmark, tmp_path: Path):
+    schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 1024))])
+    # 1GB
+    data = rand_batches(schema, num_batches=100, batch_size_bytes=10 * 1024 * 1024)
+    ds = lance.write_dataset(data, tmp_path)
+
+    def build_index(ds):
+        ds.create_index(
+            "vector",
+            index_type="IVF_PQ",
+            num_partitions=32,
+            num_sub_vectors=4,
+            replace=True,
+        )
+
+    io_mem_benchmark(build_index, ds, warmup=False)
diff --git a/python/python/lance/_datagen.py b/python/python/lance/_datagen.py
index 9c0e203cb77..b156066eca6 100644
--- a/python/python/lance/_datagen.py
+++ b/python/python/lance/_datagen.py
@@ -26,4 +26,5 @@ def rand_batches(
         raise NotImplementedError(
             "This version of lance was not built with the datagen feature"
         )
-    return datagen.rand_batches(schema, num_batches, batch_size_bytes)
+    batch_iter = datagen.rand_batches(schema, num_batches, batch_size_bytes)
+    return pa.RecordBatchReader.from_batches(schema, batch_iter)

From eccbc79ce321f0a4ad9ae4c7106ad11d69a464d4 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 16 Dec 2025 10:11:16 -0800
Subject: [PATCH 2/5] Make FTS datagen easier.

---
 Cargo.lock                          |  1 +
 python/Cargo.lock                   |  1 +
 rust/lance-datagen/Cargo.toml       |  1 +
 rust/lance-datagen/src/generator.rs | 57 ++++++++++++++++++++++++-----
 4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index be4fb9b25ad..12e6d7e735f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4600,6 +4600,7 @@ dependencies = [
  "hex",
  "pprof",
  "rand 0.9.2",
+ "rand_distr 0.5.1",
  "rand_xoshiro",
  "random_word",
 ]
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 43622d4d9df..d0bbee9d9d5 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -4019,6 +4019,7 @@ dependencies = [
  "half",
  "hex",
  "rand 0.9.2",
+ "rand_distr 0.5.1",
  "rand_xoshiro",
  "random_word",
 ]
diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml
index 2330d083f97..c192485b271 100644
--- a/rust/lance-datagen/Cargo.toml
+++ b/rust/lance-datagen/Cargo.toml
@@ -19,6 +19,7 @@ futures = { workspace = true }
 half = { workspace = true }
 hex = "0.4.3"
 rand = { workspace = true }
+rand_distr = { workspace = true }
 rand_xoshiro = { workspace = true }
 random_word = { version = "0.5", features = ["en"] }
 
diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs
index bc319c1ed2e..e4cf673069f 100644
--- a/rust/lance-datagen/src/generator.rs
+++ b/rust/lance-datagen/src/generator.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc};
+use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc, sync::LazyLock};
 
 use arrow::{
     array::{ArrayData, AsArray, Float32Builder, GenericBinaryBuilder, GenericStringBuilder},
@@ -21,6 +21,7 @@ use arrow_array::{
 use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
 use futures::{stream::BoxStream, StreamExt};
 use rand::{distr::Uniform, Rng, RngCore, SeedableRng};
+use rand_distr::Zipf;
 use random_word;
 
 use self::array::rand_with_distribution;
@@ -1172,21 +1173,55 @@ impl ArrayGenerator for BinaryPrefixPlusCounterGenerator {
     }
 }
 
-#[derive(Debug)]
+// Common English stop words placed at the front to be sampled more frequently
+const STOP_WORDS: &[&str] = &[
+    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with",
+];
+
+/// Word list with stop words at the front for Zipf sampling, computed once.
+static SENTENCE_WORDS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
+    let all_words = random_word::all(random_word::Lang::En);
+    let mut words = Vec::with_capacity(STOP_WORDS.len() + all_words.len());
+    words.extend(STOP_WORDS.iter().copied());
+    words.extend(
+        all_words
+            .iter()
+            .filter(|w| !STOP_WORDS.contains(w))
+            .copied(),
+    );
+    words
+});
+
 struct RandomSentenceGenerator {
     min_words: usize,
     max_words: usize,
-    words: &'static [&'static str],
+    /// Zipf distribution for word selection (favors lower indices)
+    zipf: Zipf<f64>,
     is_large: bool,
 }
 
+impl std::fmt::Debug for RandomSentenceGenerator {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RandomSentenceGenerator")
+            .field("min_words", &self.min_words)
+            .field("max_words", &self.max_words)
+            .field("num_words", &SENTENCE_WORDS.len())
+            .field("is_large", &self.is_large)
+            .finish()
+    }
+}
+
 impl RandomSentenceGenerator {
     pub fn new(min_words: usize, max_words: usize, is_large: bool) -> Self {
-        let words = random_word::all(random_word::Lang::En);
+        // Zipf distribution with exponent ~1.0 approximates natural language
+        let zipf = Zipf::new(SENTENCE_WORDS.len() as f64, 1.0).unwrap();
+
         Self {
             min_words,
             max_words,
-            words,
+            zipf,
             is_large,
         }
     }
@@ -1203,7 +1238,11 @@ impl ArrayGenerator for RandomSentenceGenerator {
         for _ in 0..length.0 {
             let num_words = rng.random_range(self.min_words..=self.max_words);
             let sentence: String = (0..num_words)
-                .map(|_| self.words[rng.random_range(0..self.words.len())])
+                .map(|_| {
+                    // Zipf returns 1-indexed values, subtract 1 for 0-indexed array
+                    let idx = rng.sample(self.zipf) as usize - 1;
+                    SENTENCE_WORDS[idx]
+                })
                 .collect::<Vec<_>>()
                 .join(" ");
             values.push(sentence);
@@ -2931,9 +2970,9 @@ mod tests {
         assert_eq!(
             *genn.generate(RowCount::from(3), &mut rng).unwrap(),
             arrow_array::BinaryArray::from_iter_values([
-                vec![234, 107],
-                vec![220, 152],
-                vec![21, 16, 184, 220]
+                vec![174, 178],
+                vec![64, 122, 207, 248],
+                vec![124, 3, 58]
             ])
         );
     }

From 5ea53033c6d38d6382c7ed7651061cc6c1356f1b Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 16 Dec 2025 10:40:38 -0800
Subject: [PATCH 3/5] choose FTS

---
 .../ci_benchmarks/benchmarks/test_indexing.py |  8 ++++++-
 rust/lance-datagen/src/generator.rs           | 22 +++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py
index 33368c38de4..d0593c60c49 100644
--- a/python/python/ci_benchmarks/benchmarks/test_indexing.py
+++ b/python/python/ci_benchmarks/benchmarks/test_indexing.py
@@ -30,7 +30,13 @@ def build_index(ds):
 @pytest.mark.parametrize("with_positions", [True, False])
 @pytest.mark.io_memory_benchmark()
 def test_io_mem_build_fts(io_mem_benchmark, with_positions: bool, tmp_path: Path):
-    schema = pa.schema([pa.field("text", pa.string())])
+    schema = pa.schema(
+        [
+            pa.field(
+                "text", pa.string(), metadata={"lance-datagen:content-type": "sentence"}
+            )
+        ]
+    )
     # 100MB
     data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
     ds = lance.write_dataset(data, tmp_path)
diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs
index e4cf673069f..4df8c8f6554 100644
--- a/rust/lance-datagen/src/generator.rs
+++ b/rust/lance-datagen/src/generator.rs
@@ -2788,13 +2788,31 @@ pub fn gen_array(genn: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
     ArrayGeneratorBuilder::new(genn)
 }
 
+/// Metadata key to specify content type for string generation.
+/// Set to "sentence" to use the sentence generator with Zipf distribution.
+pub const CONTENT_TYPE_KEY: &str = "lance-datagen:content-type";
+
+/// Create a generator for a field, checking metadata for content type hints.
+pub fn rand_field(field: &Field) -> Box<dyn ArrayGenerator> {
+    if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) {
+        match (content_type.as_str(), field.data_type()) {
+            ("sentence", DataType::Utf8) => return array::random_sentence(1, 10, false),
+            ("sentence", DataType::LargeUtf8) => return array::random_sentence(1, 10, true),
+            _ => {}
+        }
+    }
+    array::rand_type(field.data_type())
+}
+
 /// Create a BatchGeneratorBuilder with the given schema
 ///
-/// You can add more columns or convert this into a reader immediately
+/// You can add more columns or convert this into a reader immediately.
+/// Fields with metadata `lance-datagen:content-type` = `"sentence"` will use
+/// the sentence generator with Zipf distribution for more realistic text.
 pub fn rand(schema: &Schema) -> BatchGeneratorBuilder {
     let mut builder = BatchGeneratorBuilder::default();
     for field in schema.fields() {
-        builder = builder.col(field.name(), array::rand_type(field.data_type()));
+        builder = builder.col(field.name(), rand_field(field));
     }
     builder
 }

From 1fdd85dcd83b6194c6bb84975edc552243d3e3a6 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 16 Dec 2025 12:02:57 -0800
Subject: [PATCH 4/5] cardinality limit

---
 .../ci_benchmarks/benchmarks/test_indexing.py |   6 +-
 rust/lance-datagen/src/generator.rs           | 116 ++++++++++++++++--
 2 files changed, 114 insertions(+), 8 deletions(-)

diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py
index d0593c60c49..8131fd41369 100644
--- a/python/python/ci_benchmarks/benchmarks/test_indexing.py
+++ b/python/python/ci_benchmarks/benchmarks/test_indexing.py
@@ -16,7 +16,11 @@
 def test_io_mem_build_scalar_index(
     io_mem_benchmark, data_type: pa.DataType, index_type: str, tmp_path: Path
 ):
-    schema = pa.schema([pa.field("col", data_type)])
+    metadata = None
+    if index_type == "bitmap":
+        metadata = {b"lance-datagen:cardinality": b"1000"}
+    schema = pa.schema([pa.field("col", data_type, metadata=metadata)])
+
     # 100MB
     data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
     ds = lance.write_dataset(data, tmp_path)
diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs
index 4df8c8f6554..348576f2586 100644
--- a/rust/lance-datagen/src/generator.rs
+++ b/rust/lance-datagen/src/generator.rs
@@ -1569,6 +1569,72 @@ impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGener
     }
 }
 
+/// Generator that produces low-cardinality data by generating a fixed set of
+/// unique values and then randomly selecting from them.
+struct LowCardinalityGenerator {
+    inner: Box<dyn ArrayGenerator>,
+    cardinality: usize,
+    /// Cached unique values, generated on first call
+    unique_values: Option<Arc<dyn Array>>,
+}
+
+impl std::fmt::Debug for LowCardinalityGenerator {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("LowCardinalityGenerator")
+            .field("inner", &self.inner)
+            .field("cardinality", &self.cardinality)
+            .field("initialized", &self.unique_values.is_some())
+            .finish()
+    }
+}
+
+impl LowCardinalityGenerator {
+    fn new(inner: Box<dyn ArrayGenerator>, cardinality: usize) -> Self {
+        Self {
+            inner,
+            cardinality,
+            unique_values: None,
+        }
+    }
+}
+
+impl ArrayGenerator for LowCardinalityGenerator {
+    fn generate(
+        &mut self,
+        length: RowCount,
+        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
+    ) -> Result<Arc<dyn Array>, ArrowError> {
+        // Generate unique values on first call
+        if self.unique_values.is_none() {
+            self.unique_values = Some(
+                self.inner
+                    .generate(RowCount::from(self.cardinality as u64), rng)?,
+            );
+        }
+
+        let unique_values = self.unique_values.as_ref().unwrap();
+
+        // Generate random indices into the unique values
+        let indices: Vec<usize> = (0..length.0)
+            .map(|_| rng.random_range(0..self.cardinality))
+            .collect();
+
+        // Use arrow's take to select values
+        let indices_array =
+            arrow_array::UInt32Array::from(indices.iter().map(|&i| i as u32).collect::<Vec<_>>());
+        arrow::compute::take(unique_values.as_ref(), &indices_array, None)
+            .map(|arr| arr as Arc<dyn Array>)
+    }
+
+    fn data_type(&self) -> &DataType {
+        self.inner.data_type()
+    }
+
+    fn element_size_bytes(&self) -> Option<ByteCount> {
+        self.inner.element_size_bytes()
+    }
+}
+
 #[derive(Debug)]
 struct RandomListGenerator {
     field: Arc<Field>,
@@ -2776,6 +2842,17 @@ pub mod array {
             _ => unimplemented!(),
         }
     }
+
+    /// Wraps a generator to produce low-cardinality data.
+    ///
+    /// Generates `cardinality` unique values on first call, then randomly
+    /// selects from them for all subsequent rows.
+    pub fn low_cardinality(
+        generator: Box<dyn ArrayGenerator>,
+        cardinality: usize,
+    ) -> Box<dyn ArrayGenerator> {
+        Box::new(LowCardinalityGenerator::new(generator, cardinality))
+    }
 }
 
 /// Create a BatchGeneratorBuilder to start generating batch data
@@ -2792,23 +2869,48 @@ pub fn gen_array(genn: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
 /// Set to "sentence" to use the sentence generator with Zipf distribution.
 pub const CONTENT_TYPE_KEY: &str = "lance-datagen:content-type";
 
+/// Metadata key to specify cardinality for low-cardinality data generation.
+/// Set to a numeric string (e.g., "100") to limit unique values.
+pub const CARDINALITY_KEY: &str = "lance-datagen:cardinality";
+
 /// Create a generator for a field, checking metadata for content type hints.
+///
+/// Supported metadata keys:
+/// - `lance-datagen:content-type`: Set to "sentence" for Utf8/LargeUtf8 fields
+///   to use the sentence generator with Zipf distribution.
+/// - `lance-datagen:cardinality`: Set to a number to limit unique values.
+///   The generator will produce only that many unique values and randomly
+///   select from them.
 pub fn rand_field(field: &Field) -> Box<dyn ArrayGenerator> {
-    if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) {
+    let mut generator = if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) {
         match (content_type.as_str(), field.data_type()) {
-            ("sentence", DataType::Utf8) => return array::random_sentence(1, 10, false),
-            ("sentence", DataType::LargeUtf8) => return array::random_sentence(1, 10, true),
-            _ => {}
+            ("sentence", DataType::Utf8) => array::random_sentence(1, 10, false),
+            ("sentence", DataType::LargeUtf8) => array::random_sentence(1, 10, true),
+            _ => array::rand_type(field.data_type()),
+        }
+    } else {
+        array::rand_type(field.data_type())
+    };
+
+    if let Some(cardinality_str) = field.metadata().get(CARDINALITY_KEY) {
+        if let Ok(cardinality) = cardinality_str.parse::<usize>() {
+            if cardinality > 0 {
+                generator = array::low_cardinality(generator, cardinality);
+            }
         }
     }
-    array::rand_type(field.data_type())
+
+    generator
 }
 
 /// Create a BatchGeneratorBuilder with the given schema
 ///
 /// You can add more columns or convert this into a reader immediately.
-/// Fields with metadata `lance-datagen:content-type` = `"sentence"` will use
-/// the sentence generator with Zipf distribution for more realistic text.
+///
+/// Supported field metadata:
+/// - `lance-datagen:content-type` = `"sentence"`: Use sentence generator with
+///   Zipf distribution for more realistic text (Utf8/LargeUtf8 only).
+/// - `lance-datagen:cardinality` = `"<number>"`: Limit to N unique values.
 pub fn rand(schema: &Schema) -> BatchGeneratorBuilder {
     let mut builder = BatchGeneratorBuilder::default();
     for field in schema.fields() {

From 90a0c4f611581d0342c93ff52811d3e8d9c858a2 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 16 Dec 2025 12:22:24 -0800
Subject: [PATCH 5/5] fix formatting

---
 python/python/ci_benchmarks/benchmark.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/python/ci_benchmarks/benchmark.py b/python/python/ci_benchmarks/benchmark.py
index e735b8e382f..7d80596e305 100644
--- a/python/python/ci_benchmarks/benchmark.py
+++ b/python/python/ci_benchmarks/benchmark.py
@@ -69,6 +69,15 @@ def _format_bytes(num_bytes: int) -> str:
     return f"{num_bytes:.1f} PB"
 
 
+def _format_count(count: int) -> str:
+    """Format a large count with commas."""
+    for unit in ["", "K"]:
+        if abs(count) < 1000.0:
+            return f"{count:.1f} {unit}"
+        count /= 1000.0
+    return f"{count:.1f} M"
+
+
 class IOMemoryBenchmark:
     """Benchmark fixture that tracks IO and memory during execution."""
 
@@ -204,14 +213,14 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
             f"{'Read IOPS':>10}  {'Read Bytes':>12}  "
             f"{'Write IOPS':>10}  {'Write Bytes':>12}"
         )
-        terminalreporter.write_line("-" * (name_width + 72))
+        terminalreporter.write_line("-" * (name_width + 76))
     else:
         terminalreporter.write_line(
             f"{'Test':<{name_width}}  "
             f"{'Read IOPS':>10}  {'Read Bytes':>12}  "
             f"{'Write IOPS':>10}  {'Write Bytes':>12}"
         )
-        terminalreporter.write_line("-" * (name_width + 50))
+        terminalreporter.write_line("-" * (name_width + 52))
 
     # Results sorted by read bytes (descending)
     sorted_results = sorted(
@@ -224,7 +233,7 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
             terminalreporter.write_line(
                 f"{result.name:<{name_width}}  "
                 f"{_format_bytes(s.peak_bytes):>10}  "
-                f"{s.total_allocations:>10,}  "
+                f"{_format_count(s.total_allocations):>10}  "
                 f"{s.read_iops:>10,}  "
                 f"{_format_bytes(s.read_bytes):>12}  "
                 f"{s.write_iops:>10,}  "