lance-format · wjones127 · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.github/workflows/ci-benchmarks.yml b/.github/workflows/ci-benchmarks.yml
@@ -45,7 +45,7 @@ jobs:
           python -m venv venv
           source venv/bin/activate
           pip install maturin duckdb requests pytest pytest-benchmark
-          maturin develop --locked --release
+          maturin develop --locked --release --features datagen
       - name: Build memtest
         run: |
           source venv/bin/activate

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/python/Cargo.lock b/python/Cargo.lock
diff --git a/python/python/ci_benchmarks/benchmark.py b/python/python/ci_benchmarks/benchmark.py
@@ -69,6 +69,15 @@ def _format_bytes(num_bytes: int) -> str:
     return f"{num_bytes:.1f} PB"
 
 
+def _format_count(count: int) -> str:
+    """Format a large count with commas."""
+    for unit in ["", "K"]:
+        if abs(count) < 1000.0:
+            return f"{count:.1f} {unit}"
+        count /= 1000.0
+    return f"{count:.1f} M"
+
+
 class IOMemoryBenchmark:
     """Benchmark fixture that tracks IO and memory during execution."""
 
@@ -204,14 +213,14 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
             f"{'Read IOPS':>10}  {'Read Bytes':>12}  "
             f"{'Write IOPS':>10}  {'Write Bytes':>12}"
         )
-        terminalreporter.write_line("-" * (name_width + 72))
+        terminalreporter.write_line("-" * (name_width + 76))
     else:
         terminalreporter.write_line(
             f"{'Test':<{name_width}}  "
             f"{'Read IOPS':>10}  {'Read Bytes':>12}  "
             f"{'Write IOPS':>10}  {'Write Bytes':>12}"
         )
-        terminalreporter.write_line("-" * (name_width + 50))
+        terminalreporter.write_line("-" * (name_width + 52))
 
     # Results sorted by read bytes (descending)
     sorted_results = sorted(
@@ -224,7 +233,7 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
             terminalreporter.write_line(
                 f"{result.name:<{name_width}}  "
                 f"{_format_bytes(s.peak_bytes):>10}  "
-                f"{s.total_allocations:>10,}  "
+                f"{_format_count(s.total_allocations):>10}  "
                 f"{s.read_iops:>10,}  "
                 f"{_format_bytes(s.read_bytes):>12}  "
                 f"{s.write_iops:>10,}  "

diff --git a/python/python/ci_benchmarks/benchmarks/test_indexing.py b/python/python/ci_benchmarks/benchmarks/test_indexing.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+from pathlib import Path
+
+import lance
+import pyarrow as pa
+import pytest
+from lance._datagen import rand_batches
+
+
+@pytest.mark.parametrize(
+    "data_type", [pa.int64(), pa.string()], ids=["int64", "string"]
+)
+@pytest.mark.parametrize("index_type", ["btree", "bitmap", "zonemap", "bloomfilter"])
+@pytest.mark.io_memory_benchmark()
+def test_io_mem_build_scalar_index(
+    io_mem_benchmark, data_type: pa.DataType, index_type: str, tmp_path: Path
+):
+    metadata = None
+    if index_type == "bitmap":
+        metadata = {b"lance-datagen:cardinality": b"1000"}
+    schema = pa.schema([pa.field("col", data_type, metadata=metadata)])
+
+    # 100MB
+    data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
+    ds = lance.write_dataset(data, tmp_path)
+
+    def build_index(ds):
+        ds.create_scalar_index("col", index_type, replace=True)
+
+    io_mem_benchmark(build_index, ds, warmup=False)
+
+
+@pytest.mark.parametrize("with_positions", [True, False])
+@pytest.mark.io_memory_benchmark()
+def test_io_mem_build_fts(io_mem_benchmark, with_positions: bool, tmp_path: Path):
+    schema = pa.schema(
+        [
+            pa.field(
+                "text", pa.string(), metadata={"lance-datagen:content-type": "sentence"}
+            )
+        ]
+    )
+    # 100MB
+    data = rand_batches(schema, num_batches=100, batch_size_bytes=1024 * 1024)
+    ds = lance.write_dataset(data, tmp_path)
+
+    def build_index(ds):
+        ds.create_scalar_index("text", "INVERTED", with_position=True, replace=True)
+
+    io_mem_benchmark(build_index, ds, warmup=False)
+
+
+@pytest.mark.io_memory_benchmark()
+def test_io_mem_build_ivf_pq(io_mem_benchmark, tmp_path: Path):
+    schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 1024))])
+    # 1GB
+    data = rand_batches(schema, num_batches=100, batch_size_bytes=10 * 1024 * 1024)
+    ds = lance.write_dataset(data, tmp_path)
+
+    def build_index(ds):
+        ds.create_index(
+            "vector",
+            index_type="IVF_PQ",
+            num_partitions=32,
+            num_sub_vectors=4,
+            replace=True,
+        )
+
+    io_mem_benchmark(build_index, ds, warmup=False)
diff --git a/python/python/lance/_datagen.py b/python/python/lance/_datagen.py
@@ -26,4 +26,5 @@ def rand_batches(
         raise NotImplementedError(
             "This version of lance was not built with the datagen feature"
         )
-    return datagen.rand_batches(schema, num_batches, batch_size_bytes)
+    batch_iter = datagen.rand_batches(schema, num_batches, batch_size_bytes)
+    return pa.RecordBatchReader.from_batches(schema, batch_iter)
diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml
@@ -19,6 +19,7 @@ futures = { workspace = true }
 half = { workspace = true }
 hex = "0.4.3"
 rand = { workspace = true }
+rand_distr = { workspace = true }
 rand_xoshiro = { workspace = true }
 random_word = { version = "0.5", features = ["en"] }