lance-format · westonpace · Dec 4, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 3, 2025
diff --git a/docs/src/guide/performance.md b/docs/src/guide/performance.md
@@ -165,3 +165,74 @@ across the entire process. This limit is specified by the `LANCE_PROCESS_IO_THRE
 The default is 128 which is more than enough for most workloads. You can increase this limit if you are working
 with a high-throughput workload. You can even disable this limit entirely by setting it to zero. Note that this
 can often lead to issues with excessive retries and timeouts from the object store.
+
+## Indexes
+
+Training and searching indexes can have unique requirements for compute and memory. This section provides some
+guidance on what can be expected for different index types.
+
+### BTree Index
+
+The BTree index is a two-level structure that provides efficient range queries and sorted access.
+It strikes a balance between an expensive memory structure containing all values and an expensive disk
+structure that can't be efficiently searched.
+
+Training a BTree index is done by sorting the column. This is done using an [external sort](https://en.wikipedia.org/wiki/External_sorting) to constrain the total memory usage to a reasonable amount. Updating a BTree index does not
+require re-sorting the entire column. The new values are sorted and the existing values are merged into the new sorted
+values in linear time.
+
+#### Storage Requirements
+
+The BTree index is essentially a sorted copy of a column. The storage requirements are therefore the same as the column
+but an additional 4 bytes per value is required to store the row ID and there is a small lookup structure which
+should be roughly 0.001% of the size of the column.
+
+#### Memory Requirements
+
+Training a BTree index requires some RAM but the current implementation spills to disk rather aggressively and so the
+total memory usage is fairly low.
+
+When searching a BTree index, the index is loaded into the index cache in pages. Each page contains 4096 values.
+
+#### Performance
+
+The sort stage is the most expensive step in training a BTree index. The time complexity is O(n log n) where n is the number of rows in the column. At very large scales this can be a bottleneck and a distributed sort may be necessary. Lance currently does
+not have anything builtin for this but work is underway to add this functionality. Training an index in parts as the data grows
+may be slightly more efficient than training the entire index at once if you have the flexibility to do so.
+
+When the BTree index is fully loaded into the index cache, the search time scales linearly with the number of rows that match the
+query. When the BTree index is not fully loaded into the index cache, the search time will be controlled by the number of pages
+that need to be loaded from disk and the speed of storage. The parts_loaded metric in the execution metrics can tell you how many
+pages were loaded from disk to satisfy a query.
+
+### Bitmap Index
+
+The Bitmap index is an inverted lookup table that stores a bitmap for each possible value in the column. These bitmaps are compressed and serialized as a [Roaring Bitmap](https://roaringbitmap.org/).
+
+A bitmap index is currently trained by accumulating the column into a hash map from value to a vector of row ids. Each value
+is then serialized into a bitmap and stored in a file.
+
+### Storage Requirements
+
+The size of a bitmap index is difficult to calculate precisely but will generally scale with the number of unique values in the
+column since a unique bitmap is required for each value and a single bitmap with all rows will compress more efficiently than
+many bitmaps with a small number of rows.
+
+#### Memory Requirements
+
+Since training a bitmap index requires collecting the values into a hash map you will need at least 8 bytes of memory per row.
+In addition, if you have many unique values, then you will need additional memory for the keys of the hash map. Training large
+bitmaps with many unique values at scale can be memory intensive.
+
+When a bitmap index is searched, bitmaps are loaded into the session cache individually. The size of the bitmap will depend on
+the number of rows that match the token.
+
+### Performance
+
+When the bitmap index is fully loaded into the index cache, the search time scales linearly with the number of values that the
+query requires. This makes the bitmap very fast for equality queries or very small ranges. Queries against large ranges are
+currently extremely slow and the btree index is much faster for large range queries.
+
+When a bitmap index is not fully loaded into the index cache, the search time will be controlled by the number of bitmaps that
+need to be loaded from disk and the speed of storage. The parts_loaded metric in the execution metrics can tell you how many
+bitmaps were loaded from disk to satisfy a query.
diff --git a/python/python/ci_benchmarks/benchmarks/test_search.py b/python/python/ci_benchmarks/benchmarks/test_search.py
@@ -6,6 +6,7 @@
 import lance
 import pytest
 from ci_benchmarks.datasets import get_dataset_uri
+from ci_benchmarks.utils import wipe_os_cache
 
 COLUMN_LABELS = ["bools", "normals"]
 COLUMNS = [["bools"], ["normals"]]
@@ -38,24 +39,33 @@ def bench():
     benchmark.pedantic(bench, rounds=1, iterations=1)
 
 
+LARGE_IN_FILTER = (
+    "image_widths IN (" + ", ".join([str(i) for i in range(3990, 4100)]) + ")"
+)
+
 BTREE_FILTERS = [
     None,
     "image_widths = 3997",
     "image_widths >= 3990 AND image_widths <= 3997",
     "image_widths != 3997",
+    LARGE_IN_FILTER,
 ]
 BTREE_FILTER_LABELS = [
     None,
     "equal",
     "small_range",
     "not_equal",
+    "large_in",
 ]
 
 
 # These tests benchmark a variety of filtered read patterns
 @pytest.mark.parametrize("filt", BTREE_FILTERS, ids=BTREE_FILTER_LABELS)
 @pytest.mark.parametrize("payload", [None, "image_widths"], ids=["none", "integers"])
-def test_eda_btree_search(benchmark, filt: str | None, payload: str | None):
+@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"])
+def test_eda_btree_search(
+    benchmark, filt: str | None, payload: str | None, use_cache: bool
+):
     dataset_uri = get_dataset_uri("image_eda")
     ds = lance.dataset(dataset_uri)
 
@@ -66,7 +76,8 @@ def test_eda_btree_search(benchmark, filt: str | None, payload: str | None):
         columns = [payload]
 
     def bench():
-        ds.to_table(
+        to_search = ds if use_cache else lance.dataset(dataset_uri)
+        to_search.to_table(
             columns=columns,
             filter=filt,
             with_row_id=True,
@@ -80,45 +91,101 @@ def bench():
         iterations = 100
 
     # We warmup so we can test hot index performance
-    benchmark.pedantic(bench, warmup_rounds=1, rounds=1, iterations=iterations)
+    warmup_rounds = 1 if use_cache else 0
+
+    benchmark.pedantic(
+        bench, warmup_rounds=warmup_rounds, rounds=1, iterations=iterations
+    )
 
 
+BASIC_LARGE_IN_FILTER = (
+    "row_number IN (" + ", ".join([str(i) for i in range(100000, 100100)]) + ")"
+)
 BASIC_BTREE_FILTERS = [
     None,
     "row_number = 100000",
     "row_number != 100000",
     "row_number >= 100000 AND row_number <= 100007",
+    BASIC_LARGE_IN_FILTER,
 ]
 
 BASIC_BTREE_FILTER_LABELS = [
     "none",
     "equal",
     "not_equal",
     "small_range",
+    "large_in",
 ]
 
 
-# Repeats the same test for the basic dataset which is easier to test with locally
-# This benchmark is not part of the CI job as the EDA dataset is better for that
-@pytest.mark.parametrize("filt", BASIC_BTREE_FILTERS, ids=BASIC_BTREE_FILTER_LABELS)
-@pytest.mark.parametrize("payload", [None, "small_strings", "integers"])
-def test_basic_btree_search(benchmark, filt: str | None, payload: str | None):
+def do_basic_search(benchmark, filt: str | None, payload: str | None, use_cache: bool):
     dataset_uri = get_dataset_uri("basic")
     ds = lance.dataset(dataset_uri)
 
     columns = []
     if payload is not None:
         columns = [payload]
 
+    def clear_cache():
+        wipe_os_cache(dataset_uri)
+
     def bench():
-        ds.to_table(
+        to_search = ds if use_cache else lance.dataset(dataset_uri)
+        to_search.to_table(
             columns=columns,
             filter=filt,
             with_row_id=True,
             batch_size=32 * 1024,
         )
 
-    benchmark.pedantic(bench, warmup_rounds=1, rounds=1, iterations=10)
+    setup = None if use_cache else clear_cache
+
+    warmup_rounds = 1 if use_cache else 0
+    benchmark.pedantic(
+        bench, warmup_rounds=warmup_rounds, rounds=10, iterations=1, setup=setup
+    )
+
+
+# Repeats the same test for the basic dataset which is easier to test with locally
+# This benchmark is not part of the CI job as the EDA dataset is better for that
+@pytest.mark.parametrize("filt", BASIC_BTREE_FILTERS, ids=BASIC_BTREE_FILTER_LABELS)
+@pytest.mark.parametrize("payload", [None, "small_strings", "integers"])
+@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"])
+def test_basic_btree_search(
+    benchmark, filt: str | None, payload: str | None, use_cache: bool
+):
+    do_basic_search(benchmark, filt, payload, use_cache)
+
+
+BASIC_LARGE_IN_FILTER_BITMAP = (
+    "row_number_bitmap IN (" + ", ".join([str(i) for i in range(100000, 100100)]) + ")"
+)
+BASIC_BITMAP_FILTERS = [
+    None,
+    "row_number_bitmap = 100000",
+    "row_number_bitmap != 100000",
+    #     "row_number_bitmap >= 100000 AND row_number_bitmap <= 100007",
+    #     BASIC_LARGE_IN_FILTER_BITMAP,
+]
+
+BASIC_BITMAP_FILTER_LABELS = [
+    "none",
+    "equal",
+    "not_equal",
+    #     "small_range",
+    #     "large_in",
+]
+
+
+# Repeats the same test for the basic dataset which is easier to test with locally
+# This benchmark is not part of the CI job as the EDA dataset is better for that
+@pytest.mark.parametrize("filt", BASIC_BITMAP_FILTERS, ids=BASIC_BITMAP_FILTER_LABELS)
+@pytest.mark.parametrize("payload", [None, "small_strings", "integers"])
+@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"])
+def test_basic_bitmap_search(
+    benchmark, filt: str | None, payload: str | None, use_cache: bool
+):
+    do_basic_search(benchmark, filt, payload, use_cache)
 
 
 IOPS = 0.0

diff --git a/python/python/ci_benchmarks/datagen/basic.py b/python/python/ci_benchmarks/datagen/basic.py
@@ -19,6 +19,7 @@
 SCHEMA = pa.schema(
     {
         "row_number": pa.uint64(),
+        "row_number_bitmap": pa.uint64(),
         "integers": pa.int64(),
         "small_strings": pa.string(),
     }
@@ -36,9 +37,12 @@ def _gen_data():
                 pa.array(
                     [batch_idx * ROWS_PER_BATCH + i for i in range(ROWS_PER_BATCH)]
                 ),
+                pa.array(
+                    [batch_idx * ROWS_PER_BATCH + i for i in range(ROWS_PER_BATCH)]
+                ),
                 pa.array([f"payload_{i}" for i in range(ROWS_PER_BATCH)]),
             ],
-            names=["row_number", "integers", "small_strings"],
+            names=["row_number", "row_number_bitmap", "integers", "small_strings"],
         )
         yield batch
 
@@ -72,6 +76,7 @@ def _create(dataset_uri: str):
         )
     if ds.list_indices() == []:
         ds.create_scalar_index("row_number", "BTREE")
+        ds.create_scalar_index("row_number_bitmap", "BITMAP")
 
 
 def gen_basic():

diff --git a/python/python/ci_benchmarks/utils.py b/python/python/ci_benchmarks/utils.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import os
+from pathlib import Path
+
+
+def wipe_os_cache(dataset_uri: str):
+    if dataset_uri.startswith("/"):
+        path = dataset_uri
+    elif dataset_uri.startswith("file://"):
+        path = Path(dataset_uri.removeprefix("file://"))
+    else:
+        return
+
+    if not hasattr(os, "posix_fadvise"):
+        raise NotImplementedError("posix_fadvise not available on this platform")
+
+    POSIX_FADV_DONTNEED = 4  # Tell kernel we don't need this data in cache
+
+    directory = Path(path)
+
+    file_iterator = directory.rglob("*")
+
+    for filepath in file_iterator:
+        # Skip directories, symlinks, and non-regular files
+        if not filepath.is_file():
+            continue
+
+        with open(filepath, "rb") as f:
+            fd = f.fileno()
+            # offset=0, length=0 means drop entire file from cache
+            os.posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED)
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
@@ -144,5 +144,13 @@ harness = false
 name = "rq"
 harness = false
 
+[[bench]]
+name = "btree"
+harness = false
+
+[[bench]]
+name = "bitmap"
+harness = false
+
 [lints]
 workspace = true