lance-format · wjones127 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -112,26 +112,26 @@ criterion = { version = "0.5", features = [
 ] }
 crossbeam-queue = "0.3"
 crossbeam-skiplist = "0.1"
-datafusion = { version = "51.0.0", default-features = false, features = [
-    "nested_expressions",
-    "regex_expressions",
-    "unicode_expressions",
+datafusion = { version = "52.1.0", default-features = false, features = [
     "crypto_expressions",
-    "encoding_expressions",
     "datetime_expressions",
+    "encoding_expressions",
+    "nested_expressions",
+    "regex_expressions",
     "sql",
     "string_expressions",
+    "unicode_expressions",
 ] }
-datafusion-common = "51.0.0"
-datafusion-functions = { version = "51.0.0", features = ["regex_expressions"] }
-datafusion-sql = "51.0.0"
-datafusion-expr = "51.0.0"
-datafusion-ffi = "51.0.0"
-datafusion-execution = "51.0.0"
-datafusion-optimizer = "51.0.0"
-datafusion-physical-expr = "51.0.0"
-datafusion-physical-plan = "51.0.0"
-datafusion-substrait = "51.0.0"
+datafusion-common = "52.1.0"
+datafusion-functions = { version = "52.1.0", features = ["regex_expressions"] }
+datafusion-sql = "52.1.0"
+datafusion-expr = "52.1.0"
+datafusion-ffi = "52.1.0"
+datafusion-execution = "52.1.0"
+datafusion-optimizer = "52.1.0"
+datafusion-physical-expr = "52.1.0"
+datafusion-physical-plan = "52.1.0"
+datafusion-substrait = "52.1.0"
 deepsize = "0.2.0"
 dirs = "6.0.0"
 either = "1.0"
@@ -140,7 +140,7 @@ fsst = { version = "=3.0.0-rc.1", path = "./rust/compression/fsst" }
 futures = "0.3"
 geoarrow-array = "0.7"
 geoarrow-schema = "0.7"
-geodatafusion = "0.2.0"
+geodatafusion = "0.3.0"
 geo-traits = "0.3.0"
 geo-types = "0.7.16"
 http = "1.1.0"

diff --git a/docs/src/format/table/index/index.md b/docs/src/format/table/index/index.md
@@ -1,12 +1,11 @@
-
 # Indices in Lance
 
 Lance supports three main categories of indices to accelerate data access: scalar
 indices, vector indices, and system indices.
 
-**Scalar indices** are traditional indices that speed up queries on scalar data types, such as 
+**Scalar indices** are traditional indices that speed up queries on scalar data types, such as
 integers and strings. Examples include [B-trees](scalar/btree.md) and
-[full-text search indices](scalar/full_text.md). Typically, scalar indices receive a
+[full-text search indices](scalar/fts.md). Typically, scalar indices receive a
 query predicate, such as equality or range conditions, and output a set of row addresses that
 satisfy the predicate.
 
@@ -59,7 +58,7 @@ all rows in the fragments they cover, with one exception: if a fragment has dele
 of index creation, the index segment is allowed to not contain the deleted rows. The fragments an index
 covers are those recorded in the `fragment_bitmap` field.
 
-Index segments together **do not** need to cover all fragments. This means an index isn't required to 
+Index segments together **do not** need to cover all fragments. This means an index isn't required to
 be fully up-to-date. When this happens, engines can split their queries into indexed and unindexed
 subplans and merge the results.
 
@@ -71,13 +70,13 @@ subplans and merge the results.
 
 Consider the example dataset in the figure above:
 
-* The dataset contains three fragments with ids 0, 1, 2. Fragment 1 has 10 deleted rows, indicated
+- The dataset contains three fragments with ids 0, 1, 2. Fragment 1 has 10 deleted rows, indicated
   by the deletion file.
-* There is an index called "id_idx", which has two segments: one covering fragments 0 and another covering
+- There is an index called "id_idx", which has two segments: one covering fragments 0 and another covering
   fragment 1. Fragment 2 is not covered by the index. Queries using this index will need to query both
   segments and then scan fragment 2 directly. Additionally, when querying the segment covering fragment 1,
   the engine will need to filter out the 10 deleted rows.
-* There is another index called "vec_idx", which has a single segment covering all three fragments.
+- There is another index called "vec_idx", which has a single segment covering all three fragments.
   Because it covers all fragments, queries using this index do not need to scan any fragments directly.
   They do, however, need to filter out the 10 deleted rows from fragment 1.
 
@@ -99,13 +98,13 @@ Index segments are created and updated through a transactional process:
    directory, where `{UUID}` is a newly generated unique identifier.
 
 2. **Prepare the metadata**: Create an `IndexMetadata` message with:
-    - `uuid`: The newly generated UUID
-    - `name`: The index name (must match existing segments if adding to an existing index)
-    - `fields`: The column(s) being indexed
-    - `fragment_bitmap`: The set of fragment IDs covered by this segment
-    - `index_details`: Index-specific configuration and parameters
-    - `version`: The format version of this index type
-    - See the full protobuf definition in [table.proto](https://github.com/lance-format/lance/blob/main/protos/table.proto).
+   - `uuid`: The newly generated UUID
+   - `name`: The index name (must match existing segments if adding to an existing index)
+   - `fields`: The column(s) being indexed
+   - `fragment_bitmap`: The set of fragment IDs covered by this segment
+   - `index_details`: Index-specific configuration and parameters
+   - `version`: The format version of this index type
+   - See the full protobuf definition in [table.proto](https://github.com/lance-format/lance/blob/main/protos/table.proto).
 
 3. **Commit the transaction**: Write a new manifest that includes the new index segment
    in its `IndexSection`. This is done atomically using the same transaction mechanism
@@ -149,13 +148,12 @@ When loading an index:
 
 The `IndexMetadata` message contains important information about the index segment:
 
-* `uuid`: the unique identifier of the index segment.
-* `fields`: the column(s) the index is built on.
-* `fragment_bitmap`: the set of fragment IDs covered by this index segment.
-* `index_details`: a protobuf `Any` message that contains index-specific details, such as index type,
+- `uuid`: the unique identifier of the index segment.
+- `fields`: the column(s) the index is built on.
+- `fragment_bitmap`: the set of fragment IDs covered by this index segment.
+- `index_details`: a protobuf `Any` message that contains index-specific details, such as index type,
   parameters, and storage format. This allows different index types to store their own metadata.
 
-
 <details>
   <summary>Full protobuf definitions</summary>
 

diff --git a/docs/src/format/table/index/scalar/fts.md b/docs/src/format/table/index/scalar/fts.md
@@ -18,6 +18,8 @@ The FTS index consists of multiple files storing the token dictionary, document
 3. `invert.lance` - Compressed posting lists for each token
 4. `metadata.lance` - Index metadata and configuration
 
+An FTS index may contain multiple partitions. Each partition has its own set of token, document, and posting list files, prefixed with the partition ID (e.g. `part_0_tokens.lance`, `part_0_docs.lance`, `part_0_invert.lance`). The `metadata.lance` file lists all partition IDs in the index. At query time, every partition must be searched and the results combined to produce the final ranked output. Fewer partitions generally means better query performance, since each partition requires its own token dictionary lookup and posting list scan. The number of partitions is controlled by the training configuration -- specifically `LANCE_FTS_TARGET_SIZE` determines how large each merged partition can grow (see [Training Process](#training-process) for details).
+
 ### Token Dictionary File Schema
 
 | Column      | Type   | Nullable | Description                     |
@@ -189,6 +191,58 @@ address.city:San
 address.city:Francisco
 ```
 
+## Training Process
+
+Building an FTS index is a multi-phase pipeline: the source column is scanned, documents are tokenized in parallel, intermediate results are spilled to part files on disk, and the part files are merged into final output partitions.
+
+### Phase 1: Tokenization
+
+The input column is read as a stream of record batches and dispatched to a pool of tokenizer worker tasks. Each worker tokenizes documents independently, accumulating tokens, posting lists, and document metadata in memory.
+
+When a worker's accumulated data reaches the partition size limit or the document count hits `u32::MAX`, it flushes the data to disk as a set of part files (`part_<id>_tokens.lance`, `part_<id>_invert.lance`, `part_<id>_docs.lance`). A single worker may produce multiple part files if it processes enough data.
+
+### Phase 2: Merge
+
+After all workers finish, the part files are merged into output partitions. Part files are streamed with bounded buffering so that not all data needs to be loaded into memory at once. For each part file, the token dictionaries are unified, document sets are concatenated, and posting lists are rewritten with adjusted IDs.
+
+When a merged partition reaches the target size, it is written to the destination store and a new one is started. After all part files are consumed the final partition is flushed, and a `metadata.lance` file is written listing the partition IDs and index parameters.
+
+### Configuration
+
+| Environment Variable       | Default                          | Description                                                                                                           |
+|----------------------------|----------------------------------|-----------------------------------------------------------------------------------------------------------------------|
+| `LANCE_FTS_NUM_SHARDS`     | Number of compute-intensive CPUs | Number of parallel tokenizer worker tasks. Higher values increase indexing throughput but use more memory.             |
+| `LANCE_FTS_PARTITION_SIZE` | 256 (MiB)                        | Maximum uncompressed size of a worker's in-memory buffer before it is spilled to a part file.                         |
+| `LANCE_FTS_TARGET_SIZE`    | 4096 (MiB)                       | Target uncompressed size for merged output partitions. Fewer, larger partitions improve query performance.             |
+
+### Memory and Performance Considerations
+
+Memory usage is primarily determined by two factors:
+
+- **`LANCE_FTS_NUM_SHARDS`** -- Each worker holds an independent in-memory buffer. Peak memory is roughly `NUM_SHARDS * PARTITION_SIZE` plus the overhead of token dictionaries and posting list structures.
+- **`LANCE_FTS_PARTITION_SIZE`** -- Larger values reduce the number of part files and make the merge phase cheaper. Smaller values reduce per-worker memory at the cost of more part files.
+
+Merge phase memory is bounded by the streaming approach: part files are loaded one at a time with a small concurrency buffer. The merged partition's in-memory size is bounded by `LANCE_FTS_TARGET_SIZE`.
+
+Building an FTS index requires temporary disk space to store the part files generated during tokenization. The amount of temporary space depends heavily on whether position information is enabled. An index with `with_position: true` stores the position of every token occurrence in every document, which can easily require 10x the size of the original column or more in temporary disk space. An index without positions tends to be smaller than the original column and will typically need less than 2x the size of the column in total disk space.
+
+Performance tips:
+
+- Larger `LANCE_FTS_TARGET_SIZE` produces fewer output partitions, which is beneficial for query performance because queries must scan every partition's token dictionary. When memory allows, prefer fewer, larger partitions.
+- `with_position: true` significantly increases index size because term positions are stored for every occurrence. Only enable it when phrase queries are needed.
+- The ngram tokenizer generates many more tokens per document than word-level tokenizers, so expect larger index sizes and higher memory usage.
+
+### Distributed Training
+
+The FTS index supports distributed training where different worker nodes each index a subset of the data and the results are assembled afterward.
+
+1. Each distributed worker is assigned a **fragment mask** (`(fragment_id as u64) << 32`) that is OR'd into the partition IDs it generates, ensuring globally unique IDs across workers.
+2. Workers set `skip_merge: true` so they write their part files directly without running the merge phase.
+3. Instead of a single `metadata.lance`, each worker writes per-partition metadata files named `part_<id>_metadata.lance`.
+4. After all workers finish, a coordinator merges the metadata files: it collects all partition IDs, remaps them to a sequential range starting from 0 (renaming the corresponding data files), and writes the final unified `metadata.lance`.
+
+This allows each worker to operate independently during the tokenization phase. Only the final metadata merge requires a single-node step, and it is lightweight since it only renames files and writes a small metadata file.
+
 ## Accelerated Queries
 
 Lance SDKs provide dedicated full text search APIs to leverage the FTS index capabilities. 

diff --git a/docs/src/format/table/index/system/mem_wal.md b/docs/src/format/table/index/system/mem_wal.md
@@ -9,4 +9,4 @@ For the complete specification, see:
 
 - [MemWAL Index Overview](../../mem_wal.md#memwal-index) - Purpose and high-level description
 - [MemWAL Index Details](../../mem_wal.md#memwal-index-details) - Storage format, schemas, and staleness handling
-- [MemWAL Index Builder](../../mem_wal.md#memwal-index-builder) - Background process and configuration updates
+- [MemWAL Implementation](../../mem_wal.md#implementation-expectation) - Implementation details and expectations