Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,3 @@ kv_cache_config:
- 512
- 512
- 32768
enable_block_reuse: false
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ kv_cache_config:
- 512
- 512
- 32768
enable_block_reuse: false

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ kv_cache_config:
- 512
- 512
- 32768
enable_block_reuse: false

cache_transceiver_config:
backend: default
28 changes: 24 additions & 4 deletions components/backends/trtllm/gemma3_sliding_window_attention.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ This guide demonstrates how to deploy google/gemma-3-1b-it with Variable Sliding
VSWA is a mechanism in which a model’s layers alternate between multiple sliding window sizes. An example of this is Gemma 3, which incorporates both global attention layers and sliding window layers.

## Notes
* To run Gemma 3 with VSWA, ensure that the container has TensorRT-LLM v1.0.0rc4 installed.

## Limitation
* The current KV event-based KV routing does not work well with VSWA. The Dynamo team is actively working on adding support to distinguish between events from different layer groups.
* To run Gemma 3 with VSWA and KV Routing with KV block reuse, ensure that the container is built using commit ID `c9eebcb4541d961ab390f0bd0a22e2c89f1bcc78` from Tensorrt-LLM.
```bash
./container/build.sh --framework TENSORRTLLM --tensorrtllm-commit c9eebcb4541d961ab390f0bd0a22e2c89f1bcc78
```
* The 1.0.0rc4 release version of TensorRT-LLM can also run Gemma 3 with VSWA, but KV block reuse cannot be turned on in that version.

### Aggregated Serving
```bash
Expand All @@ -35,6 +36,15 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml
./launch/agg.sh
```

### Aggregated Serving with KV Routing
```bash
cd $DYNAMO_HOME/components/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml
./launch/agg_router.sh
```

#### Disaggregated Serving
```bash
cd $DYNAMO_HOME/components/backends/trtllm
Expand All @@ -44,3 +54,13 @@ export PREFILL_ENGINE_ARGS=engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml
./launch/disagg.sh
```

#### Disaggregated Serving with KV Routing
```bash
cd $DYNAMO_HOME/components/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml
./launch/disagg_router.sh
```
51 changes: 51 additions & 0 deletions components/backends/trtllm/src/dynamo/trtllm/publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ def __init__(self, component, engine, kv_listener, worker_id, kv_block_size):
self.kv_listener = kv_listener
self.worker_id = worker_id
self.kv_block_size = kv_block_size
self.max_window_size = None

# The first few kv events from the model engine are always "created" type events.
# Use these events to capture the max_window_size of the model.
# When the first event that is not a "created" type is received, the publisher will set this to False to stop processing "created" type events.
self.processing_initial_created_events = True

# Needed by the events and metrics publishers
self.metrics_publisher = None
Expand Down Expand Up @@ -289,9 +295,14 @@ async def _publish_kv_cache_events_task(self):
events = self.engine.llm.get_kv_cache_events_async(timeout=5)
async for event in events:
logging.debug(f"KV cache event received: {event}")
# drop the events that is not emitted from the global attention layer.
if self.should_drop_event(event):
continue

event_id = event["event_id"]
data = event["data"]
if data["type"] == "stored":
self.processing_initial_created_events = False
parent_hash = _to_signed_i64(data["parent_hash"])
token_ids = []
num_block_tokens = []
Expand Down Expand Up @@ -332,6 +343,7 @@ async def _publish_kv_cache_events_task(self):
parent_hash,
)
elif data["type"] == "removed":
self.processing_initial_created_events = False
block_hashes = []
for block_hash in data["block_hashes"]:
block_hash = _to_signed_i64(block_hash)
Expand All @@ -347,6 +359,9 @@ async def _publish_kv_cache_events_task(self):
f"publish removed event: event_id: {event_id}, block_hashes: {block_hashes}"
)
self.kv_event_publisher.publish_removed(event_id, block_hashes)
elif data["type"] == "created" and self.processing_initial_created_events:
self.update_max_window_size(event)

return True

def start(self):
Expand Down Expand Up @@ -394,6 +409,42 @@ async def cleanup(self):
if self.publish_kv_cache_events_thread.is_alive():
logging.warning("KV cache events thread did not stop within timeout")

def update_max_window_size(self, event):
if "window_size" in event:
window_size = event["window_size"]
if self.max_window_size is None or window_size > self.max_window_size:
self.max_window_size = window_size
logging.debug(
f"kv events max_window_size has been updated to {self.max_window_size}"
)

# The global attention layer will emit the KV event with the max_window_size.
# We only want to keep the KV event that has the max_window_size to ensure
# the accuracy of KV routing.
# TRTLLM emits a "created" event at the very beginning when it creates the KV cache,
# so we can use the "created" event to identify the max_window_size of the global
# attention layer in the model engine.
def should_drop_event(self, event):
# There are two cases for KV event filtering:
#
# 1. If "window_size" is NOT in the KV event:
# "window_size" was added to KV events only recently, so some older versions of TRTLLM
# might not include it. In this case, the publisher will assume that all events are
# from the global attention layer.
#
# 2. If "window_size" is present in the KV event:
# The publisher will not drop any KV events until all initial "created" KV events
# have been processed in order to capture the max_window_size.
# After processing all "created" events, the publisher will only accept KV events
# whose window_size is equal to the max_window_size to ensure accurate routing.
if "window_size" not in event or self.processing_initial_created_events:
return False

if event["window_size"] != self.max_window_size:
return True

return False


@asynccontextmanager
async def get_publisher(component, engine, kv_listener, worker_id, kv_block_size):
Expand Down
Loading