Merge branch 'NVIDIA:main' into qgai/deepseekeagle2models

sunnyqgg · web-flow · commit dfa8bc04807d · 2025-09-17T15:14:02.000+09:00
diff --git a/docs/source/features/kvcache.md b/docs/source/features/kvcache.md
@@ -70,6 +70,8 @@ Before a block is evicted from GPU memory, it can optionally be offloaded to hos
 
 When offloading is enabled, the client can prevent specific blocks from being offloaded by toggling block priority. Blocks with lower priority than a certain threshold are not offloaded; they are evicted directly from GPU memory to reduce traffic between GPU and host. This priority is set with ```secondary_offload_min_priority```. Default value is 35, meaning any block with lower priority than 35 will not be offloaded.
 
+Here is an [example](../../../examples/llm-api/llm_kv_cache_offloading.py) to show how to enable host offloading.
+
 ### Partial Reuse
 
 Partial reuse of a block can happen when some but not all tokens are matched. It is enabled by default, but can be disabled by setting ```enable_partial_reuse``` to False.
diff --git a/examples/llm-api/llm_kv_cache_offloading.py b/examples/llm-api/llm_kv_cache_offloading.py
@@ -0,0 +1,131 @@
+'''
+This script demonstrates the effectiveness of KV cache host offloading in TensorRT-LLM.
+
+**Scenario:**
+The script simulates a scenario where the GPU's KV cache is severely limited,
+while multiple requests with recurring prompts (like system prompts) are processed.
+
+1.  **Constrained GPU Cache:** The GPU KV cache is configured to be very small,
+    only large enough to hold the state for a single request.
+2.  **Alternating Prompts:** Four requests are sent sequentially (batch size of 1)
+    with two distinct prompts in an A, B, A, B pattern.
+3.  **Cache Eviction:** Due to the small GPU cache, processing prompt B will
+    force the eviction of the cache generated for prompt A.
+
+**Demonstration:**
+
+* **Without Offloading (Default):**
+    - When the first prompt 'A' is processed, its KV cache is stored on the GPU.
+    - When prompt 'B' arrives, the cache manager needs space and discards the cache for 'A'.
+    - When prompt 'A' is sent again, its cache must be recomputed from scratch.
+    - **Expected Outcome:** The log will show `reused blocks: 0` and `cache hit rate: 0`.
+
+* **With Offloading (`--enable_offloading`):**
+    - When prompt 'B' arrives, the cache for 'A' is not discarded but is instead
+      *offloaded* from the fast GPU VRAM to the slower (but larger) host CPU RAM.
+    - When prompt 'A' is sent again, its KV cache is loaded back from host RAM
+      to the GPU, which is significantly faster than recomputing it.
+    - **Expected Outcome:** The log will show positive values for `reused blocks`
+      and a non-zero `cache hit rate`, confirming that the cache was successfully
+      reused from the host.
+
+**How to Run & Verify:**
+
+1.  **Without Offloading:**
+    ```bash
+    TLLM_LOG_LEVEL=DEBUG python llm_kv_cache_offloading.py 2>&1 | tee offloading_disabled.log
+    ```
+    (Check the log for zero reuse)
+
+2.  **With Offloading:**
+    ```bash
+    TLLM_LOG_LEVEL=DEBUG python llm_kv_cache_offloading.py --enable_offloading 2>&1 | tee offloading_enabled.log
+    ```
+    (Check the log for non-zero reuse)
+'''
+
+import argparse
+
+from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.llmapi import KvCacheConfig
+
+
+def main(args):
+    # Define two distinct prompts to simulate different requests or system prompts.
+    prompt_a = (
+        "Returns the per-iterations statistics computed since last call to this method. "
+        "Contains at most iter_stats_max_iterations iterations.")
+    prompt_b = ("Use for skipping decoding step for non generation model, "
+                "and return the batch_output (such as mm_embeddings)")
+
+    # Use a batch size of 1 to process requests sequentially, making the cache
+    # eviction and reuse cycle easy to observe.
+    max_batch_size = 1
+    max_seq_len = 256
+
+    # --- KV Cache Configuration ---
+    # Set a small GPU KV cache size (in number of tokens). This is crucial for the demo,
+    # as it's only large enough to hold the KV cache for a single request.
+    kv_cache_max_tokens = 256
+    # Define the size of a single cache block.
+    kv_cache_page_size = 16
+    # Enable a 1 GB host cache if offloading is requested, otherwise disable it (size 0).
+    # This is the key toggle for the experiment.
+    kv_cache_host_size = 1024**3 if args.enable_offloading else 0
+
+    sampling_params = SamplingParams(max_tokens=max_seq_len)
+
+    llm = LLM(
+        model="Qwen/Qwen3-8B",
+        max_batch_size=max_batch_size,
+        max_seq_len=max_seq_len,
+        kv_cache_config=KvCacheConfig(
+            enable_block_reuse=True,  # Enable reuse of cached blocks
+            max_tokens=kv_cache_max_tokens,  # Max tokens in GPU cache
+            tokens_per_block=kv_cache_page_size,
+            host_cache_size=kv_cache_host_size  # Host cache size for offloading
+        ))
+
+    # Process four requests sequentially using two distinct prompts (A, B, A, B).
+    # This pattern is designed to showcase the cache eviction and reuse behavior.
+    print("--- First Round ---")
+    # 1. Process prompt A. Its cache is stored on the GPU.
+    output_a = llm.generate(prompt_a, sampling_params)
+    print(
+        f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}"
+    )
+    # 2. Process prompt B. Its cache replaces/offloads A's cache.
+    output_b = llm.generate(prompt_b, sampling_params)
+    print(
+        f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}"
+    )
+
+    print("\n--- Second Round ---")
+    # 3. Process prompt A again.
+    #    - Without offloading: Must recompute from scratch.
+    #    - With offloading: Recovers cache from host RAM.
+    output_a = llm.generate(prompt_a, sampling_params)
+    print(
+        f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}"
+    )
+    # 4. Process prompt B again.
+    #    - Without offloading: Must recompute from scratch.
+    #    - With offloading: Recovers cache from host RAM.
+    output_b = llm.generate(prompt_b, sampling_params)
+    print(
+        f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}"
+    )
+
+    llm.shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=
+        "A script to demonstrate the effectiveness of KV cache host offloading."
+    )
+    parser.add_argument('--enable_offloading',
+                        action='store_true',
+                        help='Enable host RAM for KV cache offloading.')
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/scaffolding/contrib/TreeInference/run_mcts_example.py b/examples/scaffolding/contrib/TreeInference/run_mcts_example.py
@@ -3,8 +3,8 @@
 
 import argparse
 
-from tensorrt_llm.scaffolding import (MCTSController,
-                                      NativeGenerationController, PRMController)
+from tensorrt_llm.scaffolding import NativeGenerationController, PRMController
+from tensorrt_llm.scaffolding.contrib.TreeInference import MCTSController
 from tensorrt_llm.scaffolding.scaffolding_llm import ScaffoldingLlm
 from tensorrt_llm.scaffolding.worker import TRTLLMWorker
 
diff --git a/examples/scaffolding/contrib/TreeInference/run_tot_example.py b/examples/scaffolding/contrib/TreeInference/run_tot_example.py
@@ -3,8 +3,8 @@
 
 import argparse
 
-from tensorrt_llm.scaffolding import (NativeGenerationController, PRMController,
-                                      TOTController)
+from tensorrt_llm.scaffolding import NativeGenerationController, PRMController
+from tensorrt_llm.scaffolding.contrib.TreeInference import TOTController
 from tensorrt_llm.scaffolding.scaffolding_llm import ScaffoldingLlm
 from tensorrt_llm.scaffolding.worker import TRTLLMWorker
 
diff --git a/tensorrt_llm/scaffolding/__init__.py b/tensorrt_llm/scaffolding/__init__.py
@@ -1,8 +1,5 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-
 from .benchmark import ScaffoldingBenchRequest, async_scaffolding_benchmark
-from .contrib.TreeInference.tree_controllers import (MCTSController,
-                                                     TOTController)
 from .controller import (BestOfNController, Controller, MajorityVoteController,
                          NativeGenerationController, NativeRewardController,
                          ParallelProcess, PRMController)
@@ -23,8 +20,6 @@
     "PRMController",
     "MajorityVoteController",
     "BestOfNController",
-    "MCTSController",
-    "TOTController",
     "Task",
     "GenerationTask",
     "RewardTask",
diff --git a/tensorrt_llm/scaffolding/contrib/TreeInference/__init__.py b/tensorrt_llm/scaffolding/contrib/TreeInference/__init__.py
@@ -0,0 +1,3 @@
+from .tree_controllers import MCTSController, TOTController
+
+__all__ = ["MCTSController", "TOTController"]
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2040,6 +2040,42 @@ def test_nvfp4_multi_gpus_corner_case(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    def test_nvfp4_multi_gpus_corner_case(self):
+        """
+        This test is used to test the corner case of the NVFP4 model.
+        When using the same value for max_seq_len and max_num_tokens, there will be no
+        enough kv block for the dummy requests in CUDA graph warmup when creating
+        the py_executor before estimating kv cache. Then CUDA graph capture will be
+        triggered when estimating kv cache. This may cause some errors.
+        More info in https://nvbugs/5485325.
+        """
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.80,
+                                        dtype="fp8",
+                                        enable_block_reuse=False)
+        pytorch_config = dict(disable_overlap_scheduler=False,
+                              cuda_graph_config=CudaGraphConfig(
+                                  enable_padding=True, max_batch_size=1024),
+                              moe_config=MoeConfig(backend="TRTLLM"))
+
+        mtp_config = MTPDecodingConfig(num_nextn_predict_layers=1)
+        with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4",
+                 tensor_parallel_size=8,
+                 pipeline_parallel_size=1,
+                 moe_expert_parallel_size=8,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 enable_attention_dp=False,
+                 speculative_config=mtp_config,
+                 max_seq_len=5120,
+                 max_num_tokens=5120) as llm:
+
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @pytest.mark.skip_less_mpi_world_size(8)
     @skip_pre_hopper
     @pytest.mark.parametrize(
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -310,6 +310,7 @@ examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16] SKIP (https://nvbugs/5465143)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] SKIP (https://nvbugs/5465143)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
+examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] SKIP (https://nvbugs/5522332)
 accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5465143)
 accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5465143)
 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075)
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
@@ -2,6 +2,7 @@
 import os
 from typing import Any, Dict, Optional
 
+import pytest
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -286,11 +287,12 @@ def generate_dynamic_shapes(max_batch_size, max_seq_len):
 
 
 def _hf_model_dir_or_hub_id(
-    hf_model_dir: str,
+    hf_model_subdir: str,
     hf_hub_id: str,
 ) -> str:
-    if os.path.isdir(hf_model_dir):
-        return hf_model_dir
+    llm_models_path = llm_models_root()
+    if llm_models_path and os.path.isdir((model_fullpath := llm_models_path / hf_model_subdir)):
+        return str(model_fullpath)
     else:
         return hf_hub_id
 
@@ -350,10 +352,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
 
 _SMALL_MODEL_CONFIGS = {
     "meta-llama/Meta-Llama-3.1-8B-Instruct": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct",
-            "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        ),
+        "llm_models_subdir": "llama-3.1-model/Llama-3.1-8B-Instruct",
         "model_kwargs": {
             "num_hidden_layers": 1,
             "hidden_size": 64,
@@ -363,10 +362,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "mistralai/Mixtral-8x7B-Instruct-v0.1": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/Mixtral-8x7B-Instruct-v0.1",
-            "mistralai/Mixtral-8x7B-Instruct-v0.1",
-        ),
+        "llm_models_subdir": "Mixtral-8x7B-Instruct-v0.1",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "intermediate_size": 256,
@@ -377,10 +373,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "Qwen/Qwen3-30B-A3B": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B",
-            "Qwen/Qwen3-30B-A3B",
-        ),
+        "llm_models_subdir": "Qwen3/Qwen3-30B-A3B",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "intermediate_size": 256,
@@ -391,10 +384,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "microsoft/Phi-3-mini-4k-instruct": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/Phi-3/Phi-3-mini-4k-instruct",
-            "microsoft/Phi-3-mini-4k-instruct",
-        ),
+        "llm_models_subdir": "Phi-3/Phi-3-mini-4k-instruct",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "hidden_size": 128,
@@ -404,10 +394,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/Llama-4-Scout-17B-16E-Instruct",
-            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        ),
+        "llm_models_subdir": "Llama-4-Scout-17B-16E-Instruct",
         "model_factory": "AutoModelForImageTextToText",
         "model_kwargs": {
             "text_config": {
@@ -426,10 +413,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "deepseek-ai/DeepSeek-V3": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/DeepSeek-V3",
-            "deepseek-ai/DeepSeek-V3",
-        ),
+        "llm_models_subdir": "DeepSeek-V3",
         "model_kwargs": {
             "first_k_dense_replace": 1,
             "num_hidden_layers": 2,
@@ -448,16 +432,13 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "Qwen/Qwen2.5-3B-Instruct": {
-        "model": _hf_model_dir_or_hub_id(
-            f"{llm_models_root()}/Qwen/Qwen2.5-3B-Instruct",
-            "Qwen/Qwen2.5-3B-Instruct",
-        ),
+        "llm_models_subdir": "Qwen2.5-3B-Instruct",
         "model_kwargs": {
             "num_hidden_layers": 2,
         },
     },
     "mistralai/Mistral-Small-3.1-24B-Instruct-2503": {
-        "model": f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503",
+        "llm_models_subdir": "Mistral-Small-3.1-24B-Instruct-2503",
         "model_factory": "Mistral3VLM",
         "compile_backend": "torch-simple",
         "model_kwargs": {
@@ -487,6 +468,9 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An
 
     llm_args = copy.deepcopy(_SMALL_MODEL_CONFIGS[model_hub_id])
 
+    # check if should use llm_models_root or hf_hub_id
+    llm_args["model"] = _hf_model_dir_or_hub_id(llm_args.pop("llm_models_subdir"), model_hub_id)
+
     # add some defaults to llm_args
     llm_args["skip_loading_weights"] = True  # No weight loading to speed up things
     llm_args["free_mem_ratio"] = 0.00  # we don't need the cache and it may cause OOM issues
@@ -507,3 +491,13 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An
     }
 
     return experiment_config
+
+
+def get_small_model_config_pytest_param(
+    model_hub_id: str, pytest_param_kwargs=None, **llm_args_kwargs
+):
+    return pytest.param(
+        get_small_model_config(model_hub_id, **llm_args_kwargs),
+        id=model_hub_id,
+        **(pytest_param_kwargs or {}),
+    )
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py
@@ -3,15 +3,15 @@
 from typing import Dict
 
 import pytest
-from _model_test_utils import get_small_model_config
+from _model_test_utils import get_small_model_config_pytest_param
 from build_and_run_ad import ExperimentConfig, main
 
 
 @pytest.mark.parametrize("world_size", [1, 2])
 @pytest.mark.parametrize(
     "experiment_config",
     [
-        get_small_model_config(
+        get_small_model_config_pytest_param(
             "meta-llama/Meta-Llama-3.1-8B-Instruct",
             attn_backend="flashinfer",
             compile_backend="torch-opt",
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .tree_controllers import MCTSController, TOTController`
	`2`	`+`
	`3`	`+__all__ = ["MCTSController", "TOTController"]`