NVIDIA · kaiyux · Jul 21, 2025 · coderabbitai · Jul 21, 2025
@@ -149,7 +149,6 @@ def setup_llm(args, **kwargs):
     kv_cache_config = KvCacheConfig(
         enable_block_reuse=not args.disable_kv_cache_reuse,
         free_gpu_memory_fraction=args.kv_cache_fraction,
-        dtype=args.kv_cache_dtype,
     )
 
     spec_decode_algo = args.spec_decode_algo.upper(
@@ -195,6 +194,7 @@ def setup_llm(args, **kwargs):
         model=args.model_dir,
         backend='pytorch',
         disable_overlap_scheduler=args.disable_overlap_scheduler,
+        kv_cache_dtype=args.kv_cache_dtype,
         kv_cache_config=kv_cache_config,
         attn_backend=args.attention_backend,
         cuda_graph_config=cuda_graph_config,

@@ -88,14 +88,12 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     enable_chunked_prefill = params.get("enable_chunked_prefill", False)
 
     kv_cache_dtype = "auto"
-    kv_cache_config = {}
     if extra_llm_api_options:
         with open(extra_llm_api_options, 'r') as f:
             llm_args_dict = yaml.safe_load(f)
-            kv_cache_config = llm_args_dict.get("kv_cache_config", {
-                "dtype": "auto",
-            })
-            kv_cache_dtype = kv_cache_config.get("dtype", "auto")
+
+        if "kv_cache_dtype" in llm_args_dict:
+            kv_cache_dtype = llm_args_dict["kv_cache_dtype"]
 
         enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
                                                    enable_chunked_prefill)
@@ -160,11 +158,9 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
         "max_batch_size": max_batch_size
     }
 
-    kv_cache_config["dtype"] = kv_cache_dtype
-
     pyt_options = {
         "cuda_graph_config": cuda_graph_config,
-        "kv_cache_config": kv_cache_config,
+        "kv_cache_dtype": kv_cache_dtype,
     }
 
     backend = params.get("backend", "pytorch")

@@ -114,6 +114,7 @@ def get_pytorch_perf_config(self) -> PyTorchConfig:
     def get_autodeploy_perf_config(self) -> Dict:
         AutoDeployPerfConfig = dict
         ad_config = AutoDeployPerfConfig()
+        ad_config["kv_cache_dtype"] = "auto"
         ad_config["attn_backend"] = "flashinfer"
         return ad_config
 

@@ -11,7 +11,6 @@
 from tensorrt_llm.bench.dataclasses.statistics import (BenchmarkStatistics,
                                                        PercentileStats,
                                                        RequestRecord)
-from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.logger import Logger
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
@@ -276,17 +275,8 @@ def get_statistics_dict(self) -> Dict[str, Any]:
             model = self.rt_cfg.model_path or self.rt_cfg.model
             model_config = ModelConfig.from_pretrained(model,
                                                        trust_remote_code=True)
-            kv_cache_config = self.kwargs.get("kv_cache_config",
-                                              KvCacheConfig())
-            if isinstance(kv_cache_config, KvCacheConfig):
-                kv_cache_dtype = kv_cache_config.dtype
-            elif isinstance(kv_cache_config, dict):
-                kv_cache_dtype = kv_cache_config.get("dtype", "auto")
-            else:
-                raise ValueError(
-                    f"Invalid kv_cache_config type: {type(kv_cache_config)}.")
-
-            validate_and_set_kv_cache_quant(model_config, kv_cache_dtype)
+            validate_and_set_kv_cache_quant(model_config,
+                                            self.kwargs["kv_cache_dtype"])
 
             stats_dict["engine"] |= {
                 "backend":

@@ -822,10 +822,6 @@ class KvCacheConfig(BaseModel, PybindMirror):
     use_uvm: bool = Field(default=False,
                           description="Whether to use UVM for the KV cache.")
 
-    # This is a pure python field, not a pybind field. It is only for the Pytorch backend.
-    dtype: str = Field(default="auto",
-                       description="The data type to use for the KV cache.")
-
     def _to_pybind(self):
         return _KvCacheConfig(
             enable_block_reuse=self.enable_block_reuse,
@@ -1037,6 +1033,10 @@ class BaseLlmArgs(BaseModel):
     lora_config: Optional[LoraConfig] = Field(
         default=None, description="LoRA configuration for the model.")
 
+    # Quantization and calibration configurations
+    quant_config: Optional[QuantConfig] = Field(
+        default=None, description="Quantization config.", validate_default=True)
+
     # Several options from ExecutorConfig, expanded here for less hierarchy
     kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig,
                                            description="KV cache config.")
@@ -1217,6 +1217,13 @@ def validate_dtype(cls, v, info):
                 raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
         return v
 
+    @field_validator("quant_config", mode='before')
+    @classmethod
+    def validate_quant_config(cls, v, info):
+        if v is None:
+            v = QuantConfig()
+        return v
+
     @field_validator("gpus_per_node", mode='before')
     @classmethod
     def validate_gpus_per_node(cls, v, info):
@@ -1668,10 +1675,6 @@ class TrtLlmArgs(BaseLlmArgs):
     calib_config: Optional[CalibConfig] = Field(
         default=None, description="Calibration config.", validate_default=True)
 
-    # Quantization and calibration configurations
-    quant_config: Optional[QuantConfig] = Field(
-        default=None, description="Quantization config.", validate_default=True)
-
     embedding_parallel_mode: str = Field(
         default='SHARDING_ALONG_VOCAB',
         description="The embedding parallel mode.")
@@ -1709,13 +1712,6 @@ def init_calib_config(cls, v):
             return CalibConfig()
         return v
 
-    @field_validator("quant_config", mode='before')
-    @classmethod
-    def validate_quant_config(cls, v, info):
-        if v is None:
-            v = QuantConfig()
-        return v
-
     @model_validator(mode="after")
     def setup_embedding_parallel_mode(self):
         if self.embedding_parallel_mode == 'NONE':
@@ -1760,11 +1756,6 @@ def validate_enable_build_cache(self):
                 f"Invalid build_cache_config: {self.enable_build_cache}")
         return self
 
-    @model_validator(mode="after")
-    def validate_kv_cache_dtype(self):
-        assert self.kv_cache_config.dtype == "auto", "KvCacheConfig.dtype is not supported by the TensorRT backend."
-        return self
-
 
 class LoadFormat(Enum):
     AUTO = 0
@@ -1838,6 +1829,9 @@ class TorchLlmArgs(BaseLlmArgs):
         "If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."
     )
 
+    kv_cache_dtype: str = Field(default="auto",
+                                description="Data type for KV cache.")
+
     enable_iter_perf_stats: bool = Field(
         default=False, description="Enable iteration performance statistics.")
 
@@ -1903,19 +1897,6 @@ class TorchLlmArgs(BaseLlmArgs):
         description="The format of the provided checkpoint.",
     )
 
-    # PrivateVars
-    _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
-
-    @property
-    def quant_config(self) -> QuantConfig:
-        if self._quant_config is None:
-            self._quant_config = QuantConfig()
-        return self._quant_config
-
-    @quant_config.setter
-    def quant_config(self, value: QuantConfig):
-        self._quant_config = value
-
     # TODO: remove backend later
     @field_validator('backend', mode='before')
     def init_backend(cls, v):
@@ -2059,22 +2040,6 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
 
         return self
 
-    @model_validator(mode='after')
-    def sync_quant_config_with_kv_cache_config_dtype(self) -> 'TorchLlmArgs':
-        if self.kv_cache_config is None:
-            return self
-
-        assert self.quant_config is not None
-        if self.kv_cache_config.dtype == "auto":
-            return self
-        elif self.kv_cache_config.dtype == 'fp8':
-            self.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-        else:
-            logger.warning(
-                f"Cannot sync quant_config.kv_cache_quant_algo with kv_cache_config.dtype of {self.kv_cache_config.dtype}, "
-                "please update the validator")
-        return self
-
     # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
     def get_pytorch_backend_config(self) -> "PyTorchConfig":
         from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@@ -2098,7 +2063,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             moe_backend=self.moe_config.backend,
             enable_mixed_sampler=self.enable_mixed_sampler,
             enable_trtllm_sampler=self.enable_trtllm_sampler,
-            kv_cache_dtype=self.kv_cache_config.dtype,
+            kv_cache_dtype=self.kv_cache_dtype,
             enable_iter_perf_stats=self.enable_iter_perf_stats,
             enable_iter_req_stats=self.enable_iter_req_stats,
             print_iter_log=self.print_iter_log,

@@ -405,9 +405,6 @@ def _update_from_hf_quant_config(self) -> bool:
                 logger.info(f"Setting {key}={value} from HF quant config.")
                 setattr(quant_config, key, value)
 
-            # Update the quant_config in llm_args for pytorch
-            self.llm_args.quant_config = quant_config
-
             return True
 
         hf_config_path = f"{self._model_dir}/config.json"