wip

danielafrimi · danielafrimi · commit 2facb5e86d18 · 2025-11-18T14:33:48.000Z
Signed-off-by: Daniel Afrimi &lt;dafrimi@nvidia.com&gt;
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
@@ -25,6 +25,7 @@
 from ..logger import logger
 from ..mapping import Mapping
 from ..models.automodel import MODEL_MAP, AutoConfig, AutoModelForCausalLM
+from ..models.modeling_utils import QuantAlgo  # noqa: F401
 from ..models.modeling_utils import PretrainedConfig, QuantConfig
 from ..module import Module
 from .build_cache import (BuildCache, BuildCacheConfig, CachedStage,
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
@@ -235,11 +235,11 @@ def _infer_kv_cache_quant_algo_from_scheme(kv_scheme: dict) -> str | None:
         bits = kv_scheme.get("num_bits")
         dynamic = bool(kv_scheme.get("dynamic", False))
 
-        # todo add here all options...
+        # TODO (danielafrimi) needs to check all supported options...
         if kv_type == "float" and bits == 8 and not dynamic:
-            return QuantAlgo("FP8_BLOCK_SCALES")
+            return QuantAlgo.FP8
         if kv_type in ("int", "uint") and bits == 8:
-            return QuantAlgo("INT8")
+            return QuantAlgo.INT8
         return None
 
     def _map_new_to_legacy_args(self, hf_quant_config: dict) -> dict:
@@ -261,8 +261,7 @@ def _map_new_to_legacy_args(self, hf_quant_config: dict) -> dict:
                 hf_quant_config.get("ignore") or [])
 
         kv_scheme = hf_quant_config.get("kv_cache_scheme") or {}
-        kv_algo = QuantConfig._infer_kv_cache_quant_algo_from_scheme(
-            kv_scheme)  # todo check it
+        kv_algo = QuantConfig._infer_kv_cache_quant_algo_from_scheme(kv_scheme)
         if kv_algo is not None:
             qunatization_dict["kv_cache_quant_algo"] = kv_algo
 
@@ -273,7 +272,6 @@ def _map_new_to_legacy_args(self, hf_quant_config: dict) -> dict:
         if "symmetric" in hf_quant_config:
             qunatization_dict["zero_point"] = hf_quant_config["symmetric"]
 
-        # todo add here pre qunat scale and other keys....
         return qunatization_dict
 
     def _update_from_quant_config_json(self, path, moe_backend: str,