FP8 Context MLA integration.

yuxianq · yuxianq · commit 7961672ab2f6 · 2025-09-08T08:50:42.000Z
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -2570,8 +2570,7 @@ int AttentionOp::initialize() noexcept
     if (mIsMLAEnabled)
     {
         TLLM_CHECK_WITH_INFO(mEnableContextFMHA, "MLA(Deepseek v2) only support fmha");
-        TLLM_CHECK_WITH_INFO(
-            !mFP8ContextFMHA && !mDenseContextFMHA, "MLA(Deepseek v2) currently not support FP8 and dense fmha");
+        TLLM_CHECK_WITH_INFO(!mDenseContextFMHA, "MLA(Deepseek v2) currently not support dense fmha");
         TLLM_CHECK_WITH_INFO(
             mPagedKVCache && mUseKVCache && mRemovePadding, "MLA(Deepseek v2) only support paged kv cache");
         TLLM_CHECK_WITH_INFO(!mCrossAttention, "MLA(Deepseek v2) do not support cross attention right now");
@@ -2736,11 +2735,6 @@ int AttentionOp::initialize() noexcept
                     qDataType = DATA_TYPE_E4M3;
                     kvDataType = DATA_TYPE_E4M3;
                 }
-                // When FP8 Context FMHA is enabled, the output data type needs to be E4M3.
-                if (mFP8ContextFMHA)
-                {
-                    outputDataType = DATA_TYPE_E4M3;
-                }
 
                 // Instantiate the mTllmGenFMHARunner used for MLA
                 mTllmGenFMHARunner.reset(new TllmGenFmhaRunner(qDataType, kvDataType, outputDataType));
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
@@ -466,13 +466,13 @@ class AttentionOp
             (int8_t) mPositionEmbeddingType, mUseLognScaling, mRemovePadding, (int32_t) mMaskType,
             mBlockSparseParams.data(), mPagedKVCache, mTokensPerBlock, mKVCacheQuantMode.value(), mTpSize, mTpRank,
             mUnfuseQkvGemm, (int32_t) mType, mMaxContextLength, mQKVBiasEnabled, mCrossAttention, mMaxDistance,
-            mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mDenseContextFMHA, mHasFullAttentionMask,
-            mIsSpecDecodingEnabled, mUseSpecDecoding, mIsSpecDecTree, mSpecDecodingIsGenerationLengthVariable,
-            mSpecDecodingMaxGenerationLength, mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mMLAParams.data(),
-            mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank,
-            mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode,
-            mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant, mNbMultiBlockSemaphores,
-            mAttentionChunkSize.value_or(-1));
+            mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mFP8ContextMLA, mDenseContextFMHA,
+            mHasFullAttentionMask, mIsSpecDecodingEnabled, mUseSpecDecoding, mIsSpecDecTree,
+            mSpecDecodingIsGenerationLengthVariable, mSpecDecodingMaxGenerationLength, mIsMLAEnabled, mIsGenerationMLA,
+            mUseGenFlashMLA, mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads,
+            mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast,
+            mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant,
+            mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
     };
 
 private:
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
@@ -541,7 +541,9 @@ class TllmGenFmhaKernel
         int numTokensPerPage = (!isPagedKv(params.mQkvLayout)) ? 0 : params.mNumTokensPerPage;
 
         // Debug info.
-        std::string info = "qkvLayout=" + std::to_string(static_cast<int>(params.mQkvLayout))
+        std::string info = "dtypeQ=" + std::to_string(static_cast<int>(mDtypeQ)) + ", dtypeKv="
+            + std::to_string(static_cast<int>(mDtypeKv)) + ", dtypeOut=" + std::to_string(static_cast<int>(mDtypeOut))
+            + ", sm=" + std::to_string(mSM) + ", qkvLayout=" + std::to_string(static_cast<int>(params.mQkvLayout))
             + ", maskType=" + std::to_string(static_cast<int>(selectKernelParams.mMaskType))
             + ", kernelType=" + std::to_string(static_cast<int>(kernelType))
             + ", tileScheduler=" + std::to_string(static_cast<int>(selectKernelParams.mTileScheduler))
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -529,38 +529,38 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
     {
         if (is_fp8_out)
         {
-            runner.reset(new Runner<half, __nv_fp8_e4m3>());
+            runner = std::make_shared<Runner<half, __nv_fp8_e4m3>>();
         }
         else if (is_fp4_out)
         {
-            runner.reset(new Runner<half, __nv_fp4_e2m1>());
+            runner = std::make_shared<Runner<half, __nv_fp4_e2m1>>();
         }
         else
         {
             TLLM_CHECK(!out_dtype.has_value() || out_dtype.value() == torch::kFloat16);
-            runner.reset(new Runner<half>());
+            runner = std::make_shared<Runner<half>>();
         }
     }
     else if (dtype == nvinfer1::DataType::kFLOAT)
     {
         TLLM_CHECK(!out_dtype.has_value() || out_dtype.value() == torch::kFloat32);
-        runner.reset(new Runner<float>());
+        runner = std::make_shared<Runner<float>>();
     }
 #ifdef ENABLE_BF16
     else if (dtype == nvinfer1::DataType::kBF16)
     {
         if (is_fp8_out)
         {
-            runner.reset(new Runner<__nv_bfloat16, __nv_fp8_e4m3>());
+            runner = std::make_shared<Runner<__nv_bfloat16, __nv_fp8_e4m3>>();
         }
         else if (is_fp4_out)
         {
-            runner.reset(new Runner<__nv_bfloat16, __nv_fp4_e2m1>());
+            runner = std::make_shared<Runner<__nv_bfloat16, __nv_fp4_e2m1>>();
         }
         else
         {
             TLLM_CHECK(!out_dtype.has_value() || out_dtype.value() == torch::kBFloat16);
-            runner.reset(new Runner<__nv_bfloat16>());
+            runner = std::make_shared<Runner<__nv_bfloat16>>();
         }
     }
 #endif
@@ -578,13 +578,13 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
     auto op = std::make_shared<AttentionOp>();
     op->mType = dtype;
     op->mFMHAForceFP32Acc = dtype == nvinfer1::DataType::kBF16;
+    op->mKVCacheQuantMode = tensorrt_llm::common::QuantMode(uint32_t(quant_mode));
     op->mFP8ContextFMHA = is_fp8_out || is_fp4_out;
     op->mLayerIdx = layer_idx;
     op->mNumHeads = num_heads;
     op->mNumKVHeads = num_kv_heads;
     op->mHeadSize = head_size;
     op->mMaskType = static_cast<tensorrt_llm::kernels::AttentionMaskType>(int32_t(mask_type));
-    op->mKVCacheQuantMode = tensorrt_llm::common::QuantMode(uint32_t(quant_mode));
     op->mUseKVCache = use_kv_cache;
     op->mPagedKVCache = op->mPagedKVCache && use_kv_cache; // update mPagedKVCache based on use_kv_cache
     op->mTokensPerBlock = tokens_per_block.value_or(0);
@@ -627,7 +627,9 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
             static_cast<int>(v_head_dim.value()), static_cast<int>(predicted_tokens_per_seq),
             static_cast<int>(layer_num)};
 
-        op->mFP8ContextMLA = tensorrt_llm::common::getSMVersion() == 120 && op->mKVCacheQuantMode.hasFp8KvCache();
+        op->mFP8ContextMLA
+            = (tensorrt_llm::common::getSMVersion() == 100 || tensorrt_llm::common::getSMVersion() == 120)
+            && op->mKVCacheQuantMode.hasFp8KvCache();
         op->mIsGenerationMLA = head_size == op->mMLAParams.kv_lora_rank + op->mMLAParams.qk_rope_head_dim;
         op->mFP8GenerationMLA = op->mKVCacheQuantMode.hasFp8KvCache();
         // only enable flash mla on sm90 and head_size == 576 and tokens_per_block == 64
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -295,6 +295,12 @@ def create_weights(self):
         # which could be modified after __init__
         self.attn.update_quant_config(self.quant_config)
 
+        self.o_proj.create_weights()
+        self.has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
+                                or self.o_proj.has_fp8_block_scales
+                                or self.o_proj.has_fp8_rowwise
+                                or self.o_proj.has_w4a8_nvfp4_fp8)
+
     def split_qkv(self, q, k=None, v=None):
         if k is None and v is None:
             q, k, v = q.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -314,12 +320,8 @@ def create_output(self, q: torch.Tensor):
         out_dtype = q.dtype
 
         if self.attn_backend == "TRTLLM":
-            has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
-                               or self.o_proj.has_fp8_block_scales
-                               or self.o_proj.has_fp8_rowwise
-                               or self.o_proj.has_w4a8_nvfp4_fp8)
-            if has_quant_scale and (self.attn.has_fp8_kv_cache
-                                    or self.attn.has_fp4_kv_cache):
+            if self.has_quant_scale and (self.attn.has_fp8_kv_cache
+                                         or self.attn.has_fp4_kv_cache):
                 out_dtype = torch.float8_e4m3fn
         output = q.new_empty([num_tokens, hidden_size], dtype=out_dtype)
         return output
@@ -350,11 +352,7 @@ def _attn_impl(
 
         out_scale = None
         out_scale_sf = None
-        has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
-                           or self.o_proj.has_fp8_block_scales
-                           or self.o_proj.has_fp8_rowwise
-                           or self.o_proj.has_w4a8_nvfp4_fp8)
-        if has_quant_scale:
+        if self.has_quant_scale:
             out_scale = self.o_proj.inv_input_scale
         if self.o_proj.has_nvfp4 and self.support_nvfp4_output and enable_attn_nvfp4_output:
             out_scale_sf = self.o_proj.input_scale
@@ -847,6 +845,9 @@ def create_weights(self):
         self.mha.update_quant_config(self.quant_config)
         self.mqa.update_quant_config(self.quant_config)
 
+        # Although we use FP8 MLA for context/generation phase, the output is still in BF16
+        self.out_scale = None
+
         # k_b_proj_trans's dtype must be consistent with self.kv_b_proj,
         # which can be modified after __init__
         has_fp8_block_scales = (
@@ -1050,17 +1051,14 @@ def forward_context_default(
                                                        self.qk_rope_head_dim)
         k = k.view(-1, self.num_heads * self.qk_head_dim)
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Currently we use BF16 MHA for context phase
-
         attn_output = self.mha.forward(
             q,
             k,
             v,
             attn_metadata,
             attention_input_type=AttentionInputType.context_only,
             latent_cache=latent_cache,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             output=output,
         )
 
@@ -1115,9 +1113,6 @@ def forward_context_with_cached_kv(
         full_kv = None
         full_k_nope = None
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Currently we use BF16 MHA for context phase
-
         # latent_cache must be None to differentiate from normal context phase,
         # so that we can skip applying RoPE and appending KV cache inside attention op
         attn_output = self.mha.forward(
@@ -1127,7 +1122,7 @@ def forward_context_with_cached_kv(
             attn_metadata,
             attention_input_type=AttentionInputType.context_only,
             latent_cache=None,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             output=output,
         )
 
@@ -1217,7 +1212,6 @@ def forward_context_with_chunked_prefill(
                 loop_idx]
             attn_metadata.host_total_kv_lens[0] = total_ctx_chunked_tokens
 
-            out_scale = None
             # do not apply mask for attention within loop
             # latent_cache must be None to differentiate from normal context phase,
             # so that we can skip applying RoPE and appending KV cache inside attention op
@@ -1228,7 +1222,7 @@ def forward_context_with_chunked_prefill(
                 attn_metadata,
                 attention_input_type=AttentionInputType.context_only,
                 latent_cache=None,
-                out_scale=out_scale,
+                out_scale=self.out_scale,
                 attention_mask=PredefinedAttentionMask.FULL,
                 softmax_stats_tensor=self.temp_softmax_stats_tensor,
                 output=temp_attn_output,
@@ -1267,9 +1261,6 @@ def forward_context_with_chunked_prefill(
                                                        num_contexts].sum().item(
                                                        )
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Currently we use BF16 MHA for context phase
-
         # latent_cache must be None to differentiate from normal context phase,
         # so that we can skip applying RoPE and appending KV cache inside attention op
         temp_attn_output = self.mha.forward(
@@ -1279,7 +1270,7 @@ def forward_context_with_chunked_prefill(
             attn_metadata,
             attention_input_type=AttentionInputType.context_only,
             latent_cache=None,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             softmax_stats_tensor=self.temp_softmax_stats_tensor,
             output=temp_attn_output,
         )
@@ -1375,16 +1366,13 @@ def forward_generation(
             self.num_heads * (self.kv_lora_rank + self.qk_rope_head_dim)
         ])
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Although we use FP8 MLA for generation phase, the output is still in BF16
-
         attn_out_latent = self.mqa.forward(
             fused_q,
             None,
             None,
             attn_metadata,
             attention_input_type=AttentionInputType.generation_only,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             latent_cache=latent_cache,  # kvcache and k_pe
             q_pe=q_pe,  # used by `invokeMLARopeGeneration`
         )
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1008,7 +1008,7 @@ def init_meta_tensor(t: torch.Tensor):
 
             except Exception:
                 logger.info(
-                    f"Fallback to regular model init: {traceback.format_exc(limit=1)}\n"
+                    f"Fallback to regular model init: {traceback.format_exc(limit=10)}\n"
                 )
                 model = AutoModelForCausalLM.from_config(config)
 
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -512,6 +512,9 @@ def _deduce_max_tokens(request: GenerationRequest,
                 else:
                     # use max_tokens if can't deduce default_max_tokens
                     return max_tokens
+            assert (
+                len(prompt_token_ids) <= executor_config.max_seq_len
+            ), f"`prompt_token_ids` length ({len(prompt_token_ids)}) is greater than `max_seq_len` ({executor_config.max_seq_len})"
             splited_prompt_len = int(len(prompt_token_ids) / cp_size)
             default_max_tokens = max_seq_len - splited_prompt_len - query_token_len
             if default_max_tokens <= 0:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1212,7 +1212,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @skip_no_hopper
+    @skip_pre_hopper
     @parametrize_with_ids("torch_compile", [False, True])
     @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
                           [(False, False, False, False),
@@ -1236,6 +1236,8 @@ def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             torch_compile_config=torch_compile_config,
+            moe_config=MoeConfig(
+                backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS"),
         )
 
         if fp8kv:
@@ -1311,7 +1313,7 @@ def test_cute_dsl_fp8_block_scales(
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @pytest.mark.skip_device_not_contain(["H100"])
+    @skip_pre_hopper
     @parametrize_with_ids("mtp_nextn", [0, 2])
     def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
@@ -1324,6 +1326,8 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
                 max_batch_size=512,
                 enable_padding=True,
             ),
+            moe_config=MoeConfig(
+                backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS"),
         )
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                  kv_cache_config=kv_cache_config,
@@ -1334,7 +1338,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
-    @skip_no_hopper
+    @skip_pre_hopper
     @parametrize_with_ids("mtp_nextn", [0, 2])
     @parametrize_with_ids("attention_dp", [False, True])
     def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
@@ -1346,6 +1350,8 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
         pytorch_config = dict(
             disable_overlap_scheduler=False,
             cuda_graph_config=CudaGraphConfig(enable_padding=True),
+            moe_config=MoeConfig(
+                backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS"),
         )
 
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
@@ -1359,7 +1365,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
-    @skip_no_hopper
+    @skip_pre_hopper
     @parametrize_with_ids("torch_compile", [False, True])
     @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
                           [(False, False, False, False),
@@ -1388,6 +1394,8 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             torch_compile_config=torch_compile_config,
+            moe_config=MoeConfig(
+                backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS"),
         )
 
         if fp8kv:
@@ -1474,7 +1482,7 @@ def test_cute_dsl_fp8_block_scales_4gpus(
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
-    @pytest.mark.skip_device_not_contain(["H100", "H200"])
+    @skip_pre_hopper
     def test_fp8_block_scales_4gpus_static_eplb(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
 

Original file line number	Diff line number	Diff line change
`@@ -529,38 +529,38 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to`
`529`	`529`	`{`
`530`	`530`	`if (is_fp8_out)`
`531`	`531`	`{`
`532`		`- runner.reset(new Runner<half, __nv_fp8_e4m3>());`
	`532`	`+ runner = std::make_shared<Runner<half, __nv_fp8_e4m3>>();`
`533`	`533`	`}`
`534`	`534`	`else if (is_fp4_out)`
`535`	`535`	`{`
`536`		`- runner.reset(new Runner<half, __nv_fp4_e2m1>());`
	`536`	`+ runner = std::make_shared<Runner<half, __nv_fp4_e2m1>>();`
`537`	`537`	`}`
`538`	`538`	`else`
`539`	`539`	`{`
`540`	`540`	`TLLM_CHECK(!out_dtype.has_value() \|\| out_dtype.value() == torch::kFloat16);`
`541`		`- runner.reset(new Runner<half>());`
	`541`	`+ runner = std::make_shared<Runner<half>>();`
`542`	`542`	`}`
`543`	`543`	`}`
`544`	`544`	`else if (dtype == nvinfer1::DataType::kFLOAT)`
`545`	`545`	`{`
`546`	`546`	`TLLM_CHECK(!out_dtype.has_value() \|\| out_dtype.value() == torch::kFloat32);`
`547`		`- runner.reset(new Runner<float>());`
	`547`	`+ runner = std::make_shared<Runner<float>>();`
`548`	`548`	`}`
`549`	`549`	`#ifdef ENABLE_BF16`
`550`	`550`	`else if (dtype == nvinfer1::DataType::kBF16)`
`551`	`551`	`{`
`552`	`552`	`if (is_fp8_out)`
`553`	`553`	`{`
`554`		`- runner.reset(new Runner<__nv_bfloat16, __nv_fp8_e4m3>());`
	`554`	`+ runner = std::make_shared<Runner<__nv_bfloat16, __nv_fp8_e4m3>>();`
`555`	`555`	`}`
`556`	`556`	`else if (is_fp4_out)`
`557`	`557`	`{`
`558`		`- runner.reset(new Runner<__nv_bfloat16, __nv_fp4_e2m1>());`
	`558`	`+ runner = std::make_shared<Runner<__nv_bfloat16, __nv_fp4_e2m1>>();`
`559`	`559`	`}`
`560`	`560`	`else`
`561`	`561`	`{`
`562`	`562`	`TLLM_CHECK(!out_dtype.has_value() \|\| out_dtype.value() == torch::kBFloat16);`
`563`		`- runner.reset(new Runner<__nv_bfloat16>());`
	`563`	`+ runner = std::make_shared<Runner<__nv_bfloat16>>();`
`564`	`564`	`}`
`565`	`565`	`}`
`566`	`566`	`#endif`
`@@ -578,13 +578,13 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to`
`578`	`578`	`auto op = std::make_shared<AttentionOp>();`
`579`	`579`	`op->mType = dtype;`
`580`	`580`	`op->mFMHAForceFP32Acc = dtype == nvinfer1::DataType::kBF16;`
	`581`	`+ op->mKVCacheQuantMode = tensorrt_llm::common::QuantMode(uint32_t(quant_mode));`
`581`	`582`	`op->mFP8ContextFMHA = is_fp8_out \|\| is_fp4_out;`
`582`	`583`	`op->mLayerIdx = layer_idx;`
`583`	`584`	`op->mNumHeads = num_heads;`
`584`	`585`	`op->mNumKVHeads = num_kv_heads;`
`585`	`586`	`op->mHeadSize = head_size;`
`586`	`587`	`op->mMaskType = static_cast<tensorrt_llm::kernels::AttentionMaskType>(int32_t(mask_type));`
`587`		`- op->mKVCacheQuantMode = tensorrt_llm::common::QuantMode(uint32_t(quant_mode));`
`588`	`588`	`op->mUseKVCache = use_kv_cache;`
`589`	`589`	`op->mPagedKVCache = op->mPagedKVCache && use_kv_cache; // update mPagedKVCache based on use_kv_cache`
`590`	`590`	`op->mTokensPerBlock = tokens_per_block.value_or(0);`
`@@ -627,7 +627,9 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to`
`627`	`627`	`static_cast<int>(v_head_dim.value()), static_cast<int>(predicted_tokens_per_seq),`
`628`	`628`	`static_cast<int>(layer_num)};`
`629`	`629`
`630`		`- op->mFP8ContextMLA = tensorrt_llm::common::getSMVersion() == 120 && op->mKVCacheQuantMode.hasFp8KvCache();`
	`630`	`+ op->mFP8ContextMLA`
	`631`	`+ = (tensorrt_llm::common::getSMVersion() == 100 \|\| tensorrt_llm::common::getSMVersion() == 120)`
	`632`	`+ && op->mKVCacheQuantMode.hasFp8KvCache();`
`631`	`633`	`op->mIsGenerationMLA = head_size == op->mMLAParams.kv_lora_rank + op->mMLAParams.qk_rope_head_dim;`
`632`	`634`	`op->mFP8GenerationMLA = op->mKVCacheQuantMode.hasFp8KvCache();`
`633`	`635`	`// only enable flash mla on sm90 and head_size == 576 and tokens_per_block == 64`
Original file line number	Diff line number	Diff line change
`@@ -1008,7 +1008,7 @@ def init_meta_tensor(t: torch.Tensor):`
`1008`	`1008`
`1009`	`1009`	`except Exception:`
`1010`	`1010`	`logger.info(`
`1011`		`- f"Fallback to regular model init: {traceback.format_exc(limit=1)}\n"`
	`1011`	`+ f"Fallback to regular model init: {traceback.format_exc(limit=10)}\n"`
`1012`	`1012`	`)`
`1013`	`1013`	`model = AutoModelForCausalLM.from_config(config)`
`1014`	`1014`