Fix build issues

djns99 · djns99 · commit e09cebe633cc · 2025-08-15T11:49:22.000+12:00
Signed-off-by: djns99 &lt;40156487+djns99@users.noreply.github.com&gt;
diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
@@ -50,7 +50,7 @@ auto listAllTactics(MoeGemmId gemm_id)
 }
 
 template <class BenchClass>
-void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids)
+void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids, MoeGemmId gemm_id)
 {
     if (tactic.is_number_integer())
     {
@@ -60,7 +60,7 @@ void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids)
     {
         for (auto c : tactic)
         {
-            parseTacticToVectorID<BenchClass>(c, tactic_ids);
+            parseTacticToVectorID<BenchClass>(c, tactic_ids, gemm_id);
         }
     }
     else if (tactic.is_string())
@@ -69,7 +69,7 @@ void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids)
         auto tactic_name = tactic.get<std::string>();
         if (tactic_name == "all")
         {
-            auto all_tactics = listAllTactics<BenchClass>();
+            auto all_tactics = listAllTactics<BenchClass>(gemm_id);
             tactic_ids.resize(all_tactics.size());
             std::iota(tactic_ids.begin(), tactic_ids.end(), 0);
         }
@@ -291,9 +291,14 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
             {
                 printed = true;
                 std::cerr << __PRETTY_FUNCTION__ << ": Valid Tactics are:\n";
-                auto confs = listAllTactics<BenchClass>();
-                for (auto c : confs)
-                    std::cerr << c.toString();
+                for (auto gemm_id : {MoeGemmId::GEMM_1, MoeGemmId::GEMM_2})
+                {
+                    std::cerr << "GEMM " << (int) gemm_id << ":\n";
+                    auto confs = listAllTactics<BenchClass>(gemm_id);
+                    for (auto c : confs)
+                        std::cerr << c.toString();
+                    std::cerr << std::endl;
+                }
             }
 
             continue;
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -432,7 +432,14 @@ struct CutlassGemmConfig
     bool enableCudaKernel = false;
     int sm_version = 80; // Use 80 as a catch all for <90
     bool is_tma_warp_specialized = false;
-    bool is_finalize_fusion = false;
+
+    enum class EpilogueFusionType : int
+    {
+        NONE,
+        FINALIZE
+    };
+
+    EpilogueFusionType epilogue_fusion_type = EpilogueFusionType::NONE;
 
     CutlassGemmConfig() = default;
 
@@ -504,7 +511,7 @@ struct CutlassGemmConfig
                    << "\n\tcluster shape ID: " << (int) cluster_shape
                    << "\n\tmainloop sched: " << (int) mainloop_schedule << "\n\tepi sched: " << (int) epilogue_schedule
                    << "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false")
-                   << "\n\tis_finalize_fusion: " << (is_finalize_fusion ? "true" : "false");
+                   << "\n\tepilogue fusion type: " << (int) epilogue_fusion_type;
         }
         else if (tile_config_sm80 != tensorrt_llm::cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic)
         {
@@ -536,7 +543,8 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
             << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
             << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
             << ", cluster_shape_enum: " << int(config.cluster_shape)
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false")
+            << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type);
     }
     else
     {
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -297,7 +297,13 @@ class MoeGemmRunner
     static std::vector<cutlass_extensions::CutlassGemmConfig> getAmpereConfigs(int sm);
 
     [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
-    [[nodiscard]] bool supportsTmaWarpSpecialized() const;
+
+    [[nodiscard]] bool supportsTmaWarpSpecialized() const
+    {
+        return supportsTmaWarpSpecialized(sm_);
+    }
+
+    [[nodiscard]] static bool supportsTmaWarpSpecialized(int sm);
     [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
         ActivationType activation_type, int gemm_n, int gemm_k) const;
     [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -600,8 +600,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         gemm2_config_ = std::move(gemm2_config);
     }
 
-    static auto& addFinalizeFusionConfigs(
-        std::vector<cutlass_extensions::CutlassGemmConfig>& configs, bool use_fused_finalize)
+    static auto addFinalizeFusionConfigs(
+        std::vector<cutlass_extensions::CutlassGemmConfig>&& configs, bool use_fused_finalize)
     {
         if (!use_fused_finalize)
             return configs;
@@ -612,22 +612,24 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
             if (configs[i].is_tma_warp_specialized)
             {
                 configs.push_back(configs[i]);
-                configs.back().is_finalize_fusion = true;
+                configs.back().epilogue_fusion_type
+                    = cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
             }
         }
         return configs;
     }
 
     std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(MoeGemmId gemm_id) override
     {
-        return addFinalizeFusionConfigs(
+        return Self::addFinalizeFusionConfigs(
             moe_gemm_runner_.getConfigs(), gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused());
     }
 
     static std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(int sm, MoeGemmId gemm_id)
     {
         using RunnerType = decltype(moe_gemm_runner_);
-        return RunnerType::getConfigs(sm, gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused(sm));
+        return Self::addFinalizeFusionConfigs(
+            RunnerType::getConfigs(sm), gemm_id == MoeGemmId::GEMM_2 && Self::mayHaveFinalizeFused(sm));
     }
 
     void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -2847,8 +2847,10 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     expert_first_token_offset_ = getWsPtr(int64_t{}, "expert_first_token_offset");
 
     // We check if the provided config uses fused finalize and disable it if it does not
+    bool gemm2_using_finalize_fusion
+        = gemm2_config_->epilogue_fusion_type == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
     permuted_token_final_scales_
-        = gemm2_config_->using_fused_finalize ? getWsPtr(float{}, "permuted_token_final_scales") : nullptr;
+        = gemm2_using_finalize_fusion ? getWsPtr(float{}, "permuted_token_final_scales") : nullptr;
 
     bool const is_gated_activation = isGatedActivation(activation_type);
     bool const gemm1_using_fused_moe
@@ -4005,9 +4007,11 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
         bool apply_bias = parallelism_config.tp_rank == 0;
         auto* fc2_bias = apply_bias ? fc2_expert_biases : nullptr;
+        bool gemm2_using_finalize_fusion = gemm2_config_->epilogue_fusion_type
+            == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
         bool using_fused_finalize
-            = use_fused_finalize_ && gemm2_config_->is_finalize_fusion && !use_w4_groupwise && !use_lora;
-        TLLM_CHECK_WITH_INFO(using_fused_finalize == gemm2_config_->using_fused_finalize,
+            = use_fused_finalize_ && gemm2_using_finalize_fusion && !use_w4_groupwise && !use_lora;
+        TLLM_CHECK_WITH_INFO(using_fused_finalize == gemm2_using_finalize_fusion,
             "GEMM2 tactic requests finalize fusion, but the runner is not configured to use it");
         if (using_fused_finalize)
         {
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h
@@ -290,13 +290,7 @@ class MoeGemmRunner
     static std::vector<cutlass_extensions::CutlassGemmConfig> getAmpereConfigs(int sm);
 
     [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
-
-    [[nodiscard]] bool supportsTmaWarpSpecialized() const
-    {
-        return supportsTmaWarpSpecialized(sm_);
-    }
-
-    [[nodiscard]] static bool supportsTmaWarpSpecialized(int sm);
+    [[nodiscard]] bool supportsTmaWarpSpecialized() const;
     [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
         ActivationType activation_type, int gemm_n, int gemm_k) const;
     [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const;
diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
@@ -43,6 +43,7 @@ namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
 using MoeMinLatencyParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::MoeMinLatencyParams;
 using MOEParallelismConfig = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::MOEParallelismConfig;
 using QuantParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::QuantParams;
+using MoeGemmId = CUTLASS_MOE_GEMM_NAMESPACE::MoeGemmId;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
 using ActivationParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::ActivationParams;
 using TmaWarpSpecializedGroupedGemmInput = CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput;
diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp
@@ -48,6 +48,7 @@ namespace common = tensorrt_llm::common;
 namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
 using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
+using MoeGemmId = CUTLASS_MOE_GEMM_NAMESPACE::MoeGemmId;
 // Always use public header as it is just utility functions and types
 using TmaWarpSpecializedGroupedGemmInput = tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 using profiler_backend = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::GemmProfilerBackend;
@@ -571,9 +572,10 @@ class FusedMoeRunner : public torch::CustomClassHolder
         return std::make_tuple(output, num_active_experts_per_node, experts_to_token_score, active_expert_global_ids);
     }
 
-    int64_t getTacticNum(int gemm_idx)
+    int64_t getTacticNum(int64_t const gemm_idx)
     {
         std::lock_guard<std::mutex> lock(mMutex);
+        TORCH_CHECK(gemm_idx == 1 || gemm_idx == 2, "gemm_idx must be 1 or 2");
         return (gemm_idx == 1) ? mGemm1Profiles.size() : mGemm2Profiles.size();
     }
 
diff --git a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu
@@ -1120,15 +1120,17 @@ protected:
     auto selectTacticsForArch(int sm)
     {
         bool is_tma_warp_specialized = sm >= 90 && !INT_QUANT;
-        bool is_finalize_fusion = is_tma_warp_specialized && mUseFusedFinalize;
+        bool epilogue_fusion_type = is_tma_warp_specialized && mUseFusedFinalize
+            ? cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE
+            : cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::NONE;
         auto tactics1 = getFilteredConfigs(sm, MoeGemmId::GEMM_1);
         auto tactics2 = getFilteredConfigs(sm, MoeGemmId::GEMM_2);
         auto it1 = std::find_if(tactics1.begin(), tactics1.end(),
             [is_tma_warp_specialized](auto& c) { return c.is_tma_warp_specialized == is_tma_warp_specialized; });
         auto it2 = std::find_if(tactics2.begin(), tactics2.end(),
-            [is_tma_warp_specialized, is_finalize_fusion](auto& c) {
+            [is_tma_warp_specialized, epilogue_fusion_type](auto& c) {
                 return c.is_tma_warp_specialized == is_tma_warp_specialized
-                    && c.using_fused_finalize == is_finalize_fusion;
+                    && c.epilogue_fusion_type == epilogue_fusion_type;
             });
         if (it1 == tactics1.end() || it2 == tactics2.end())
         {
@@ -1175,7 +1177,7 @@ protected:
         if (!tactic1 || !tactic2)
         {
             int sm = getSMVersion();
-            std::tie(tactic1, tactic2) = selectTacticsForArch(sm, mUseFusedFinalize);
+            std::tie(tactic1, tactic2) = selectTacticsForArch(sm);
         }
         ASSERT_TRUE(tactic1.has_value());
         ASSERT_TRUE(tactic2.has_value());
@@ -1637,8 +1639,9 @@ void MixtureOfExpertsTest<TypeParam_>::BasicPermuteTest(
         auto [expected_experts, token_final_scales] = populateRouting(num_experts, num_tokens, k);
 
         runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k);
-        bool should_be_deterministic
-            = !gemm2.is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
+        bool is_finalize_fusion
+            = gemm2.epilogue_fusion_type == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
+        bool should_be_deterministic = !is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
         if (should_be_deterministic && !mIsLongTest)
         {
             auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1904,8 +1907,10 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
                     // Only need to init the inputs on the first iteration
                     runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k,
                         MOEParallelismConfig{tp_size, i, ep_size, j}, enable_alltoall);
+                    bool is_finalize_fusion = gemm2.epilogue_fusion_type
+                        == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
                     bool should_be_deterministic
-                        = !gemm2.is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
+                        = !is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
                     if (should_be_deterministic && !mIsLongTest)
                     {
                         auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1920,8 +1925,10 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
                 else
                 {
                     runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j}, enable_alltoall);
+                    bool is_finalize_fusion = gemm2.epilogue_fusion_type
+                        == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
                     bool should_be_deterministic
-                        = !gemm2.is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
+                        = !is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
                     if (should_be_deterministic && !mIsLongTest)
                     {
                         auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ auto listAllTactics(MoeGemmId gemm_id)`
`50`	`50`	`}`
`51`	`51`
`52`	`52`	`template <class BenchClass>`
`53`		`-void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids)`
	`53`	`+void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids, MoeGemmId gemm_id)`
`54`	`54`	`{`
`55`	`55`	`if (tactic.is_number_integer())`
`56`	`56`	`{`
`@@ -60,7 +60,7 @@ void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids)`
`60`	`60`	`{`
`61`	`61`	`for (auto c : tactic)`
`62`	`62`	`{`
`63`		`- parseTacticToVectorID<BenchClass>(c, tactic_ids);`
	`63`	`+ parseTacticToVectorID<BenchClass>(c, tactic_ids, gemm_id);`
`64`	`64`	`}`
`65`	`65`	`}`
`66`	`66`	`else if (tactic.is_string())`
`@@ -69,7 +69,7 @@ void parseTacticToVectorID(nlohmann::json& tactic, std::vector<int>& tactic_ids)`
`69`	`69`	`auto tactic_name = tactic.get<std::string>();`
`70`	`70`	`if (tactic_name == "all")`
`71`	`71`	`{`
`72`		`- auto all_tactics = listAllTactics<BenchClass>();`
	`72`	`+ auto all_tactics = listAllTactics<BenchClass>(gemm_id);`
`73`	`73`	`tactic_ids.resize(all_tactics.size());`
`74`	`74`	`std::iota(tactic_ids.begin(), tactic_ids.end(), 0);`
`75`	`75`	`}`
`@@ -291,9 +291,14 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)`
`291`	`291`	`{`
`292`	`292`	`printed = true;`
`293`	`293`	`std::cerr << __PRETTY_FUNCTION__ << ": Valid Tactics are:\n";`
`294`		`- auto confs = listAllTactics<BenchClass>();`
`295`		`- for (auto c : confs)`
`296`		`- std::cerr << c.toString();`
	`294`	`+ for (auto gemm_id : {MoeGemmId::GEMM_1, MoeGemmId::GEMM_2})`
	`295`	`+ {`
	`296`	`+ std::cerr << "GEMM " << (int) gemm_id << ":\n";`
	`297`	`+ auto confs = listAllTactics<BenchClass>(gemm_id);`
	`298`	`+ for (auto c : confs)`
	`299`	`+ std::cerr << c.toString();`
	`300`	`+ std::cerr << std::endl;`
	`301`	`+ }`
`297`	`302`	`}`
`298`	`303`
`299`	`304`	`continue;`
Original file line number	Diff line number	Diff line change
`@@ -600,8 +600,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface`
`600`	`600`	`gemm2_config_ = std::move(gemm2_config);`
`601`	`601`	`}`
`602`	`602`
`603`		`- static auto& addFinalizeFusionConfigs(`
`604`		`- std::vector<cutlass_extensions::CutlassGemmConfig>& configs, bool use_fused_finalize)`
	`603`	`+ static auto addFinalizeFusionConfigs(`
	`604`	`+ std::vector<cutlass_extensions::CutlassGemmConfig>&& configs, bool use_fused_finalize)`
`605`	`605`	`{`
`606`	`606`	`if (!use_fused_finalize)`
`607`	`607`	`return configs;`
`@@ -612,22 +612,24 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface`
`612`	`612`	`if (configs[i].is_tma_warp_specialized)`
`613`	`613`	`{`
`614`	`614`	`configs.push_back(configs[i]);`
`615`		`- configs.back().is_finalize_fusion = true;`
	`615`	`+ configs.back().epilogue_fusion_type`
	`616`	`+ = cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;`
`616`	`617`	`}`
`617`	`618`	`}`
`618`	`619`	`return configs;`
`619`	`620`	`}`
`620`	`621`
`621`	`622`	`std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(MoeGemmId gemm_id) override`
`622`	`623`	`{`
`623`		`- return addFinalizeFusionConfigs(`
	`624`	`+ return Self::addFinalizeFusionConfigs(`
`624`	`625`	`moe_gemm_runner_.getConfigs(), gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused());`
`625`	`626`	`}`
`626`	`627`
`627`	`628`	`static std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(int sm, MoeGemmId gemm_id)`
`628`	`629`	`{`
`629`	`630`	`using RunnerType = decltype(moe_gemm_runner_);`
`630`		`- return RunnerType::getConfigs(sm, gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused(sm));`
	`631`	`+ return Self::addFinalizeFusionConfigs(`
	`632`	`+ RunnerType::getConfigs(sm), gemm_id == MoeGemmId::GEMM_2 && Self::mayHaveFinalizeFused(sm));`
`631`	`633`	`}`
`632`	`634`
`633`	`635`	`void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,`
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ namespace common = tensorrt_llm::common;`
`48`	`48`	`namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;`
`49`	`49`	`using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams;`
`50`	`50`	`using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;`
	`51`	`+using MoeGemmId = CUTLASS_MOE_GEMM_NAMESPACE::MoeGemmId;`
`51`	`52`	`// Always use public header as it is just utility functions and types`
`52`	`53`	`using TmaWarpSpecializedGroupedGemmInput = tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;`
`53`	`54`	`using profiler_backend = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::GemmProfilerBackend;`
`@@ -571,9 +572,10 @@ class FusedMoeRunner : public torch::CustomClassHolder`
`571`	`572`	`return std::make_tuple(output, num_active_experts_per_node, experts_to_token_score, active_expert_global_ids);`
`572`	`573`	`}`
`573`	`574`
`574`		`- int64_t getTacticNum(int gemm_idx)`
	`575`	`+ int64_t getTacticNum(int64_t const gemm_idx)`
`575`	`576`	`{`
`576`	`577`	`std::lock_guard<std::mutex> lock(mMutex);`
	`578`	`+ TORCH_CHECK(gemm_idx == 1 \|\| gemm_idx == 2, "gemm_idx must be 1 or 2");`
`577`	`579`	`return (gemm_idx == 1) ? mGemm1Profiles.size() : mGemm2Profiles.size();`
`578`	`580`	`}`
`579`	`581`