Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 88 additions & 63 deletions csrc/trtllm_fused_moe_kernel_launcher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,26 @@ inline int32_t nextPowerOfTwo(float value) {
std::set<int32_t> computeSelectedTileN(std::vector<int32_t> const& supported_tile_nums,
int64_t const num_tokens, int64_t const top_k,
int64_t const num_local_experts) {
TVM_FFI_ICHECK(!supported_tile_nums.empty()) << "supported_tile_nums must not be empty.";
float const avg_tokens_per_expert = static_cast<float>(num_tokens * top_k) / num_local_experts;
// NOTE: This differs from Python AutoTuner bucketing:
// - AutoTuner maps raw num_tokens with last_positive_power_of_2 (round-down).
// - Here we map derived avg_tokens_per_expert and use nextPowerOfTwo (round-up).
// Because they round different quantities in different directions, cache bucket and runtime
// tile candidates can diverge; launcher-side tactic resolution handles that mismatch.
// assume supported_tile_nums is sorted
int32_t tile_tokens_dim = std::clamp(nextPowerOfTwo(avg_tokens_per_expert),
supported_tile_nums.front(), supported_tile_nums.back());
auto it = std::find(supported_tile_nums.begin(), supported_tile_nums.end(), tile_tokens_dim);

FLASHINFER_CHECK(
it != supported_tile_nums.end(), "computeSelectedTileN expected exact tile ", tile_tokens_dim,
" in supported_tile_nums (size=", supported_tile_nums.size(),
"). Please keep supported_tile_nums as a dense power-of-2 ladder for this launcher.");

// Candidate tile set centered on the heuristic tile.
// This function returns nearby candidates (not a single final tile):
// center, +1, +2, and -1 neighbors when available.
// Final tile choice is made later (autotuner-provided tile if valid, otherwise fallback policy).
std::set<int32_t> selected_tile_nums;
selected_tile_nums.insert(tile_tokens_dim);
if (std::next(it) != supported_tile_nums.end()) {
Expand All @@ -106,6 +120,37 @@ std::set<int32_t> computeSelectedTileN(std::vector<int32_t> const& supported_til
return selected_tile_nums;
}

int64_t selectDefaultTileN(std::vector<int32_t> const& supported_tile_nums,
int64_t const num_tokens, int64_t const top_k,
int64_t const num_local_experts) {
auto selected = computeSelectedTileN(supported_tile_nums, num_tokens, top_k, num_local_experts);
TVM_FFI_ICHECK(!selected.empty()) << "No selected tile_N candidates for current MoE input.";
return *selected.begin();
}

// Resolve the (tile_N, config) pair passed from Python side, applying fallback logic
// when tile_N is -1.
std::pair<int64_t, int64_t> resolveMoeTileAndConfig(Array<int64_t> const& config_index,
std::vector<int32_t> const& supported_tile_nums,
int64_t const num_tokens, int64_t const top_k,
int64_t const num_local_experts) {
// Python side convention: tactic is [tile_N, config]
TVM_FFI_ICHECK(config_index.size() == 2)
<< "Invalid tactic, expected to be [tile_N, config], but got array of size "
<< config_index.size();
const int64_t tile_N = config_index[0];
const int64_t config = config_index[1];

if (tile_N == -1 || config == -1) {
// Use fallback tactic
auto const default_tile_N =
selectDefaultTileN(supported_tile_nums, num_tokens, top_k, num_local_experts);
return {default_tile_N, -1};
}

return {tile_N, config};
}

class FusedMoeLauncher {
protected:
Optional<TensorView> routing_logits;
Expand Down Expand Up @@ -1632,13 +1677,14 @@ Array<Tensor> trtllm_bf16_moe(Optional<TensorView> const& routing_logits,
// Calculate supported tile sizes
std::vector<int32_t> mSupportedTileN(Bf16MoeLauncher::mSupportedTileNums.begin(),
Bf16MoeLauncher::mSupportedTileNums.end());
std::set<int32_t> selected_tile_nums =
computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
// Build launchers for ALL supported tiles (not just the computeSelectedTileN subset)
// so that autotuner-cached tactics always find their tile_N in the map.
// Launcher creation is cheap (no GPU allocation until run()), so this is safe.

// Create a map of launchers for each tile size
std::unordered_map<int32_t, std::unique_ptr<Bf16MoeLauncher>> launchers_map;

for (int32_t curr_tile_N : selected_tile_nums) {
for (int32_t curr_tile_N : mSupportedTileN) {
// Create MoE arguments for this launcher
auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
args->num_tokens = num_tokens;
Expand Down Expand Up @@ -1666,17 +1712,14 @@ Array<Tensor> trtllm_bf16_moe(Optional<TensorView> const& routing_logits,
launchers_map[curr_tile_N] = std::move(launcher);
}

// Extract tile_N and config from moe_tactic
int64_t tile_N = moe_tactic[0];
int64_t config = moe_tactic[1];

// Handle default case
if (tile_N == -1 || config == -1) {
tile_N = *selected_tile_nums.begin();
}
auto const [tile_N, config] =
resolveMoeTileAndConfig(moe_tactic, mSupportedTileN, num_tokens, top_k, local_num_experts);

// Get the launcher for the selected tile_N
auto& selected_launcher = launchers_map.at(tile_N);
auto launcher_it = launchers_map.find(static_cast<int32_t>(tile_N));
FLASHINFER_CHECK(launcher_it != launchers_map.end(),
"Internal error: missing BF16 MoE launcher for tile_N=", tile_N);
auto& selected_launcher = launcher_it->second;

// Run the launcher - it will create its own runner internally
return selected_launcher->run(config, enable_pdl);
Expand Down Expand Up @@ -1724,13 +1767,12 @@ Array<Tensor> trtllm_fp8_per_tensor_scale_moe(
// Calculate supported tile sizes
std::vector<int32_t> mSupportedTileN(Fp8PerTensorLauncher::mSupportedTileNums.begin(),
Fp8PerTensorLauncher::mSupportedTileNums.end());
std::set<int32_t> selected_tile_nums =
computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
// Build launchers for ALL supported tiles so autotuner-cached tactics always find their tile_N.

// Create a map of launchers for each tile size
std::unordered_map<int32_t, std::unique_ptr<Fp8PerTensorLauncher>> launchers_map;

for (int32_t curr_tile_N : selected_tile_nums) {
for (int32_t curr_tile_N : mSupportedTileN) {
// Create MoE arguments for this launcher
auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
args->num_tokens = num_tokens;
Expand Down Expand Up @@ -1758,17 +1800,14 @@ Array<Tensor> trtllm_fp8_per_tensor_scale_moe(
launchers_map[curr_tile_N] = std::move(launcher);
}

// Extract tile_N and config from config_index
int64_t tile_N = config_index[0];
int64_t config = config_index[1];

// Handle default case
if (tile_N == -1 || config == -1) {
tile_N = *selected_tile_nums.begin();
}
auto const [tile_N, config] =
resolveMoeTileAndConfig(config_index, mSupportedTileN, num_tokens, top_k, local_num_experts);

// Get the launcher for the selected tile_N
auto& selected_launcher = launchers_map.at(tile_N);
auto launcher_it = launchers_map.find(static_cast<int32_t>(tile_N));
FLASHINFER_CHECK(launcher_it != launchers_map.end(),
"Internal error: missing FP8 per-tensor MoE launcher for tile_N=", tile_N);
auto& selected_launcher = launcher_it->second;

// Run the launcher - it will create its own runner internally
return selected_launcher->run(config, enable_pdl, use_routing_scales_on_input);
Expand Down Expand Up @@ -1844,13 +1883,12 @@ Array<Tensor> trtllm_fp8_block_scale_moe(
auto const hidden_size = hidden_states.size(1);

auto supported_tile_nums = Fp8BlockScaleLauncher::getSupportedTileNums(quantization_type);
std::set<int32_t> selected_tile_nums =
computeSelectedTileN(supported_tile_nums, num_tokens, top_k, local_num_experts);
// Build launchers for ALL supported tiles so autotuner-cached tactics always find their tile_N.

// Create a map of launchers for each tile size
std::unordered_map<int32_t, std::unique_ptr<Fp8BlockScaleLauncher>> launchers_map;

for (int32_t curr_tile_N : selected_tile_nums) {
for (int32_t curr_tile_N : supported_tile_nums) {
// Create MoE arguments for this launcher
auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
args->num_tokens = num_tokens;
Expand Down Expand Up @@ -1879,17 +1917,14 @@ Array<Tensor> trtllm_fp8_block_scale_moe(
launchers_map[curr_tile_N] = std::move(launcher);
}

// Extract tile_N and config from config_index
int64_t tile_N = config_index[0];
int64_t config = config_index[1];

// Handle default case
if (tile_N == -1 || config == -1) {
tile_N = *selected_tile_nums.begin();
}
auto const [tile_N, config] = resolveMoeTileAndConfig(config_index, supported_tile_nums,
num_tokens, top_k, local_num_experts);

// Get the launcher for the selected tile_N
auto& selected_launcher = launchers_map.at(tile_N);
auto launcher_it = launchers_map.find(static_cast<int32_t>(tile_N));
FLASHINFER_CHECK(launcher_it != launchers_map.end(),
"Internal error: missing FP8 block-scale MoE launcher for tile_N=", tile_N);
auto& selected_launcher = launcher_it->second;

// Run the launcher with DeepSeek FP8 enabled - it will create its own runner internally
return selected_launcher->run(
Expand Down Expand Up @@ -1985,13 +2020,12 @@ Array<Tensor> trtllm_fp4_block_scale_moe(

// Determine supported tile sizes
std::vector<int32_t> mSupportedTileN = FP4BlockScaleLauncher::getSupportedTileNums(mDtypeAct);
std::set<int32_t> selected_tile_nums =
computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
// Build launchers for ALL supported tiles so autotuner-cached tactics always find their tile_N.

// Create a map of launchers for each tile size
std::unordered_map<int32_t, std::unique_ptr<FP4BlockScaleLauncher>> launchers_map;

for (int32_t curr_tile_N : selected_tile_nums) {
for (int32_t curr_tile_N : mSupportedTileN) {
// Create MoE arguments for this launcher
auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
args->num_tokens = num_tokens;
Expand Down Expand Up @@ -2023,18 +2057,14 @@ Array<Tensor> trtllm_fp4_block_scale_moe(
launchers_map[curr_tile_N] = std::move(launcher);
}

// Extract tile_N and config from config_index
int64_t tile_N = config_index[0];
int64_t config = config_index[1];

// Handle default case
if (tile_N == -1 || config == -1) {
tile_N = *selected_tile_nums.begin();
config = -1; // Let the runner choose default
}
auto const [tile_N, config] =
resolveMoeTileAndConfig(config_index, mSupportedTileN, num_tokens, top_k, local_num_experts);

// Get the launcher for the selected tile_N
auto& selected_launcher = launchers_map.at(tile_N);
auto launcher_it = launchers_map.find(static_cast<int32_t>(tile_N));
FLASHINFER_CHECK(launcher_it != launchers_map.end(),
"Internal error: missing FP4 block-scale MoE launcher for tile_N=", tile_N);
auto& selected_launcher = launcher_it->second;

// Run the launcher - it will create its own runner internally
return selected_launcher->run(config, enable_pdl);
Expand Down Expand Up @@ -2078,13 +2108,12 @@ Array<Tensor> trtllm_mxint4_block_scale_moe(
// Determine supported tile sizes
std::vector<int32_t> mSupportedTileN(MxInt4BlockScaleLauncher::mSupportedTileNums.begin(),
MxInt4BlockScaleLauncher::mSupportedTileNums.end());
std::set<int32_t> selected_tile_nums =
computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
// Build launchers for ALL supported tiles so autotuner-cached tactics always find their tile_N.

// Create a map of launchers for each tile size
std::unordered_map<int32_t, std::unique_ptr<MxInt4BlockScaleLauncher>> launchers_map;

for (int32_t curr_tile_N : selected_tile_nums) {
for (int32_t curr_tile_N : mSupportedTileN) {
// Create MoE arguments for this launcher
auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
args->num_tokens = num_tokens;
Expand Down Expand Up @@ -2112,18 +2141,14 @@ Array<Tensor> trtllm_mxint4_block_scale_moe(
launchers_map[curr_tile_N] = std::move(launcher);
}

// Extract tile_N and config from config_index
int64_t tile_N = config_index[0];
int64_t config = config_index[1];

// Handle default case
if (tile_N == -1 || config == -1) {
tile_N = *selected_tile_nums.begin();
config = -1; // Let the runner choose default
}
auto const [tile_N, config] =
resolveMoeTileAndConfig(config_index, mSupportedTileN, num_tokens, top_k, local_num_experts);

// Get the launcher for the selected tile_N
auto& selected_launcher = launchers_map.at(tile_N);
auto launcher_it = launchers_map.find(static_cast<int32_t>(tile_N));
FLASHINFER_CHECK(launcher_it != launchers_map.end(),
"Internal error: missing MXINT4 block-scale MoE launcher for tile_N=", tile_N);
auto& selected_launcher = launcher_it->second;

// Run the launcher - it will create its own runner internally
return selected_launcher->run(config, enable_pdl);
Expand Down
9 changes: 5 additions & 4 deletions flashinfer/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,11 +1037,12 @@ def _find_nearest_profile(
base_profile = list(list(shape) for shape in shapes)

for spec in tuning_config.dynamic_tensor_specs:
base_profile[spec.input_idx[0]][spec.dim_idx[0]] = (
spec.map_to_tuning_buckets(
base_profile[spec.input_idx[0]][spec.dim_idx[0]]
)
mapped_val = spec.map_to_tuning_buckets(
base_profile[spec.input_idx[0]][spec.dim_idx[0]]
)
# Apply the same mapped bucket to all linked dimensions in this spec.
for input_i, dim_i in zip(spec.input_idx, spec.dim_idx, strict=True):
base_profile[input_i][dim_i] = mapped_val

# associated dimensions dependent on other free dynamic dimensions, so assign -1 in the profile
for constraint_spec in tuning_config.constraint_specs:
Expand Down
Loading
Loading