Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,18 @@ def trtllm_quant_fp8_linear(
assert input_scale is not None
input_fp8, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(input, input_scale)

enable_cuda_core = False
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability(torch.device("cuda:0"))
# enable cuda core for sm89 and sm120
enable_cuda_core = (capability[0] == 8 and capability[1] == 9) or (
capability[0] == 12 and capability[1] == 0
)
# Use TensorRT-LLM FP8 scaled matrix multiply
# Choose between CUDA core (for small M) and cuBLAS (for large M) implementations
if input_fp8.shape[0] <= 8: # NOTE: this kernel work with n % 2 == 0 as well??
if (
input_fp8.shape[0] <= 8 and enable_cuda_core
): # NOTE: this kernel work with n % 2 == 0 as well??
# Use CUDA core for small M dimension (better for small batch sizes)
output = torch.ops.trtllm.cuda_scaled_mm(
input_fp8,
Expand Down
29 changes: 29 additions & 0 deletions tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,34 @@ def _nemotron_h_block_forward(
return hidden_states


def _nemotron_h_topk_router_forward(self, hidden_states):
"""
Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel.

This replaces the original forward method which used pure PyTorch operations
with a fused CUDA kernel that performs:
1. Sigmoid activation of logits
2. Group-based expert selection
3. Top-k selection within selected groups
4. Normalized weight computation
"""
hidden_states = hidden_states.view(-1, self.config.hidden_size)
router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))

# Use the fused noaux_tc_op kernel which applies sigmoid internally
# and performs group-based top-k selection with normalization
topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op(
router_logits,
self.e_score_correction_bias,
self.n_group,
self.topk_group,
self.top_k,
self.routed_scaling_factor,
)

return topk_indices, topk_weights


# Note: we assume experts have no bias for now
def _nemotron_h_moe_forward(self, hidden_states: torch.Tensor):
"""
Expand Down Expand Up @@ -138,6 +166,7 @@ def _nemotron_h_moe_forward(self, hidden_states: torch.Tensor):
],
"NemotronHBlock": [("forward", _nemotron_h_block_forward)],
"NemotronHMOE": [("forward", _nemotron_h_moe_forward)],
"NemotronHTopkRouter": [("forward", _nemotron_h_topk_router_forward)],
}


Expand Down