Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
bbd3e4c
[https://nvbugs/5569713][fix] Disable fp8 deep gemm for EXAONE-4.0-32…
JunyiXu-nv Oct 21, 2025
a7cda1d
[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more…
SimengLiu-nv Oct 22, 2025
67177a2
[https://nvbugs/5504095][fix] Unwaive test_user_specify_workspace cas…
nv-guomingz Oct 22, 2025
c44f102
[https://nvbugs/5546510][fix] Move torch.cuda.Stream out of torch com…
liji-nv Oct 22, 2025
8a1d383
[https://nvbugs/5565549][fix] unwaive test_disaggregated_spec_dec_bat…
bo-nv Oct 22, 2025
6e13411
[https://nvbugs/5575829][fix] Unwaive gpt-oss test (#8576)
LinPoly Oct 22, 2025
706dfb5
[https://nvbugs/5569754][fix] trtllm-llmapi-launch port conflict (#8…
Superjomn Oct 23, 2025
385cb92
[https://nvbugs/5582277][fix] rework DisaggPPTerminationHandler to fi…
reasonsolo Oct 23, 2025
b958574
[https://nvbugs/5575902][fix] set max_batch_size=1 to stabilize accur…
reasonsolo Oct 23, 2025
9c721e0
[https://nvbugs/5587456][fix] Remove multimodal test cases using TRT …
jieli-matrix Oct 24, 2025
14b6df1
[None][test] Clean cache for certain easily hang cases (#8619)
crazydemo Oct 24, 2025
95f676e
[https://nvbugs/5608489][fix] Fix output unpack issues for Llama3/4 N…
hyukn Oct 28, 2025
682e956
[https://nvbugs/5572320][fix] Ported test_ad_trtllm_bench.py from mai…
MrGeva Oct 28, 2025
17459b7
[https://nvbugs/5564465][fix] Overwrite only if default_max_tokens is…
LinPoly Oct 28, 2025
9bbb404
[https://nvbugs/5578175][fix] Fix block range index (#8470)
chuangz0 Oct 28, 2025
49961ca
[https://nvbugs/5601203] [fix]Restrict fp8 blockscale moe case (#8583)
VALLIS-NERIA Oct 29, 2025
5ceb732
[https://nvbugs/5575841] [test] Move test_moe.py to serial tests to i…
DomBrown Oct 30, 2025
a95598b
[https://nvbugs/5488118][fix] Unwaive passed tests (#8758)
liji-nv Oct 31, 2025
b75bf44
[None][infra] Remove invaild waived tests which not in release branch…
ZhanruiSunCh Oct 31, 2025
9010698
[https://nvbugs/5325296][fix] Enable relaxed acceptance test on Black…
Barry-Delaney Oct 31, 2025
7dec606
[None][chore] Update linter rules for mass integration
mikeiovine Nov 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,6 @@ common-files: &common_files |
tests/unittest/_torch/thop/parallel/test_logits_bitmask_op.py |
tests/unittest/_torch/thop/parallel/test_mamba_conv1d_op.py |
tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py |
tests/unittest/_torch/thop/parallel/test_moe.py |
tests/unittest/_torch/thop/parallel/test_noaux_tc.py |
tests/unittest/_torch/thop/parallel/test_scaled_mm.py |
tests/unittest/_torch/thop/parallel/test_selective_scan_op.py |
Expand All @@ -1071,6 +1070,7 @@ common-files: &common_files |
tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py |
tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py |
tests/unittest/_torch/thop/serial/test_moe_alltoall.py |
tests/unittest/_torch/thop/serial/test_moe.py |
tests/unittest/api_stability/api_stability_core.py |
tests/unittest/api_stability/test_llm_api.py |
tests/unittest/bindings/binding_test_utils.py |
Expand Down
10 changes: 8 additions & 2 deletions cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2145,7 +2145,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
return 0;
}

auto const numCurrTokens = mSequences.at(req.mRequestId).getNumTokens();
auto const numCurrTokens = getSequence(req.mRequestId).getNumTokens();
auto const generatedTokens = numCurrTokens - req.getPromptLen();
auto const maxTokensToAddToKVCache = req.mMaxNewTokens - generatedTokens;
auto const tokensPerStep = req.getNumDraftTokens() + 1;
Expand Down Expand Up @@ -2409,7 +2409,13 @@ void KVCacheManager::addSequence(
void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
{
auto const requestId = llmRequest.mRequestId;
if (mSequences.find(requestId) != mSequences.end())
bool found = false;
{
// protect the mSequences
std::scoped_lock lock(mSequencesMtx);
found = mSequences.find(requestId) != mSequences.end();
}
if (found)
{
auto& sequence = getSequence(requestId);
if (mEnableBlockReuse && !llmRequest.isDummyRequest())
Expand Down
13 changes: 13 additions & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2209,6 +2209,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
def noIsolateTests = false
def rerunFailed = false

echoNodeAndGpuInfo(pipeline, stageName)
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'

def extraInternalEnv = ""
def pytestTestTimeout = "3600"

// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
// Enable NCCL debug information for multi-GPU tests
extraInternalEnv += " NCCL_DEBUG=INFO"

def testDBList = renderTestDB(testList, llmSrc, stageName)

// Process shard test list and create separate files for regular and isolate tests
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,6 @@ exclude = [
"tests/unittest/_torch/thop/parallel/test_logits_bitmask_op.py",
"tests/unittest/_torch/thop/parallel/test_mamba_conv1d_op.py",
"tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py",
"tests/unittest/_torch/thop/parallel/test_moe.py",
"tests/unittest/_torch/thop/parallel/test_noaux_tc.py",
"tests/unittest/_torch/thop/parallel/test_scaled_mm.py",
"tests/unittest/_torch/thop/parallel/test_selective_scan_op.py",
Expand All @@ -1111,6 +1110,7 @@ exclude = [
"tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py",
"tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py",
"tests/unittest/_torch/thop/serial/test_moe_alltoall.py",
"tests/unittest/_torch/thop/serial/test_moe.py",
"tests/unittest/api_stability/api_stability_core.py",
"tests/unittest/api_stability/test_llm_api.py",
"tests/unittest/bindings/binding_test_utils.py",
Expand Down
12 changes: 4 additions & 8 deletions tensorrt_llm/_torch/compilation/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def __init__(
self.capture_num_tokens = sorted(capture_num_tokens or [])
self.piecewise_cuda_graph = enable_piecewise_cuda_graph
self.no_optimization = False
# We only need to create aux streams.
self.aux_streams = Backend.Streams(
[torch.cuda.Stream() for _ in range(max_num_streams - 1)])
self.num_streams = max_num_streams
self.events = Backend.Events()
inductor_config.enable_auto_functionalized_v2 = False

Expand Down Expand Up @@ -109,10 +107,8 @@ def optimize(
# Do not apply multi-stream if enable piecewise cuda graph or inductor
# For piecewise cuda graph, we will apply the multi-stream optimization in piecewise_optimizer
# For inductor, we do not control the passes inside inductor.
if len(
self.aux_streams
) > 0 and not self.piecewise_cuda_graph and not self.enable_inductor:
num_events = multi_stream_schedule(gm, len(self.aux_streams) + 1)
if self.num_streams > 1 and not self.piecewise_cuda_graph and not self.enable_inductor:
num_events = multi_stream_schedule(gm, self.num_streams)
self.generate_events(num_events)

gm.recompile()
Expand All @@ -125,7 +121,7 @@ def optimize(
self.input_num_tokens,
self.capture_num_tokens,
self._graph_pool_handle,
len(self.aux_streams) + 1,
self.num_streams,
)
self.generate_events(num_events)
return gm
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/distributed/communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,8 @@ def tp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
def pp_allgather(self, obj):
return self.pp_comm.allgather(obj)

def pp_gather(self, obj):
return self.pp_comm.gather(obj)
def pp_gather(self, obj, root=0):
return self.pp_comm.gather(obj, root=root)

def pp_broadcast(self, obj, root=0):
return self.pp_comm.bcast(obj, root)
Expand Down
14 changes: 13 additions & 1 deletion tensorrt_llm/_torch/models/modeling_exaone4.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.quantization import QuantAlgo

from ..attention_backend import AttentionMetadata
from ..attention_backend.interface import (PositionalEmbeddingParams,
Expand Down Expand Up @@ -54,7 +55,8 @@ class Exaone4Attention(QKNormRoPEAttention):
def __init__(self,
model_config: ModelConfig[Exaone4Config],
layer_idx: Optional[int] = None,
fuse_qk_norm_rope: bool = False):
fuse_qk_norm_rope: bool = False,
disable_deep_gemm: bool = False):
config = model_config.pretrained_config

self.attention_window_size = None
Expand Down Expand Up @@ -88,6 +90,7 @@ def __init__(self,
layer_idx=layer_idx,
dtype=config.torch_dtype,
config=model_config,
disable_deep_gemm=disable_deep_gemm,
)

def forward(
Expand Down Expand Up @@ -128,9 +131,17 @@ def __init__(
self.is_quanted = model_config.quant_config and model_config.quant_config.quant_mode.has_any_quant(
)

disable_deep_gemm = False
quant_config = getattr(model_config, "quant_config", None)
if quant_config is not None:
# EXAONE4 fp8 has an illegal memory access issue with deep_gemm.
disable_deep_gemm = getattr(quant_config, "quant_algo",
None) == QuantAlgo.FP8_BLOCK_SCALES

self.self_attn = Exaone4Attention(
model_config,
layer_idx=layer_idx,
disable_deep_gemm=disable_deep_gemm,
)

self.mlp = GatedMLP(
Expand All @@ -140,6 +151,7 @@ def __init__(
dtype=config.torch_dtype,
config=model_config,
layer_idx=layer_idx,
disable_deep_gemm=disable_deep_gemm,
)

self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size,
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/models/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ def forward(
))

# Unpack the allreduce output
if self.next_attn is not None and self.is_nvfp4:
if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
act_fp4, act_sf, residual = allreduce_output
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:
Expand Down Expand Up @@ -791,7 +791,7 @@ def forward(
scale=scale,
eps=self.next_layer_layernorm.variance_epsilon,
))
if self.next_attn is not None and self.is_nvfp4:
if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
act_fp4, act_sf, residual = all_reduce_output
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:
Expand Down
7 changes: 5 additions & 2 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,10 @@ def __init__(
use_ub = not use_ub_for_nccl and (
torch_compile_enable_userbuffers
and self._init_userbuffers(self.model.config.hidden_size))
self.backend_num_streams = Backend.Streams([
torch.cuda.Stream()
for _ in range(torch_compile_max_num_streams - 1)
])
self._torch_compile_backend = Backend(
torch_compile_inductor_enabled,
enable_userbuffers=use_ub,
Expand Down Expand Up @@ -2658,8 +2662,7 @@ def model_forward(self, **kwargs):
if self._torch_compile_backend is not None:
# Register aux streams and events to model extra attrs.
# The streams and events are list which could be updated during compilation.
attrs["aux_streams"] = weakref.ref(
self._torch_compile_backend.aux_streams)
attrs["aux_streams"] = weakref.ref(self.backend_num_streams)
attrs["events"] = weakref.ref(self._torch_compile_backend.events)
attrs["global_stream"] = torch.cuda.current_stream()

Expand Down
Loading