waive more streamingLLM tests, assert if user enables streamingLLM

symphonylyh · symphonylyh · commit be99e975be1d · 2025-08-04T23:39:04.000-07:00
Signed-off-by: Haohang Huang &lt;31998628+symphonylyh@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@@ -1363,7 +1363,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
 #ifndef MMHA_USE_FP32_ACCUM_FOR_LOGITS
     if (sizeof(Tk) != 4)
     {
-        auto const max_timesteps = min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));
+        auto const max_timesteps
+            = min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));
         logits_smem_ += divUp(max_timesteps + 1, 4u) * 16;
     }
     Tk* logits_smem = reinterpret_cast<Tk*>(logits_smem_);
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
@@ -349,8 +349,8 @@ def build_model(
         model_config.logits_dtype = logits_dtype
 
     architecture = model_config.architecture
-    assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \
-        "StreamingLLM is only supported in the llama model."
+    assert not build_config.plugin_config.streamingllm, \
+        "StreamingLLM is no longer supported because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes."
     assert not build_config.plugin_config.pp_reduce_scatter or architecture == "MixtralForCausalLM", \
         "PP reduce scatter is only supported in the mixtral model."
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -227,6 +227,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-floa
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
 examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-build] SKIP (https://nvbugs/5247243)
 examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer] SKIP (https://nvbugs/5247243)
+examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] SKIP (https://nvbugs/5435714)
 test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075)
 examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488)
 accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043)
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
@@ -708,6 +708,7 @@ def test_generate_with_beam_search(llm_for_sampling_params: LLM):
     check_output(outputs, references)
 
 
+@pytest.mark.skip(reason="https://nvbugs/5435714")
 @force_ampere
 @pytest.mark.part0
 def test_generate_with_streaming_llm():
diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
@@ -206,6 +206,12 @@ def test_gpt_attention_IFB(self,
             pytest.skip("Beam search is not supported in this test yet")
 
         tokens_per_block = 128
+        streamingllm = sink_token_len > 0
+
+        if streamingllm:
+            pytest.skip(
+                "Waived for now because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes."
+            )
 
         remove_input_padding = True
 

Original file line number	Diff line number	Diff line change
`@@ -1363,7 +1363,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske`
`1363`	`1363`	`#ifndef MMHA_USE_FP32_ACCUM_FOR_LOGITS`
`1364`	`1364`	`if (sizeof(Tk) != 4)`
`1365`	`1365`	`{`
`1366`		`- auto const max_timesteps = min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));`
	`1366`	`+ auto const max_timesteps`
	`1367`	`+ = min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));`
`1367`	`1368`	`logits_smem_ += divUp(max_timesteps + 1, 4u) * 16;`
`1368`	`1369`	`}`
`1369`	`1370`	`Tk* logits_smem = reinterpret_cast<Tk*>(logits_smem_);`