Skip to content

Commit be99e97

Browse files
committed
waive more streamingLLM tests, assert if user enables streamingLLM
Signed-off-by: Haohang Huang <[email protected]>
1 parent 38f8db0 commit be99e97

File tree

5 files changed

+12
-3
lines changed

5 files changed

+12
-3
lines changed

cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1363,7 +1363,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
13631363
#ifndef MMHA_USE_FP32_ACCUM_FOR_LOGITS
13641364
if (sizeof(Tk) != 4)
13651365
{
1366-
auto const max_timesteps = min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));
1366+
auto const max_timesteps
1367+
= min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));
13671368
logits_smem_ += divUp(max_timesteps + 1, 4u) * 16;
13681369
}
13691370
Tk* logits_smem = reinterpret_cast<Tk*>(logits_smem_);

tensorrt_llm/commands/build.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,8 +349,8 @@ def build_model(
349349
model_config.logits_dtype = logits_dtype
350350

351351
architecture = model_config.architecture
352-
assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \
353-
"StreamingLLM is only supported in the llama model."
352+
assert not build_config.plugin_config.streamingllm, \
353+
"StreamingLLM is no longer supported because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes."
354354
assert not build_config.plugin_config.pp_reduce_scatter or architecture == "MixtralForCausalLM", \
355355
"PP reduce scatter is only supported in the mixtral model."
356356

tests/integration/test_lists/waives.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-floa
227227
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
228228
examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-build] SKIP (https://nvbugs/5247243)
229229
examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer] SKIP (https://nvbugs/5247243)
230+
examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] SKIP (https://nvbugs/5435714)
230231
test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075)
231232
examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488)
232233
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043)

tests/unittest/llmapi/test_llm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,7 @@ def test_generate_with_beam_search(llm_for_sampling_params: LLM):
708708
check_output(outputs, references)
709709

710710

711+
@pytest.mark.skip(reason="https://nvbugs/5435714")
711712
@force_ampere
712713
@pytest.mark.part0
713714
def test_generate_with_streaming_llm():

tests/unittest/trt/attention/test_gpt_attention_IFB.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,12 @@ def test_gpt_attention_IFB(self,
206206
pytest.skip("Beam search is not supported in this test yet")
207207

208208
tokens_per_block = 128
209+
streamingllm = sink_token_len > 0
210+
211+
if streamingllm:
212+
pytest.skip(
213+
"Waived for now because attention sink cannot work with the non-cyclic kv cache kernel & runtime changes."
214+
)
209215

210216
remove_input_padding = True
211217

0 commit comments

Comments
 (0)