From af0c7e24ba29c6371c8dcca6cce504529a75b682 Mon Sep 17 00:00:00 2001 From: haosdent Date: Tue, 12 May 2026 13:32:18 +0800 Subject: [PATCH] [CI/Build] Replace Test1 with a code prompt for starcoder2-3b bigcode/starcoder2-3b fails models/language/generation/test_common.py on L4 (SM 8.9). The failure is on Test1, an open-ended NL prompt fed to a code-completion model. After 8 matched tokens the model wanders into a Jupyter notebook markdown id and lands in a near-uniform digit-token logit region; HF and vLLM disagree on which digit lands in top-5 by ~1 logprob bit, so `output_id_0 in logprobs_elem_1` fails. Replace Test1 only for bigcode/starcoder2-3b with a code prompt that keeps the model on its training distribution and produces sharp top-1 logits at every position. Other models are unchanged. Verified locally on GB10: both parametrizations PASS in ~95s each. Signed-off-by: haosdent --- tests/models/language/generation/test_common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 6e5f1e328431..2a693603f023 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -143,6 +143,15 @@ def test_models( # in parts of the operators pytest.skip(f"Skipping '{model}' model test with AITER kernel.") + if model == "bigcode/starcoder2-3b": + # Replace example.txt's Test1 (an NL prompt) with a code prompt: + # starcoder2-3b is a code model, so NL prompts give near-uniform + # digit logits where HF<->vLLM bf16 drift can reorder top-K. + example_prompts = list(example_prompts) + example_prompts[1] = ( + "def add(a, b):\n return a + b\n\ndef sub(a, b):\n return a - " + ) + with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs