Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions tests/quantization/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,3 +496,44 @@ def check_layers(model):
assert layer.self_attn.attn.kv_cache_dtype == expected

llm.apply_model(check_layers)


@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.quant_model
def test_fp8_online_bias_model(vllm_runner, example_prompts):
"""Regression test: online FP8 on models with bias=True (e.g. Qwen2)."""
from tests.models.utils import check_logprobs_close

model = "Qwen/Qwen2-0.5B"
max_model_len = 2048
max_tokens = 32
num_logprobs = 5

with vllm_runner(
model,
max_model_len=max_model_len,
enforce_eager=True,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)

with vllm_runner(
model,
max_model_len=max_model_len,
enforce_eager=True,
quantization="fp8",
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)

check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="bf16",
name_1="fp8_online",
)
3 changes: 3 additions & 0 deletions vllm/model_executor/model_loader/reload/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
SKIP_MODULES: set[str] = {"HadamardTransform"}

SKIP_TENSORS: set[str] = {
"bias",
Comment thread
alankessler marked this conversation as resolved.
"w13_bias",
"w2_bias",
"_expert_map",
"expert_mask",
"expert_global_to_physical",
Expand Down
Loading