Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 38 additions & 34 deletions tests/fixtures/gpt_oss/integration_tests.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
"Roses are red, violets are red, red, red, red, red, red, red, red, red, red",
"How are you? Tell me the name of the president of the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
],
"quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
"Roses are red, violets are red, red, red, red, red, red, red, red, red, red",
"How are you? Tell me the name of the president of the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
],
"quantized=true|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Did not work"
Expand All @@ -14,12 +14,12 @@
"Did not work"
],
"quantized=true|model=120b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United\n\nI am an AI language model and do not have personal feelings or emotions. As for"
],
"quantized=true|model=120b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United\n\nI am an AI language model and do not have personal feelings or emotions. As for"
],
"quantized=true|model=120b|kernels=true|attn_impl=eager|mode=eval": [
"Did not work"
Expand All @@ -28,32 +28,36 @@
"Did not work"
],
"quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too.\nIt sounds like you're looking for",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
"Roses are red, violets, red, red, red, red, red, red, red, red, red, red",
"How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
],
"quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're looking for",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
"Roses are red, violets, red, red, red, red, red, red, red, red, red, red",
"How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
],
"quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Did not work"
"Roses are red, violets, or, or, or, or, or, or, or, or, or, or",
"How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
],
"quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Did not work"
"Roses are red, violets R, R, R, R, R, R, R, R, R, R,",
"How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
],
"quantized=true|model=20b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=true|model=20b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
Comment on lines 46 to 53
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quantized seems to work fine

"quantized=true|model=20b|kernels=true|attn_impl=eager|mode=eval": [
"Did not work"
"Roses are red, violets are green, and the world is a beautiful place.\n\nIt sounds like you're sharing a poetic and",
"How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
],
"quantized=true|model=20b|kernels=true|attn_impl=eager|mode=train": [
"Did not work"
"Roses are red, violets are green, and the sky is blue.\n\nIt seems like you're sharing a playful and whimsical line",
"How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
Comment on lines 54 to +60
Copy link
Member Author

@IlyasMoutawwakil IlyasMoutawwakil Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kernels (megablocks) seems to be broken

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might also be a GPU diff

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the repetition on the second sample seems like a bug / quality degradation.

],
"quantized=false|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
Expand Down Expand Up @@ -88,35 +92,35 @@
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
],
"quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
"Roses are red, violets, vi, vi, vi, vi, vi, vi, vi, vi, vi, vi",
"How are you? Tell me the name of the president of the name of the president of the name of the president of the name of the president of the name"
],
"quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
"How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
"Roses are red, violets, vi, vi, vi, vi, vi, vi, vi, vi, vi, vi",
"How are you? Tell me the name of the president of the name of the president of the name of the president of the name of the president of the name"
],
Comment on lines 94 to 101
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kernels-community/vllm-flash-attn3 seems to be broken

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On which platform have you tested? Interestingly, I tested on A100 and I get your garbage output as well. Switching to H100 produces wellformed outputs

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A100 as well ! i guess it is a platform issue, the kernel is optimised for H100 but should work on A100 (cc @MekkCyber)

"quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
"Roses are red, violets, or, or, or, or, or, or, or, or, or, or",
"How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
],
"quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
"How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
"Roses are red, violets R, R, R, R, R, R, R, R, R, R,",
"How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
],
"quantized=false|model=20b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=false|model=20b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
"How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
"Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
Comment on lines 110 to 117
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-quantized + no-kernels + eager attn seems to work fine

"quantized=false|model=20b|kernels=true|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
"Roses are red, violets are green, and the world is a beautiful place.\n\nIt sounds like you're sharing a poetic and",
"How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
],
"quantized=false|model=20b|kernels=true|attn_impl=eager|mode=train": [
"Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
"How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
"Roses are red, violets are green, and the sky is blue.\n\nIt seems like you're sharing a playful and whimsical line",
"How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
]
}
}
64 changes: 25 additions & 39 deletions tests/models/gpt_oss/test_modeling_gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@
if is_torch_available():
import torch

from transformers import (
GptOssModel,
)
from transformers import GptOssModel, Mxfp4Config

NUM_GPUS = torch.cuda.device_count()

Expand Down Expand Up @@ -131,7 +129,7 @@ def distributed_worker(quantized, model_size, kernels, attn_impl, mode):
"""This is the function that will be executed by torchrun workers."""
import os

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
from transformers.testing_utils import torch_device

def generate_config_key(quantized, model, kernels, attn_impl, mode):
Expand All @@ -154,8 +152,9 @@ def generate_config_key(quantized, model, kernels, attn_impl, mode):
dtype="auto",
tp_plan="auto", # distributed inference
use_kernels=kernels,
attn_implementation=attn_impl,
quantization_config=Mxfp4Config(dequantize=not quantized),
).to(torch_device)
model.set_attn_implementation(attn_impl)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

# Inference
Expand Down Expand Up @@ -232,32 +231,6 @@ def setUp(self):
def tearDown(self):
cleanup(torch_device, gc_collect=True)

# ------------------------
# Non-distributed inference
# ------------------------
@staticmethod
def load_and_forward(model_id, attn_implementation, input_text, mode="eval", **pretrained_kwargs):
model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.bfloat16,
device_map="auto",
attn_implementation=attn_implementation,
**pretrained_kwargs,
)

# Set the correct mode
if mode == "train":
model.train()
else:
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(model.device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
return output_text

# ------------------------
# Distributed inference using inspect
# ------------------------
Expand Down Expand Up @@ -344,14 +317,26 @@ def run_distributed_test(quantized, model, kernels, attn_impl, mode):
@parameterized.expand(PARAMETERS)
def test_model_outputs(self, quantized, model, kernels, attn_impl, mode):
model_id = f"openai/gpt-oss-{model}"
output_texts = self.load_and_forward(
model_obj = AutoModelForCausalLM.from_pretrained(
model_id,
attn_impl,
self.input_text,
mode=mode,
dtype="auto",
device_map="auto",
use_kernels=kernels,
attn_implementation=attn_impl,
quantization_config=Mxfp4Config(dequantize=not quantized),
)

# Set the correct mode
if mode == "train":
model_obj.train()
else:
model_obj.eval()

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(model_obj.device)
output_ids = model_obj.generate(**inputs, max_new_tokens=20, do_sample=False)
output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

# Generate key to look up expected outputs
key = self.generate_config_key(quantized, model, kernels, attn_impl, mode)

Expand Down Expand Up @@ -422,10 +407,11 @@ def test_training_step(self, quantized, model, kernels, attn_impl, mode):

model_obj = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.bfloat16,
dtype="auto",
device_map="auto",
attn_implementation=attn_impl,
use_kernels=kernels,
attn_implementation=attn_impl,
quantization_config=Mxfp4Config(dequantize=True),
)
model_obj.train()

Expand Down Expand Up @@ -484,7 +470,7 @@ def test_model_matches_original_20b(self):

model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.bfloat16,
dtype="auto",
device_map="auto",
attn_implementation="eager",
)
Expand Down Expand Up @@ -550,7 +536,7 @@ def test_model_matches_original_120b(self):

model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.bfloat16,
dtype="auto",
device_map="auto",
attn_implementation="eager",
)
Expand Down