huggingface · IlyasMoutawwakil · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 16, 2026
diff --git a/tests/fixtures/gpt_oss/integration_tests.json b/tests/fixtures/gpt_oss/integration_tests.json
@@ -1,11 +1,11 @@
 {
   "quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
-    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
-    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+    "Roses are red, violets are red, red, red, red, red, red, red, red, red, red",
+    "How are you? Tell me the name of the president of the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
   ],
   "quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
-    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
-    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+    "Roses are red, violets are red, red, red, red, red, red, red, red, red, red",
+    "How are you? Tell me the name of the president of the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
   ],
   "quantized=true|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
     "Did not work"
@@ -14,12 +14,12 @@
     "Did not work"
   ],
   "quantized=true|model=120b|kernels=false|attn_impl=eager|mode=eval": [
-    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
-    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United\n\nI am an AI language model and do not have personal feelings or emotions. As for"
   ],
   "quantized=true|model=120b|kernels=false|attn_impl=eager|mode=train": [
-    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
-    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United\n\nI am an AI language model and do not have personal feelings or emotions. As for"
   ],
   "quantized=true|model=120b|kernels=true|attn_impl=eager|mode=eval": [
     "Did not work"
@@ -28,32 +28,36 @@
     "Did not work"
   ],
   "quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
-    "Roses are red, violets are blue, I love you, and I love you too.\nIt sounds like you're looking for",
-    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+    "Roses are red, violets, red, red, red, red, red, red, red, red, red, red",
+    "How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
   ],
   "quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
-    "Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're looking for",
-    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+    "Roses are red, violets, red, red, red, red, red, red, red, red, red, red",
+    "How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
   ],
   "quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
-    "Did not work"
+    "Roses are red, violets, or, or, or, or, or, or, or, or, or, or",
+    "How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
   ],
   "quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
-    "Did not work"
+    "Roses are red, violets R, R, R, R, R, R, R, R, R, R,",
+    "How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
   ],
   "quantized=true|model=20b|kernels=false|attn_impl=eager|mode=eval": [
-    "Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
     "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
   ],
   "quantized=true|model=20b|kernels=false|attn_impl=eager|mode=train": [
-    "Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
     "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
   ],
   "quantized=true|model=20b|kernels=true|attn_impl=eager|mode=eval": [
-    "Did not work"
+    "Roses are red, violets are green, and the world is a beautiful place.\n\nIt sounds like you're sharing a poetic and",
+    "How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
   ],
   "quantized=true|model=20b|kernels=true|attn_impl=eager|mode=train": [
-    "Did not work"
+    "Roses are red, violets are green, and the sky is blue.\n\nIt seems like you're sharing a playful and whimsical line",
+    "How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
   ],
   "quantized=false|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
     "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
@@ -88,35 +92,35 @@
     "How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
   ],
   "quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
-    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
-    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+    "Roses are red, violets, vi, vi, vi, vi, vi, vi, vi, vi, vi, vi",
+    "How are you? Tell me the name of the president of the name of the president of the name of the president of the name of the president of the name"
   ],
   "quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
-    "Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
-    "How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
+    "Roses are red, violets, vi, vi, vi, vi, vi, vi, vi, vi, vi, vi",
+    "How are you? Tell me the name of the president of the name of the president of the name of the president of the name of the president of the name"
   ],
   "quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
-    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
-    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+    "Roses are red, violets, or, or, or, or, or, or, or, or, or, or",
+    "How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
   ],
   "quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
-    "Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
-    "How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
+    "Roses are red, violets R, R, R, R, R, R, R, R, R, R,",
+    "How are you? Tell me the name of the president of the president of the president of the president of the president of the president of the president of the president"
   ],
   "quantized=false|model=20b|kernels=false|attn_impl=eager|mode=eval": [
-    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio",
     "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
   ],
   "quantized=false|model=20b|kernels=false|attn_impl=eager|mode=train": [
-    "Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
-    "How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
+    "Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
   ],
   "quantized=false|model=20b|kernels=true|attn_impl=eager|mode=eval": [
-    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
-    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+    "Roses are red, violets are green, and the world is a beautiful place.\n\nIt sounds like you're sharing a poetic and",
+    "How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
   ],
   "quantized=false|model=20b|kernels=true|attn_impl=eager|mode=train": [
-    "Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
-    "How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
+    "Roses are red, violets are green, and the sky is blue.\n\nIt seems like you're sharing a playful and whimsical line",
+    "How are you? Tell me the name of the president of the company. The president is the CEO. The president is the CEO. The president is the CEO"
   ]
-}
+}
diff --git a/tests/models/gpt_oss/test_modeling_gpt_oss.py b/tests/models/gpt_oss/test_modeling_gpt_oss.py
@@ -47,9 +47,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        GptOssModel,
-    )
+    from transformers import GptOssModel, Mxfp4Config
 
     NUM_GPUS = torch.cuda.device_count()
 
@@ -131,7 +129,7 @@ def distributed_worker(quantized, model_size, kernels, attn_impl, mode):
     """This is the function that will be executed by torchrun workers."""
     import os
 
-    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
     from transformers.testing_utils import torch_device
 
     def generate_config_key(quantized, model, kernels, attn_impl, mode):
@@ -154,8 +152,9 @@ def generate_config_key(quantized, model, kernels, attn_impl, mode):
         dtype="auto",
         tp_plan="auto",  # distributed inference
         use_kernels=kernels,
+        attn_implementation=attn_impl,
+        quantization_config=Mxfp4Config(dequantize=not quantized),
     ).to(torch_device)
-    model.set_attn_implementation(attn_impl)
     tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
 
     # Inference
@@ -232,32 +231,6 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
-    # ------------------------
-    # Non-distributed inference
-    # ------------------------
-    @staticmethod
-    def load_and_forward(model_id, attn_implementation, input_text, mode="eval", **pretrained_kwargs):
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            dtype=torch.bfloat16,
-            device_map="auto",
-            attn_implementation=attn_implementation,
-            **pretrained_kwargs,
-        )
-
-        # Set the correct mode
-        if mode == "train":
-            model.train()
-        else:
-            model.eval()
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
-
-        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(model.device)
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-        return output_text
-
     # ------------------------
     # Distributed inference using inspect
     # ------------------------
@@ -344,14 +317,26 @@ def run_distributed_test(quantized, model, kernels, attn_impl, mode):
     @parameterized.expand(PARAMETERS)
     def test_model_outputs(self, quantized, model, kernels, attn_impl, mode):
         model_id = f"openai/gpt-oss-{model}"
-        output_texts = self.load_and_forward(
+        model_obj = AutoModelForCausalLM.from_pretrained(
             model_id,
-            attn_impl,
-            self.input_text,
-            mode=mode,
+            dtype="auto",
+            device_map="auto",
             use_kernels=kernels,
+            attn_implementation=attn_impl,
+            quantization_config=Mxfp4Config(dequantize=not quantized),
         )
 
+        # Set the correct mode
+        if mode == "train":
+            model_obj.train()
+        else:
+            model_obj.eval()
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(model_obj.device)
+        output_ids = model_obj.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
         # Generate key to look up expected outputs
         key = self.generate_config_key(quantized, model, kernels, attn_impl, mode)
 
@@ -422,10 +407,11 @@ def test_training_step(self, quantized, model, kernels, attn_impl, mode):
 
         model_obj = AutoModelForCausalLM.from_pretrained(
             model_id,
-            dtype=torch.bfloat16,
+            dtype="auto",
             device_map="auto",
-            attn_implementation=attn_impl,
             use_kernels=kernels,
+            attn_implementation=attn_impl,
+            quantization_config=Mxfp4Config(dequantize=True),
         )
         model_obj.train()
 
@@ -484,7 +470,7 @@ def test_model_matches_original_20b(self):
 
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
-            dtype=torch.bfloat16,
+            dtype="auto",
             device_map="auto",
             attn_implementation="eager",
         )
@@ -550,7 +536,7 @@ def test_model_matches_original_120b(self):
 
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
-            dtype=torch.bfloat16,
+            dtype="auto",
             device_map="auto",
             attn_implementation="eager",
         )