@@ -2822,8 +2822,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
28222822 extra_evaluator_kwargs = {
28232823 "fewshot_as_multiturn" : True ,
28242824 "apply_chat_template" : True ,
2825- "scores_filter" : "exact_match,flexible-extract" ,
2826- "MAX_OUTPUT_LEN" : 8192
28272825 }
28282826
28292827 MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
@@ -2837,7 +2835,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
28372835 (True , True ),
28382836 ])
28392837 def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2840- pytest .skip ("https://nvbugs/5481087" )
2838+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2839+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2840+ {"scores_filter" : "exact_match,flexible-extract" })
28412841 if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
28422842 pytest .skip ("Triton kernels are not available" )
28432843
@@ -2855,7 +2855,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
28552855
28562856 with llm :
28572857 model_name = "GPT-OSS/MXFP4"
2858- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
28592858 task = GSM8K (model_name )
28602859 task .evaluate (llm ,
28612860 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2875,7 +2874,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
28752874 ids = ["tp4" , "ep4" , "dp4" ])
28762875 def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
28772876 attention_dp , cuda_graph , overlap_scheduler , mocker ):
2878- pytest .skip ("https://nvbugs/5481087" )
2877+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2878+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2879+ {"scores_filter" : "exact_match,flexible-extract" })
28792880 if moe_backend == "TRITON" :
28802881 if not IS_TRITON_KERNELS_AVAILABLE :
28812882 pytest .skip ("Triton kernels are not available" )
@@ -2896,7 +2897,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
28962897 with llm :
28972898 model_name = "GPT-OSS/MXFP4"
28982899 task = GSM8K (model_name )
2899- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
29002900 task .evaluate (llm ,
29012901 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
29022902
@@ -2908,6 +2908,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
29082908 ids = ["dp4" ])
29092909 def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
29102910 overlap_scheduler , monkeypatch , mocker ):
2911+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2912+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2913+ {"scores_filter" : "exact_match,flexible-extract" })
29112914 if not IS_TRITON_KERNELS_AVAILABLE :
29122915 pytest .skip ("Triton kernels are not available" )
29132916 monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2927,7 +2930,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
29272930 with llm :
29282931 model_name = "GPT-OSS/BF16"
29292932 task = GSM8K (model_name )
2930- mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
29312933 task .evaluate (llm ,
29322934 extra_evaluator_kwargs = self .extra_evaluator_kwargs )
29332935
0 commit comments