@@ -2131,42 +2131,7 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
21312131 task = GSM8K (self .MODEL_NAME )
21322132 task .evaluate (llm )
21332133
2134- def test_nvfp4_multi_gpus_corner_case (self ):
2135- """
2136- This test is used to test the corner case of the NVFP4 model.
2137- When using the same value for max_seq_len and max_num_tokens, there will be no
2138- enough kv block for the dummy requests in CUDA graph warmup when creating
2139- the py_executor before estimating kv cache. Then CUDA graph capture will be
2140- triggered when estimating kv cache. This may cause some errors.
2141- More info in https://nvbugs/5485325.
2142- """
2143- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.80 ,
2144- dtype = "fp8" ,
2145- enable_block_reuse = False )
2146- pytorch_config = dict (disable_overlap_scheduler = False ,
2147- cuda_graph_config = CudaGraphConfig (
2148- enable_padding = True , max_batch_size = 1024 ),
2149- moe_config = MoeConfig (backend = "TRTLLM" ))
2150-
2151- mtp_config = MTPDecodingConfig (num_nextn_predict_layers = 1 )
2152- with LLM (f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-FP4" ,
2153- tensor_parallel_size = 8 ,
2154- pipeline_parallel_size = 1 ,
2155- moe_expert_parallel_size = 8 ,
2156- kv_cache_config = kv_cache_config ,
2157- ** pytorch_config ,
2158- enable_attention_dp = False ,
2159- speculative_config = mtp_config ,
2160- max_seq_len = 5120 ,
2161- max_num_tokens = 5120 ) as llm :
2162-
2163- assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
2164-
2165- task = MMLU (self .MODEL_NAME )
2166- task .evaluate (llm )
2167- task = GSM8K (self .MODEL_NAME )
2168- task .evaluate (llm )
2169-
2134+ @skip_pre_blackwell
21702135 def test_nvfp4_multi_gpus_corner_case (self ):
21712136 """
21722137 This test is used to test the corner case of the NVFP4 model.
0 commit comments