diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2c50635d6ce..c27c2d2a0d1 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2131,42 +2131,7 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - def test_nvfp4_multi_gpus_corner_case(self): - """ - This test is used to test the corner case of the NVFP4 model. - When using the same value for max_seq_len and max_num_tokens, there will be no - enough kv block for the dummy requests in CUDA graph warmup when creating - the py_executor before estimating kv cache. Then CUDA graph capture will be - triggered when estimating kv cache. This may cause some errors. - More info in https://nvbugs/5485325. - """ - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.80, - dtype="fp8", - enable_block_reuse=False) - pytorch_config = dict(disable_overlap_scheduler=False, - cuda_graph_config=CudaGraphConfig( - enable_padding=True, max_batch_size=1024), - moe_config=MoeConfig(backend="TRTLLM")) - - mtp_config = MTPDecodingConfig(num_nextn_predict_layers=1) - with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4", - tensor_parallel_size=8, - pipeline_parallel_size=1, - moe_expert_parallel_size=8, - kv_cache_config=kv_cache_config, - **pytorch_config, - enable_attention_dp=False, - speculative_config=mtp_config, - max_seq_len=5120, - max_num_tokens=5120) as llm: - - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - task = GSM8K(self.MODEL_NAME) - task.evaluate(llm) - + @skip_pre_blackwell def test_nvfp4_multi_gpus_corner_case(self): """ This test is used to test the corner case of the NVFP4 model. diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index d2d45b0c856..59606dd1f5f 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1637,6 +1637,7 @@ def test_openai_perf_metrics(llm_root, llm_venv): str(test_root / "_test_openai_perf_metrics.py")]) +@skip_pre_hopper def test_openai_chat_harmony(llm_root, llm_venv): test_root = unittest_path() / "llmapi" / "apps" llm_venv.run_cmd( diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 042544d9b33..67303b6d7f6 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -361,3 +361,4 @@ cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689) examples/test_nemotron_nas.py::test_nemotron_nano_8b_lora_torch[Llama-3.1-Nemotron-Nano-8B-v1] SKIP (https://nvbugs/5563469) unittest/bindings/test_executor_bindings.py::test_request_perf_metrics_draft SKIP (https://nvbugs/5565590) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5568676) +test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5547437)