Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 1 addition & 36 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2131,42 +2131,7 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

def test_nvfp4_multi_gpus_corner_case(self):
"""
This test is used to test the corner case of the NVFP4 model.
When using the same value for max_seq_len and max_num_tokens, there will be no
enough kv block for the dummy requests in CUDA graph warmup when creating
the py_executor before estimating kv cache. Then CUDA graph capture will be
triggered when estimating kv cache. This may cause some errors.
More info in https://nvbugs/5485325.
"""
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.80,
dtype="fp8",
enable_block_reuse=False)
pytorch_config = dict(disable_overlap_scheduler=False,
cuda_graph_config=CudaGraphConfig(
enable_padding=True, max_batch_size=1024),
moe_config=MoeConfig(backend="TRTLLM"))

mtp_config = MTPDecodingConfig(num_nextn_predict_layers=1)
with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4",
tensor_parallel_size=8,
pipeline_parallel_size=1,
moe_expert_parallel_size=8,
kv_cache_config=kv_cache_config,
**pytorch_config,
enable_attention_dp=False,
speculative_config=mtp_config,
max_seq_len=5120,
max_num_tokens=5120) as llm:

assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4

task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_blackwell
def test_nvfp4_multi_gpus_corner_case(self):
"""
This test is used to test the corner case of the NVFP4 model.
Expand Down
1 change: 1 addition & 0 deletions tests/integration/defs/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -1637,6 +1637,7 @@ def test_openai_perf_metrics(llm_root, llm_venv):
str(test_root / "_test_openai_perf_metrics.py")])


@skip_pre_hopper
def test_openai_chat_harmony(llm_root, llm_venv):
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd(
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -361,3 +361,4 @@ cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689)
examples/test_nemotron_nas.py::test_nemotron_nano_8b_lora_torch[Llama-3.1-Nemotron-Nano-8B-v1] SKIP (https://nvbugs/5563469)
unittest/bindings/test_executor_bindings.py::test_request_perf_metrics_draft SKIP (https://nvbugs/5565590)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5568676)
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5547437)