diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 56feccbd664..01897db969d 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -123,7 +123,7 @@ steps: timeout_in_minutes: 20 depends_on: image-build commands: - - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py + - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e08562ea6b1..773748c1737 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -56,7 +56,7 @@ steps: - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py + - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model - label: "Diffusion Tensor Parallelism Test" timeout_in_minutes: 20 diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py index 866c2c6a184..d66104aa135 100644 --- a/tests/e2e/offline_inference/test_sequence_parallel.py +++ b/tests/e2e/offline_inference/test_sequence_parallel.py @@ -145,7 +145,15 @@ def _run_inference( # SP configurations: (ulysses_degree, ring_degree, height, width, warmup, is_perf_test) # - warmup: whether to run warmup for this SP config # - is_perf_test: whether this is a performance test (show speedup metrics) -SP_CONFIGS = [ +SP_CONFIGS_L2 = [ + # Ulysses-2 - performance test + (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), + (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), # Ring-2 - performance test + # Hybrid - correctness only + (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False), +] + +SP_CONFIGS_L3 = [ # Ulysses-2 - performance test (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), # Ring-2 - performance test @@ -168,7 +176,7 @@ def _get_sp_mode(ulysses_degree: int, ring_degree: int) -> str: @pytest.mark.core_model @pytest.mark.diffusion @pytest.mark.parallel -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 2, "rocm": 2}) @pytest.mark.parametrize("model_name", MODELS) def test_sp_correctness(model_name: str): """Test that SP inference produces correct outputs and measure performance. @@ -191,7 +199,130 @@ def test_sp_correctness(model_name: str): print(f"Available GPUs: {device_count}") print("=" * 70) - for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS: + for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS_L2: + sp_size = ulysses_degree * ring_degree + sp_mode = _get_sp_mode(ulysses_degree, ring_degree) + + if device_count < sp_size: + print(f"\n[{sp_mode}] SKIPPED (requires {sp_size} GPUs)") + continue + + # Determine baseline warmup: only for default size (performance tests) + cache_key = (height, width) + baseline_warmup = height == DEFAULT_HEIGHT and width == DEFAULT_WIDTH + + # Get or compute baseline for this (height, width) + if cache_key not in baseline_cache: + print(f"\n--- Running baseline {height}x{width} (warmup={baseline_warmup}) ---") + baseline = _run_inference( + model_name, + torch.bfloat16, + "sdpa", + height=height, + width=width, + warmup=baseline_warmup, + ) + assert len(baseline.images) == 1 + baseline_cache[cache_key] = baseline + print(f"[baseline] {height}x{width}: {baseline.elapsed_ms:.0f}ms") + else: + baseline = baseline_cache[cache_key] + + # Run SP + print(f"\n--- Running {sp_mode} (warmup={sp_warmup}) ---") + sp_result = _run_inference( + model_name, + torch.bfloat16, + "sdpa", + ulysses_degree=ulysses_degree, + ring_degree=ring_degree, + height=height, + width=width, + warmup=sp_warmup, + ) + assert len(sp_result.images) == 1 + + # Compare outputs (correctness) + mean_diff, max_diff = _diff_metrics(baseline.images[0], sp_result.images[0]) + + # Build result entry + result = { + "mode": sp_mode, + "sp_size": sp_size, + "height": height, + "width": width, + "baseline_ms": baseline.elapsed_ms, + "sp_ms": sp_result.elapsed_ms, + "mean_diff": mean_diff, + "max_diff": max_diff, + "is_perf_test": is_perf_test, + } + results.append(result) + + # Output based on test type + if is_perf_test: + speedup = baseline.elapsed_ms / sp_result.elapsed_ms if sp_result.elapsed_ms > 0 else 0 + result["speedup"] = speedup + print( + f"[{sp_mode}] {sp_size} GPUs | " + f"baseline: {baseline.elapsed_ms:.0f}ms, sp: {sp_result.elapsed_ms:.0f}ms, " + f"speedup: {speedup:.2f}x" + ) + else: + print(f"[{sp_mode}] {sp_size} GPUs | sp: {sp_result.elapsed_ms:.0f}ms (correctness only)") + + print(f"[{sp_mode}] diff: mean={mean_diff:.6e}, max={max_diff:.6e}") + + # Assert correctness + assert mean_diff <= DIFF_MEAN_THRESHOLD and max_diff <= DIFF_MAX_THRESHOLD, ( + f"[{sp_mode}] SP output differs from baseline: mean={mean_diff:.6e}, max={max_diff:.6e}" + ) + + # Summary + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"{'Mode':<15} {'GPUs':<6} {'Size':<10} {'Baseline':<12} {'SP':<12} {'Speedup':<10} {'Status'}") + print("-" * 70) + for r in results: + speedup_str = f"{r['speedup']:.2f}x" if r.get("speedup") else "N/A" + baseline_str = f"{r['baseline_ms']:.0f}ms" if r["is_perf_test"] else "N/A" + status = "PASS" if r["mean_diff"] <= DIFF_MEAN_THRESHOLD else "FAIL" + print( + f"{r['mode']:<15} {r['sp_size']:<6} {r['height']}x{r['width']:<5} " + f"{baseline_str:<12} {r['sp_ms']:.0f}ms{'':<7} {speedup_str:<10} {status}" + ) + print("=" * 70) + + +# TODO: After PR#1272 is merged, add markers +# @pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.parallel +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) +@pytest.mark.parametrize("model_name", MODELS) +def test_sp_correctness_advanced(model_name: str): + """Test that SP inference produces correct outputs and measure performance. + + Runs baseline once per unique (height, width), then tests all SP configs. + + Note: Run with `pytest -v -s` to see detailed output. + """ + device_count = current_omni_platform.get_device_count() + + # Cache baseline results by (height, width) + # Key: (height, width), Value: (result, warmup_used) + baseline_cache: dict[tuple[int, int], InferenceResult] = {} + + # Collect results for summary + results: list[dict] = [] + + print("\n" + "=" * 70) + print(f"Sequence Parallel Test - Model: {model_name}") + print(f"Available GPUs: {device_count}") + print("=" * 70) + + for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS_L3: sp_size = ulysses_degree * ring_degree sp_mode = _get_sp_mode(ulysses_degree, ring_degree)