-
Notifications
You must be signed in to change notification settings - Fork 1.2k
[Feature] Support DiT Layerwise (Blockwise) CPU Offloading #858
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 27 commits
fdbd01c
d79975c
2df31a0
62028f9
5331795
b6036f3
d42d51c
0a07f86
cfe3699
3792ea3
b401bef
8163f96
8ab72f4
6e844a2
75626f7
8b07c12
51d5810
82ee767
9f86fcc
308981d
3c4bbc6
e7afde9
2d8257e
ba6d840
99b0d39
0faac2d
faa4e2d
52a13df
b3ccb2a
5b0bffe
c1297e7
2025760
4e0e6ac
5155c5e
976bd42
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
hsliuustc0106 marked this conversation as resolved.
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
| import torch | ||
| from vllm.distributed.parallel_state import cleanup_dist_env_and_memory | ||
|
|
||
| from tests.utils import GPUMemoryMonitor | ||
| from vllm_omni.inputs.data import OmniDiffusionSamplingParams | ||
| from vllm_omni.platforms import current_omni_platform | ||
|
|
||
| # ruff: noqa: E402 | ||
| REPO_ROOT = Path(__file__).resolve().parents[2] | ||
| if str(REPO_ROOT) not in sys.path: | ||
| sys.path.insert(0, str(REPO_ROOT)) | ||
|
|
||
| from vllm_omni import Omni | ||
|
|
||
| models = ["Wan-AI/Wan2.2-T2V-A14B-Diffusers"] | ||
|
|
||
|
|
||
| def run_inference( | ||
| model_name: str, | ||
| layerwise_offload: bool = False, | ||
| num_gpu_layers: int = 1, | ||
| num_inference_steps: int = 3, | ||
| ) -> float: | ||
| # For now, only support on GPU, so apply torch.cuda operations here | ||
| # NPU / ROCm platforms are expected to be detected and skipped this test function | ||
| torch.cuda.empty_cache() | ||
| device_index = torch.cuda.current_device() | ||
| monitor = GPUMemoryMonitor(device_index=device_index, interval=0.02) | ||
| monitor.start() | ||
|
|
||
| m = Omni( | ||
| model=model_name, | ||
| enable_layerwise_offload=layerwise_offload, | ||
| layerwise_num_gpu_layers=num_gpu_layers, | ||
| boundary_ratio=0.875, | ||
| flow_shift=5.0, | ||
| ) | ||
|
|
||
| torch.cuda.reset_peak_memory_stats(device=device_index) | ||
|
|
||
| # Refer to tests/e2e/offline_inference/test_t2v_model.py | ||
| # Use minimal settings for testing | ||
| height = 480 | ||
| width = 640 | ||
| num_frames = 5 | ||
|
|
||
| m.generate( | ||
| "A cat sitting on a table", | ||
| OmniDiffusionSamplingParams( | ||
| height=height, | ||
| width=width, | ||
| generator=torch.Generator("cuda").manual_seed(42), | ||
| guidance_scale=1.0, | ||
| num_inference_steps=num_inference_steps, | ||
| num_frames=num_frames, | ||
| ), | ||
| ) | ||
|
hsliuustc0106 marked this conversation as resolved.
|
||
|
|
||
| peak = monitor.peak_used_mb | ||
| monitor.stop() | ||
|
|
||
| return peak | ||
|
|
||
|
|
||
| @pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported") | ||
| @pytest.mark.parametrize("model_name", models) | ||
| def test_layerwise_offload_diffusion_model(model_name: str): | ||
| """Test that layerwise offloading reduces GPU memory usage. | ||
|
|
||
| This test verifies that layerwise offloading significantly reduces peak | ||
| GPU memory usage compared to loading the entire model on GPU. The layerwise | ||
| offloader keeps only a single transformer block on GPU at a time, with | ||
| prefetching for compute-memory overlap. | ||
| """ | ||
| try: | ||
| # Run without layerwise offloading (baseline) | ||
| no_offload_peak_memory = run_inference(model_name, layerwise_offload=False) | ||
| cleanup_dist_env_and_memory() | ||
|
|
||
| # Run with layerwise offloading (1 layer on GPU) | ||
| layerwise_offload_peak_memory = run_inference(model_name, layerwise_offload=True, num_gpu_layers=1) | ||
| except Exception: | ||
| pytest.fail("Inference failed") | ||
|
|
||
| print(f"Layerwise offload peak memory (1 GPU layer): {layerwise_offload_peak_memory} MB") | ||
| print(f"No offload peak memory: {no_offload_peak_memory} MB") | ||
|
|
||
| # Verify that layerwise offloading significantly reduces memory usage | ||
| # Using a threshold of 2500 MB savings to match the CPU offload test | ||
| assert layerwise_offload_peak_memory + 2500 < no_offload_peak_memory, ( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We only have 1 layer on gpu, why we only save 2.5G
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was copied from e2e cpu offloading test. I've updated the e2e test with # Models to test and expected saved memory in MB, correspondingly
MODELS_SAVED_MEMORY_MB = {"Wan-AI/Wan2.2-T2V-A14B-Diffusers": 45000}and tested with it, got expected result Layerwise offload peak memory (1 GPU layer): 28080.1875 MB
No offload peak memory: 78934.1875 MB |
||
| f"Layerwise offload peak memory {layerwise_offload_peak_memory} MB " | ||
| f"should be significantly less than no offload peak memory {no_offload_peak_memory} MB" | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported") | ||
| @pytest.mark.parametrize("model_name", models) | ||
| def test_layerwise_offload_multiple_gpu_layers(model_name: str): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's merge these 2 tests to save test time since they have one duplicate test
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| """Test layerwise offloading with multiple GPU layers. | ||
|
|
||
| This test verifies that keeping more layers on GPU increases memory usage | ||
| but should still be less than loading the entire model. It tests with | ||
| 2 GPU layers vs 1 GPU layer. | ||
| """ | ||
| try: | ||
| # Run with 1 GPU layer | ||
| one_layer_peak = run_inference(model_name, layerwise_offload=True, num_gpu_layers=1) | ||
| cleanup_dist_env_and_memory() | ||
|
|
||
| # Run with 2 GPU layers | ||
| two_layers_peak = run_inference(model_name, layerwise_offload=True, num_gpu_layers=2) | ||
| except Exception: | ||
| pytest.fail("Inference failed") | ||
|
|
||
| print(f"Layerwise offload peak memory (1 GPU layer): {one_layer_peak} MB") | ||
| print(f"Layerwise offload peak memory (2 GPU layers): {two_layers_peak} MB") | ||
|
|
||
| # Verify that 2 GPU layers uses more memory than 1 GPU layer | ||
| # But not excessively more (should be a reasonable increase) | ||
| assert one_layer_peak < two_layers_peak, ( | ||
| f"1 GPU layer peak {one_layer_peak} MB should be < 2 GPU layers peak {two_layers_peak} MB" | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.