Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,6 @@ repos:
language: python
types: [python]
additional_dependencies: [regex]
# prevent use torch.cuda APIs
- id: check-torch-cuda-call
name: "Prevent new 'torch.cuda' APIs call"
entry: python tools/pre_commit/check_torch_cuda.py
language: python
types: [python]
additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/pre_commit/validate_config.py
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_topk_topp.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def reset_memory_stats():
"""Reset peak memory statistics."""
reset_buffer_cache()
torch.cuda.reset_peak_memory_stats()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
gc.collect()


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def clear_triton_cache():

# Clear CUDA memory cache
if torch.cuda.is_available():
torch.accelerator.empty_cache()
torch.cuda.empty_cache()

# Try to clear Triton's runtime cache
try:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_reshape_and_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def run_cuda_benchmark(n_iters: int) -> float:

# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.accelerator.empty_cache()
torch.cuda.empty_cache()

return lat

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_reshape_and_cache_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def run_cuda_benchmark(n_iters: int) -> float:

# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.accelerator.empty_cache()
torch.cuda.empty_cache()

return lat

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def main():
# Clean up the GPU memory for the next test
del engine
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/rlhf_colocate.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def get_size(p: torch.Tensor) -> int:
s.close()
del buffer
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()


# Ray manages four GPUs.
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/rlhf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
socket.close()
del buffer
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()

def report_device_id(self) -> str:
from vllm.platforms import current_platform
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_dynamic_shapes_compilation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_dynamic_shapes_compilation(
# Clean up GPU memory
del model
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
torch.cuda.synchronize()
print("GPU memory cleared")

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1542,7 +1542,7 @@ def clean_gpu_memory_between_tests():

# Clean up GPU memory after the test
if torch.cuda.is_available():
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
gc.collect()


Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_tensorizer_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

def _cleanup():
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()


@pytest.fixture(autouse=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/mamba/test_causal_conv1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_causal_conv1d_varlen(
batch, with_padding, dim, seqlen, width, has_bias, silu_activation, itype
):
device = "cuda"
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/moe/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ def test_mixtral_moe(
requires_grad=False,
)
torch.cuda.synchronize()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()

# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
finally:
del model
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()


def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref):
Expand All @@ -200,7 +200,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref)
finally:
del model
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
Expand Down Expand Up @@ -283,7 +283,7 @@ def test_vllm_tensorized_model_has_same_outputs(
model_ref, vllm_runner, tmp_path, model_path
):
gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
config = TensorizerConfig(tensorizer_uri=str(model_path))
args = EngineArgs(model=model_ref)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_gc():
del llm

gc.collect()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()

# The memory allocated for model and KV cache should be released.
# The memory allocated for PyTorch and others should be less than 50MB.
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/e2e/test_async_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_no_sync_with_spec_decode(
assert len(outputs[0].outputs[0].text) > 0

del llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

sync_tracker.assert_no_sync()
4 changes: 2 additions & 2 deletions tests/v1/e2e/test_lora_with_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def test_batch_inference_correctness(
prompts, sampling_params, lora_request=lora_request
)
del ref_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

lora_spec_llm = LLM(
Expand Down Expand Up @@ -135,5 +135,5 @@ def test_batch_inference_correctness(
print(f"match ratio: {matches}/{len(ref_outputs)}")
assert matches > int(0.90 * len(ref_outputs))
del lora_spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
4 changes: 2 additions & 2 deletions tests/v1/e2e/test_mamba_prefix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def _run_ref_mamba_state_worker():
torch.save(cpu_state_ref, "mamba_kv_cache_dict_ref.pth")
mamba_kv_cache_dict.clear()
del engine
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
except Exception:
traceback.print_exc()
Expand Down Expand Up @@ -805,5 +805,5 @@ def test_mamba_prefix_cache(monkeypatch: pytest.MonkeyPatch):
check_mamba_state_equal(mamba_state_ref, mamba_kv_cache_dict, keys_to_check)
mamba_kv_cache_dict.clear()
del engine
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
18 changes: 9 additions & 9 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def test_ngram_and_suffix_correctness(
)
evaluate_llm_for_gsm8k(spec_llm)
del spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()


Expand Down Expand Up @@ -240,7 +240,7 @@ def test_suffix_decoding_acceptance(
assert last_accept_rate > 0.80

del spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()


Expand Down Expand Up @@ -307,14 +307,14 @@ def test_speculators_model_integration(
verifier_model = spec_llm.llm_engine.vllm_config.model_config.model

del spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

# Second run: Reference without speculative decoding
ref_llm = LLM(model=verifier_model, max_model_len=4096)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

# Compare outputs
Expand Down Expand Up @@ -410,7 +410,7 @@ def _run_eagle_correctness(
)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

spec_llm = LLM(
Expand Down Expand Up @@ -445,7 +445,7 @@ def _run_eagle_correctness(

assert matches > int(0.6 * len(ref_outputs))
del spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()


Expand Down Expand Up @@ -715,7 +715,7 @@ def test_mtp_correctness(
ref_llm, expected_accuracy_threshold=expected_accuracy_threshold
)
del ref_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

spec_llm = LLM(
Expand Down Expand Up @@ -747,7 +747,7 @@ def test_mtp_correctness(
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(MTP_SIMILARITY_RATE * len(ref_outputs))
del spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()


Expand Down Expand Up @@ -952,7 +952,7 @@ def assert_draft_model_correctness(args: ArgsTest):
)

del spec_llm # CLEANUP
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

print(
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/entrypoints/llm/test_struct_output_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
# Free memory as soon as possible as failed assertions
# will short circuit and not free up memory
del llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

for index, output in enumerate(outputs):
Expand Down
6 changes: 3 additions & 3 deletions tests/v1/sample/test_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
assert positive_values > 0
finally:
del llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()


Expand Down Expand Up @@ -1065,7 +1065,7 @@ def test_spec_decode_logprobs(
for logprobs in output.logprobs:
ref_logprobs.extend(logprobs.values())
del ref_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

# Run spec decode LLM.
Expand Down Expand Up @@ -1095,7 +1095,7 @@ def test_spec_decode_logprobs(
for logprobs in output.logprobs:
spec_logprobs.extend(logprobs.values())
del spec_llm
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

# Per-token logprobs are expected to be the same.
Expand Down
43 changes: 0 additions & 43 deletions tools/pre_commit/check_torch_cuda.py

This file was deleted.

4 changes: 1 addition & 3 deletions vllm/compilation/cuda_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
# therefore, we only run gc for the first graph,
# and disable gc for the rest of the graphs.
stack.enter_context(patch("gc.collect", lambda: None))
stack.enter_context(
patch("torch.accelerator.empty_cache", lambda: None)
)
stack.enter_context(patch("torch.cuda.empty_cache", lambda: None))

if self.graph_pool is not None:
set_graph_pool_id(self.graph_pool)
Expand Down
2 changes: 1 addition & 1 deletion vllm/distributed/elastic_ep/elastic_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ def switch_and_prepare(self) -> None:

gc.collect()
torch.cuda.synchronize()
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
unlock_workspace()
self.worker.compile_or_warm_up_model()
lock_workspace()
Expand Down
Loading
Loading