diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index d121106334cb..5a71601d94da 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -38,20 +38,20 @@ FP4_DTYPE = torch.uint8 -def empty_bf16(*args, **kwargs): - return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda") +def empty_bf16(*args, device=current_platform.device_type, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device=device) -def empty_fp32(*args, **kwargs): - return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda") +def empty_fp32(*args, device=current_platform.device_type, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.float32, device=device) -def empty_i32(*args, **kwargs): - return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda") +def empty_i32(*args, device=current_platform.device_type, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.int32, device=device) -def empty_i64(*args, **kwargs): - return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda") +def empty_i64(*args, device=current_platform.device_type, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.int64, device=device) RMS_OP = torch.ops._C.rms_norm.default diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 971bd2005a23..90c9afc2697f 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -32,7 +32,7 @@ def is_supported( try: import aiter # noqa: F401 # deliberately attempt to import aiter - except Exception: + except (ImportError, ModuleNotFoundError): return ( False, "AiterScaledMMLinearKernel requires `aiter` which is not "