diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 6447a33838d7..c18546ce45d6 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -154,7 +154,7 @@ def run_tests( with monkeypatch.context() as m: # lock matmul precision to full FP32 (IEEE) - m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( diff --git a/vllm/envs.py b/vllm/envs.py index 1d4128d74b95..dadb8c8a231c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,7 +75,7 @@ VLLM_MEDIA_CONNECTOR: str = "http" VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" - VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -459,13 +459,11 @@ def get_vllm_port() -> int | None: "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", # Controls PyTorch float32 matmul precision mode within vLLM workers. - # Accepted values: - # - "ieee" (default): force full IEEE FP32 matmul precision. - # - "tf32": enable TensorFloat32-based fast matmul. + # Valid options mirror torch.set_float32_matmul_precision "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( "VLLM_FLOAT32_MATMUL_PRECISION", - "ieee", - ["ieee", "tf32"], + "highest", + ["highest", "high", "medium"], case_sensitive=False, ), # Maximum number of compilation jobs to run in parallel. diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index c8441c09b2f9..fd4ee596c30e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -84,7 +84,7 @@ def __init__( # configure float32 matmul precision according to vLLM env. precision = envs.VLLM_FLOAT32_MATMUL_PRECISION - torch.backends.cuda.matmul.fp32_precision = precision + torch.set_float32_matmul_precision(precision) if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing