From 1602b7bcec6f1ba190aacb0a4ee47bd4b69fce5b Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 19 Jun 2025 18:51:55 +0000 Subject: [PATCH 1/6] add emualation support Signed-off-by: Dipika Sikka --- .../compressed_tensors/compressed_tensors.py | 4 +++- .../schemes/compressed_tensors_w4a4_nvfp4.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index e5702c871cc9..c1763326bfd2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os from contextlib import suppress from typing import Any, Literal, Optional, cast @@ -374,7 +375,8 @@ def _get_scheme_from_parts( if is_activation_quantization_format(self.quant_format): if self._is_fp4a4_nvfp4(weight_quant, input_quant): - if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(): + if CompressedTensorsW4A4Fp4.cutlass_fp4_supported( + ) or os.environ.get("USE_NVFP4_CT_EMULATIONS") == "1": return CompressedTensorsW4A4Fp4() else: logger.warning_once( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 32718972a627..6a688cc30c87 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import os from typing import Callable, Optional import torch @@ -9,6 +10,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501 + run_nvfp4_emulations) from vllm.model_executor.parameter import (GroupQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter) @@ -18,6 +21,8 @@ __all__ = ["CompressedTensorsW4A4Fp4"] +USE_NVFP4_CT_EMULATIONS = os.environ.get("USE_NVFP4_CT_EMULATIONS", '0') + class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): @@ -129,6 +134,14 @@ def apply_weights(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: + if USE_NVFP4_CT_EMULATIONS == "1": + return run_nvfp4_emulations( + x=x, + input_global_scale=layer.input_global_scale, + weight=layer.weight, + weight_scale_swizzles=layer.weight_scale_swizzled, + weight_global_scale=layer.weight_global_scale) + output_dtype = x.dtype output_shape = [x.shape[0], layer.weight.shape[0]] From 3c537c0e390c0c8d1cc330773243e8691b61478f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 19 Jun 2025 19:08:48 +0000 Subject: [PATCH 2/6] fix Signed-off-by: Dipika Sikka --- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- .../schemes/compressed_tensors_w4a4_nvfp4.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index c1763326bfd2..1c65b26b6d7a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -376,7 +376,7 @@ def _get_scheme_from_parts( if is_activation_quantization_format(self.quant_format): if self._is_fp4a4_nvfp4(weight_quant, input_quant): if CompressedTensorsW4A4Fp4.cutlass_fp4_supported( - ) or os.environ.get("USE_NVFP4_CT_EMULATIONS") == "1": + ) or os.environ.get("USE_NVFP4_CT_EMULATIONS", "0") == "1": return CompressedTensorsW4A4Fp4() else: logger.warning_once( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 6a688cc30c87..2938258a9130 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -31,6 +31,8 @@ def __init__(self): @classmethod def get_min_capability(cls) -> int: + if USE_NVFP4_CT_EMULATIONS == "1": + return 80 return 100 @classmethod @@ -135,11 +137,12 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_NVFP4_CT_EMULATIONS == "1": + print("running emulations") return run_nvfp4_emulations( x=x, input_global_scale=layer.input_global_scale, weight=layer.weight, - weight_scale_swizzles=layer.weight_scale_swizzled, + weight_scale_swizzled=layer.weight_scale_swizzled, weight_global_scale=layer.weight_global_scale) output_dtype = x.dtype From c968c62e4a70f380e6a3ea910b65099b8edc0237 Mon Sep 17 00:00:00 2001 From: Dipika Date: Thu, 19 Jun 2025 15:22:07 -0400 Subject: [PATCH 3/6] remove print Signed-off-by: Dipika --- .../compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 2938258a9130..38e8c4dd6756 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -137,7 +137,6 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_NVFP4_CT_EMULATIONS == "1": - print("running emulations") return run_nvfp4_emulations( x=x, input_global_scale=layer.input_global_scale, From 095b4c8fe565a6f6e1481aa3fdd15b907f69a47f Mon Sep 17 00:00:00 2001 From: Dipika Date: Thu, 19 Jun 2025 15:28:47 -0400 Subject: [PATCH 4/6] add bias Signed-off-by: Dipika --- .../schemes/compressed_tensors_w4a4_nvfp4.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 38e8c4dd6756..108422f18e69 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -137,12 +137,15 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: if USE_NVFP4_CT_EMULATIONS == "1": - return run_nvfp4_emulations( + out = run_nvfp4_emulations( x=x, input_global_scale=layer.input_global_scale, weight=layer.weight, weight_scale_swizzled=layer.weight_scale_swizzled, weight_global_scale=layer.weight_global_scale) + if bias is not None: + out = out + bias + return out output_dtype = x.dtype output_shape = [x.shape[0], layer.weight.shape[0]] From 00bfb898622e848142155917ac502cac4ab03d90 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 20 Jun 2025 00:13:45 +0000 Subject: [PATCH 5/6] use envs Signed-off-by: Dipika Sikka --- vllm/envs.py | 7 +++++++ .../quantization/compressed_tensors/compressed_tensors.py | 4 ++-- .../schemes/compressed_tensors_w4a4_nvfp4.py | 8 +++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 01d8d8a2d2e0..c6a57bd82356 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -133,6 +133,7 @@ VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300 VLLM_KV_CACHE_LAYOUT: Optional[str] = None VLLM_COMPUTE_NANS_IN_LOGITS: bool = False + VLLM_USE_NVFP4_CT_EMULATIONS: bool = False def get_default_cache_root(): @@ -918,6 +919,12 @@ def get_vllm_port() -> Optional[int]: # or bad hardware but it may add compute overhead. "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))), + + # Controls whether or not emulations are used for NVFP4 + # generations on machines < 100 for compressed-tensors + # models + "VLLM_USE_NVFP4_CT_EMULATIONS": + lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))) } # --8<-- [end:env-vars-definition] diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 1c65b26b6d7a..d21abb2741a2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os from contextlib import suppress from typing import Any, Literal, Optional, cast @@ -14,6 +13,7 @@ QuantizationType) from pydantic import BaseModel +import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -376,7 +376,7 @@ def _get_scheme_from_parts( if is_activation_quantization_format(self.quant_format): if self._is_fp4a4_nvfp4(weight_quant, input_quant): if CompressedTensorsW4A4Fp4.cutlass_fp4_supported( - ) or os.environ.get("USE_NVFP4_CT_EMULATIONS", "0") == "1": + ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS: return CompressedTensorsW4A4Fp4() else: logger.warning_once( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 108422f18e69..ec1d4a6c0efa 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import os from typing import Callable, Optional import torch from torch.nn.parameter import Parameter +import vllm.envs as envs from vllm._custom_ops import (cutlass_scaled_fp4_mm, cutlass_scaled_mm_supports_fp4, scaled_fp4_quant) from vllm.logger import init_logger @@ -21,8 +21,6 @@ __all__ = ["CompressedTensorsW4A4Fp4"] -USE_NVFP4_CT_EMULATIONS = os.environ.get("USE_NVFP4_CT_EMULATIONS", '0') - class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): @@ -31,7 +29,7 @@ def __init__(self): @classmethod def get_min_capability(cls) -> int: - if USE_NVFP4_CT_EMULATIONS == "1": + if envs.VLLM_USE_NVFP4_CT_EMULATIONS: return 80 return 100 @@ -136,7 +134,7 @@ def apply_weights(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - if USE_NVFP4_CT_EMULATIONS == "1": + if envs.VLLM_USE_NVFP4_CT_EMULATIONS: out = run_nvfp4_emulations( x=x, input_global_scale=layer.input_global_scale, From d6018a74d866a6681fc025309345d63385378aac Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 24 Jun 2025 19:47:41 +0000 Subject: [PATCH 6/6] style Signed-off-by: Dipika Sikka --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index c6a57bd82356..cb4b54307033 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -919,7 +919,7 @@ def get_vllm_port() -> Optional[int]: # or bad hardware but it may add compute overhead. "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))), - + # Controls whether or not emulations are used for NVFP4 # generations on machines < 100 for compressed-tensors # models