From 1602b7bcec6f1ba190aacb0a4ee47bd4b69fce5b Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 19 Jun 2025 18:51:55 +0000
Subject: [PATCH 1/6] add emualation support

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py        |  4 +++-
 .../schemes/compressed_tensors_w4a4_nvfp4.py        | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index e5702c871cc9..c1763326bfd2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 from contextlib import suppress
 from typing import Any, Literal, Optional, cast
 
@@ -374,7 +375,8 @@ def _get_scheme_from_parts(
 
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
-                if CompressedTensorsW4A4Fp4.cutlass_fp4_supported():
+                if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
+                ) or os.environ.get("USE_NVFP4_CT_EMULATIONS") == "1":
                     return CompressedTensorsW4A4Fp4()
                 else:
                     logger.warning_once(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 32718972a627..6a688cc30c87 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import os
 from typing import Callable, Optional
 
 import torch
@@ -9,6 +10,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
+    run_nvfp4_emulations)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -18,6 +21,8 @@
 
 __all__ = ["CompressedTensorsW4A4Fp4"]
 
+USE_NVFP4_CT_EMULATIONS = os.environ.get("USE_NVFP4_CT_EMULATIONS", '0')
+
 
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
 
@@ -129,6 +134,14 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
+        if USE_NVFP4_CT_EMULATIONS == "1":
+            return run_nvfp4_emulations(
+                x=x,
+                input_global_scale=layer.input_global_scale,
+                weight=layer.weight,
+                weight_scale_swizzles=layer.weight_scale_swizzled,
+                weight_global_scale=layer.weight_global_scale)
+
         output_dtype = x.dtype
         output_shape = [x.shape[0], layer.weight.shape[0]]
 

From 3c537c0e390c0c8d1cc330773243e8691b61478f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 19 Jun 2025 19:08:48 +0000
Subject: [PATCH 2/6] fix

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../quantization/compressed_tensors/compressed_tensors.py    | 2 +-
 .../schemes/compressed_tensors_w4a4_nvfp4.py                 | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c1763326bfd2..1c65b26b6d7a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -376,7 +376,7 @@ def _get_scheme_from_parts(
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
-                ) or os.environ.get("USE_NVFP4_CT_EMULATIONS") == "1":
+                ) or os.environ.get("USE_NVFP4_CT_EMULATIONS", "0") == "1":
                     return CompressedTensorsW4A4Fp4()
                 else:
                     logger.warning_once(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 6a688cc30c87..2938258a9130 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -31,6 +31,8 @@ def __init__(self):
 
     @classmethod
     def get_min_capability(cls) -> int:
+        if USE_NVFP4_CT_EMULATIONS == "1":
+            return 80
         return 100
 
     @classmethod
@@ -135,11 +137,12 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         if USE_NVFP4_CT_EMULATIONS == "1":
+            print("running emulations")
             return run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,
                 weight=layer.weight,
-                weight_scale_swizzles=layer.weight_scale_swizzled,
+                weight_scale_swizzled=layer.weight_scale_swizzled,
                 weight_global_scale=layer.weight_global_scale)
 
         output_dtype = x.dtype

From c968c62e4a70f380e6a3ea910b65099b8edc0237 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Thu, 19 Jun 2025 15:22:07 -0400
Subject: [PATCH 3/6] remove print

Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 .../compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 2938258a9130..38e8c4dd6756 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -137,7 +137,6 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         if USE_NVFP4_CT_EMULATIONS == "1":
-            print("running emulations")
             return run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,

From 095b4c8fe565a6f6e1481aa3fdd15b907f69a47f Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Thu, 19 Jun 2025 15:28:47 -0400
Subject: [PATCH 4/6] add bias

Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 .../schemes/compressed_tensors_w4a4_nvfp4.py                 | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 38e8c4dd6756..108422f18e69 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -137,12 +137,15 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         if USE_NVFP4_CT_EMULATIONS == "1":
-            return run_nvfp4_emulations(
+            out = run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,
                 weight=layer.weight,
                 weight_scale_swizzled=layer.weight_scale_swizzled,
                 weight_global_scale=layer.weight_global_scale)
+            if bias is not None:
+                out = out + bias
+            return out
 
         output_dtype = x.dtype
         output_shape = [x.shape[0], layer.weight.shape[0]]

From 00bfb898622e848142155917ac502cac4ab03d90 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 20 Jun 2025 00:13:45 +0000
Subject: [PATCH 5/6] use envs

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 vllm/envs.py                                              | 7 +++++++
 .../quantization/compressed_tensors/compressed_tensors.py | 4 ++--
 .../schemes/compressed_tensors_w4a4_nvfp4.py              | 8 +++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 01d8d8a2d2e0..c6a57bd82356 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -133,6 +133,7 @@
     VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
     VLLM_KV_CACHE_LAYOUT: Optional[str] = None
     VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
+    VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
 
 
 def get_default_cache_root():
@@ -918,6 +919,12 @@ def get_vllm_port() -> Optional[int]:
     # or bad hardware but it may add compute overhead.
     "VLLM_COMPUTE_NANS_IN_LOGITS":
     lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
+    
+    # Controls whether or not emulations are used for NVFP4
+    # generations on machines < 100 for compressed-tensors
+    # models
+    "VLLM_USE_NVFP4_CT_EMULATIONS":
+    lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 1c65b26b6d7a..d21abb2741a2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 from contextlib import suppress
 from typing import Any, Literal, Optional, cast
 
@@ -14,6 +13,7 @@
                                              QuantizationType)
 from pydantic import BaseModel
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -376,7 +376,7 @@ def _get_scheme_from_parts(
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
-                ) or os.environ.get("USE_NVFP4_CT_EMULATIONS", "0") == "1":
+                ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
                     return CompressedTensorsW4A4Fp4()
                 else:
                     logger.warning_once(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 108422f18e69..ec1d4a6c0efa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
 from typing import Callable, Optional
 
 import torch
 from torch.nn.parameter import Parameter
 
+import vllm.envs as envs
 from vllm._custom_ops import (cutlass_scaled_fp4_mm,
                               cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
 from vllm.logger import init_logger
@@ -21,8 +21,6 @@
 
 __all__ = ["CompressedTensorsW4A4Fp4"]
 
-USE_NVFP4_CT_EMULATIONS = os.environ.get("USE_NVFP4_CT_EMULATIONS", '0')
-
 
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
 
@@ -31,7 +29,7 @@ def __init__(self):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        if USE_NVFP4_CT_EMULATIONS == "1":
+        if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
             return 80
         return 100
 
@@ -136,7 +134,7 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        if USE_NVFP4_CT_EMULATIONS == "1":
+        if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
             out = run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,

From d6018a74d866a6681fc025309345d63385378aac Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 24 Jun 2025 19:47:41 +0000
Subject: [PATCH 6/6] style

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 vllm/envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index c6a57bd82356..cb4b54307033 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -919,7 +919,7 @@ def get_vllm_port() -> Optional[int]:
     # or bad hardware but it may add compute overhead.
     "VLLM_COMPUTE_NANS_IN_LOGITS":
     lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
-    
+
     # Controls whether or not emulations are used for NVFP4
     # generations on machines < 100 for compressed-tensors
     # models