From d9d68bfdfb66e674dd9c990a2e466e5efd32bb1a Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 15 Apr 2025 17:39:06 +0300
Subject: [PATCH 1/6] add flag to disable mark scales as const

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 vllm/worker/hpu_model_runner.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index bb18d4564a9f..b614f76abcf9 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -837,6 +837,10 @@ def load_model(self) -> None:
                 with HabanaMemoryProfiler() as m_inc:
                     from neural_compressor.torch.quantization import (
                         FP8Config, convert, prepare)
+
+                    disable_mark_scales_scales_as_const = os.getenv(
+                        "VLLM_DISABLE_MARK_SCALES_AS_CONST", "false"
+                    ) in ("1", "true")
                     config = FP8Config.from_json_file(
                         os.getenv("QUANT_CONFIG", ""))
                     self._inc_preprocess_(self.model, config)
@@ -844,8 +848,10 @@ def load_model(self) -> None:
                         self.model = prepare(self.model, config)
                     elif config.quantize:
                         self.model = convert(self.model, config)
-                    htcore.hpu_initialize(self.model,
-                                          mark_only_scales_as_const=True)
+                    if not disable_mark_scales_scales_as_const:
+                        htcore.hpu_initialize(
+                            self.model, mark_only_scales_as_const=True
+                        )
                     torch.distributed.barrier()
                 self.inc_initialized_successfully = True
                 logger.info("Preparing model with INC took %s",

From 7991e532222140e3d158b4c56f8e43e89fe241f0 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 15 Apr 2025 17:39:58 +0300
Subject: [PATCH 2/6] call barrier if needed

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 vllm/worker/hpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index b614f76abcf9..28e0992aa31f 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -852,7 +852,8 @@ def load_model(self) -> None:
                         htcore.hpu_initialize(
                             self.model, mark_only_scales_as_const=True
                         )
-                    torch.distributed.barrier()
+                    if torch.distributed.is_initialized():
+                        torch.distributed.barrier()
                 self.inc_initialized_successfully = True
                 logger.info("Preparing model with INC took %s",
                             m_inc.get_summary_string())

From b2ec7ee2971135dff85f7155b5fa1a9df8b7e037 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 15 Apr 2025 17:46:16 +0300
Subject: [PATCH 3/6] fix typo

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 vllm/worker/hpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 28e0992aa31f..cc029262a9d3 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -838,7 +838,7 @@ def load_model(self) -> None:
                     from neural_compressor.torch.quantization import (
                         FP8Config, convert, prepare)
 
-                    disable_mark_scales_scales_as_const = os.getenv(
+                    disable_mark_scales_as_const = os.getenv(
                         "VLLM_DISABLE_MARK_SCALES_AS_CONST", "false"
                     ) in ("1", "true")
                     config = FP8Config.from_json_file(
@@ -848,7 +848,7 @@ def load_model(self) -> None:
                         self.model = prepare(self.model, config)
                     elif config.quantize:
                         self.model = convert(self.model, config)
-                    if not disable_mark_scales_scales_as_const:
+                    if not disable_mark_scales_as_const:
                         htcore.hpu_initialize(
                             self.model, mark_only_scales_as_const=True
                         )

From d79ab1cd775c914994bc8383e874c5b18a5af076 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Wed, 16 Apr 2025 12:26:30 +0300
Subject: [PATCH 4/6] fix pre-commit

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 vllm/worker/hpu_model_runner.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index cc029262a9d3..96e810dd05b2 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -839,8 +839,8 @@ def load_model(self) -> None:
                         FP8Config, convert, prepare)
 
                     disable_mark_scales_as_const = os.getenv(
-                        "VLLM_DISABLE_MARK_SCALES_AS_CONST", "false"
-                    ) in ("1", "true")
+                        "VLLM_DISABLE_MARK_SCALES_AS_CONST",
+                        "false") in ("1", "true")
                     config = FP8Config.from_json_file(
                         os.getenv("QUANT_CONFIG", ""))
                     self._inc_preprocess_(self.model, config)
@@ -849,9 +849,8 @@ def load_model(self) -> None:
                     elif config.quantize:
                         self.model = convert(self.model, config)
                     if not disable_mark_scales_as_const:
-                        htcore.hpu_initialize(
-                            self.model, mark_only_scales_as_const=True
-                        )
+                        htcore.hpu_initialize(self.model,
+                                              mark_only_scales_as_const=True)
                     if torch.distributed.is_initialized():
                         torch.distributed.barrier()
                 self.inc_initialized_successfully = True

From db7e4bc45da2f5e3ebd71874af107fbd8c1897c7 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Wed, 16 Apr 2025 15:24:11 +0300
Subject: [PATCH 5/6] updated

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 README_GAUDI.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index a137ba21696e..84c80dc60bac 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -409,6 +409,10 @@ measurements for a given model. The quantization configuration is used during in
 > If you are prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which is time-consuming.
 However, disabling this feature in production environments is not recommended, as it can lead to a significant performance decrease.
 
+> [!TIP]
+> If you are benchmarking an FP8 model with `scale_format=const`, setting `VLLM_DISABLE_MARK_SCALES_AS_CONST=true` can help speed up the warmup stage.
+
+
 > [!TIP]
 > When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables:
 > - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

From 236161401ad7bd5648e7feaf3b0a40c51ebcc52b Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Wed, 16 Apr 2025 15:52:40 +0300
Subject: [PATCH 6/6] fix pre-commit

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 README_GAUDI.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 84c80dc60bac..20dd1a296657 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -412,7 +412,6 @@ However, disabling this feature in production environments is not recommended, a
 > [!TIP]
 > If you are benchmarking an FP8 model with `scale_format=const`, setting `VLLM_DISABLE_MARK_SCALES_AS_CONST=true` can help speed up the warmup stage.
 
-
 > [!TIP]
 > When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables:
 > - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.