From d9d68bfdfb66e674dd9c990a2e466e5efd32bb1a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 15 Apr 2025 17:39:06 +0300 Subject: [PATCH 1/6] add flag to disable mark scales as const Signed-off-by: Yi Liu --- vllm/worker/hpu_model_runner.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index bb18d4564a9f..b614f76abcf9 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -837,6 +837,10 @@ def load_model(self) -> None: with HabanaMemoryProfiler() as m_inc: from neural_compressor.torch.quantization import ( FP8Config, convert, prepare) + + disable_mark_scales_scales_as_const = os.getenv( + "VLLM_DISABLE_MARK_SCALES_AS_CONST", "false" + ) in ("1", "true") config = FP8Config.from_json_file( os.getenv("QUANT_CONFIG", "")) self._inc_preprocess_(self.model, config) @@ -844,8 +848,10 @@ def load_model(self) -> None: self.model = prepare(self.model, config) elif config.quantize: self.model = convert(self.model, config) - htcore.hpu_initialize(self.model, - mark_only_scales_as_const=True) + if not disable_mark_scales_scales_as_const: + htcore.hpu_initialize( + self.model, mark_only_scales_as_const=True + ) torch.distributed.barrier() self.inc_initialized_successfully = True logger.info("Preparing model with INC took %s", From 7991e532222140e3d158b4c56f8e43e89fe241f0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 15 Apr 2025 17:39:58 +0300 Subject: [PATCH 2/6] call barrier if needed Signed-off-by: Yi Liu --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b614f76abcf9..28e0992aa31f 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -852,7 +852,8 @@ def load_model(self) -> None: htcore.hpu_initialize( self.model, mark_only_scales_as_const=True ) - torch.distributed.barrier() + if torch.distributed.is_initialized(): + torch.distributed.barrier() self.inc_initialized_successfully = True logger.info("Preparing model with INC took %s", m_inc.get_summary_string()) From b2ec7ee2971135dff85f7155b5fa1a9df8b7e037 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 15 Apr 2025 17:46:16 +0300 Subject: [PATCH 3/6] fix typo Signed-off-by: Yi Liu --- vllm/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 28e0992aa31f..cc029262a9d3 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -838,7 +838,7 @@ def load_model(self) -> None: from neural_compressor.torch.quantization import ( FP8Config, convert, prepare) - disable_mark_scales_scales_as_const = os.getenv( + disable_mark_scales_as_const = os.getenv( "VLLM_DISABLE_MARK_SCALES_AS_CONST", "false" ) in ("1", "true") config = FP8Config.from_json_file( @@ -848,7 +848,7 @@ def load_model(self) -> None: self.model = prepare(self.model, config) elif config.quantize: self.model = convert(self.model, config) - if not disable_mark_scales_scales_as_const: + if not disable_mark_scales_as_const: htcore.hpu_initialize( self.model, mark_only_scales_as_const=True ) From d79ab1cd775c914994bc8383e874c5b18a5af076 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 16 Apr 2025 12:26:30 +0300 Subject: [PATCH 4/6] fix pre-commit Signed-off-by: Yi Liu --- vllm/worker/hpu_model_runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index cc029262a9d3..96e810dd05b2 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -839,8 +839,8 @@ def load_model(self) -> None: FP8Config, convert, prepare) disable_mark_scales_as_const = os.getenv( - "VLLM_DISABLE_MARK_SCALES_AS_CONST", "false" - ) in ("1", "true") + "VLLM_DISABLE_MARK_SCALES_AS_CONST", + "false") in ("1", "true") config = FP8Config.from_json_file( os.getenv("QUANT_CONFIG", "")) self._inc_preprocess_(self.model, config) @@ -849,9 +849,8 @@ def load_model(self) -> None: elif config.quantize: self.model = convert(self.model, config) if not disable_mark_scales_as_const: - htcore.hpu_initialize( - self.model, mark_only_scales_as_const=True - ) + htcore.hpu_initialize(self.model, + mark_only_scales_as_const=True) if torch.distributed.is_initialized(): torch.distributed.barrier() self.inc_initialized_successfully = True From db7e4bc45da2f5e3ebd71874af107fbd8c1897c7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 16 Apr 2025 15:24:11 +0300 Subject: [PATCH 5/6] updated Signed-off-by: Yi Liu --- README_GAUDI.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README_GAUDI.md b/README_GAUDI.md index a137ba21696e..84c80dc60bac 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -409,6 +409,10 @@ measurements for a given model. The quantization configuration is used during in > If you are prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which is time-consuming. However, disabling this feature in production environments is not recommended, as it can lead to a significant performance decrease. +> [!TIP] +> If you are benchmarking an FP8 model with `scale_format=const`, setting `VLLM_DISABLE_MARK_SCALES_AS_CONST=true` can help speed up the warmup stage. + + > [!TIP] > When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables: > - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. From 236161401ad7bd5648e7feaf3b0a40c51ebcc52b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 16 Apr 2025 15:52:40 +0300 Subject: [PATCH 6/6] fix pre-commit Signed-off-by: Yi Liu --- README_GAUDI.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 84c80dc60bac..20dd1a296657 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -412,7 +412,6 @@ However, disabling this feature in production environments is not recommended, a > [!TIP] > If you are benchmarking an FP8 model with `scale_format=const`, setting `VLLM_DISABLE_MARK_SCALES_AS_CONST=true` can help speed up the warmup stage. - > [!TIP] > When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables: > - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.