diff --git a/README_GAUDI.md b/README_GAUDI.md index a137ba21696e..20dd1a296657 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -409,6 +409,9 @@ measurements for a given model. The quantization configuration is used during in > If you are prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which is time-consuming. However, disabling this feature in production environments is not recommended, as it can lead to a significant performance decrease. +> [!TIP] +> If you are benchmarking an FP8 model with `scale_format=const`, setting `VLLM_DISABLE_MARK_SCALES_AS_CONST=true` can help speed up the warmup stage. + > [!TIP] > When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables: > - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index fa7e7b29e5e0..f9d93adabea0 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -825,6 +825,10 @@ def load_model(self) -> None: with HabanaMemoryProfiler() as m_inc: from neural_compressor.torch.quantization import ( FP8Config, convert, prepare) + + disable_mark_scales_as_const = os.getenv( + "VLLM_DISABLE_MARK_SCALES_AS_CONST", + "false") in ("1", "true") config = FP8Config.from_json_file( os.getenv("QUANT_CONFIG", "")) self._inc_preprocess_(self.model, config) @@ -832,9 +836,11 @@ def load_model(self) -> None: self.model = prepare(self.model, config) elif config.quantize: self.model = convert(self.model, config) - htcore.hpu_initialize(self.model, - mark_only_scales_as_const=True) - torch.distributed.barrier() + if not disable_mark_scales_as_const: + htcore.hpu_initialize(self.model, + mark_only_scales_as_const=True) + if torch.distributed.is_initialized(): + torch.distributed.barrier() self.inc_initialized_successfully = True logger.info("Preparing model with INC took %s", m_inc.get_summary_string())