diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py index b45d0f542c24..d3dfaed0cefd 100644 --- a/vllm/worker/hpu_enc_dec_model_runner.py +++ b/vllm/worker/hpu_enc_dec_model_runner.py @@ -42,7 +42,7 @@ class HpuModelAdapterEncoderDecoder(HpuModelAdapter): def __init__(self, model, vllm_config, layer_names, is_causal): - super().__init__(model, vllm_config, layer_names, False) + super().__init__(model, vllm_config, layer_names, is_causal) # We only wrap the language model in HPU graph because some Ops in # vision model will fallback to CPU and cause the graph building fail. diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index c3ef8b19892e..dcd7c9b87001 100755 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -92,7 +92,6 @@ def __init__( ModelRunnerClass = HPUPoolingModelRunner elif is_encoder_decoder_model: ModelRunnerClass = HPUEncoderDecoderModelRunner - is_causal = False self.model_runner: HPUModelRunnerBase = ModelRunnerClass( vllm_config=vllm_config, kv_cache_dtype=self.cache_config.cache_dtype,