diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh index 8225926295..0cba7bec7d 100644 --- a/tests/full_tests/ci_gsm8k_tests.sh +++ b/tests/full_tests/ci_gsm8k_tests.sh @@ -99,7 +99,7 @@ run_qwen3_compressed_tensor_dynamic_scaling_test() { # QWEN3 FP8 + MOE compressed tensor + dynamic scaling run_qwen3_moe_compressed_tensor_dynamic_scaling_test() { echo "➡️ Testing Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 + moe + compressed-tensor + dynamic scaling..." - HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 --trust-remote-code + HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 --trust-remote-code --max-model-len 131072 echo "✅ Test with Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 + moe + compressed-tensor + dynamic scaling successful." } diff --git a/tests/full_tests/ci_perf_tests.sh b/tests/full_tests/ci_perf_tests.sh index fb94a19565..2066572eea 100644 --- a/tests/full_tests/ci_perf_tests.sh +++ b/tests/full_tests/ci_perf_tests.sh @@ -37,4 +37,4 @@ vllm bench throughput \ --dataset_path ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_name sharegpt \ --num-prompts 1000 \ - --max-model-len 32768 + --max-model-len 16384 diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index c9488523e7..ed650597af 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -786,14 +786,14 @@ def __init__( self.use_hpu_graph = not self.model_config.enforce_eager self.max_batch_size = self.scheduler_config.max_num_seqs self.max_num_seqs = self.scheduler_config.max_num_seqs + self.max_cudagraph_capture_size = self.vllm_config.compilation_config.max_cudagraph_capture_size if prompt_profile_cfg: self.max_prefill_batch_size = prompt_profile_cfg[0] else: self.max_prefill_batch_size = with_default(get_config().VLLM_PROMPT_BS_BUCKET_MAX, 1) self.seen_configs: set = set() - self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - self.max_graph_capture_tokens = self.vllm_config.compilation_config.max_cudagraph_capture_size if \ - self.vllm_config.compilation_config.max_cudagraph_capture_size is not None else self.max_num_batched_tokens + self.max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens self.use_prefix_caching = (self.vllm_config.cache_config.enable_prefix_caching) self.bucketing_manager = HPUBucketingManager() max_num_prefill_seqs = self.max_num_seqs if self.use_merged_prefill \ @@ -2603,9 +2603,7 @@ def _execute_model_generic(self, additional_kwargs = {} if htorch.utils.internal.is_lazy(): use_graphs = self._use_graphs() - # skip HPU graphs for long prefills - if seq_len > 1 and \ - batch_size * (seq_len + num_blocks * self.block_size) > self.max_graph_capture_tokens: + if self.max_cudagraph_capture_size is not None and batch_size * seq_len > self.max_cudagraph_capture_size: use_graphs = False additional_kwargs.update({"bypass_hpu_graphs": not use_graphs}) else: