Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/full_tests/ci_gsm8k_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ run_qwen3_compressed_tensor_dynamic_scaling_test() {
# QWEN3 FP8 + MOE compressed tensor + dynamic scaling
run_qwen3_moe_compressed_tensor_dynamic_scaling_test() {
echo "➡️ Testing Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 + moe + compressed-tensor + dynamic scaling..."
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 --trust-remote-code
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 --trust-remote-code --max-model-len 131072
echo "✅ Test with Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 + moe + compressed-tensor + dynamic scaling successful."
}

Expand Down
2 changes: 1 addition & 1 deletion tests/full_tests/ci_perf_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ vllm bench throughput \
--dataset_path ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 1000 \
--max-model-len 32768
--max-model-len 16384
10 changes: 4 additions & 6 deletions vllm_gaudi/v1/worker/hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,14 +786,14 @@ def __init__(
self.use_hpu_graph = not self.model_config.enforce_eager
self.max_batch_size = self.scheduler_config.max_num_seqs
self.max_num_seqs = self.scheduler_config.max_num_seqs
self.max_cudagraph_capture_size = self.vllm_config.compilation_config.max_cudagraph_capture_size
Comment thread
adobrzyn marked this conversation as resolved.
if prompt_profile_cfg:
self.max_prefill_batch_size = prompt_profile_cfg[0]
else:
self.max_prefill_batch_size = with_default(get_config().VLLM_PROMPT_BS_BUCKET_MAX, 1)
self.seen_configs: set = set()
self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
self.max_graph_capture_tokens = self.vllm_config.compilation_config.max_cudagraph_capture_size if \
self.vllm_config.compilation_config.max_cudagraph_capture_size is not None else self.max_num_batched_tokens
self.max_num_batched_tokens = \
self.scheduler_config.max_num_batched_tokens
self.use_prefix_caching = (self.vllm_config.cache_config.enable_prefix_caching)
self.bucketing_manager = HPUBucketingManager()
max_num_prefill_seqs = self.max_num_seqs if self.use_merged_prefill \
Expand Down Expand Up @@ -2603,9 +2603,7 @@ def _execute_model_generic(self,
additional_kwargs = {}
if htorch.utils.internal.is_lazy():
use_graphs = self._use_graphs()
Comment thread
adobrzyn marked this conversation as resolved.
# skip HPU graphs for long prefills
if seq_len > 1 and \
batch_size * (seq_len + num_blocks * self.block_size) > self.max_graph_capture_tokens:
if self.max_cudagraph_capture_size is not None and batch_size * seq_len > self.max_cudagraph_capture_size:
use_graphs = False
additional_kwargs.update({"bypass_hpu_graphs": not use_graphs})
else:
Expand Down