From c52dbfd5645b603ca2efb48ddb60a4d1a255ed55 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 28 Oct 2025 06:27:45 +0200 Subject: [PATCH 1/9] Initial Commit GPT-OSS Signed-off-by: Himangshu Lahkar --- tests/unit_tests/sinks/test_gpt_oss.py | 73 +++++++++++ vllm_gaudi/attention/backends/hpu_attn.py | 39 +++++- vllm_gaudi/extension/ops.py | 148 +++++++++++++++++----- vllm_gaudi/extension/utils.py | 42 +++++- vllm_gaudi/ops/hpu_fused_moe.py | 23 +++- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 6 files changed, 277 insertions(+), 50 deletions(-) create mode 100644 tests/unit_tests/sinks/test_gpt_oss.py diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py new file mode 100644 index 0000000000..2c33c29dae --- /dev/null +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -0,0 +1,73 @@ +import os +import sys +import vllm +from vllm.entrypoints.llm import LLM +import numpy as np + +RUN_20B_MODEL = True # Set to False to run the 120B model instead +MODEL_PATH = "lmsys/gpt-oss-20b-BF16" +MODEL_PATH_120 = "lmsys/gpt-oss-120b-BF16" +# reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L397 +original_output = "Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio" +# reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L462 +original_output_120 = "Roses are red, violets are blue,\nI am a language model, not a human being" + +def do_sample(llm: LLM, original_output: str, rtol: float, atol: float, max_num_seqs:int) -> list[str]: + prompts = [ + "Roses are red, violets", + ] * max_num_seqs + + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=20, + logprobs=1 if not PT_PROFILE else None,) + outputs = llm.generate( + prompts, + sampling_params) + + if not PT_PROFILE: + # Print the outputs. + generated_texts: list[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + assert prompts[0]+generated_texts[0] == original_output, "Generated text does not match the expected output." + return generated_texts + + + + +expected_output = [ + "are blue, I love you, and I love you too.\n\nRoses are red, vio" # noqa: E501 +] + + +def _test_gpt_oss(): + """Main function that sets up and runs the prompt processing.""" + if RUN_20B_MODEL: + llm = LLM(MODEL_PATH, + max_num_seqs=8 if not PT_PROFILE else max_num_seqs, + dtype='bfloat16', + enforce_eager=True, + max_model_len=512, + max_num_batched_tokens=2048, + tensor_parallel_size=1, + ) + generated_texts = do_sample(llm, original_output=original_output, rtol=1e-01, atol=1e-01, max_num_seqs=1) + else: + llm = LLM(MODEL_PATH_120, + max_num_seqs=8, + dtype='bfloat16', + enforce_eager=False, + max_model_len=512, + max_num_batched_tokens=2048, + tensor_parallel_size=4, + ) + generated_texts = do_sample(llm, original_output=original_output_120, rtol=1e-01, atol=1e-01, max_num_seqs=1) + assert generated_texts == expected_output + + +def test_gpt_oss_1x(): + _test_gpt_oss() diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index 206ab6bc69..d1724c9ee2 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -167,6 +167,7 @@ def __init__( qk_head_dim: int, v_head_dim: int, kv_b_proj: ColumnParallelLinear, + sinks: Optional[torch.Tensor] = None, **kwargs, ) -> None: torch.nn.Module.__init__(self) @@ -218,6 +219,13 @@ def __init__( "encoder/decoder cross-attention " "are not implemented for " "TritonMLAImpl") + self.sinks = sinks + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}." + ) def forward( self, @@ -389,6 +397,7 @@ def __init__( attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, + sinks: Optional[torch.Tensor] = None, ) -> None: super(AttentionImpl, self).__init__() if kv_sharing_target_layer_name is not None: @@ -453,6 +462,13 @@ def __init__( raise NotImplementedError("Encoder self-attention " "is not implemented for " "HPUAttentionImpl") + self.sinks = sinks + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}." + ) def _maybe_init_alibi_biases( self, @@ -534,6 +550,12 @@ def forward( # Reshape the input keys and values and store them in the cache. # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. + if key.dtype != key_cache.dtype: + key = key.to(key_cache.dtype) + if value.dtype != value_cache.dtype: + value = value.to(value_cache.dtype) + if query.dtype != key.dtype: + query = query.to(key.dtype) key_cache = self.k_cache(key, key_cache, slot_mapping) value_cache = self.v_cache(value, value_cache, slot_mapping) @@ -570,13 +592,17 @@ def forward( common_args = self.common_attention_args(block_list, key_cache, value_cache, attn_metadata.block_size) + if self.sliding_window and hasattr(attn_metadata, + 'window_attn_bias') and attn_metadata.window_attn_bias is not None \ + and self.prefill_impl == 'naive_impl': + attn_bias = attn_metadata.window_attn_bias if self.sliding_window: - if hasattr(attn_metadata, 'window_attn_bias') and attn_metadata.window_attn_bias is not None: - attn_bias = attn_metadata.window_attn_bias - else: - attn_bias = None - window_size = (self.sliding_window, 0) - common_args['window_size'] = window_size + # TODO - change 128 to proper window size + window_size = ( + 128, + 0, + ) + common_args["window_size"] = window_size out = ops.prompt_attention(impl=self.prefill_impl, query=query.view(query_shape), @@ -641,6 +667,7 @@ def common_attention_args(self, block_list=None, key_cache=None, value_cache=Non 'key_cache': key_cache, 'value_cache': value_cache, 'block_size': block_size, + "sinks": self.sinks, } def forward_encoder_decoder( diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index e9cf98bcb2..a03ad545e3 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -52,8 +52,8 @@ def block2batch(tensor, block_mapping, matmul_op=torch.matmul): return b2b_impl(tensor, block_mapping.t(), matmul_op) -def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_size, matmul_av_op, batch2block_matmul_op, - block2batch_matmul_op): +def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, block_size, batch_size, + matmul_av_op, batch2block_matmul_op, block2batch_matmul_op): # When fp32_softmax is enabled attn is left in fp32 after Q@K # We can return to native dtype after we renormalize and calculate the adjustments if block_bias is not None and attn.dtype != block_bias.dtype: @@ -67,11 +67,27 @@ def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_siz if block_bias is not None: attn.add_(block_bias) block_max = attn.amax(dim=-1, keepdim=True) + if sink is not None: + block_max = torch.maximum(block_max, sink) attn = attn.sub(block_max) attn = attn.exp() if attn.dtype == torch.float32: attn = attn.to(value.dtype) - block_sums = attn.sum(dim=-1, keepdim=True) + attn_shape = attn.shape + block_sums = attn.view(-1,attn_shape[-1]).sum(dim=-1, keepdim=True) + attn_shape = list(attn_shape) + attn_shape[-1] = 1 + block_sums = block_sums.view(attn_shape) + if sink is not None: + attn_sink = sink.sub(block_max) + attn_sink = attn_sink.exp() + if attn_sink.dtype == torch.float32: + attn_sink = attn_sink.to(value.dtype) + #TODO: Removing this .sum and using attn_sink directly + #results in wrong output which does not make sense. + #Looks like a Synapse issue, need to investigate further. + block_sums_sink = attn_sink.sum(dim=-1, keepdim=True) + block_sums = block_sums + block_sums_sink attn = matmul_av_op(attn, value) if get_config().fused_block_softmax_adjustment: out_shape = list(attn.shape[:3]) + [1] * (attn.dim() - 3) @@ -154,9 +170,9 @@ def flat_pa_mla(query, key_cache, value_cache, block_list, block_mapping, block_ return attn -def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias, block_groups, block_size, scale, - matmul_qk_op, position_bias, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op, keys_fetch_func, - values_fetch_func, **ignored_args): +def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias, block_groups, block_size, scale, + matmul_qk_op, position_bias, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op, keys_fetch_func, + values_fetch_func, sinks, **ignored_args): batch_size, _, hidden_size = query.shape _, kv_heads, head_size = key_cache.shape q_heads = hidden_size // head_size @@ -166,6 +182,13 @@ def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias key = keys_fetch_func(key_cache.unflatten(0, (-1, block_size)), block_list).transpose(1, 2) value = values_fetch_func(value_cache.unflatten(0, (-1, block_size)), block_list).transpose(1, 2) block_bias = block_bias.view(key.size(0), 1, 1, -1) + sink = None + if sinks is not None: + sinks = sinks.reshape(sinks.shape[0], 1) + sink = sinks.reshape(1, sinks.shape[0], 1, sinks.shape[1]) + sink = sink.expand(query.shape[0], -1, query.shape[-2], -1) + if kv_heads != q_heads: + sink = sink.unflatten(1, (kv_heads, -1)) if kv_heads != q_heads: query = query.unflatten(1, (kv_heads, -1)) key = key.unflatten(1, (kv_heads, 1)) @@ -187,11 +210,13 @@ def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias attn = attn.to(dtype=position_bias.dtype) attn.add_(position_bias.unsqueeze(-2)) - attn = pipelined_pa(attn, + attn = pipelined_pa(attn, value, block_bias, block_groups, block_mapping, + sink, + block_size, batch_size=batch_size, matmul_av_op=matmul_av_op, batch2block_matmul_op=batch2block_matmul_op, @@ -250,6 +275,7 @@ def _naive_prompt_attention(query: torch.Tensor, matmul_qk_op=torch.matmul, softmax_op=torch.softmax, matmul_av_op=torch.matmul, + sinks: Optional[torch.Tensor] = None, **ignored_args) -> torch.Tensor: query = query.transpose(1, 2) key = key.transpose(1, 2) @@ -281,10 +307,19 @@ def _naive_prompt_attention(query: torch.Tensor, if attn_weights.dtype != attn_bias.dtype: attn_bias = attn_bias.to(dtype=attn_weights.dtype) attn_weights.add_(attn_bias) + if sinks is not None: + sink = sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1) + if query_heads != kv_heads: + sink = sink.unflatten(1, (kv_heads, -1)) + combined_logits = torch.cat([attn_weights, sink], dim=-1) + combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values + attn_weights = combined_logits if get_config().fp32_softmax: attn_weights = torch.softmax(attn_weights, dim=-1) else: attn_weights = softmax_op(attn_weights, dim=-1) + if sinks is not None: + attn_weights = attn_weights[..., :-1] attn_weights = attn_weights.to(query.dtype) attn_weights = matmul_av_op(attn_weights, value) @@ -303,6 +338,7 @@ def _fsdpa_prompt_attention(query: torch.Tensor, attn_bias: Optional[torch.Tensor] = None, valid_seq_lengths: Optional[torch.Tensor] = None, window_size: Optional[int] = None, + sinks: Optional[torch.Tensor] = None, **ignored_args) -> torch.Tensor: query = query.transpose(1, 2) key = key.transpose(1, 2) @@ -319,15 +355,23 @@ def _fsdpa_prompt_attention(query: torch.Tensor, # TODO: causal + attn_bias is not yet supported is_causal = False valid_seq_lengths = None + # TODO - remove this once fsdpa op support fast mode for sliding window + if window_size is not None: + #causal window sdpa kernel only supports softmax None + softmax_mode = 'None' + args = [query, key, value, attn_bias, 0.0, is_causal, + scale, softmax_mode, recompute_mode, + valid_seq_lengths, padding_side] + args += [window_size] if window_size else [None] + # use sinks in fsdpa + if sinks is not None: + args += [sinks] - args = [ - query, key, value, attn_bias, 0.0, is_causal, scale, softmax_mode, recompute_mode, valid_seq_lengths, - padding_side - ] - args += [window_size] if window_size else [] attn_weights = fsdpa_op(*args) - attn_weights = attn_weights.transpose(1, 2) + if sinks is not None: + # TODO - check if we can remove this + htcore.mark_step() return attn_weights @@ -447,19 +491,23 @@ def __init__(self): def set_weight(self, w): self.weight = w + def set_bias(self, b): + self.bias = b + def forward(self, state, expert_id, w): raise NotImplementedError() class VllmMixtureOfExpertsOp(torch.nn.Module): - def __init__(self, num_total_experts, experts_min: int = 0, experts_max: int = 8): + def __init__(self, num_total_experts, experts_min: int = 0, experts_max: int = 8, bias = None): super().__init__() self.w13_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)]) self.w2_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)]) self.num_experts = num_total_experts self.experts_min = experts_min self.experts_max = experts_max + self.bias = bias if MAX_EXPERTS_PER_SLICE > 0: max_expert_per_slice = MAX_EXPERTS_PER_SLICE @@ -476,29 +524,64 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_ w2_list = [self.w2_list[i].weight.squeeze() for i in experts_range] if self.moe_n_slice == 1: - return torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states, - expert_routing_table=expert_routing_table, - router_weights=router_weights, - w12=w1_list, - w3=w2_list, - permuted_weights=permuted_weights, - activation=activation, - experts_min=self.experts_min, - experts_max=self.experts_max) + if self.bias is not None: + w1_bias_list = [self.w13_list[i].bias.squeeze() for i in experts_range] + w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] + return torch.ops.hpu.mixture_of_experts.bias_fused_weights( + hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list, + w3=w2_list, + w12_bias = w1_bias_list, + w3_bias = w2_bias_list, + permuted_weights=permuted_weights, + experts_min=self.experts_min, + experts_max=self.experts_max) + else: + return torch.ops.hpu.mixture_of_experts( + hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list, + w3=w2_list, + permuted_weights=permuted_weights, + activation=activation, + experts_min=self.experts_min, + experts_max=self.experts_max) + for i in range(self.moe_n_slice): w1_list_slice = w1_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] w2_list_slice = w2_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] min_expert = self.experts_min + i * self.num_expert_per_group max_expert = min_expert + self.num_expert_per_group - 1 - slice_final_hidden_states = torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states, - expert_routing_table=expert_routing_table, - router_weights=router_weights, - w12=w1_list_slice, - w3=w2_list_slice, - permuted_weights=permuted_weights, - activation=activation, - experts_min=min_expert, - experts_max=max_expert) + if self.bias is not None: + w1_bias_list = [self.w13_list[i].bias.squeeze() for i in experts_range] + w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] + w1_bias_list_slice = w1_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] + w2_bias_list_slice = w2_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] + slice_final_hidden_states = torch.ops.hpu.mixture_of_experts.bias_fused_weights( + hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list, + w3=w2_list, + w12_bias = w1_bias_list_slice, + w3_bias = w2_bias_list_slice, + permuted_weights=permuted_weights, + experts_min=self.experts_min, + experts_max=self.experts_max) + else: + slice_final_hidden_states = torch.ops.hpu.mixture_of_experts( + hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list_slice, + w3=w2_list_slice, + permuted_weights=permuted_weights, + activation=activation, + experts_min=min_expert, + experts_max=max_expert) if i == 0: final_hidden_states = slice_final_hidden_states else: @@ -507,6 +590,7 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_ return final_hidden_states + class DynamicFusedMOE(torch.nn.Module): def __init__(self, num_total_experts): diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index e5630c0ff4..0f13896866 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -149,15 +149,43 @@ def forward( valid_sequence_lengths, padding_side="left", window_size=None, + sinks=None, ): - if window_size is not None: - return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode, - recompute_mode, valid_sequence_lengths, padding_side, False, False, - window_size) + if window_size: + return self._hpu_kernel_fsdpa.apply( + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + softmax_mode, + recompute_mode, + valid_sequence_lengths, + padding_side, + False, + False, + window_size, + sinks) else: - return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode, - recompute_mode, valid_sequence_lengths, padding_side) - + return self._hpu_kernel_fsdpa.apply( + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + softmax_mode, + recompute_mode, + valid_sequence_lengths, + padding_side, + False, + False, + (-1,-1), + sinks + ) def pad_list(input, target_len, val_generator): padding = target_len - len(input) diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index 41b7a62ce2..70749245d4 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -2,6 +2,7 @@ import torch import vllm +from vllm.config import get_current_vllm_config from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod) from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) @@ -13,23 +14,29 @@ class HPUUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) torch.hpu.synchronize() - + vllm_config = get_current_vllm_config() + self.model_type = vllm_config.model_config.hf_config.model_type def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) # custom handling for HPU num_experts = layer.local_num_experts ep_shift = layer.ep_rank * num_experts + has_bias = hasattr(layer, 'w13_bias') and hasattr(layer, 'w2_bias') experts_min, experts_max = ep_shift, num_experts + ep_shift - 1 layer.moe_op = VllmMixtureOfExpertsOp( num_experts, experts_min, experts_max, + bias=has_bias, ) for expert_id in range(layer.local_num_experts): layer.moe_op.w13_list[expert_id].set_weight(layer.w13_weight.data[expert_id]) layer.moe_op.w2_list[expert_id].set_weight(layer.w2_weight.data[expert_id]) + if has_bias: + layer.moe_op.w13_list[expert_id].set_bias(layer.w13_bias.data[expert_id]) + layer.moe_op.w2_list[expert_id].set_bias(layer.w2_bias.data[expert_id]) def forward_oot( self, @@ -66,9 +73,17 @@ def forward_oot( e_score_correction_bias=e_score_correction_bias) else: import torch.nn.functional as F - topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32) - topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1) - topk_weights /= topk_weights.sum(dim=-1, keepdim=True) + if self.model_type in ["gpt_oss"]: + topk_weights, topk_ids = torch.topk(router_logits, + top_k, + dim=-1) + topk_weights = F.softmax(topk_weights, + dim=-1, + dtype=torch.float32) + else: + topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32) + topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1) + topk_weights /= topk_weights.sum(dim=-1, keepdim=True) topk_weights = topk_weights.to(x.dtype) topk_ids = topk_ids.view(*x.shape[:-1], -1) topk_weights = topk_weights.view(*x.shape[:-1], -1) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 38e8fc3a71..312a97f463 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -47,7 +47,7 @@ from vllm.sampling_params import SamplingType from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.utils import LayerBlockType -from vllm.utils.math_utils import cdiv +from vllm.utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.import_utils import LazyLoader From bc3d7042a3c7a66997fb49a1946e7fd48bb067af Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 28 Oct 2025 06:37:22 +0200 Subject: [PATCH 2/9] Update Formatting Signed-off-by: Himangshu Lahkar --- tests/unit_tests/sinks/test_gpt_oss.py | 62 +++++++------- vllm_gaudi/attention/backends/hpu_attn.py | 22 +++-- vllm_gaudi/extension/ops.py | 99 +++++++++++------------ vllm_gaudi/extension/utils.py | 40 ++------- vllm_gaudi/ops/hpu_fused_moe.py | 9 +-- 5 files changed, 97 insertions(+), 135 deletions(-) diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py index 2c33c29dae..cf26d42e05 100644 --- a/tests/unit_tests/sinks/test_gpt_oss.py +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -1,8 +1,5 @@ -import os -import sys import vllm from vllm.entrypoints.llm import LLM -import numpy as np RUN_20B_MODEL = True # Set to False to run the 120B model instead MODEL_PATH = "lmsys/gpt-oss-20b-BF16" @@ -12,17 +9,18 @@ # reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L462 original_output_120 = "Roses are red, violets are blue,\nI am a language model, not a human being" -def do_sample(llm: LLM, original_output: str, rtol: float, atol: float, max_num_seqs:int) -> list[str]: + +def do_sample(llm: LLM, original_output: str, rtol: float, atol: float, max_num_seqs: int) -> list[str]: prompts = [ - "Roses are red, violets", - ] * max_num_seqs + "Roses are red, violets", + ] * max_num_seqs - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=20, - logprobs=1 if not PT_PROFILE else None,) - outputs = llm.generate( - prompts, - sampling_params) + sampling_params = vllm.SamplingParams( + temperature=0, + max_tokens=20, + logprobs=1 if not PT_PROFILE else None, + ) + outputs = llm.generate(prompts, sampling_params) if not PT_PROFILE: # Print the outputs. @@ -33,38 +31,38 @@ def do_sample(llm: LLM, original_output: str, rtol: float, atol: float, max_num_ generated_texts.append(generated_text) print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - assert prompts[0]+generated_texts[0] == original_output, "Generated text does not match the expected output." + assert prompts[0] + generated_texts[0] == original_output, "Generated text does not match the expected output." return generated_texts - - expected_output = [ - "are blue, I love you, and I love you too.\n\nRoses are red, vio" # noqa: E501 + "are blue, I love you, and I love you too.\n\nRoses are red, vio" # noqa: E501 ] def _test_gpt_oss(): """Main function that sets up and runs the prompt processing.""" if RUN_20B_MODEL: - llm = LLM(MODEL_PATH, - max_num_seqs=8 if not PT_PROFILE else max_num_seqs, - dtype='bfloat16', - enforce_eager=True, - max_model_len=512, - max_num_batched_tokens=2048, - tensor_parallel_size=1, - ) + llm = LLM( + MODEL_PATH, + max_num_seqs=8 if not PT_PROFILE else max_num_seqs, + dtype='bfloat16', + enforce_eager=True, + max_model_len=512, + max_num_batched_tokens=2048, + tensor_parallel_size=1, + ) generated_texts = do_sample(llm, original_output=original_output, rtol=1e-01, atol=1e-01, max_num_seqs=1) else: - llm = LLM(MODEL_PATH_120, - max_num_seqs=8, - dtype='bfloat16', - enforce_eager=False, - max_model_len=512, - max_num_batched_tokens=2048, - tensor_parallel_size=4, - ) + llm = LLM( + MODEL_PATH_120, + max_num_seqs=8, + dtype='bfloat16', + enforce_eager=False, + max_model_len=512, + max_num_batched_tokens=2048, + tensor_parallel_size=4, + ) generated_texts = do_sample(llm, original_output=original_output_120, rtol=1e-01, atol=1e-01, max_num_seqs=1) assert generated_texts == expected_output diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index d1724c9ee2..ac0f6355c0 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -221,11 +221,9 @@ def __init__( "TritonMLAImpl") self.sinks = sinks if sinks is not None: - assert sinks.shape[0] == num_heads, ( - "Sinks must have the same number of heads as the number of " - f"heads in the layer. Sinks shape: {sinks.shape}, " - f"num_heads: {num_heads}." - ) + assert sinks.shape[0] == num_heads, ("Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}.") def forward( self, @@ -464,11 +462,9 @@ def __init__( "HPUAttentionImpl") self.sinks = sinks if sinks is not None: - assert sinks.shape[0] == num_heads, ( - "Sinks must have the same number of heads as the number of " - f"heads in the layer. Sinks shape: {sinks.shape}, " - f"num_heads: {num_heads}." - ) + assert sinks.shape[0] == num_heads, ("Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}.") def _maybe_init_alibi_biases( self, @@ -599,9 +595,9 @@ def forward( if self.sliding_window: # TODO - change 128 to proper window size window_size = ( - 128, - 0, - ) + 128, + 0, + ) common_args["window_size"] = window_size out = ops.prompt_attention(impl=self.prefill_impl, diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index a03ad545e3..be561ccd07 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -52,8 +52,8 @@ def block2batch(tensor, block_mapping, matmul_op=torch.matmul): return b2b_impl(tensor, block_mapping.t(), matmul_op) -def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, block_size, batch_size, - matmul_av_op, batch2block_matmul_op, block2batch_matmul_op): +def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, block_size, batch_size, matmul_av_op, + batch2block_matmul_op, block2batch_matmul_op): # When fp32_softmax is enabled attn is left in fp32 after Q@K # We can return to native dtype after we renormalize and calculate the adjustments if block_bias is not None and attn.dtype != block_bias.dtype: @@ -74,7 +74,7 @@ def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, blo if attn.dtype == torch.float32: attn = attn.to(value.dtype) attn_shape = attn.shape - block_sums = attn.view(-1,attn_shape[-1]).sum(dim=-1, keepdim=True) + block_sums = attn.view(-1, attn_shape[-1]).sum(dim=-1, keepdim=True) attn_shape = list(attn_shape) attn_shape[-1] = 1 block_sums = block_sums.view(attn_shape) @@ -84,7 +84,7 @@ def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, blo if attn_sink.dtype == torch.float32: attn_sink = attn_sink.to(value.dtype) #TODO: Removing this .sum and using attn_sink directly - #results in wrong output which does not make sense. + #results in wrong output which does not make sense. #Looks like a Synapse issue, need to investigate further. block_sums_sink = attn_sink.sum(dim=-1, keepdim=True) block_sums = block_sums + block_sums_sink @@ -170,8 +170,8 @@ def flat_pa_mla(query, key_cache, value_cache, block_list, block_mapping, block_ return attn -def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias, block_groups, block_size, scale, - matmul_qk_op, position_bias, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op, keys_fetch_func, +def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias, block_groups, block_size, scale, + matmul_qk_op, position_bias, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op, keys_fetch_func, values_fetch_func, sinks, **ignored_args): batch_size, _, hidden_size = query.shape _, kv_heads, head_size = key_cache.shape @@ -210,13 +210,13 @@ def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias attn = attn.to(dtype=position_bias.dtype) attn.add_(position_bias.unsqueeze(-2)) - attn = pipelined_pa(attn, + attn = pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, - block_size, + block_size, batch_size=batch_size, matmul_av_op=matmul_av_op, batch2block_matmul_op=batch2block_matmul_op, @@ -359,9 +359,10 @@ def _fsdpa_prompt_attention(query: torch.Tensor, if window_size is not None: #causal window sdpa kernel only supports softmax None softmax_mode = 'None' - args = [query, key, value, attn_bias, 0.0, is_causal, - scale, softmax_mode, recompute_mode, - valid_seq_lengths, padding_side] + args = [ + query, key, value, attn_bias, 0.0, is_causal, scale, softmax_mode, recompute_mode, valid_seq_lengths, + padding_side + ] args += [window_size] if window_size else [None] # use sinks in fsdpa if sinks is not None: @@ -500,7 +501,7 @@ def forward(self, state, expert_id, w): class VllmMixtureOfExpertsOp(torch.nn.Module): - def __init__(self, num_total_experts, experts_min: int = 0, experts_max: int = 8, bias = None): + def __init__(self, num_total_experts, experts_min: int = 0, experts_max: int = 8, bias=None): super().__init__() self.w13_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)]) self.w2_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)]) @@ -526,30 +527,28 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_ if self.moe_n_slice == 1: if self.bias is not None: w1_bias_list = [self.w13_list[i].bias.squeeze() for i in experts_range] - w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] - return torch.ops.hpu.mixture_of_experts.bias_fused_weights( - hidden_states=hidden_states, - expert_routing_table=expert_routing_table, - router_weights=router_weights, - w12=w1_list, - w3=w2_list, - w12_bias = w1_bias_list, - w3_bias = w2_bias_list, - permuted_weights=permuted_weights, - experts_min=self.experts_min, - experts_max=self.experts_max) + w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] + return torch.ops.hpu.mixture_of_experts.bias_fused_weights(hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list, + w3=w2_list, + w12_bias=w1_bias_list, + w3_bias=w2_bias_list, + permuted_weights=permuted_weights, + experts_min=self.experts_min, + experts_max=self.experts_max) else: - return torch.ops.hpu.mixture_of_experts( - hidden_states=hidden_states, - expert_routing_table=expert_routing_table, - router_weights=router_weights, - w12=w1_list, - w3=w2_list, - permuted_weights=permuted_weights, - activation=activation, - experts_min=self.experts_min, - experts_max=self.experts_max) - + return torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list, + w3=w2_list, + permuted_weights=permuted_weights, + activation=activation, + experts_min=self.experts_min, + experts_max=self.experts_max) + for i in range(self.moe_n_slice): w1_list_slice = w1_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] w2_list_slice = w2_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] @@ -557,31 +556,30 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_ max_expert = min_expert + self.num_expert_per_group - 1 if self.bias is not None: w1_bias_list = [self.w13_list[i].bias.squeeze() for i in experts_range] - w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] + w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] w1_bias_list_slice = w1_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] - w2_bias_list_slice = w2_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] + w2_bias_list_slice = w2_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group] slice_final_hidden_states = torch.ops.hpu.mixture_of_experts.bias_fused_weights( hidden_states=hidden_states, expert_routing_table=expert_routing_table, router_weights=router_weights, w12=w1_list, w3=w2_list, - w12_bias = w1_bias_list_slice, - w3_bias = w2_bias_list_slice, + w12_bias=w1_bias_list_slice, + w3_bias=w2_bias_list_slice, permuted_weights=permuted_weights, experts_min=self.experts_min, experts_max=self.experts_max) - else: - slice_final_hidden_states = torch.ops.hpu.mixture_of_experts( - hidden_states=hidden_states, - expert_routing_table=expert_routing_table, - router_weights=router_weights, - w12=w1_list_slice, - w3=w2_list_slice, - permuted_weights=permuted_weights, - activation=activation, - experts_min=min_expert, - experts_max=max_expert) + else: + slice_final_hidden_states = torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list_slice, + w3=w2_list_slice, + permuted_weights=permuted_weights, + activation=activation, + experts_min=min_expert, + experts_max=max_expert) if i == 0: final_hidden_states = slice_final_hidden_states else: @@ -590,7 +588,6 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_ return final_hidden_states - class DynamicFusedMOE(torch.nn.Module): def __init__(self, num_total_experts): diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index 0f13896866..0439ff70da 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -152,40 +152,14 @@ def forward( sinks=None, ): if window_size: - return self._hpu_kernel_fsdpa.apply( - query, - key, - value, - attn_mask, - dropout_p, - is_causal, - scale, - softmax_mode, - recompute_mode, - valid_sequence_lengths, - padding_side, - False, - False, - window_size, - sinks) + return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode, + recompute_mode, valid_sequence_lengths, padding_side, False, False, + window_size, sinks) else: - return self._hpu_kernel_fsdpa.apply( - query, - key, - value, - attn_mask, - dropout_p, - is_causal, - scale, - softmax_mode, - recompute_mode, - valid_sequence_lengths, - padding_side, - False, - False, - (-1,-1), - sinks - ) + return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode, + recompute_mode, valid_sequence_lengths, padding_side, False, False, + (-1, -1), sinks) + def pad_list(input, target_len, val_generator): padding = target_len - len(input) diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index 70749245d4..803fab3c41 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -16,6 +16,7 @@ def __init__(self, *args, **kwargs): torch.hpu.synchronize() vllm_config = get_current_vllm_config() self.model_type = vllm_config.model_config.hf_config.model_type + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) # custom handling for HPU @@ -74,12 +75,8 @@ def forward_oot( else: import torch.nn.functional as F if self.model_type in ["gpt_oss"]: - topk_weights, topk_ids = torch.topk(router_logits, - top_k, - dim=-1) - topk_weights = F.softmax(topk_weights, - dim=-1, - dtype=torch.float32) + topk_weights, topk_ids = torch.topk(router_logits, top_k, dim=-1) + topk_weights = F.softmax(topk_weights, dim=-1, dtype=torch.float32) else: topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32) topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1) From f3e25530f2acf2ba2e6419f6aa81fb087c0d66b4 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 28 Oct 2025 06:41:06 +0200 Subject: [PATCH 3/9] Update Test Case Signed-off-by: Himangshu Lahkar --- tests/unit_tests/sinks/test_gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py index cf26d42e05..54d7699563 100644 --- a/tests/unit_tests/sinks/test_gpt_oss.py +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -45,7 +45,7 @@ def _test_gpt_oss(): if RUN_20B_MODEL: llm = LLM( MODEL_PATH, - max_num_seqs=8 if not PT_PROFILE else max_num_seqs, + max_num_seqs=8, dtype='bfloat16', enforce_eager=True, max_model_len=512, From 1d35ae920b3d017473a97e3ad368a8b3797bfa37 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 28 Oct 2025 06:44:11 +0200 Subject: [PATCH 4/9] Remove unused variable from test Signed-off-by: Himangshu Lahkar --- tests/unit_tests/sinks/test_gpt_oss.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py index 54d7699563..f0b0acf817 100644 --- a/tests/unit_tests/sinks/test_gpt_oss.py +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -18,21 +18,19 @@ def do_sample(llm: LLM, original_output: str, rtol: float, atol: float, max_num_ sampling_params = vllm.SamplingParams( temperature=0, max_tokens=20, - logprobs=1 if not PT_PROFILE else None, ) outputs = llm.generate(prompts, sampling_params) - if not PT_PROFILE: - # Print the outputs. - generated_texts: list[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + # Print the outputs. + generated_texts: list[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - assert prompts[0] + generated_texts[0] == original_output, "Generated text does not match the expected output." - return generated_texts + assert prompts[0] + generated_texts[0] == original_output, "Generated text does not match the expected output." + return generated_texts expected_output = [ From a350ae9d39d38a375d5acbcb2cf43efba64fbb95 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 28 Oct 2025 07:01:24 +0200 Subject: [PATCH 5/9] Update model_runner Signed-off-by: Himangshu Lahkar --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 312a97f463..38e8fc3a71 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -47,7 +47,7 @@ from vllm.sampling_params import SamplingType from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.utils import LayerBlockType -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.import_utils import LazyLoader From 192841668ba31044fe40a693dae08ff140299120 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 28 Oct 2025 07:19:08 +0200 Subject: [PATCH 6/9] Set FUSED_SDPA to 0 for test Signed-off-by: Himangshu Lahkar --- tests/unit_tests/sinks/test_gpt_oss.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py index f0b0acf817..306908c81e 100644 --- a/tests/unit_tests/sinks/test_gpt_oss.py +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -1,4 +1,5 @@ import vllm +import os from vllm.entrypoints.llm import LLM RUN_20B_MODEL = True # Set to False to run the 120B model instead @@ -66,4 +67,6 @@ def _test_gpt_oss(): def test_gpt_oss_1x(): + os.environ['VLLM_PROMPT_USE_FUSEDSDPA'] = '0' _test_gpt_oss() + os.environ['VLLM_PROMPT_USE_FUSEDSDPA'] = '1' From 2a2968ad02a6a8b352db9a99f945375e91da35a0 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Mon, 10 Nov 2025 08:18:09 +0200 Subject: [PATCH 7/9] Set window_size for fsdpa based on sliding_window Signed-off-by: Himangshu Lahkar --- tests/unit_tests/sinks/test_gpt_oss.py | 16 +++++++++++++--- vllm_gaudi/attention/backends/hpu_attn.py | 11 +++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py index 306908c81e..5230831a6f 100644 --- a/tests/unit_tests/sinks/test_gpt_oss.py +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -6,7 +6,7 @@ MODEL_PATH = "lmsys/gpt-oss-20b-BF16" MODEL_PATH_120 = "lmsys/gpt-oss-120b-BF16" # reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L397 -original_output = "Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio" +original_output = "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio" # reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L462 original_output_120 = "Roses are red, violets are blue,\nI am a language model, not a human being" @@ -67,6 +67,16 @@ def _test_gpt_oss(): def test_gpt_oss_1x(): - os.environ['VLLM_PROMPT_USE_FUSEDSDPA'] = '0' + os.environ['PT_HPU_ENABLE_FUSED_SDPA_SINK'] = '1' + os.environ['PT_HPU_QKV_SLICE_SEQ_LEN_THLD'] = '64' + os.environ['PT_HPU_SDPA_BR_FACTOR'] = '64' + os.environ['PT_HPU_SDPA_BC_FACTOR'] = '64' + os.environ['PT_HPU_SDPA_QKV_SLICE_MODE_FWD'] = '1' + os.environ['VLLM_FUSEDSDPA_SLIDE_THLD'] = '128' _test_gpt_oss() - os.environ['VLLM_PROMPT_USE_FUSEDSDPA'] = '1' + os.environ['PT_HPU_ENABLE_FUSED_SDPA_SINK'] = '0' + os.environ['PT_HPU_QKV_SLICE_SEQ_LEN_THLD'] = '1024' + os.environ['PT_HPU_SDPA_BR_FACTOR'] = '1024' + os.environ['PT_HPU_SDPA_BC_FACTOR'] = '1024' + os.environ['PT_HPU_SDPA_QKV_SLICE_MODE_FWD'] = '0' + os.environ['VLLM_FUSEDSDPA_SLIDE_THLD'] = '8192' diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index 481d67ff45..cef052178e 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -589,15 +589,10 @@ def forward( common_args = self.common_attention_args(block_list, key_cache, value_cache, attn_metadata.block_size) if self.sliding_window and hasattr(attn_metadata, - 'window_attn_bias') and attn_metadata.window_attn_bias is not None \ - and self.prefill_impl == 'naive_impl': + 'window_attn_bias') and attn_metadata.window_attn_bias is not None: attn_bias = attn_metadata.window_attn_bias - if self.sliding_window: - # TODO - change 128 to proper window size - window_size = ( - 128, - 0, - ) + elif self.sliding_window: + window_size = (self.sliding_window, 0) common_args["window_size"] = window_size out = ops.prompt_attention(impl=self.prefill_impl, From 051a0c0e5f76961c171d95ad58fa836084f83223 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Thu, 20 Nov 2025 06:47:38 +0200 Subject: [PATCH 8/9] Update block calculation for decode Signed-off-by: Himangshu Lahkar --- test_gpt_oss_offline.py | 134 +++++++++++++++++++++++ tests/unit_tests/sinks/test_gpt_oss.py | 2 +- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 3 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 test_gpt_oss_offline.py diff --git a/test_gpt_oss_offline.py b/test_gpt_oss_offline.py new file mode 100644 index 0000000000..20a239f63d --- /dev/null +++ b/test_gpt_oss_offline.py @@ -0,0 +1,134 @@ +import os +import sys +import vllm +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.entrypoints.llm import LLM +import numpy as np + +RUN_20B_MODEL = True # Set to False to run the 120B model instead +MODEL_PATH = "lmsys/gpt-oss-20b-BF16" +MODEL_PATH_120 = "lmsys/gpt-oss-120b-BF16" +# reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L397 +original_output = "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio" +# reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L462 +original_output_120 = "Roses are red, violets are blue,\nI am a language model, not a human being" +original_logprobs = [ + -0.037353515625, + -0.08154296875, + -1.21875, + -1.953125, + -2.234375, + -0.96875, + -1.546875, + -1.640625, + -0.93359375, + -1.609375, + -1.625, + -0.85546875, + -1.7265625, + ] +original_logprobs_120 = [ + -0.90234375, + -0.66015625, + -1.546875, + -2.703125, + -2.078125, + -1.21875, + -2.484375, + -0.031982421875, + -0.84765625, + -1.890625, + -0.1923828125, + -2.046875, + -1.65625, + ] + + +def do_sample(llm: LLM, original_output: str, original_logprobs: list[float], rtol: float, atol: float, max_num_seqs:int) -> list[str]: + prompts = [ + "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", + ] * max_num_seqs + + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=512, + logprobs=1 if not PT_PROFILE else None,) + outputs = llm.generate( + prompts, + sampling_params) + + if not PT_PROFILE: + # Print the outputs. + generated_texts: list[str] = [] + logprobs: list[float] = [] + for output in outputs: + for probs in output.outputs[0].logprobs: + logprobs.append(list(probs.values())[0].logprob) + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # assert prompts[0]+generated_texts[0] == original_output, "Generated text does not match the expected output." + # assert np.allclose(np.array(logprobs[:-1]),np.array(original_logprobs),rtol=rtol, atol=atol), "Logprobs do not match the expected values." + return generated_texts + else: + generated_texts: list[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +if __name__ == "__main__": + DEFAULT_MAX_NUM_SEQS = 1 + max_num_seqs = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_MAX_NUM_SEQS + # Enable PyTorch profiling when PT_PROFILE env var is set to one of the values (1,true,yes,on) + _pt_profile_env = os.getenv("PT_PROFILE", "0") + PT_PROFILE = _pt_profile_env.lower() in ("1", "true", "yes", "on") + + if RUN_20B_MODEL: + llm = LLM(MODEL_PATH, + max_num_seqs=8 if not PT_PROFILE else max_num_seqs, + dtype='bfloat16', + enforce_eager=True, + max_model_len=512, + max_num_batched_tokens=2048, + tensor_parallel_size=1, + ) + if PT_PROFILE: + import torch + schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) + activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU] + _profiler = torch.profiler.profile( + schedule=schedule, + activities=activities, + on_trace_ready=torch.profiler.tensorboard_trace_handler("./"), + record_shapes=False, + with_stack=False, + ) + _profiler.start() + do_sample(llm, original_output=original_output, + original_logprobs=original_logprobs, rtol=1e-01, atol=1e-01, max_num_seqs=max_num_seqs) + _profiler.step() + do_sample(llm, original_output=original_output, + original_logprobs=original_logprobs, rtol=1e-01, atol=1e-01, max_num_seqs=max_num_seqs) + _profiler.step() + do_sample(llm, original_output=original_output, + original_logprobs=original_logprobs, rtol=1e-01, atol=1e-01, max_num_seqs=max_num_seqs) + _profiler.step() + _profiler.stop() + else: + do_sample(llm, original_output=original_output, + original_logprobs=original_logprobs, rtol=1e-01, atol=1e-01, max_num_seqs=max_num_seqs) + + else: + llm = LLM(MODEL_PATH_120, + max_num_seqs=8, + dtype='bfloat16', + enforce_eager=False, + max_model_len=512, + max_num_batched_tokens=2048, + tensor_parallel_size=4, + ) + do_sample(llm, original_output=original_output_120, + original_logprobs=original_logprobs_120, rtol=1e-01, atol=3e-01, max_num_seqs=max_num_seqs) diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py index 5230831a6f..b90ff79553 100644 --- a/tests/unit_tests/sinks/test_gpt_oss.py +++ b/tests/unit_tests/sinks/test_gpt_oss.py @@ -72,7 +72,7 @@ def test_gpt_oss_1x(): os.environ['PT_HPU_SDPA_BR_FACTOR'] = '64' os.environ['PT_HPU_SDPA_BC_FACTOR'] = '64' os.environ['PT_HPU_SDPA_QKV_SLICE_MODE_FWD'] = '1' - os.environ['VLLM_FUSEDSDPA_SLIDE_THLD'] = '128' + os.environ['VLLM_FUSEDSDPA_SLIDE_THLD'] = '0' _test_gpt_oss() os.environ['PT_HPU_ENABLE_FUSED_SDPA_SINK'] = '0' os.environ['PT_HPU_QKV_SLICE_SEQ_LEN_THLD'] = '1024' diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 70203c91bd..417dbe7a7b 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2059,7 +2059,7 @@ def _create_decode_input_data(self, ) if self.interleaved_sliding_window and self.sliding_window > 0: - sliding_block_size = (self.sliding_window // self.block_size) + sliding_block_size = (self.sliding_window // self.block_size) + 1 window_block_tables = [block_table[-sliding_block_size:] for block_table in block_tables_list] window_block_list, window_block_groups, window_block_usage = \ self.get_habana_paged_attn_buffers( From d4eee4de8ffdf178c5dffa901ce5f768b739a608 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Thu, 20 Nov 2025 06:50:28 +0200 Subject: [PATCH 9/9] Update pipelined_pa signature Signed-off-by: Himangshu Lahkar --- vllm_gaudi/extension/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 2f843845d9..1a61167c1c 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -61,8 +61,8 @@ def matmul_shape(lhs, rhs): return result -def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_size, matmul_av_op, batch2block_matmul_op, - block2batch_matmul_op): +def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, block_size, batch_size, matmul_av_op, + batch2block_matmul_op, block2batch_matmul_op): # When fp32_softmax is enabled attn is left in fp32 after Q@K # We can return to native dtype after we renormalize and calculate the adjustments if block_bias is not None and attn.dtype != block_bias.dtype: