Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,7 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
Expand Down
11 changes: 11 additions & 0 deletions tests/compile/distributed/test_fusions_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ class ModelBackendTestCase(NamedTuple):
async_tp=96, # MLP is MoE, half the fusions of dense
),
),
ModelBackendTestCase(
model_name="openai/gpt-oss-20b",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
matches=Matches(
attention_fusion=0,
allreduce_fusion=49,
sequence_parallel=49,
async_tp=48,
),
),
Comment on lines +114 to +124
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ProExpertProg @mgoin Added the 20b e2e fusion test.
Also tested on main and got expected failure:

>   assert int(log_matches[0]) == matches.allreduce_fusion
      ^^^^^^^^^^^^^^^^^
E   AssertionError: assert 25 == 49
E    +  where 25 = int('25')
E    +  and   49 = Matches(attention_fusion=0, allreduce_fusion=49, sequence_parallel=49, async_tp=48).allreduce_fusion

]

elif current_platform.is_rocm():
Expand Down
2 changes: 1 addition & 1 deletion vllm/distributed/device_communicators/symm_mem.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def all_reduce(
return None
if out is None:
out = torch.empty_like(inp)
self.buffer[: inp.numel()].copy_(inp.view(-1))
self.buffer[: inp.numel()].copy_(inp.reshape(-1))

# Determine which algorithm to use
use_multimem = False
Expand Down
17 changes: 11 additions & 6 deletions vllm/model_executor/layers/fused_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,6 +1690,10 @@ def forward_native(
)

def reduce_output(states: torch.Tensor) -> torch.Tensor:
# Slice before all_reduce to enable possible fusion
if self.hidden_size != og_hidden_states:
states = states[..., :og_hidden_states]

if (
not self.is_sequence_parallel
and not self.use_dp_chunking
Expand All @@ -1712,11 +1716,12 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(fused_output, tuple)
fused_output, zero_expert_result = fused_output
return (reduce_output(fused_output) + zero_expert_result)[
..., :og_hidden_states
]
return (
reduce_output(fused_output)
+ zero_expert_result[..., :og_hidden_states]
)
else:
return reduce_output(fused_output)[..., :og_hidden_states]
return reduce_output(fused_output)
else:
if current_platform.is_tpu():
# TODO: Once the OOM issue for the TPU backend is resolved, we
Expand All @@ -1729,8 +1734,8 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
hidden_states, router_logits, self.layer_name
)
return (
reduce_output(shared_output)[..., :og_hidden_states],
reduce_output(fused_output)[..., :og_hidden_states],
reduce_output(shared_output),
reduce_output(fused_output),
)

def forward_cuda(
Expand Down