Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
58cde5c
[ROCm][CI] Skip trtllm kvfp8 dequant tests on ROCm (#37330)
AndreasKaratzas Mar 18, 2026
f174000
[Perf] Enable dual stream execution of input projection for Qwen3 (#3…
xyang16 Mar 18, 2026
a0dd199
[Hardware][TPU] Add supports_async_scheduling() method to Executor in…
gxd3 Mar 18, 2026
8b63257
[ROCm][CI] Add ROCM_EXTRA_ARGS to audio_in_video test server fixture …
AndreasKaratzas Mar 18, 2026
ce2ef42
[CI] Stabilize test_cpu_offloading by waiting for async offload befor…
AndreasKaratzas Mar 18, 2026
0e95916
[responsesAPI] parser.extract_response_outputs can take in token IDs …
qandrew Mar 18, 2026
86b7e3c
[XPU] skip unsupported ut and update test_nixl_connector (#37179)
zhenwei-intel Mar 18, 2026
fcf0687
[kv_offload+HMA][0/N]: Support block-level preemption handling (#34805)
orozery Mar 18, 2026
2618012
[Bugfix] Avoid OpenMP thread reallocation in CPU torch compile (#37391)
bigPYJ1151 Mar 18, 2026
8c31f47
[LoRA] Make LoRA respect `language_model_only` (#37375)
jeejeelee Mar 18, 2026
fad09e8
fix(glm47): improve tool call parsing and content normalization (#37386)
karanb192 Mar 18, 2026
47a1f11
[docs] Add docs for new RL flows (#36188)
hao-aaron Mar 18, 2026
eaf7c9b
[CI] Fix PaddleOCR-VL HF test failure due to create_causal_mask API r…
AndreasKaratzas Mar 18, 2026
b322b19
[Build] Bump python openai version (#32316)
chaunceyjiang Mar 18, 2026
17c47fb
[Bugfix] Fix EP weight filter breaking EPLB and NVFP4 accuracy (#37322)
elvircrn Mar 18, 2026
cef1f30
[Model] Enable LoRA support for tower and connector in H2OVL (#31696)
shwetha-s-poojary Mar 18, 2026
98b09dd
[NIXL][Bugfix] metrics & testing minor bug (#36051)
andylolu2 Mar 18, 2026
918b789
[Bugfix] Fix base64 JPEG video frames returning empty metadata (#37301)
he-yufeng Mar 18, 2026
525f2ee
[kv_offload+HMA][6/N]: Split offloading_connector.py (#37405)
orozery Mar 18, 2026
99267c2
[2/3] Refactor InternVL-based processors (#37324)
DarkLight1337 Mar 18, 2026
de1a86b
elastic_ep: Fix stateless group port races (#36330)
itayalroy Mar 18, 2026
c373b5c
[Log] Reduce duplicate log (#37313)
yewentao256 Mar 18, 2026
296839a
[Perf] Eliminate padding and slicing op for GPT-OSS with Flashinfer M…
elvischenv Mar 18, 2026
1780839
standardize load_weights using AutoWeightsLoader for kimi_linear and …
XLiu-2000 Mar 18, 2026
b1169d7
[Kernel] Add gpt-oss Router GEMM kernel (#37205)
xyang16 Mar 18, 2026
c9d838f
Adding deterministic lora benchmarking to vLLM Bench (#36057)
RonaldBXu Mar 18, 2026
39bfb57
Add API docs link if the CLI arg is a config class (#37432)
hmellor Mar 18, 2026
5dd8df0
[kv_offload+HMA][2/N]: Support multiple KV groups in GPULoadStoreSpec…
orozery Mar 18, 2026
0ef7f79
[Perf] Add tuned triton moe config for Qwen3.5 H200, 9.9% E2E through…
yewentao256 Mar 18, 2026
f3732bd
[Misc] Clean up model registry (#37457)
DarkLight1337 Mar 18, 2026
7476d14
[Model] Remove unnecessary processor definition for Nemotron Parse (#…
DarkLight1337 Mar 18, 2026
70b81c4
[bugfix][async scheduling] fix extra cuda context in device 0 with EP…
youkaichao Mar 18, 2026
738d0a2
[Bugfix] Fix incorrect use of merge_size in Qwen3-VL video timestamp …
cnyvfang Mar 18, 2026
5ce2d10
Fix models which use `layer_type_validation` for Transformers v5 (#37…
hmellor Mar 18, 2026
6e49140
[SSM/Mamba] N-1 prefill for P/D disaggregation
ZhanqiuHu Mar 17, 2026
172e7e6
Refactor: cleaner call-site guards and docstrings
ZhanqiuHu Mar 17, 2026
f1c4cb6
Fix ruff SIM102: collapse nested if statements
ZhanqiuHu Mar 17, 2026
2130daf
Use _has_mamba instead of _is_hma_required for N-1 logic
ZhanqiuHu Mar 17, 2026
3a18e8e
Rename _hma_ helpers to _mamba_ for clarity
ZhanqiuHu Mar 17, 2026
f29f1b6
Add comment explaining _p_side_truncated preemption guard
ZhanqiuHu Mar 17, 2026
458ca60
add test cases
ZhanqiuHu Mar 18, 2026
17f996f
handle prompt embeddings
ZhanqiuHu Mar 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ docker run \
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
pytest -v -s v1/test_serial_utils.py
'
10 changes: 5 additions & 5 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1573,7 +1573,7 @@ steps:
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/
- examples/rl/
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
Expand Down Expand Up @@ -1615,7 +1615,7 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
# NEW rlhf examples
- pushd ../examples/offline_inference/new_weight_syncing
- pushd ../examples/rl
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
Expand Down Expand Up @@ -2660,7 +2660,7 @@ steps:
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
- examples/offline_inference/new_weight_syncing/
- examples/rl/
commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
Expand Down Expand Up @@ -3325,7 +3325,7 @@ steps:
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/
- examples/rl/
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
Expand Down Expand Up @@ -3367,7 +3367,7 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
# NEW rlhf examples
- pushd ../examples/offline_inference/new_weight_syncing
- pushd ../examples/rl
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
Expand Down
29 changes: 12 additions & 17 deletions .buildkite/test_areas/distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,41 +82,36 @@ steps:

- label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
working_dir: "/vllm-workspace"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_torchrun_example.py
- tests/distributed/test_torchrun_example_moe.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/
- examples/rl/
- tests/examples/offline_inference/data_parallel.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
# OLD rlhf examples
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- python3 examples/offline_inference/data_parallel.py --enforce-eager
# rlhf examples
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py

- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
Expand Down
3 changes: 1 addition & 2 deletions .buildkite/test_areas/expert_parallelism.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ steps:

- label: Elastic EP Scaling Test
timeout_in_minutes: 20
device: b200
optional: true
device: h100
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/moe_wna16.cu"
"csrc/moe/grouped_topk_kernels.cu"
"csrc/moe/gpt_oss_router_gemm.cu"
"csrc/moe/router_gemm.cu")
endif()

Expand Down
42 changes: 33 additions & 9 deletions benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,17 +750,20 @@ def get_weight_block_size_safety(config, default_value=None):


def get_model_params(config):
if config.architectures[0] == "DbrxForCausalLM":
architectures = getattr(config, "architectures", None) or [type(config).__name__]
architecture = architectures[0]

if architecture == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM":
elif architecture == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
elif architecture in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM",
Expand All @@ -774,7 +777,7 @@ def get_model_params(config):
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
elif architecture in (
"Qwen2MoeForCausalLM",
"Qwen3MoeForCausalLM",
"Qwen3NextForCausalLM",
Expand All @@ -783,23 +786,27 @@ def get_model_params(config):
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
elif architecture in (
"Qwen3VLMoeForConditionalGeneration",
"Qwen3_5MoeForConditionalGeneration",
"Qwen3_5MoeTextConfig",
):
text_config = config.get_text_config()
E = text_config.num_experts
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
elif architecture == "HunYuanMoEV1ForCausalLM":
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
elif architecture == "Qwen3OmniMoeForConditionalGeneration":
E = config.thinker_config.text_config.num_experts
topk = config.thinker_config.text_config.num_experts_per_tok
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
hidden_size = config.thinker_config.text_config.hidden_size
elif config.architectures[0] == "PixtralForConditionalGeneration":
elif architecture == "PixtralForConditionalGeneration":
# Pixtral can contain different LLM architectures,
# recurse to get their parameters
return get_model_params(config.get_text_config())
Expand All @@ -814,6 +821,23 @@ def get_model_params(config):
return E, topk, intermediate_size, hidden_size


def resolve_dtype(config) -> torch.dtype:
if current_platform.is_rocm():
return torch.float16

dtype = getattr(config, "dtype", None)
if dtype is not None:
return dtype

if hasattr(config, "get_text_config"):
text_config = config.get_text_config()
dtype = getattr(text_config, "dtype", None)
if dtype is not None:
return dtype

return torch.bfloat16


def get_quantization_group_size(config) -> int | None:
"""Extract the quantization group size from the HF model config.

Expand Down Expand Up @@ -861,7 +885,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
dtype = resolve_dtype(config)
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_int4_w4a16 = args.dtype == "int4_w4a16"
Expand Down
134 changes: 134 additions & 0 deletions benchmarks/kernels/benchmark_router_gemm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import torch
import torch.nn.functional as F

from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser

# Dimensions supported by the DSV3 specialized kernel
DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
DSV3_SUPPORTED_HIDDEN_SIZES = [7168]

# Dimensions supported by the gpt-oss specialized kernel
GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]


def get_batch_size_range(max_batch_size):
return [2**x for x in range(14) if 2**x <= max_batch_size]


def get_model_params(config):
if config.architectures[0] in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM",
):
num_experts = config.n_routed_experts
hidden_size = config.hidden_size
elif config.architectures[0] in ("GptOssForCausalLM",):
num_experts = config.num_local_experts
hidden_size = config.hidden_size
else:
raise ValueError(f"Unsupported architecture: {config.architectures}")
return num_experts, hidden_size


def get_benchmark(model, max_batch_size, trust_remote_code):
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=get_batch_size_range(max_batch_size),
x_log=False,
line_arg="provider",
line_vals=[
"torch",
"vllm",
],
line_names=["PyTorch", "vLLM"],
styles=([("blue", "-"), ("red", "-")]),
ylabel="TFLOPs",
plot_name=f"{model} router gemm throughput",
args={},
)
)
def benchmark(batch_size, provider):
config = get_config(model=model, trust_remote_code=trust_remote_code)
num_experts, hidden_size = get_model_params(config)

mat_a = torch.randn(
(batch_size, hidden_size), dtype=torch.bfloat16, device="cuda"
).contiguous()
mat_b = torch.randn(
(num_experts, hidden_size), dtype=torch.bfloat16, device="cuda"
).contiguous()
bias = torch.randn(
num_experts, dtype=torch.bfloat16, device="cuda"
).contiguous()

is_hopper_or_blackwell = current_platform.is_device_capability(
90
) or current_platform.is_device_capability_family(100)
allow_dsv3_router_gemm = (
is_hopper_or_blackwell
and num_experts in DSV3_SUPPORTED_NUM_EXPERTS
and hidden_size in DSV3_SUPPORTED_HIDDEN_SIZES
)
allow_gpt_oss_router_gemm = (
is_hopper_or_blackwell
and num_experts in GPT_OSS_SUPPORTED_NUM_EXPERTS
and hidden_size in GPT_OSS_SUPPORTED_HIDDEN_SIZES
)

has_bias = False
if allow_gpt_oss_router_gemm:
has_bias = True

quantiles = [0.5, 0.2, 0.8]

if provider == "torch":

def runner():
if has_bias:
F.linear(mat_a, mat_b, bias)
else:
F.linear(mat_a, mat_b)
elif provider == "vllm":

def runner():
if allow_dsv3_router_gemm:
ops.dsv3_router_gemm(mat_a, mat_b, torch.bfloat16)
elif allow_gpt_oss_router_gemm:
ops.gpt_oss_router_gemm(mat_a, mat_b, bias)
else:
raise ValueError("Unsupported router gemm")

ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
runner, quantiles=quantiles
)

def tflops(t_ms):
flops = 2 * batch_size * hidden_size * num_experts
return flops / (t_ms * 1e-3) / 1e12

return tflops(ms), tflops(max_ms), tflops(min_ms)

return benchmark


if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--model", type=str, default="openai/gpt-oss-20b")
parser.add_argument("--max-batch-size", default=16, type=int)
parser.add_argument("--trust-remote-code", action="store_true")
args = parser.parse_args()

# Get the benchmark function
benchmark = get_benchmark(args.model, args.max_batch_size, args.trust_remote_code)
# Run performance benchmark
benchmark.run(print_data=True)
Loading
Loading