ZhanqiuHu · ZhanqiuHu · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -40,16 +40,16 @@ docker run \
     python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
     python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager --max-model-len 8192
     python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
     python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
     cd tests
     pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
     pytest -v -s v1/engine
     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
-    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
+    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -1573,7 +1573,7 @@ steps:
   - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
+  - examples/rl/
   - tests/examples/offline_inference/data_parallel.py
   - tests/v1/distributed
   - tests/v1/engine/test_engine_core_client.py
@@ -1615,7 +1615,7 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
   # NEW rlhf examples
-  - pushd ../examples/offline_inference/new_weight_syncing
+  - pushd ../examples/rl
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
@@ -2660,7 +2660,7 @@ steps:
   - tests/v1/entrypoints/openai/test_multi_api_servers.py
   - tests/v1/shutdown
   - tests/v1/worker/test_worker_memory_snapshot.py
-  - examples/offline_inference/new_weight_syncing/
+  - examples/rl/
   commands:
   # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
   # TODO: Remove when the bug is fixed in a future ROCm release
@@ -3325,7 +3325,7 @@ steps:
   - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
+  - examples/rl/
   - tests/examples/offline_inference/data_parallel.py
   - tests/v1/distributed
   - tests/v1/engine/test_engine_core_client.py
@@ -3367,7 +3367,7 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
   # NEW rlhf examples
-  - pushd ../examples/offline_inference/new_weight_syncing
+  - pushd ../examples/rl
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py

diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
@@ -82,41 +82,36 @@ steps:
 
 - label: Distributed Torchrun + Examples (4 GPUs)
   timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
+  working_dir: "/vllm-workspace"
   num_devices: 4
   source_file_dependencies:
   - vllm/distributed/
   - tests/distributed/test_torchrun_example.py
   - tests/distributed/test_torchrun_example_moe.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
+  - examples/rl/
   - tests/examples/offline_inference/data_parallel.py
   commands:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   # test with torchrun tp=2 and external_dp=2
-  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
   # test with torchrun tp=2 and pp=2
-  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
   # test with torchrun tp=4 and dp=1
-  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
   # test with torchrun tp=2, pp=2 and dp=1
-  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
   # test with torchrun tp=1 and dp=4 with ep
-  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
   # test with torchrun tp=2 and dp=2 with ep
-  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
   # test with internal dp
-  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  # OLD rlhf examples
-  - cd ../examples/offline_inference
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  # NEW rlhf examples
-  - cd new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
+  - python3 examples/offline_inference/data_parallel.py --enforce-eager
+  # rlhf examples
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
 
 - label: Distributed DP Tests (4 GPUs)
   timeout_in_minutes: 30

diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
@@ -24,8 +24,7 @@ steps:
 
 - label: Elastic EP Scaling Test
   timeout_in_minutes: 20
-  device: b200
-  optional: true
+  device: h100
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -999,6 +999,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
     "csrc/moe/grouped_topk_kernels.cu"
+    "csrc/moe/gpt_oss_router_gemm.cu"
     "csrc/moe/router_gemm.cu")
 endif()
 

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -750,17 +750,20 @@ def get_weight_block_size_safety(config, default_value=None):
 
 
 def get_model_params(config):
-    if config.architectures[0] == "DbrxForCausalLM":
+    architectures = getattr(config, "architectures", None) or [type(config).__name__]
+    architecture = architectures[0]
+
+    if architecture == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
         intermediate_size = config.ffn_config.ffn_hidden_size
         hidden_size = config.hidden_size
-    elif config.architectures[0] == "JambaForCausalLM":
+    elif architecture == "JambaForCausalLM":
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         hidden_size = config.hidden_size
-    elif config.architectures[0] in (
+    elif architecture in (
         "DeepseekV2ForCausalLM",
         "DeepseekV3ForCausalLM",
         "DeepseekV32ForCausalLM",
@@ -774,7 +777,7 @@ def get_model_params(config):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         hidden_size = config.hidden_size
-    elif config.architectures[0] in (
+    elif architecture in (
         "Qwen2MoeForCausalLM",
         "Qwen3MoeForCausalLM",
         "Qwen3NextForCausalLM",
@@ -783,23 +786,27 @@ def get_model_params(config):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         hidden_size = config.hidden_size
-    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
+    elif architecture in (
+        "Qwen3VLMoeForConditionalGeneration",
+        "Qwen3_5MoeForConditionalGeneration",
+        "Qwen3_5MoeTextConfig",
+    ):
         text_config = config.get_text_config()
         E = text_config.num_experts
         topk = text_config.num_experts_per_tok
         intermediate_size = text_config.moe_intermediate_size
         hidden_size = text_config.hidden_size
-    elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
+    elif architecture == "HunYuanMoEV1ForCausalLM":
         E = config.num_experts
         topk = config.moe_topk[0]
         intermediate_size = config.moe_intermediate_size[0]
         hidden_size = config.hidden_size
-    elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
+    elif architecture == "Qwen3OmniMoeForConditionalGeneration":
         E = config.thinker_config.text_config.num_experts
         topk = config.thinker_config.text_config.num_experts_per_tok
         intermediate_size = config.thinker_config.text_config.moe_intermediate_size
         hidden_size = config.thinker_config.text_config.hidden_size
-    elif config.architectures[0] == "PixtralForConditionalGeneration":
+    elif architecture == "PixtralForConditionalGeneration":
         # Pixtral can contain different LLM architectures,
         # recurse to get their parameters
         return get_model_params(config.get_text_config())
@@ -814,6 +821,23 @@ def get_model_params(config):
     return E, topk, intermediate_size, hidden_size
 
 
+def resolve_dtype(config) -> torch.dtype:
+    if current_platform.is_rocm():
+        return torch.float16
+
+    dtype = getattr(config, "dtype", None)
+    if dtype is not None:
+        return dtype
+
+    if hasattr(config, "get_text_config"):
+        text_config = config.get_text_config()
+        dtype = getattr(text_config, "dtype", None)
+        if dtype is not None:
+            return dtype
+
+    return torch.bfloat16
+
+
 def get_quantization_group_size(config) -> int | None:
     """Extract the quantization group size from the HF model config.
 
@@ -861,7 +885,7 @@ def main(args: argparse.Namespace):
     else:
         ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
+    dtype = resolve_dtype(config)
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     use_int4_w4a16 = args.dtype == "int4_w4a16"

diff --git a/benchmarks/kernels/benchmark_router_gemm.py b/benchmarks/kernels/benchmark_router_gemm.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
+from vllm.triton_utils import triton
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Dimensions supported by the DSV3 specialized kernel
+DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
+DSV3_SUPPORTED_HIDDEN_SIZES = [7168]
+
+# Dimensions supported by the gpt-oss specialized kernel
+GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
+GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]
+
+
+def get_batch_size_range(max_batch_size):
+    return [2**x for x in range(14) if 2**x <= max_batch_size]
+
+
+def get_model_params(config):
+    if config.architectures[0] in (
+        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "DeepseekV32ForCausalLM",
+    ):
+        num_experts = config.n_routed_experts
+        hidden_size = config.hidden_size
+    elif config.architectures[0] in ("GptOssForCausalLM",):
+        num_experts = config.num_local_experts
+        hidden_size = config.hidden_size
+    else:
+        raise ValueError(f"Unsupported architecture: {config.architectures}")
+    return num_experts, hidden_size
+
+
+def get_benchmark(model, max_batch_size, trust_remote_code):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size"],
+            x_vals=get_batch_size_range(max_batch_size),
+            x_log=False,
+            line_arg="provider",
+            line_vals=[
+                "torch",
+                "vllm",
+            ],
+            line_names=["PyTorch", "vLLM"],
+            styles=([("blue", "-"), ("red", "-")]),
+            ylabel="TFLOPs",
+            plot_name=f"{model} router gemm throughput",
+            args={},
+        )
+    )
+    def benchmark(batch_size, provider):
+        config = get_config(model=model, trust_remote_code=trust_remote_code)
+        num_experts, hidden_size = get_model_params(config)
+
+        mat_a = torch.randn(
+            (batch_size, hidden_size), dtype=torch.bfloat16, device="cuda"
+        ).contiguous()
+        mat_b = torch.randn(
+            (num_experts, hidden_size), dtype=torch.bfloat16, device="cuda"
+        ).contiguous()
+        bias = torch.randn(
+            num_experts, dtype=torch.bfloat16, device="cuda"
+        ).contiguous()
+
+        is_hopper_or_blackwell = current_platform.is_device_capability(
+            90
+        ) or current_platform.is_device_capability_family(100)
+        allow_dsv3_router_gemm = (
+            is_hopper_or_blackwell
+            and num_experts in DSV3_SUPPORTED_NUM_EXPERTS
+            and hidden_size in DSV3_SUPPORTED_HIDDEN_SIZES
+        )
+        allow_gpt_oss_router_gemm = (
+            is_hopper_or_blackwell
+            and num_experts in GPT_OSS_SUPPORTED_NUM_EXPERTS
+            and hidden_size in GPT_OSS_SUPPORTED_HIDDEN_SIZES
+        )
+
+        has_bias = False
+        if allow_gpt_oss_router_gemm:
+            has_bias = True
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "torch":
+
+            def runner():
+                if has_bias:
+                    F.linear(mat_a, mat_b, bias)
+                else:
+                    F.linear(mat_a, mat_b)
+        elif provider == "vllm":
+
+            def runner():
+                if allow_dsv3_router_gemm:
+                    ops.dsv3_router_gemm(mat_a, mat_b, torch.bfloat16)
+                elif allow_gpt_oss_router_gemm:
+                    ops.gpt_oss_router_gemm(mat_a, mat_b, bias)
+                else:
+                    raise ValueError("Unsupported router gemm")
+
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            runner, quantiles=quantiles
+        )
+
+        def tflops(t_ms):
+            flops = 2 * batch_size * hidden_size * num_experts
+            return flops / (t_ms * 1e-3) / 1e12
+
+        return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--model", type=str, default="openai/gpt-oss-20b")
+    parser.add_argument("--max-batch-size", default=16, type=int)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = parser.parse_args()
+
+    # Get the benchmark function
+    benchmark = get_benchmark(args.model, args.max_batch_size, args.trust_remote_code)
+    # Run performance benchmark
+    benchmark.run(print_data=True)