Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
0a872d1
adding mxfp4 quant key
zyongye Feb 13, 2026
20ff584
runnable but not correct
zyongye Feb 13, 2026
d9b14fe
remove unused variable
zyongye Feb 13, 2026
b26944d
bug fix
zyongye Feb 13, 2026
4fd583d
convert bf16
zyongye Feb 13, 2026
0fcedd9
revert back scalar dtype
zyongye Feb 13, 2026
614aa94
fix trtllm moe
zyongye Feb 14, 2026
c56b55f
add tune size to flashinfer experts
zyongye Feb 15, 2026
4124fcc
move kernel setup to process_weight
zyongye Feb 15, 2026
6451020
only cast when act is fp8
zyongye Feb 15, 2026
5781493
add topk_ids contiguous assertion
zyongye Feb 15, 2026
c9704d2
add testing infrastructure
zyongye Feb 16, 2026
b4bb93f
fix pre-commit
zyongye Feb 16, 2026
19a2bec
change parameter inside the kernels
zyongye Feb 17, 2026
977387b
change ci to h100
zyongye Feb 17, 2026
49534f6
add back quant function parameters
zyongye Feb 17, 2026
3fa87c9
add back dep interface
zyongye Feb 17, 2026
413fcb1
add back dep interface
zyongye Feb 17, 2026
6ed9a44
fixing trtllm moe and pre commit
zyongye Feb 17, 2026
3658e86
assert not using dep
zyongye Feb 17, 2026
88d3be1
bring back dep
zyongye Feb 17, 2026
e348fda
pre-commit
zyongye Feb 17, 2026
fb6a6b2
update ci tests
zyongye Feb 18, 2026
eda2c4b
update device to use in moe config
zyongye Feb 18, 2026
a7a41d5
move fake scale into init
zyongye Feb 18, 2026
61d521a
add dtype into scales
zyongye Feb 18, 2026
93f4f56
unifing moe_mk interface
zyongye Feb 19, 2026
004912a
staging
zyongye Feb 19, 2026
152b8ae
backend selection done
zyongye Feb 20, 2026
781bc71
make kernels
zyongye Feb 20, 2026
313ed30
skleton ready
zyongye Feb 20, 2026
6094b0a
finish everything except marlin
zyongye Feb 20, 2026
6abf521
update compressed_tensors
zyongye Feb 20, 2026
318e0fa
fix quark
zyongye Feb 20, 2026
14f89ef
update config
zyongye Feb 20, 2026
fe3dee7
trtllm working
zyongye Feb 20, 2026
ade83af
pre-commit
zyongye Feb 22, 2026
35dc29e
fixing trtllm experts
zyongye Feb 23, 2026
7625260
add oai silu to support activation
zyongye Feb 23, 2026
3339ecc
update selection and triton experts attribute
zyongye Feb 23, 2026
c9c430c
triton backend working
zyongye Feb 23, 2026
6bbcf2a
change type annotation
zyongye Feb 23, 2026
0b7a2ac
fixing type
zyongye Feb 23, 2026
17120aa
fixing typo
zyongye Feb 23, 2026
045335c
fixing more typos
zyongye Feb 23, 2026
98cd346
Merge branch 'main' into mxfp4_oracle
zyongye Feb 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
kFp8Static128BlockSym,
kFp8StaticChannelSym,
kFp8StaticTensorSym,
kMxfp4Static,
kNvfp4Static,
)
from vllm.platforms import current_platform
Expand Down Expand Up @@ -581,6 +582,7 @@ def _supports_quant_scheme(
kFp8StaticChannelSym,
kFp8StaticTensorSym,
kNvfp4Static,
kMxfp4Static,
]
return weight_key in SUPPORTED_W

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
kMxfp4Static,
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
Expand Down Expand Up @@ -386,41 +387,32 @@ def make_routing_data(
class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
@staticmethod
def _supports_current_device() -> bool:
raise NotImplementedError(
"OAITritonExperts is not yet used by an Oracle. "
"This method should not be called."
p = current_platform
return p.is_cuda_alike() and (
p.is_device_capability(90) or p.is_device_capability_family(100)
)

@staticmethod
def _supports_no_act_and_mul() -> bool:
raise NotImplementedError(
"OAITritonExperts is not yet used by an Oracle. "
"This method should not be called."
)
return False

@staticmethod
def _supports_quant_scheme(
weight_key: QuantKey | None,
activation_key: QuantKey | None,
) -> bool:
raise NotImplementedError(
"OAITritonExperts is not yet used by an Oracle. "
"This method should not be called."
)
SUPPORTED_W_A = [
(kMxfp4Static, None),
]
return (weight_key, activation_key) in SUPPORTED_W_A

@staticmethod
def _supports_activation(activation: MoEActivation) -> bool:
raise NotImplementedError(
"OAITritonExperts is not yet used by an Oracle. "
"This method should not be called."
)
raise NotImplementedError

@staticmethod
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
raise NotImplementedError(
"OAITritonExperts is not yet used by an Oracle. "
"This method should not be called."
)
return True

def supports_expert_map(self) -> bool:
return True
Expand Down Expand Up @@ -477,6 +469,10 @@ def _make_routing_data(
class OAITritonExperts(BaseOAITritonExperts):
"""OAI Triton-based fused MoE expert implementation."""

@staticmethod
def _supports_activation(activation: MoEActivation) -> bool:
return activation == MoEActivation.SWIGLUOAI

@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard
Expand Down Expand Up @@ -561,6 +557,15 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts):
One use case for it is to inject LoRA modules on the activation and moe_sum.
"""

@staticmethod
def _supports_activation(activation: MoEActivation) -> bool:
return activation in [
MoEActivation.SILU,
MoEActivation.GELU,
MoEActivation.SWIGLUOAI,
MoEActivation.SWIGLUSTEP,
]

@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard
Expand Down
23 changes: 0 additions & 23 deletions vllm/model_executor/layers/fused_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
QuantizationConfig,
)
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up

logger = init_logger(__name__)

Expand Down Expand Up @@ -245,28 +244,6 @@ def maybe_roundup_hidden_size(
hidden_size, act_dtype, moe_parallel_config
)

# we are padding globally so EP buffer allocation works
if model_type == "gpt_oss" and is_mxfp4_quant:
from vllm.model_executor.layers.quantization.mxfp4 import (
Mxfp4Backend,
get_mxfp4_backend,
)

current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled)

if (
current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
):
hidden_size = round_up(hidden_size, 128)
elif (
current_platform.is_rocm()
or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
or current_mxfp4_backend == Mxfp4Backend.MARLIN
):
hidden_size = round_up(hidden_size, 256)

return hidden_size


Expand Down
Loading