Skip to content
Open
Original file line number Diff line number Diff line change
Expand Up @@ -438,8 +438,8 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
assert loaded_weight.shape[output_dim] == (
self.org_vocab_size // param.packed_factor
)
start_idx = start_idx // packed_factor
shard_size = shard_size // packed_factor
start_idx = round(start_idx // packed_factor)
shard_size = round(shard_size // packed_factor)
Comment thread
jinzhen-lin marked this conversation as resolved.
else:
assert loaded_weight.shape[output_dim] == self.org_vocab_size

Expand Down
4 changes: 2 additions & 2 deletions python/sglang/multimodal_gen/runtime/models/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,6 @@ def permute_param_layout_(
def _adjust_shard_indexes_for_packing(
shard_size, shard_offset, packed_factor
) -> tuple[Any, Any]:
shard_size = shard_size // packed_factor
shard_offset = shard_offset // packed_factor
shard_size = round(shard_size // packed_factor)
shard_offset = round(shard_offset // packed_factor)
return shard_size, shard_offset
1 change: 1 addition & 0 deletions python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,7 @@ def _verify_quantization(self) -> None:
"petit_nvfp4",
"quark",
"modelslim",
"humming",
]
compatible_quantization_methods = {
"modelopt_fp8": ["modelopt"],
Expand Down
17 changes: 17 additions & 0 deletions python/sglang/srt/environ.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import subprocess
import warnings
Expand Down Expand Up @@ -110,6 +111,16 @@ def parse(self, value: str) -> str:
return value


class EnvJSON(EnvField):
def parse(self, value: str | None) -> list | dict | None:
if not value:
return None
if os.path.exists(value):
with open(value) as f:
return json.load(f)
return json.loads(value)


class EnvBool(EnvField):
def parse(self, value: str) -> bool:
value = value.lower()
Expand Down Expand Up @@ -307,6 +318,12 @@ class Envs:
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2 = EnvBool(False)
SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE = EnvBool(False)

# Quantization (Humming)
SGLANG_HUMMING_ONLINE_QUANT_CONFIG = EnvJSON(None)
SGLANG_HUMMING_INPUT_QUANT_CONFIG = EnvJSON(None)
SGLANG_HUMMING_USE_F16_ACCUM = EnvBool(False)
SGLANG_HUMMING_MOE_GEMM_TYPE = EnvStr("")

# Flashinfer
SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
SGLANG_ENABLE_FLASHINFER_FP8_GEMM = EnvBool(False)
Expand Down
22 changes: 14 additions & 8 deletions python/sglang/srt/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"IPEXAWQLinearMethod",
"PetitNvFp4LinearMethod",
"QuarkInt4Fp8LinearMethod",
"HummingLinearMethod",
]

_is_cpu = is_cpu()
Expand Down Expand Up @@ -209,6 +210,7 @@ def __init__(

# All the linear layer supports quant method.
assert self.quant_method is not None
self.with_bias = bias
self.quant_method.create_weights(
self,
self.input_size,
Expand Down Expand Up @@ -315,6 +317,7 @@ def __init__(
input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
)

self.with_bias = bias
self.gather_output = gather_output
self.use_presharded_weights = use_presharded_weights

Expand Down Expand Up @@ -502,6 +505,7 @@ def __init__(
tp_size: Optional[int] = None,
use_presharded_weights: bool = False,
):
self.with_bias = bias
self.output_sizes = output_sizes
if tp_rank is None:
tp_rank = get_tensor_model_parallel_rank()
Expand Down Expand Up @@ -589,8 +593,8 @@ def weight_loader(
# If quantized, we need to adjust the offset and size to account
# for the packing.
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
shard_size = round(shard_size // param.pack_factor)
shard_offset = round(shard_offset // param.pack_factor)
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset
Expand Down Expand Up @@ -622,8 +626,8 @@ def weight_loader(
# for the packing.
packed_dim = getattr(param, "packed_dim", None)
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
shard_size = round(shard_size // param.pack_factor)
shard_offset = round(shard_offset // param.pack_factor)
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset
Expand Down Expand Up @@ -825,6 +829,7 @@ def __init__(
v_head_size: Optional[int] = None,
skip_block_quant_check: bool = False,
):
self.with_bias = bias
self.hidden_size = hidden_size
self.head_size = head_size
self.v_head_size = v_head_size if v_head_size is not None else head_size
Expand Down Expand Up @@ -1086,8 +1091,8 @@ def weight_loader(
# If quantized, we need to adjust the offset and size to account
# for the packing.
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
shard_size = round(shard_size // param.pack_factor)
shard_offset = round(shard_offset // param.pack_factor)

# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
Expand Down Expand Up @@ -1143,8 +1148,8 @@ def weight_loader(
# for the packing.
packed_dim = getattr(param, "packed_dim", None)
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
shard_size = round(shard_size // param.pack_factor)
shard_offset = round(shard_offset // param.pack_factor)

# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
Expand Down Expand Up @@ -1272,6 +1277,7 @@ def __init__(
input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
)

self.with_bias = bias
self.input_is_parallel = input_is_parallel
self.reduce_results = reduce_results

Expand Down
69 changes: 69 additions & 0 deletions python/sglang/srt/layers/moe/ep_moe/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -1381,3 +1381,72 @@ def silu_and_mul_masked_post_per_tensor_quant_fwd(
NUM_STAGE=NUM_STAGES,
)
return output


def moe_permute(
inputs: torch.Tensor,
topk_ids: torch.Tensor,
num_experts: int,
use_int64_offset: bool = False,
is_ep: bool = False,
outputs: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
from sgl_kernel import moe_permute_prepare

expert_offsets, src2dst = moe_permute_prepare(
topk_ids=topk_ids,
num_experts=num_experts,
use_int64_offset=use_int64_offset,
is_ep=is_ep,
)
output_shape = (topk_ids.nelement(), inputs.size(-1))
if outputs is None:
outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)

assert outputs.shape == output_shape
assert outputs.dtype == inputs.dtype
assert outputs.device == inputs.device

deepep_permute_triton_kernel[(inputs.shape[0],)](
inputs,
outputs,
src2dst,
topk_ids,
None,
topk_ids.size(1),
inputs.size(1),
BLOCK_SIZE=512,
)

return outputs, src2dst, expert_offsets


def moe_unpermute(
inputs: torch.Tensor,
src2dst: torch.Tensor,
topk_ids: torch.Tensor,
topk_weights: torch.Tensor,
outputs: torch.Tensor | None = None,
) -> torch.Tensor:
num_tokens = topk_ids.size(0)
output_shape = (num_tokens, inputs.size(1))
if outputs is None:
outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)

assert outputs.shape == output_shape
assert outputs.dtype == inputs.dtype
assert outputs.device == inputs.device

deepep_post_reorder_triton_kernel[(num_tokens,)](
inputs,
outputs,
src2dst,
topk_ids,
topk_weights,
topk_ids.size(1),
inputs.size(1),
BLOCK_SIZE=512,
)

assert outputs is not None
return outputs
11 changes: 10 additions & 1 deletion python/sglang/srt/layers/moe/ep_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,16 @@ def __init__(
routed_scaling_factor=routed_scaling_factor,
**kwargs,
)
if _use_aiter or _is_npu:
is_humming = (
get_moe_runner_backend().is_humming()
or get_moe_runner_backend().is_auto()
and quant_config is not None
and quant_config.get_name() == "humming"
)
if is_humming:
envs.SGLANG_DEEPEP_BF16_DISPATCH.set(True)
self.deprecate_flag = True
elif _use_aiter or _is_npu:
self.deprecate_flag = False
elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and isinstance(
quant_config, Fp8Config
Expand Down
3 changes: 3 additions & 0 deletions python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,13 @@ def __init__(
if params_dtype is None:
params_dtype = torch.get_default_dtype()

self.params_dtype = params_dtype
self.layer_name = prefix
self.layer_id = layer_id
self.top_k = top_k
self.hidden_size = hidden_size
self.num_experts = num_experts
self.with_bias = with_bias
self.num_fused_shared_experts = num_fused_shared_experts

self.enable_flashinfer_cutlass_moe = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@


def moe_align_block_size(
topk_ids: torch.Tensor, block_size: int, num_experts: int
topk_ids: torch.Tensor,
block_size: int,
num_experts: int,
ignore_invalid_expert: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Aligns the token distribution across experts to be compatible with block
Expand Down Expand Up @@ -81,5 +84,6 @@ def moe_align_block_size(
num_tokens_post_pad,
cumsum_buffer,
True,
ignore_invalid_expert,
)
return sorted_ids, expert_ids, num_tokens_post_pad
Loading
Loading