Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ def create_model(
onnx_model = Gemma3Model(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
elif config.architectures[0] == "GptOssForCausalLM":
print("WARNING: This model only supports symmetric quantization for `QMoE`.")
delattr(config, "quantization_config")
if hasattr(config, "quantization_config") and config.quantization_config.get("quant_method") != "quark":
delattr(config, "quantization_config")
onnx_model = GPTOSSModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
elif config.architectures[0] == "GraniteForCausalLM":
onnx_model = GraniteModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
Expand Down
4 changes: 4 additions & 0 deletions src/python/py/models/builders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3227,7 +3227,11 @@ def make_qmoe_op(self, name, **kwargs):
kwargs.get("weight3", ""),
kwargs.get("scales3", ""),
kwargs.get("bias3", ""),
kwargs.get("zero_points1", ""),
kwargs.get("zero_points2", ""),
kwargs.get("zero_points3", ""),
]
Comment thread
thpereir marked this conversation as resolved.

output = f"{name}/output_0"

extra_kwargs = (
Expand Down
129 changes: 102 additions & 27 deletions src/python/py/models/builders/gptoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ def make_moe_fused(self, layer_id, mlp, root_input):
op_type = self.moe_attrs["op_type"]
moe_weight_type = f"{'q' if op_type == 'QMoE' else ''}weight"

has_quark_experts = self.has_quark_experts(mlp.experts)

# Make router nodes
router_basename = f"{basename}/router/MatMul"
router_matmul_name = self.make_matmul(mlp.router, router_basename, root_input)
Expand All @@ -572,22 +574,25 @@ def make_moe_fused(self, layer_id, mlp, root_input):
gate_up_proj_weight = f"model.layers.{layer_id}.moe.experts.gate_up_proj.{moe_weight_type}"
gate_up_proj_scales = f"model.layers.{layer_id}.moe.experts.gate_up_proj.scales"
gate_up_proj_bias = f"model.layers.{layer_id}.moe.experts.gate_up_proj.bias"
gate_up_proj_zero_points = f"model.layers.{layer_id}.moe.experts.gate_up_proj.zero_points"
down_proj_weight = f"model.layers.{layer_id}.moe.experts.down_proj.{moe_weight_type}"
down_proj_scales = f"model.layers.{layer_id}.moe.experts.down_proj.scales"
down_proj_bias = f"model.layers.{layer_id}.moe.experts.down_proj.bias"
down_proj_zero_points = f"model.layers.{layer_id}.moe.experts.down_proj.zero_points"

# Apply transpose depending on EP/op requirements
# Apply transpose depending on EP/op requirements and Quark expert presence
# For quantized QMoE on CUDA, kernels expect scales along the hidden_size axis,
# so we keep original orientation (last axis = hidden_size) when quantizing.
# For non-quantized MoE or non-CUDA EPs, transpose to align MatMul layout.
if op_type == "QMoE" and self.ep == "cuda":
gate_up_proj_layout = mlp.experts.gate_up_proj
down_proj_layout = mlp.experts.down_proj
else:
gate_up_proj_layout = mlp.experts.gate_up_proj.transpose(-1, -2)
down_proj_layout = mlp.experts.down_proj.transpose(-1, -2)

if op_type == "MoE":
if not has_quark_experts:
if op_type == "QMoE" and self.ep == "cuda":
gate_up_proj_layout = mlp.experts.gate_up_proj
down_proj_layout = mlp.experts.down_proj
else:
gate_up_proj_layout = mlp.experts.gate_up_proj.transpose(-1, -2)
down_proj_layout = mlp.experts.down_proj.transpose(-1, -2)

if op_type == "MoE" and not has_quark_experts:
# Save non-quantized MoE weights as initializers
self.make_initializer(
gate_up_proj_layout.view(self.moe_attrs["num_experts"], -1, self.hidden_size),
Expand All @@ -600,22 +605,39 @@ def make_moe_fused(self, layer_id, mlp, root_input):
to=self.io_dtype,
)
else:
# Create and save quantized MoE weights as initializers
gate_up_proj_qweight_list, gate_up_proj_scales_list = [], []
down_proj_qweight_list, down_proj_scales_list = [], []

for i in range(self.moe_attrs["num_experts"]):
qweight1, scales1 = self.make_qmoe_weights(gate_up_proj_layout[i])
gate_up_proj_qweight_list.append(qweight1)
gate_up_proj_scales_list.append(scales1)
qweight2, scales2 = self.make_qmoe_weights(down_proj_layout[i])
down_proj_qweight_list.append(qweight2)
down_proj_scales_list.append(scales2)

gate_up_proj_qweight_tensor = torch.stack(gate_up_proj_qweight_list, dim=0).to(torch.uint8)
gate_up_proj_scales_tensor = torch.stack(gate_up_proj_scales_list, dim=0)
down_proj_qweight_tensor = torch.stack(down_proj_qweight_list, dim=0).to(torch.uint8)
down_proj_scales_tensor = torch.stack(down_proj_scales_list, dim=0)
if has_quark_experts:
# Use pre-quantized Quark experts
gate_up_proj_qweight_tensor, gate_up_proj_scales_tensor, gate_up_proj_zero_points_tensor = (
mlp.experts.fc1_weights,
mlp.experts.fc1_scales,
mlp.experts.fc1_zero_points,
)
down_proj_qweight_tensor, down_proj_scales_tensor, down_proj_zero_points_tensor = (
mlp.experts.fc2_weights,
mlp.experts.fc2_scales,
mlp.experts.fc2_zero_points,
)

# Save zero point as initializers
self.make_initializer(gate_up_proj_zero_points_tensor, gate_up_proj_zero_points)
self.make_initializer(down_proj_zero_points_tensor, down_proj_zero_points)
else:
# Create and save quantized MoE weights as initializers
gate_up_proj_qweight_list, gate_up_proj_scales_list = [], []
down_proj_qweight_list, down_proj_scales_list = [], []

for i in range(self.moe_attrs["num_experts"]):
qweight1, scales1 = self.make_qmoe_weights(gate_up_proj_layout[i])
gate_up_proj_qweight_list.append(qweight1)
gate_up_proj_scales_list.append(scales1)
qweight2, scales2 = self.make_qmoe_weights(down_proj_layout[i])
down_proj_qweight_list.append(qweight2)
down_proj_scales_list.append(scales2)

gate_up_proj_qweight_tensor = torch.stack(gate_up_proj_qweight_list, dim=0).to(torch.uint8)
gate_up_proj_scales_tensor = torch.stack(gate_up_proj_scales_list, dim=0)
down_proj_qweight_tensor = torch.stack(down_proj_qweight_list, dim=0).to(torch.uint8)
down_proj_scales_tensor = torch.stack(down_proj_scales_list, dim=0)

# qweight tensors always use the same shape regardless of quantization method
pack_size = 8 // self.moe_attrs["expert_weight_bits"]
Expand All @@ -635,8 +657,15 @@ def make_moe_fused(self, layer_id, mlp, root_input):
self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype)

# Save MoE biases as initializers
self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype)
self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype)
if has_quark_experts:
gate_up_bias = self.combine_quark_gate_up_biases_from_experts(mlp.experts)
down_bias = self.combine_quark_down_biases_from_experts(mlp.experts)
else:
gate_up_bias = mlp.experts.gate_up_proj_bias
down_bias = mlp.experts.down_proj_bias

self.make_initializer(gate_up_bias, gate_up_proj_bias, to=self.io_dtype)
self.make_initializer(down_bias, down_proj_bias, to=self.io_dtype)

moe_name = f"{basename}/{op_type}"
self.make_moe_op(
Expand All @@ -649,7 +678,53 @@ def make_moe_fused(self, layer_id, mlp, root_input):
weight2=down_proj_weight,
scales2=down_proj_scales,
bias2=down_proj_bias,
zero_points1=gate_up_proj_zero_points if has_quark_experts else "",
zero_points2=down_proj_zero_points if has_quark_experts else "",
)

# Assign output 0 of previous MoE as root input to next SkipLayerNorm
self.layernorm_attrs["skip_input"] = f"{moe_name}/output_0"

def has_quark_experts(self, experts):
return hasattr(experts, "fc1_weights") and hasattr(experts, "fc2_weights")

def combine_quark_gate_up_biases_from_experts(self, experts):
"""Combine Quark gate_proj and up_proj biases from individual experts"""
assert(self.has_quark_experts(experts))
combined_biases = []

for expert_id in sorted(experts.keys()):
expert = experts[expert_id]

if expert.gate_up_proj.qweight is not None:
# Fused gate_up projection
gate_up_proj = expert.gate_up_proj.bias if hasattr(expert.gate_up_proj, 'bias') and expert.gate_up_proj.bias is not None else torch.zeros(expert.gate_up_proj.qweight.shape[0])
combined_biases.append(gate_up_proj)
else:
# Get biases from individual projections
gate_bias = expert.gate_proj.bias if hasattr(expert.gate_proj, 'bias') and expert.gate_proj.bias is not None else torch.zeros(expert.gate_proj.qweight.shape[0])
up_bias = expert.up_proj.bias if hasattr(expert.up_proj, 'bias') and expert.up_proj.bias is not None else torch.zeros(expert.up_proj.qweight.shape[0])

# Combine gate and up biases (interleaved pattern: even=gate, odd=up)
gate_out_dim = gate_bias.shape[0]
up_out_dim = up_bias.shape[0]

combined_bias = torch.zeros(gate_out_dim + up_out_dim, dtype=gate_bias.dtype, device="cpu")
combined_bias[::2] = gate_bias # Even indices = gate
combined_bias[1::2] = up_bias # Odd indices = up

combined_biases.append(combined_bias)

return torch.stack(combined_biases, dim=0)

def combine_quark_down_biases_from_experts(self, experts):
"""Combine Quark down_proj biases from individual experts"""
assert(self.has_quark_experts(experts))
combined_biases = []

for expert_id in sorted(experts.keys()):
expert = experts[expert_id]
down_bias = expert.down_proj.bias if hasattr(expert.down_proj, 'bias') and expert.down_proj.bias is not None else torch.zeros(expert.down_proj.qweight.shape[0])
combined_biases.append(down_bias)

return torch.stack(combined_biases, dim=0)
Loading
Loading