diff --git a/vllm_ascend/quantization/methods/w8a8_dynamic.py b/vllm_ascend/quantization/methods/w8a8_dynamic.py index 68dea55030e..b150d1a5875 100644 --- a/vllm_ascend/quantization/methods/w8a8_dynamic.py +++ b/vllm_ascend/quantization/methods/w8a8_dynamic.py @@ -79,6 +79,11 @@ def apply( tp_rank: int | None = 0, ) -> torch.Tensor: quantized_x, pertoken_scale = torch_npu.npu_dynamic_quant(x) + need_unsqz = False + if pertoken_scale.dim() == 2: + need_unsqz = True + quantized_x = quantized_x.squeeze(dim=1) + pertoken_scale = pertoken_scale.squeeze(dim=1) output = torch_npu.npu_quant_matmul( quantized_x, layer.weight, @@ -87,6 +92,8 @@ def apply( bias=bias, output_dtype=x.dtype, ) + if need_unsqz: + output = output.unsqueeze(dim=1) return output def process_weights_after_loading(self, layer): diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 337287ea8ee..d0e082ba996 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -64,6 +64,13 @@ "mm_projector.linear_1": "mm_projector.proj.0", "mm_projector.linear_2": "mm_projector.proj.2", }, + "qwen3_omni_moe_thinker": { + "thinker.lm_head.": "language_model.lm_head.", + "thinker.model.": "language_model.model.", + "thinker.": "", + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }, } # key: model_type @@ -186,6 +193,18 @@ ], "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"], }, + "qwen3_omni_moe_text": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, } @@ -462,7 +481,10 @@ def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[ "to have the same precision." ) else: - is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT" + is_skipped = any( + key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT" + for key, value in self.quant_description.items() + ) assert is_skipped is not None return is_skipped