diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 67a49c5755c..52e75989d2f 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -65,6 +65,7 @@ th {
 |--------------|--------|-------------------|
 | `Qwen3OmniMoeForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct` |
 | `Qwen2_5OmniForConditionalGeneration` | Qwen2.5-Omni | `Qwen/Qwen2.5-Omni-7B`, `Qwen/Qwen2.5-Omni-3B`|
+| `HunyuanImage3ForCausalMM` | HunyuanImage3.0 (DiT-only) | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` |
 | `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` |
 | `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` |
 | `QwenImageEditPipeline` | Qwen-Image-Edit | `Qwen/Qwen-Image-Edit` |
diff --git a/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py b/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py
new file mode 100644
index 00000000000..2aa1adf4449
--- /dev/null
+++ b/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for HunyuanFusedMoE (Support HunyuanImage3 Diffusion Model, 5a779b4)."""
+
+import pytest
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class TestHunyuanFusedMoEPlatformDispatch:
+    """Test platform dispatch via platform qualname hooks."""
+
+    def test_default_platform_uses_default_impl_qualname(self, mocker):
+        """HunyuanFusedMoE should resolve the impl class from the platform hook."""
+        import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe
+
+        mock_platform = mocker.MagicMock()
+        mock_platform.get_diffusion_model_impl_qualname.return_value = (
+            "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault"
+        )
+
+        mocker.patch.object(
+            hunyuan_moe,
+            "current_omni_platform",
+            mock_platform,
+        )
+        mock_resolve = mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname")
+        mock_impl = mocker.MagicMock()
+        mock_resolve.return_value = mock_impl
+
+        from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import (
+            HunyuanFusedMoE,
+        )
+
+        HunyuanFusedMoE(prefix="")
+
+        mock_platform.prepare_diffusion_op_runtime.assert_called_once_with("hunyuan_fused_moe")
+        mock_platform.get_diffusion_model_impl_qualname.assert_called_once_with("hunyuan_fused_moe")
+        mock_resolve.assert_called_once_with(
+            "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault"
+        )
+        mock_impl.assert_called_once_with(prefix="")
+
+
+class TestHunyuanFusedMoEFactory:
+    """Test HunyuanFusedMoE factory __new__ and make_expert_params_mapping delegation."""
+
+    def test_new_delegates_to_impl_class(self, mocker):
+        """HunyuanFusedMoE(prefix=..., **kwargs) should instantiate and return impl instance."""
+        import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe
+
+        class MockImpl:
+            def __init__(self, *, prefix: str = "", **kwargs):
+                self.prefix = prefix
+                self.kwargs = kwargs
+
+        mock_platform = mocker.MagicMock()
+        mock_platform.get_diffusion_model_impl_qualname.return_value = "mock.impl.Qualname"
+        mocker.patch.object(hunyuan_moe, "current_omni_platform", mock_platform)
+
+        mock_impl_class = mocker.MagicMock(return_value=MockImpl(prefix="test", a=1))
+        mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class)
+
+        from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import (
+            HunyuanFusedMoE,
+        )
+
+        result = HunyuanFusedMoE(prefix="test", a=1)
+
+        assert isinstance(result, MockImpl)
+        assert result.prefix == "test"
+        assert result.kwargs == {"a": 1}
+        mock_platform.prepare_diffusion_op_runtime.assert_called_once_with("hunyuan_fused_moe")
+        mock_platform.get_diffusion_model_impl_qualname.assert_called_once_with("hunyuan_fused_moe")
+        mock_impl_class.assert_called_once_with(prefix="test", a=1)
+
+    def test_make_expert_params_mapping_delegates_to_impl(self, mocker):
+        """make_expert_params_mapping should delegate to impl class method."""
+        import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe
+
+        expected_mapping = [("a", "b", 0, "c")]
+        mock_platform = mocker.MagicMock()
+        mock_platform.get_diffusion_model_impl_qualname.return_value = "mock.impl.Qualname"
+        mocker.patch.object(hunyuan_moe, "current_omni_platform", mock_platform)
+
+        mock_impl_class = mocker.MagicMock()
+        mock_impl_class.make_expert_params_mapping = mocker.MagicMock(return_value=expected_mapping)
+        mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class)
+
+        from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import (
+            HunyuanFusedMoE,
+        )
+
+        result = HunyuanFusedMoE.make_expert_params_mapping(
+            model=None,
+            ckpt_gate_proj_name="gate",
+            ckpt_down_proj_name="down",
+            ckpt_up_proj_name="up",
+            num_experts=4,
+            num_redundant_experts=0,
+        )
+
+        assert result == expected_mapping
+        mock_platform.get_diffusion_model_impl_qualname.assert_called_once_with("hunyuan_fused_moe")
+        mock_impl_class.make_expert_params_mapping.assert_called_once_with(
+            None,
+            ckpt_gate_proj_name="gate",
+            ckpt_down_proj_name="down",
+            ckpt_up_proj_name="up",
+            num_experts=4,
+            num_redundant_experts=0,
+        )
diff --git a/tests/diffusion/test_data_is_moe.py b/tests/diffusion/test_data_is_moe.py
new file mode 100644
index 00000000000..25fa59ef1db
--- /dev/null
+++ b/tests/diffusion/test_data_is_moe.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for OmniDiffusionConfig.is_moe (fix is_moe type and threshold, 6663c0b)."""
+
+import pytest
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig, TransformerConfig
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class TestOmniDiffusionConfigIsMoE:
+    """Tests for OmniDiffusionConfig.is_moe property.
+
+    Covers commit 6663c0b: fix is_moe type and threshold
+    - num_experts must be (list, tuple, int); otherwise return False.
+    - Threshold: is_moe is True when num_experts > 0 (not > 1).
+    """
+
+    def test_is_moe_missing_num_experts_returns_false(self):
+        """When num_experts is absent, is_moe should be False."""
+        tf_config = TransformerConfig.from_dict({})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is False
+
+    def test_is_moe_none_num_experts_returns_false(self):
+        """When num_experts is explicitly None (e.g. in params), is_moe should be False."""
+        tf_config = TransformerConfig.from_dict({"num_experts": None})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is False
+
+    def test_is_moe_non_allowed_type_returns_false(self):
+        """When num_experts is not int/list/tuple (e.g. str), is_moe should be False."""
+        tf_config = TransformerConfig.from_dict({"num_experts": "2"})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is False
+
+    def test_is_moe_int_zero_returns_false(self):
+        """num_experts int 0 should yield is_moe False (threshold > 0)."""
+        tf_config = TransformerConfig.from_dict({"num_experts": 0})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is False
+
+    def test_is_moe_int_one_returns_true(self):
+        """num_experts int 1 should yield is_moe True (threshold > 0, not > 1)."""
+        tf_config = TransformerConfig.from_dict({"num_experts": 1})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is True
+
+    def test_is_moe_int_gt_one_returns_true(self):
+        """num_experts int > 1 should yield is_moe True."""
+        tf_config = TransformerConfig.from_dict({"num_experts": 2})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is True
+
+    def test_is_moe_list_all_zero_returns_false(self):
+        """num_experts list with all <= 0 should yield is_moe False."""
+        tf_config = TransformerConfig.from_dict({"num_experts": [0]})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is False
+
+    def test_is_moe_list_has_positive_returns_true(self):
+        """num_experts list with any int > 0 should yield is_moe True."""
+        tf_config = TransformerConfig.from_dict({"num_experts": [0, 1]})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is True
+
+    def test_is_moe_tuple_has_positive_returns_true(self):
+        """num_experts tuple with any int > 0 should yield is_moe True."""
+        tf_config = TransformerConfig.from_dict({"num_experts": (0, 2)})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is True
+
+    def test_is_moe_list_non_int_ignored(self):
+        """num_experts list with only non-int entries should yield is_moe False."""
+        tf_config = TransformerConfig.from_dict({"num_experts": ["a", 0.0]})
+        config = OmniDiffusionConfig(model="test", tf_model_config=tf_config)
+        assert config.is_moe is False
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index 38366469eb1..634cb329414 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -463,11 +463,13 @@ class OmniDiffusionConfig:
     @property
     def is_moe(self) -> bool:
         num_experts = self.tf_model_config.get("num_experts", None)
+        if not isinstance(num_experts, (list, tuple, int)):
+            return False
         if isinstance(num_experts, int):
-            return num_experts > 1
+            return num_experts > 0
 
         if isinstance(num_experts, (list, tuple)):
-            return any(isinstance(n, int) and n > 1 for n in num_experts)
+            return any(isinstance(n, int) and n > 0 for n in num_experts)
 
         return False
 
diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py b/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py
index 98a3ac07b1c..cbc6a8ad1f4 100644
--- a/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py
+++ b/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Hunyuan Image 3 diffusion model components."""
 
+from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import HunyuanFusedMoE
 from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_image_3_transformer import (
     HunyuanImage3Model,
     HunyuanImage3Text2ImagePipeline,
@@ -10,8 +11,4 @@
     HunyuanImage3Pipeline,
 )
 
-__all__ = [
-    "HunyuanImage3Pipeline",
-    "HunyuanImage3Model",
-    "HunyuanImage3Text2ImagePipeline",
-]
+__all__ = ["HunyuanImage3Pipeline", "HunyuanImage3Model", "HunyuanImage3Text2ImagePipeline", "HunyuanFusedMoE"]
diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py
new file mode 100644
index 00000000000..5ada5ceb848
--- /dev/null
+++ b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+from vllm_omni.platforms import current_omni_platform
+
+
+class HunyuanFusedMoEDefault(SharedFusedMoE):
+    def __init__(self, *, prefix: str = "", **kwargs: Any) -> None:
+        super().__init__(prefix=prefix, **kwargs)
+        self._prefix = prefix
+        self._init_hook_handle = self.register_forward_pre_hook(self._initialize_kernel_hook, with_kwargs=True)
+
+    def _initialize_kernel_hook(self, module: Any, args: Any, kwargs: Any) -> None:
+        if self.quant_method:
+            self.quant_method.process_weights_after_loading(self)
+        self._init_hook_handle.remove()
+
+    def forward(self, hidden_states: Any, router_logits: Any) -> Any:
+        return super().forward(hidden_states, router_logits)
+
+
+class HunyuanFusedMoE:
+    def __new__(cls, *, prefix: str = "", **kwargs: Any) -> Any:
+        op_name = "hunyuan_fused_moe"
+        current_omni_platform.prepare_diffusion_op_runtime(op_name)
+        impl = resolve_obj_by_qualname(
+            current_omni_platform.get_diffusion_model_impl_qualname(op_name),
+        )
+        return impl(prefix=prefix, **kwargs)
+
+    @classmethod
+    def make_expert_params_mapping(
+        cls,
+        model: Any,
+        ckpt_gate_proj_name: str,
+        ckpt_down_proj_name: str,
+        ckpt_up_proj_name: str,
+        num_experts: int,
+        num_redundant_experts: int = 0,
+    ) -> list[tuple[str, str, int, str]]:
+        impl = resolve_obj_by_qualname(
+            current_omni_platform.get_diffusion_model_impl_qualname("hunyuan_fused_moe"),
+        )
+        return impl.make_expert_params_mapping(
+            model,
+            ckpt_gate_proj_name=ckpt_gate_proj_name,
+            ckpt_down_proj_name=ckpt_down_proj_name,
+            ckpt_up_proj_name=ckpt_up_proj_name,
+            num_experts=num_experts,
+            num_redundant_experts=num_redundant_experts,
+        )
diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py
index 0c2f9e290ac..a89931550de 100644
--- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py
@@ -32,10 +32,8 @@
 from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -63,6 +61,7 @@
 from vllm_omni.diffusion.distributed.parallel_state import get_pp_group
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.layers.rope import RotaryEmbedding
+from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import HunyuanFusedMoE
 
 logger = logging.getLogger(__name__)
 
@@ -1417,7 +1416,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
 
         return final_hidden_states.view(orig_shape)
 
@@ -1565,22 +1564,6 @@ def forward(
         return output, None, past_key_value
 
 
-class HunyuanFusedMoE(SharedFusedMoE):
-    def __init__(self, *, prefix: str = "", **kwargs):
-        super().__init__(prefix=prefix, **kwargs)
-        self._prefix = prefix
-
-        self._init_hook_handle = self.register_forward_pre_hook(self._initialize_kernel_hook, with_kwargs=True)
-
-    def _initialize_kernel_hook(self, module, args, kwargs):
-        if self.quant_method:
-            self.quant_method.process_weights_after_loading(self)
-        self._init_hook_handle.remove()
-
-    def forward(self, hidden_states, router_logits):
-        return super().forward(hidden_states, router_logits)
-
-
 class HunyuanImage3DecoderLayer(nn.Module):
     def __init__(self, config: HunyuanImage3Config, layer_idx: int, prefix: str = ""):
         super().__init__()
@@ -2454,7 +2437,6 @@ def __call__(
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
 
                     latents = callback_outputs.pop("latents", latents)
-
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py
index f90c69e3a53..314cb3219e5 100644
--- a/vllm_omni/platforms/interface.py
+++ b/vllm_omni/platforms/interface.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
+from typing import Any
 
 import torch
 from vllm.platforms import Platform
@@ -52,6 +53,16 @@ def get_omni_generation_worker_cls(cls) -> str:
     def get_default_stage_config_path(cls) -> str:
         raise NotImplementedError
 
+    @classmethod
+    def get_diffusion_model_impl_qualname(cls, op_name: str) -> str:
+        if op_name == "hunyuan_fused_moe":
+            return "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault"
+        raise NotImplementedError(f"Unsupported diffusion model op: {op_name}")
+
+    @classmethod
+    def prepare_diffusion_op_runtime(cls, op_name: str, **kwargs: Any) -> None:
+        return None
+
     @classmethod
     def get_diffusion_attn_backend_cls(
         cls,
diff --git a/vllm_omni/platforms/npu/models/__init__.py b/vllm_omni/platforms/npu/models/__init__.py
new file mode 100644
index 00000000000..208f01a7cb5
--- /dev/null
+++ b/vllm_omni/platforms/npu/models/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py
new file mode 100644
index 00000000000..46c76a65290
--- /dev/null
+++ b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+import torch
+import vllm.forward_context as _vllm_fc
+from vllm.config import VllmConfig
+from vllm.distributed import get_ep_group
+from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (
+    init_model_parallel_group as vllm_init_model_parallel_group,
+)
+from vllm_ascend.ascend_forward_context import MoECommType
+from vllm_ascend.ops.fused_moe.fused_moe import AscendSharedFusedMoE
+from vllm_ascend.ops.fused_moe.moe_comm_method import _MoECommMethods
+from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
+
+from vllm_omni.diffusion.distributed.parallel_state import (
+    get_data_parallel_world_size,
+    get_world_group,
+)
+from vllm_omni.diffusion.forward_context import get_forward_context as omni_get_ctx
+
+
+def _init_mc2_group_for_diffusion(
+    world_size: int,
+    data_parallel_size: int,
+    tensor_parallel_size: int,
+    backend: str,
+    local_rank: int,
+) -> None:
+    import vllm_ascend.distributed.parallel_state as vllm_ascend_parallel_state
+
+    if getattr(vllm_ascend_parallel_state, "_MC2", None) is not None:
+        return
+    all_ranks = torch.arange(world_size).reshape(-1, data_parallel_size * tensor_parallel_size)
+    group_ranks = all_ranks.unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+
+    vllm_ascend_parallel_state._MC2 = vllm_init_model_parallel_group(
+        group_ranks,
+        local_rank,
+        backend,
+        group_name="mc2",
+    )
+
+
+def _select_moe_comm_method(vllm_config: VllmConfig) -> MoECommType | None:
+    soc_version = get_ascend_device_type()
+    if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group().world_size == 1:
+        moe_comm_type = MoECommType.ALLGATHER
+    elif soc_version in {AscendDeviceType.A2}:
+        moe_comm_type = MoECommType.ALLGATHER
+    elif soc_version in {AscendDeviceType.A3}:
+        moe_comm_type = MoECommType.ALLTOALL
+    elif soc_version in {AscendDeviceType._310P}:
+        moe_comm_type = MoECommType.ALLGATHER
+    elif soc_version in {AscendDeviceType.A5}:
+        moe_comm_type = MoECommType.ALLTOALL
+    else:
+        raise ValueError(f"Unsupported soc_version: {soc_version}")
+    return moe_comm_type
+
+
+def prepare_hunyuan_fused_moe_runtime() -> None:
+    world_size = torch.distributed.get_world_size()
+    data_parallel_size = get_data_parallel_world_size()
+    tensor_parallel_size = get_tensor_model_parallel_world_size()
+    backend = torch.distributed.get_backend(get_world_group().device_group)
+    local_rank = get_world_group().local_rank
+    _init_mc2_group_for_diffusion(
+        world_size=world_size,
+        data_parallel_size=data_parallel_size,
+        tensor_parallel_size=tensor_parallel_size,
+        backend=backend,
+        local_rank=local_rank,
+    )
+
+    if not hasattr(_vllm_fc.ForwardContext, "moe_comm_method"):
+        _vllm_fc.ForwardContext.__annotations__["in_profile_run"] = bool
+        _vllm_fc.ForwardContext.in_profile_run = False
+
+    _vllm_fc.ForwardContext.moe_comm_type = _select_moe_comm_method(vllm_config=omni_get_ctx().vllm_config)
+    _vllm_fc.ForwardContext.moe_comm_method = _MoECommMethods.get(_vllm_fc.ForwardContext.moe_comm_type)
+    _vllm_fc.ForwardContext.flash_comm_v1_enabled = False
+
+
+class AscendHunyuanFusedMoE(AscendSharedFusedMoE):
+    def __init__(self, *, prefix: str = "", **kwargs: Any) -> None:
+        super().__init__(prefix=prefix, **kwargs)
+        self._prefix = prefix
+        self._init_hook_handle = self.register_forward_pre_hook(self._initialize_kernel_hook, with_kwargs=True)
+
+    def _initialize_kernel_hook(self, module: Any, args: Any, kwargs: Any) -> None:
+        if self.quant_method:
+            self.quant_method.process_weights_after_loading(self)
+        self._init_hook_handle.remove()
+
+    def forward(self, hidden_states: Any, router_logits: Any) -> Any:
+        return super().forward(hidden_states, router_logits)
+
+    def __del__(self):
+        import vllm_ascend.distributed.parallel_state as vllm_ascend_parallel_state
+
+        if vllm_ascend_parallel_state._MC2:
+            vllm_ascend_parallel_state._MC2.destroy()
+        vllm_ascend_parallel_state._MC2 = None
diff --git a/vllm_omni/platforms/npu/platform.py b/vllm_omni/platforms/npu/platform.py
index 3c2495c3d35..bda4e4f6155 100644
--- a/vllm_omni/platforms/npu/platform.py
+++ b/vllm_omni/platforms/npu/platform.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 import torch
 from vllm.logger import init_logger
 from vllm_ascend.platform import NPUPlatform
@@ -33,6 +35,23 @@ def get_omni_generation_worker_cls(cls) -> str:
     def get_default_stage_config_path(cls) -> str:
         return "vllm_omni/platforms/npu/stage_configs"
 
+    @classmethod
+    def get_diffusion_model_impl_qualname(cls, op_name: str) -> str:
+        if op_name == "hunyuan_fused_moe":
+            return "vllm_omni.platforms.npu.models.hunyuan_fused_moe.AscendHunyuanFusedMoE"
+        return super().get_diffusion_model_impl_qualname(op_name)
+
+    @classmethod
+    def prepare_diffusion_op_runtime(cls, op_name: str, **kwargs: Any) -> None:
+        if op_name != "hunyuan_fused_moe":
+            return
+
+        from vllm_omni.platforms.npu.models.hunyuan_fused_moe import (
+            prepare_hunyuan_fused_moe_runtime,
+        )
+
+        prepare_hunyuan_fused_moe_runtime()
+
     @classmethod
     def get_diffusion_attn_backend_cls(
         cls,