diff --git a/tests/full_tests/ci_e2e_discoverable_tests.sh b/tests/full_tests/ci_e2e_discoverable_tests.sh
index d28ddab62..083b5ad61 100755
--- a/tests/full_tests/ci_e2e_discoverable_tests.sh
+++ b/tests/full_tests/ci_e2e_discoverable_tests.sh
@@ -314,12 +314,12 @@ run_gsm8k_deepseek_test() {
 
 
 # GSM8K on deepseek v2 lite + unified attn
-run_gsm8k_deepseek_unified_mla_test() {
-    echo "➡️ Testing GSM8K on deepseek v2 lite + Unified MLA..."
-    VLLM_UNIFIED_ATTN=true VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 \
-    pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml"
-    echo "✅ GSM8K Test with deepseek v2 lite + Unified MLA passed."
-}
+#run_gsm8k_deepseek_unified_mla_test() {
+#    echo "➡️ Testing GSM8K on deepseek v2 lite + Unified MLA..."
+#    VLLM_UNIFIED_ATTN=true VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 \
+#    pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml"
+#    echo "✅ GSM8K Test with deepseek v2 lite + Unified MLA passed."
+#}
 
 # GSM8K on QWEN3-30B-A3B
 run_gsm8k_qwen3_30b_test() {
@@ -478,7 +478,7 @@ launch_all_tests() {
     run_gsm8k_granite_async_test
     run_gsm8k_granite_test_unified_attn_async
     run_gsm8k_deepseek_test
-    run_gsm8k_deepseek_unified_mla_test
+    #run_gsm8k_deepseek_unified_mla_test
     run_gsm8k_qwen3_30b_test
     run_spec_decode_ngram_test
     run_spec_decode_eagle3_test
diff --git a/tests/unit_tests/multimodal/test_hpu_multimodal_inputs.py b/tests/unit_tests/multimodal/test_hpu_multimodal_inputs.py
index f8e6adc77..322e03498 100644
--- a/tests/unit_tests/multimodal/test_hpu_multimodal_inputs.py
+++ b/tests/unit_tests/multimodal/test_hpu_multimodal_inputs.py
@@ -16,35 +16,50 @@
 )
 
 
-def _dummy_items_from_tensors(tensors: NestedTensors, modalitity: str = "image"):
+def _dummy_items_from_tensors(tensors: NestedTensors, modality: str = "image"):
     """
-    Creates MultiModalKwargsItems from a dict of modality to tensor.
+    Creates MultiModalKwargsItems from a list of tensors.
     """
-    elems = [
-        MultiModalFieldElem(modality=modalitity, key="key", data=t, field=MultiModalBatchedField()) for t in tensors
+    items = [
+        MultiModalKwargsItem({"key": MultiModalFieldElem(data=t, field=MultiModalBatchedField())}) for t in tensors
     ]
-    items = [MultiModalKwargsItem({"key": elem}) for elem in elems]
-    mm_items = MultiModalKwargsItems.from_seq(items)
+    mm_items = MultiModalKwargsItems({modality: items})
     return mm_items
 
 
 def _dummy_items_from_tensor_modalities(modality_tensor_dict: NestedTensors):
-    elems = [
-        MultiModalFieldElem(modality=modality, key="key", data=t, field=MultiModalBatchedField())
-        for modality, tensors in modality_tensor_dict.items() for t in tensors
-    ]
-    items = [MultiModalKwargsItem({"key": elem}) for elem in elems]
-    mm_items = MultiModalKwargsItems.from_seq(items)
+    """
+    Creates MultiModalKwargsItems from a dict of modality to list of tensors.
+    """
+    items_by_modality = {}
+    for modality, tensors in modality_tensor_dict.items():
+        items = [
+            MultiModalKwargsItem({"key": MultiModalFieldElem(data=t, field=MultiModalBatchedField())}) for t in tensors
+        ]
+        items_by_modality[modality] = items
+    mm_items = MultiModalKwargsItems(items_by_modality)
     return mm_items
 
 
-def _dummy_items_from_tensor_keys(key_tensor_dict: NestedTensors, modality: str = "image"):
-    elems = [
-        MultiModalFieldElem(modality=modality, key=key, data=t, field=MultiModalBatchedField())
-        for key, tensors in key_tensor_dict.items() for t in tensors
-    ]
-    items = [MultiModalKwargsItem({elem.key: elem}) for elem in elems]
-    mm_items = MultiModalKwargsItems.from_seq(items)
+def _dummy_items_from_tensor_keys(key_tensor_dict: dict[str, list], modality: str = "image"):
+    """
+    Creates MultiModalKwargsItems from a dict of key names to list of tensors.
+    Creates items where each position combines tensors from all keys at that index.
+    For example: {"key1": [t1, t2], "key2": [t3, t4]} creates:
+      - Item 0: {key1: t1, key2: t3}
+      - Item 1: {key1: t2, key2: t4}
+    """
+    # Get the number of items (should be same length for all keys)
+    num_items = len(next(iter(key_tensor_dict.values())))
+
+    items = []
+    for i in range(num_items):
+        item_dict = {}
+        for key, tensors in key_tensor_dict.items():
+            item_dict[key] = MultiModalFieldElem(data=tensors[i], field=MultiModalBatchedField())
+        items.append(MultiModalKwargsItem(item_dict))
+
+    mm_items = MultiModalKwargsItems({modality: items})
     return mm_items
 
 
@@ -58,19 +73,13 @@ def assert_nested_tensors_equal_hpu(expected: NestedTensors, actual: NestedTenso
             assert_nested_tensors_equal_hpu(expected_item, actual_item)
 
 
-def assert_multimodal_kwargs_items_equal_hpu(expected_elems: MultiModalKwargsItem, actual: dict[str, NestedTensors]):
+def assert_multimodal_kwargs_items_equal_hpu(expected: dict[str, NestedTensors], actual: dict[str, NestedTensors]):
     """HPU-aware assertion for multimodal input equality."""
 
-    assert set(expected_elems.keys()) == set(actual.keys())
-
-    for key in expected_elems:
-        if isinstance(expected_elems[key], list):
-            assert len(expected_elems[key]) == len(actual[key])
-            for expected_item, actual_item in zip(expected_elems[key], actual[key]):
-                assert_nested_tensors_equal_hpu(expected_item, actual_item)
-            continue
+    assert set(expected.keys()) == set(actual.keys())
 
-        assert_nested_tensors_equal_hpu(expected_elems[key], actual[key].data)
+    for key in expected:
+        assert_nested_tensors_equal_hpu(expected[key], actual[key])
 
 
 @pytest.mark.parametrize(
@@ -200,7 +209,7 @@ def test_hpu_device_mismatch_handling(tensor_shapes):
         result = dummy_kwargs_items.get_data()
         expected = {"key": [hpu_tensor, cpu_tensor]}
         # If successful, verify structure
-        assert_multimodal_kwargs_items_equal_hpu(result, expected)
+        assert_multimodal_kwargs_items_equal_hpu(expected, result)
     except (RuntimeError, ValueError) as e:
         # Expected behavior for device mismatch
         assert "device" in str(e).lower() or "hpu" in str(e).lower()
@@ -240,19 +249,29 @@ def test_hpu_tensor_batching_sizes(tensor_size, batch_count):
 
 
 def test_hpu_multiple_modalities():
-    """Test MultiModalKwargsItems key handling."""
+    """Test MultiModalKwargsItems handling of multiple modalities with different keys."""
     device = "hpu"
 
-    # Test multiple modalities
+    # Test multiple modalities - each should have its own key
+    # This simulates a realistic scenario where different modalities
+    # produce different output keys (e.g., pixel_values for images, audio_features for audio)
     image_tensor = torch.rand([3, 224, 224], device=device, dtype=torch.bfloat16)
     audio_tensor = torch.rand([1000], device=device, dtype=torch.bfloat16)
 
-    batch_data = {"image": [image_tensor], "audio": [audio_tensor]}
+    # Create items with different keys for different modalities
+    image_items = [
+        MultiModalKwargsItem({"pixel_values": MultiModalFieldElem(data=image_tensor, field=MultiModalBatchedField())})
+    ]
+    audio_items = [
+        MultiModalKwargsItem({"audio_features": MultiModalFieldElem(data=audio_tensor, field=MultiModalBatchedField())})
+    ]
+
+    dummy_kwargs_items = MultiModalKwargsItems({"image": image_items, "audio": audio_items})
 
-    dummy_kwargs_items: MultiModalKwargsItems = _dummy_items_from_tensor_modalities(batch_data)
     result = dummy_kwargs_items.get_data()
 
-    expected = {"key": [image_tensor, audio_tensor]}
+    # Each modality should have its own key in the output
+    expected = {"pixel_values": image_tensor.unsqueeze(0), "audio_features": audio_tensor.unsqueeze(0)}
 
     assert_multimodal_kwargs_items_equal_hpu(expected, result)
 
diff --git a/tests/unit_tests/test_prefix_caching.py b/tests/unit_tests/test_prefix_caching.py
index 9688a81ff..e0c22f6c0 100644
--- a/tests/unit_tests/test_prefix_caching.py
+++ b/tests/unit_tests/test_prefix_caching.py
@@ -16,7 +16,6 @@
 def get_vllm_config():
     model_config = ModelConfig(
         model="facebook/opt-125m",
-        task="generate",
         tokenizer="facebook/opt-125m",
         tokenizer_mode="auto",
         trust_remote_code=True,
diff --git a/tests/unit_tests/worker/test_hpu_model_runner.py b/tests/unit_tests/worker/test_hpu_model_runner.py
index 80164e6e3..bf54979d6 100644
--- a/tests/unit_tests/worker/test_hpu_model_runner.py
+++ b/tests/unit_tests/worker/test_hpu_model_runner.py
@@ -62,7 +62,6 @@ def initialize_kv_cache(runner: HPUModelRunner):
 def get_vllm_config():
     model_config = ModelConfig(
         model="facebook/opt-125m",
-        task="generate",
         tokenizer="facebook/opt-125m",
         tokenizer_mode="auto",
         trust_remote_code=True,
diff --git a/vllm_gaudi/__init__.py b/vllm_gaudi/__init__.py
index 212d4a41b..3ca74b6dd 100644
--- a/vllm_gaudi/__init__.py
+++ b/vllm_gaudi/__init__.py
@@ -14,6 +14,8 @@ def register_utils():
 
 
 def register_ops():
+    """Register custom PluggableLayers for the HPU platform"""
+    import vllm_gaudi.attention.oot_mla  # noqa: F401
     """Register custom ops for the HPU platform."""
     import vllm_gaudi.v1.sample.hpu_rejection_sampler  # noqa: F401
     import vllm_gaudi.distributed.kv_transfer.kv_connector.v1.hpu_nixl_connector  # noqa: F401
diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py
index 7f4d56f3b..370d479b8 100644
--- a/vllm_gaudi/attention/backends/hpu_attn.py
+++ b/vllm_gaudi/attention/backends/hpu_attn.py
@@ -18,7 +18,7 @@
 
 from vllm.v1.attention.backend import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata,
                                        AttentionType)
-from vllm.model_executor.layers.attention.mla_attention import MLACommonImpl
+from vllm.model_executor.layers.attention.mla_attention import (MLACommonImpl)
 from vllm_gaudi.attention.ops.hpu_paged_attn import (HPUPagedAttention, HPUPagedAttentionMetadata,
                                                      HPUPagedAttentionMetadataBuilder)
 
@@ -281,53 +281,16 @@ def __init__(
                                                  f"heads in the layer. Sinks shape: {sinks.shape}, "
                                                  f"num_heads: {num_heads}.")
 
-    def forward(
-        self,
-        layer: AttentionLayer,
-        q: torch.Tensor,
-        k_c_normed: torch.Tensor,  # key in unified attn
-        k_pe: torch.Tensor,  # value in unified attn
-        kv_cache: torch.Tensor,
-        attn_metadata: HPUAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if output is not None:
-            raise NotImplementedError("output is not yet supported for MLAImplBase")
-
-        is_prefill = attn_metadata.is_prompt
-
-        if not is_prefill:
-            # decode
-            q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-            # Convert from (B, N, P) to (N, B, P)
-            q_nope = q_nope.transpose(0, 1)
-            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-            decode_ql_nope = torch.bmm(q_nope, self.W_UK_T)
-            # Convert from (N, B, L) to (B, N, L)
-            decode_ql_nope = decode_ql_nope.transpose(0, 1)
-
-        slot_mapping = attn_metadata.slot_mapping.flatten() if attn_metadata.slot_mapping is not None else None
-
-        latent_vec_k = torch.concat((k_c_normed, k_pe.view(*k_c_normed.shape[:-1], self.qk_rope_head_dim)), dim=-1)
-        latent_vec_k = latent_vec_k.view(-1, self.qk_rope_head_dim + self.kv_lora_rank)
-
-        # write the latent and rope to kv cache
-        if kv_cache is not None and len(kv_cache) >= 2:
-            self.latent_cache_k(latent_vec_k, kv_cache[0], slot_mapping)
-            k_cache = kv_cache[0]
-
-        if is_prefill:
-            return self._forward_prefill(q, latent_vec_k, k_cache, attn_metadata)
-        else:
-            return self._forward_decode(decode_ql_nope, q_pe, k_cache, attn_metadata)
-
-    def _forward_prefill(  # type: ignore
+    def forward_mha(  # type: ignore
             self, q: torch.Tensor, latent_vec_k: torch.Tensor, k_cache: torch.Tensor,
             attn_metadata: HPUAttentionMetadata) -> torch.Tensor:
 
         ##### get prefix cache #####
         if attn_metadata.block_list is not None:
             current = latent_vec_k
+            # Patch for vllm-gaudi kv_cache tuple format.
+            if isinstance(k_cache, tuple):
+                k_cache = k_cache[0]  # Use only key_cache for MLA
             past = self.latent_cache_k.fetch_from_cache(k_cache.unflatten(0, (-1, attn_metadata.block_size)),
                                                         attn_metadata.block_list)
             past = past.view(-1, past.shape[-1])
@@ -382,9 +345,14 @@ def _forward_prefill(  # type: ignore
 
         return output.reshape(-1, self.num_heads * v.shape[-1])
 
-    def _forward_decode(  # type: ignore
+    def forward_mqa(  # type: ignore
             self, q_nope: torch.Tensor, q_pe: torch.Tensor, k_cache: torch.Tensor,
             attn_metadata: HPUAttentionMetadata) -> torch.Tensor:
+        if k_cache is not None and isinstance(k_cache, tuple):
+            key_cache, value_cache, k_scales, v_scales = \
+                HPUPagedAttention.split_kv_cache(k_cache, self.num_kv_heads, self.head_size)
+        if isinstance(k_cache, tuple):
+            k_cache = k_cache[0]  # Use only key_cache for MLA
         query = torch.cat([q_nope, q_pe], dim=-1)
         key_cache = k_cache.unsqueeze(1)
         value_cache = None
@@ -404,8 +372,7 @@ def _forward_decode(  # type: ignore
                                                   keys_fetch_func=self.latent_cache_k.fetch_from_cache,
                                                   values_fetch_func=None,
                                                   kv_lora_rank=self.kv_lora_rank)
-        result = self._v_up_proj(output)
-        return result
+        return output
 
     # NOTE(Xinyu): Make the loaded weight contiguous to avoid the transpose
     # during each graph execution
@@ -1228,15 +1195,8 @@ def get_supported_head_sizes() -> list[int]:
     def is_mla(cls) -> bool:
         return True
 
-    def _forward_decode(self, *args, **kwargs) -> torch.Tensor:
+    def forward_mqa(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("Use forward method for HPUUnifiedMLAImpl")
 
-    def _forward_prefill(self, *args, **kwargs) -> torch.Tensor:
+    def forward_mha(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("Use forward method for HPUUnifiedMLAImpl")
-
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
-        # Parent MLACommonImpl extracts W_UV and W_UK_T from kv_b_proj weights
-        # These projection matrices are used for latent ↔ full space conversions
-        super().process_weights_after_loading(act_dtype)
-        self.W_UV: torch.Tensor = self.W_UV.contiguous()
-        self.W_UK_T: torch.Tensor = self.W_UK_T.contiguous()
diff --git a/vllm_gaudi/attention/oot_mla.py b/vllm_gaudi/attention/oot_mla.py
new file mode 100644
index 000000000..cc913d526
--- /dev/null
+++ b/vllm_gaudi/attention/oot_mla.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import os
+
+from vllm.config import get_current_vllm_config
+from vllm.model_executor.custom_op import PluggableLayer
+from vllm.model_executor.layers.attention import MLAAttention
+from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper
+from vllm_gaudi.extension.utils import VLLMKVCache
+from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache)
+from vllm_gaudi.extension.unified import HPUUnifiedAttentionMetadata
+import vllm_gaudi.extension.kernels as kernels
+
+
+class HPUMLAAttention(MLAAttention):
+
+    scale: float
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.enable_fp8_attn = self.kv_cache_dtype == 'fp8_inc' and os.environ.get('QUANT_CONFIG', None) is None
+        self.latent_cache_k = VLLMKVCache() if not self.enable_fp8_attn else VLLMFP8KVCache()
+        self.scale = float(self.scale)
+        self.matmul_qk = Matmul() if not self.enable_fp8_attn \
+            else FP8Matmul()
+        self.softmax = Softmax()
+        self.matmul_av = Matmul() if not self.enable_fp8_attn \
+            else FP8Matmul()
+        self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \
+            else FP8Matmul()
+        self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \
+            else FP8Matmul()
+        self.k_cache = VLLMKVCache() if not self.enable_fp8_attn \
+            else VLLMFP8KVCache()
+        self.v_cache = VLLMKVCache(is_v_cache=True) if not self.enable_fp8_attn \
+            else VLLMFP8KVCache()
+        HPUFusedSDPA = kernels.fsdpa()
+        self.fused_scaled_dot_product_attention = None if HPUFusedSDPA is None \
+            else ModuleFusedSDPA(HPUFusedSDPA)
+
+    def forward_impl(
+        self,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: "HPUUnifiedAttentionMetadata",
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if output is not None:
+            raise NotImplementedError("output is not yet supported for MLAImplBase")
+
+        is_prefill = attn_metadata.is_prompt
+
+        if not is_prefill:
+            # decode
+            q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            # Convert from (B, N, P) to (N, B, P)
+            q_nope = q_nope.transpose(0, 1)
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            decode_ql_nope = torch.bmm(q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
+        slot_mapping = attn_metadata.slot_mapping.flatten() if attn_metadata.slot_mapping is not None else None
+
+        latent_vec_k = torch.concat((k_c_normed, k_pe.view(*k_c_normed.shape[:-1], self.qk_rope_head_dim)), dim=-1)
+        latent_vec_k = latent_vec_k.view(-1, self.qk_rope_head_dim + self.kv_lora_rank)
+
+        # write the latent and rope to kv cache
+        if kv_cache is not None and len(kv_cache) >= 2:
+            self.latent_cache_k(latent_vec_k, kv_cache[0], slot_mapping)
+
+        if is_prefill:
+            output = self.impl.forward_mha(q, latent_vec_k, kv_cache, attn_metadata)
+            return output
+        else:
+            output = self.impl.forward_mqa(decode_ql_nope, q_pe, kv_cache, attn_metadata)
+            output = self._v_up_proj(output)
+            return output
+            # NOTE(Xinyu): Make the loaded weight contiguous to avoid the transpose
+
+    # during each graph execution
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        MLAAttention.process_weights_after_loading(self, act_dtype)
+        #super(MLAAttention, self).process_weights_after_loading(act_dtype)
+        self.W_UV: torch.Tensor = self.W_UV.contiguous()
+        self.W_UK_T: torch.Tensor = self.W_UK_T.contiguous()
+
+    # NOTE(Chendi): PR25184 using output buffer as default, which can't be used in HPU Graph,
+    # so we override and always return a new tensor
+    def _v_up_proj(self, x):
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+        x = torch.bmm(x, self.W_UV)
+        # Convert from (N, B, V) to (B, N * V)
+        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        return x
+
+
+@PluggableLayer.register_oot(name="MultiHeadLatentAttentionWrapper")
+class HPUMultiHeadLatentAttentionWrapper(MultiHeadLatentAttentionWrapper):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        scale: float,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        mla_modules,
+        cache_config=None,
+        quant_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            scale=scale,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            q_lora_rank=q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            mla_modules=mla_modules,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+        layer_name = f"{prefix}.attn"
+        static_ctx = get_current_vllm_config().compilation_config.static_forward_context
+        static_ctx.pop(layer_name, None)
+        self.mla_attn = HPUMLAAttention(
+            num_heads=self.num_heads,
+            scale=scale,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=layer_name,
+            kv_b_proj=self.kv_b_proj,
+            use_sparse=self.is_sparse,
+            indexer=self.indexer,
+        )
diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py
index 1b1c363be..db1ab7468 100755
--- a/vllm_gaudi/ops/hpu_fused_moe.py
+++ b/vllm_gaudi/ops/hpu_fused_moe.py
@@ -1,3 +1,4 @@
+from collections.abc import Callable
 from functools import partial
 from typing import Union
 
@@ -25,6 +26,10 @@ def __init__(self, *args, **kwargs):
             and vllm_config.model_config.hf_config is not None:
             self.model_type = vllm_config.model_config.hf_config.model_type
 
+    def _select_monolithic(self) -> Callable:
+        """Overriding base method"""
+        return self.apply_monolithic
+
     @property
     def is_monolithic(self) -> bool:
         return True
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 6e00378ec..812047e96 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1374,7 +1374,7 @@ def _extract_mm_kwargs(
     # source: vllm/v1/worker/gpu_model_runner.py
     def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput", req_ids: list[str]):
         # Batch the multi-modal inputs.
-        mm_kwargs = list[MultiModalKwargsItem]()
+        mm_kwargs = list[tuple[str, MultiModalKwargsItem]]()
         # List of tuple (mm_hash, pos_info)
         mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
         for req_id in req_ids:
@@ -1384,7 +1384,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput", req_ids: list
                 mm_hash = mm_feature.identifier
                 if mm_hash in self.encoder_cache:
                     continue
-                mm_kwargs.append(mm_feature.data)
+                mm_kwargs.append((mm_feature.modality, mm_feature.data))
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
 
         if not mm_kwargs:
@@ -1435,10 +1435,21 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput", req_ids: list
                 mm_hashes_pos,
                 encoder_outputs,
         ):
-            if req_id not in self.encoder_cache:
-                self.encoder_cache[req_id] = {}
+            is_embed = pos_info.is_embed
+            if is_embed is not None:
+                is_embed = is_embed.to(device=output.device)
 
-            self.encoder_cache[mm_hash] = output
+            if is_embed is None:
+                scattered_output = output
+            else:
+                placeholders = output.new_full(
+                    (is_embed.shape[0], output.shape[-1]),
+                    fill_value=torch.nan,
+                )
+                placeholders[is_embed] = output
+                scattered_output = placeholders
+
+            self.encoder_cache[mm_hash] = scattered_output
 
     # modified from: vllm/v1/worker/gpu_model_runner.py
     def _gather_mm_embeddings(
@@ -1456,7 +1467,6 @@ def _gather_mm_embeddings(
 
         req_start_idx = 0
         for req_id in req_ids:
-            mm_embeds_req: list[torch.Tensor] = []
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
             req_state = self.requests[req_id]
             num_computed_tokens = \
@@ -1498,11 +1508,15 @@ def _gather_mm_embeddings(
                 else:
                     mm_embeds_item = encoder_output[start_idx:end_idx]
 
+                sliced_output = encoder_output[start_idx:end_idx]
+                mm_embeds_item = sliced_output if is_embed is None else sliced_output[is_embed]
+
                 req_start_pos = req_start_idx + start_pos - num_computed_tokens
-                is_mm_embed[req_start_pos + start_idx:req_start_pos +
-                            end_idx] = (True if is_embed is None else is_embed)
-                mm_embeds_req.append(mm_embeds_item)
-            mm_embeds.extend(mm_embeds_req)
+                is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \
+                    = True
+
+                # Only whole mm items are processed
+                mm_embeds.append(mm_embeds_item)
             req_start_idx += num_scheduled_tokens
 
         # Convert bool tensor to index tensor for merge embedding statically if optimized mm
diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py
index d79a77e08..243490eb0 100644
--- a/vllm_gaudi/v1/worker/hpu_worker.py
+++ b/vllm_gaudi/v1/worker/hpu_worker.py
@@ -17,7 +17,6 @@
 from vllm_gaudi.extension.profiler import (HabanaMemoryProfiler, format_bytes, setup_profiler)
 from vllm_gaudi.extension.runtime import get_config
 
-import vllm.envs as envs
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment)
 from vllm.distributed.kv_transfer import (
@@ -96,8 +95,10 @@ def __init__(
 
     def init_profiler(self):
         """Initialize the profiler."""
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        torch_profiler_dir = os.getenv('VLLM_TORCH_PROFILER_DIR')
+        if torch_profiler_dir:
+            logger.warning("VLLM_TORCH_PROFILER_DIR is deprecated!")
+            torch_profiler_trace_dir = torch_profiler_dir
             logger.info("Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir)
             if os.getenv('VLLM_PROFILER_ENABLED') == 'full':
                 fn = self.model_runner.profiler.full_trace_handler