From 4b6f6f943480c4dde4c01b68c7846c8173198cb5 Mon Sep 17 00:00:00 2001
From: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Date: Thu, 15 Jan 2026 14:16:34 +0200
Subject: [PATCH] Use replicated linear latent

Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
---
 vllm/model_executor/models/nemotron_h.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 85f36342c5af..ab57ef88265d 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -207,20 +207,19 @@ def __init__(
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
-            is_gated=config.mlp_hidden_act != "relu2" # Hack to identify non-gated MoE TODO: find a better way
+            is_gated=config.mlp_hidden_act
+            != "relu2",  # Hack to identify non-gated MoE TODO: find a better way
         )
 
         if self.use_latent_moe:
             # TODO: check if using ReplicatedLinear is better than
             # ColumnParallelLinear + all_gather
-            self.fc1_latent_proj = ColumnParallelLinear(
+            self.fc1_latent_proj = ReplicatedLinear(
                 input_size=config.hidden_size,
                 output_size=self.moe_hidden_size,
                 bias=config.mlp_bias,
                 quant_config=quant_config,
                 disable_tp=self.is_sequence_parallel,
-                # We need to gather the output to prepare input for moe
-                gather_output=True,
                 prefix=f"{prefix}.fc1_latent_proj",
             )
             self.fc2_latent_proj = ReplicatedLinear(