From 4b6f6f943480c4dde4c01b68c7846c8173198cb5 Mon Sep 17 00:00:00 2001 From: Amir Klein <203507526+amirkl94@users.noreply.github.com> Date: Thu, 15 Jan 2026 14:16:34 +0200 Subject: [PATCH] Use replicated linear latent Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com> --- vllm/model_executor/models/nemotron_h.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 85f36342c5af..ab57ef88265d 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -207,20 +207,19 @@ def __init__( enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, - is_gated=config.mlp_hidden_act != "relu2" # Hack to identify non-gated MoE TODO: find a better way + is_gated=config.mlp_hidden_act + != "relu2", # Hack to identify non-gated MoE TODO: find a better way ) if self.use_latent_moe: # TODO: check if using ReplicatedLinear is better than # ColumnParallelLinear + all_gather - self.fc1_latent_proj = ColumnParallelLinear( + self.fc1_latent_proj = ReplicatedLinear( input_size=config.hidden_size, output_size=self.moe_hidden_size, bias=config.mlp_bias, quant_config=quant_config, disable_tp=self.is_sequence_parallel, - # We need to gather the output to prepare input for moe - gather_output=True, prefix=f"{prefix}.fc1_latent_proj", ) self.fc2_latent_proj = ReplicatedLinear(