[#9098][feat] Simple sharding latent experts (#9099)

greg-kwasniewski1 · web-flow · commit 7905d6c0dafc · 2025-11-18T21:14:22.000-05:00
Signed-off-by: greg-kwasniewski1 &lt;213329731+greg-kwasniewski1@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -73,15 +73,13 @@ transforms:
     stage: pattern_matcher
   quantize_mxfp4_moe:
     stage: pattern_matcher
-  # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
   detect_sharding:
     stage: sharding
     simple_shard_only: false
     sharding_source: ['factory','heuristic']
     support_partial_config: true
     sharding_dims: ['tp', 'ep', 'bmm']
     requires_shape_prop: true
-  # TODO: (hg) need to ensure run_shape_prop after sharding.
   sharding_transform_executor:
     stage: sharding
     run_shape_prop: true
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
@@ -187,15 +187,23 @@ def get_model_from_config_patched(config, **kwargs):
 
 _config_from_pretrained_original = AutoConfig.from_pretrained
 _nemotron_h_base_model_tp_plan = {
+    # mamba SSM layer
     "in_proj": "mamba",
     "out_proj": "rowwise",
+    # attention layer
     "q_proj": "colwise",
     "k_proj": "colwise",
     "v_proj": "colwise",
     "o_proj": "rowwise",
+    # NOTE: consider not sharding shared experts and/or
+    # latent projections at all, keeping them replicated.
+    # To do so, comment out the corresponding entries.
+    # moe layer: SHARED experts
     "up_proj": "colwise",
     "down_proj": "rowwise",
-    # "*": "gather",
+    # MoLE: latent projections: simple shard
+    "fc1_latent_proj": "gather",
+    "fc2_latent_proj": "gather",
 }