diff --git a/examples/auto_deploy/nano_v3.yaml b/examples/auto_deploy/nano_v3.yaml index 1f2cfd0c614..a87e2624254 100644 --- a/examples/auto_deploy/nano_v3.yaml +++ b/examples/auto_deploy/nano_v3.yaml @@ -13,8 +13,27 @@ kv_cache_config: enable_block_reuse: false transforms: detect_sharding: - sharding_source: ['factory', 'heuristic'] sharding_dims: ['ep', 'bmm'] + manual_config: + head_dim: 128 + tp_plan: + # mamba SSM layer + "in_proj": "mamba" + "out_proj": "rowwise" + # attention layer + "q_proj": "colwise" + "k_proj": "colwise" + "v_proj": "colwise" + "o_proj": "rowwise" + # NOTE: consider not sharding shared experts and/or + # latent projections at all, keeping them replicated. + # To do so, comment out the corresponding entries. + # moe layer: SHARED experts + "up_proj": "colwise" + "down_proj": "rowwise" + # MoLE: latent projections: simple shard + "fc1_latent_proj": "gather" + "fc2_latent_proj": "gather" multi_stream_moe: stage: compile enabled: true diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py index d603a8c2712..095e47f299d 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py @@ -190,42 +190,6 @@ def get_model_from_config_patched(config, **kwargs): # TODO: figure out how this can be incorporated into the export patch system AutoModelForCausalLM.from_config = get_model_from_config_patched -# _config_from_pretrained_original = AutoConfig.from_pretrained -# _nemotron_h_base_model_tp_plan = { -# # mamba SSM layer -# "in_proj": "mamba", -# "out_proj": "rowwise", -# # attention layer -# "q_proj": "colwise", -# "k_proj": "colwise", -# "v_proj": "colwise", -# "o_proj": "rowwise", -# # NOTE: consider not sharding shared experts and/or -# # latent projections at all, keeping them replicated. -# # To do so, comment out the corresponding entries. -# # moe layer: SHARED experts -# "up_proj": "colwise", -# "down_proj": "rowwise", -# # MoLE: latent projections: simple shard -# "fc1_latent_proj": "gather", -# "fc2_latent_proj": "gather", -# } - - -# def get_config_from_pretrained_patched(*args, **kwargs): -# ret = _config_from_pretrained_original(*args, **kwargs) -# config = ret[0] if isinstance(ret, tuple) else ret -# # heuristic to check if it's a NemotronH MoE Model -# model_type = getattr(config, "model_type", None) -# num_moe_layers = getattr(config, "layers_block_type", []).count("moe") -# if model_type == "nemotron_h" and num_moe_layers > 0: -# config.base_model_tp_plan = _nemotron_h_base_model_tp_plan -# return (config, *ret[1:]) if isinstance(ret, tuple) else config - - -# # TODO: figure out how this can be incorporated into the export patch system -# AutoConfig.from_pretrained = get_config_from_pretrained_patched - # TODO: figure out how this can be incorporated into the export patch system # Only patch if the module isn't available _mamba_ssm_module = "mamba_ssm"