diff --git a/modelopt/torch/puzzletron/anymodel/model_descriptor/base.py b/modelopt/torch/puzzletron/anymodel/model_descriptor/base.py index 11ef8120ed8..3c1749d46ec 100644 --- a/modelopt/torch/puzzletron/anymodel/model_descriptor/base.py +++ b/modelopt/torch/puzzletron/anymodel/model_descriptor/base.py @@ -179,6 +179,31 @@ def get_language_model_config(config): """ return config + @staticmethod + def truncate_pattern_for_subblock( + lm_config: Any, parent_layer_index: int | None = None + ) -> None: + """Adjust per-layer config fields so a single-layer model represents the correct layer type. + + The default implementation handles ``hybrid_override_pattern`` for + hybrid architectures. It is a no-op when the field is absent. + Override if a model uses a different pattern alphabet. + """ + pattern = getattr(lm_config, "hybrid_override_pattern", None) + if not pattern: + return + # Strip cosmetic pipe separators (e.g. "M|-|*" -> "M-*") before indexing. + pattern = pattern.replace("|", "") + if not pattern: + raise ValueError( + f"hybrid_override_pattern is set but contains no layer-type characters " + f"(original: {lm_config.hybrid_override_pattern!r})" + ) + if parent_layer_index is not None and 0 <= parent_layer_index < len(pattern): + lm_config.hybrid_override_pattern = pattern[parent_layer_index] + return + lm_config.hybrid_override_pattern = pattern[0] + @classmethod def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module: """Create a dummy block to replace a layer for sharded model initialization.""" diff --git a/modelopt/torch/puzzletron/entrypoint.py b/modelopt/torch/puzzletron/entrypoint.py index 37db84bc513..6a4af31ce3e 100644 --- a/modelopt/torch/puzzletron/entrypoint.py +++ b/modelopt/torch/puzzletron/entrypoint.py @@ -65,15 +65,15 @@ def puzzletron( launch_prune_ckpt(hydra_cfg) dist.barrier() - # Step 4: build_library_and_stats (single process) + # Step 3: build_library_and_stats (single process) if dist.is_master(): launch_build_library_and_stats(hydra_cfg) dist.barrier() - # Step 5: calc_one_block_scores (distributed processing) + # Step 4: calc_one_block_scores (distributed processing) launch_scoring(hydra_cfg) - # Step 6: mip_and_realize_models (distributed processing) + # Step 5: mip_and_realize_models (distributed processing) launch_mip_and_realize_model(hydra_cfg) return hydra_cfg diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py index 108707f6ee2..d893eb55bb3 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py @@ -118,7 +118,12 @@ def calculate_subblock_params( layer_config: BlockConfig | FFNConfig | AttentionConfig, descriptor: Type[ModelDescriptor], ) -> int: - """Count parameters on one meta decoder layer.""" + """Count parameters on one meta decoder layer. + + The caller is responsible for adjusting per-layer config fields (e.g. + ``hybrid_override_pattern``) before passing ``config``; see + ``ModelDescriptor.truncate_pattern_for_subblock``. + """ if isinstance(layer_config, FFNConfig): block_config = layer_config.to_blockconfig() elif isinstance(layer_config, AttentionConfig): diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 89093ac51cc..dc89a1f6450 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -16,6 +16,7 @@ """Calc subblock stats to compute memory and runtime statistics for subblocks.""" +import copy import dataclasses import json import os @@ -150,6 +151,11 @@ def calculate_subblock_stats( subblock_config = subblock_config_indexed["subblock_config"] parent_layer_indices = subblock_config_indexed["parent_layer_indices"] + layer_model_config = copy.deepcopy(model_config) + ModelDescriptor.truncate_pattern_for_subblock( + descriptor.get_language_model_config(layer_model_config), parent_layer_indices[0] + ) + if is_calc_runtime: total_runtime_ms = runtime_by_subblock_dict[subblock_config] prefill_runtime_ms = None @@ -168,17 +174,17 @@ def calculate_subblock_stats( weights_dtype, kv_cache_dtype, allocate_prefill_query, - model_config=model_config, + model_config=layer_model_config, descriptor=descriptor, ) if not isinstance(subblock_memory, dict): subblock_memory = {"memory_mib": subblock_memory, "kv_cache_memory_mib": 0.0} - subblock_params = calculate_subblock_params(model_config, subblock_config, descriptor) + subblock_params = calculate_subblock_params(layer_model_config, subblock_config, descriptor) if moe_stats_file is not None: subblock_active_params = calc_subblock_active_params( subblock_config, - model_config, + layer_model_config, descriptor, n_embd, moe_stats_file, diff --git a/tests/gpu/puzzletron/test_nemotron_h_gpu_validation.py b/tests/gpu/puzzletron/test_nemotron_h_gpu_validation.py new file mode 100644 index 00000000000..4c24e6c69c4 --- /dev/null +++ b/tests/gpu/puzzletron/test_nemotron_h_gpu_validation.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GPU validation for Nemotron-H hybrid model subblock parameter counting. + +Requires HuggingFace Hub access to nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base (config only, +no weights are downloaded) and mamba_ssm (CUDA). + +Usage: + pytest -v -s -o addopts= tests/gpu/puzzletron/test_nemotron_h_gpu_validation.py +""" + +import copy + +import pytest + +import modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_model_descriptor # noqa: F401 +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.block_config import FFNConfig +from modelopt.torch.puzzletron.subblock_stats.calc_subblock_params_and_memory import ( + calculate_subblock_params, +) +from modelopt.torch.puzzletron.tools.checkpoint_utils import load_model_config + +MODEL_ID = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base" + + +@pytest.fixture +def nemotron_descriptor(): + return ModelDescriptorFactory.get("nemotron_h_v2") + + +@pytest.fixture +def nemotron_config(nemotron_descriptor): + return load_model_config( + MODEL_ID, trust_remote_code=nemotron_descriptor.requires_trust_remote_code() + ) + + +def test_ffn_variants_produce_distinct_params(nemotron_config, nemotron_descriptor): + """FFN subblocks with different intermediate_size must report different param counts. + + On hybrid models, hybrid_override_pattern must be truncated to match the subblock + type; otherwise a single-layer model always builds layer 0 (Mamba) and every FFN + variant reports identical param counts. + """ + lm_config = nemotron_descriptor.get_language_model_config(nemotron_config) + pattern = lm_config.hybrid_override_pattern.replace("|", "") + ffn_indices = [i for i, c in enumerate(pattern) if c in ("-", "E")] + assert ffn_indices, f"No FFN layers in pattern: {pattern}" + + teacher_size = lm_config.intermediate_size + sizes = [teacher_size // 4, teacher_size // 2, teacher_size] + + param_counts = {} + for size in sizes: + layer_config = copy.deepcopy(nemotron_config) + ModelDescriptor.truncate_pattern_for_subblock( + nemotron_descriptor.get_language_model_config(layer_config), ffn_indices[0] + ) + + params = calculate_subblock_params( + layer_config, FFNConfig(intermediate_size=size), nemotron_descriptor + ) + param_counts[size] = params + print(f" intermediate_size={size:>8d} -> params={params:>12,}") + + assert len(set(param_counts.values())) == len(sizes), ( + f"Expected {len(sizes)} distinct param counts, got: {param_counts}" + ) diff --git a/tests/gpu/torch/puzzletron/test_puzzletron.py b/tests/gpu/torch/puzzletron/test_puzzletron.py index 348fbd96835..a393e1e086a 100644 --- a/tests/gpu/torch/puzzletron/test_puzzletron.py +++ b/tests/gpu/torch/puzzletron/test_puzzletron.py @@ -325,8 +325,8 @@ def _assert_mip_solutions(puzzle_dir: Path, hf_model_name: str): "meta-llama/Llama-3.1-8B-Instruct": 395.63, "meta-llama/Llama-3.2-3B-Instruct": 395.63, "mistralai/Mistral-Small-24B-Instruct-2501": 395.63, - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16": 202.13, - "nvidia/NVIDIA-Nemotron-Nano-12B-v2": 202.13, + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16": 432.81, + "nvidia/NVIDIA-Nemotron-Nano-12B-v2": 197.63, "openai/gpt-oss-20b": 437.33, "Qwen/Qwen2.5-7B-Instruct": 386.25, "Qwen/Qwen3-8B": 395.63, @@ -339,8 +339,8 @@ def _assert_mip_solutions(puzzle_dir: Path, hf_model_name: str): "meta-llama/Llama-3.1-8B-Instruct": 6096128, "meta-llama/Llama-3.2-3B-Instruct": 6096128, "mistralai/Mistral-Small-24B-Instruct-2501": 6096128, - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16": 5309184, - "nvidia/NVIDIA-Nemotron-Nano-12B-v2": 5309184, + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16": 126255872, + "nvidia/NVIDIA-Nemotron-Nano-12B-v2": 2949888, "openai/gpt-oss-20b": 27959168, "Qwen/Qwen2.5-7B-Instruct": 1181696, "Qwen/Qwen3-8B": 6096640, diff --git a/tests/unit/torch/puzzletron/test_hybrid_pattern_truncation.py b/tests/unit/torch/puzzletron/test_hybrid_pattern_truncation.py new file mode 100644 index 00000000000..c19626f10be --- /dev/null +++ b/tests/unit/torch/puzzletron/test_hybrid_pattern_truncation.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ModelDescriptor.truncate_pattern_for_subblock. + +Validates that the base descriptor method selects the correct pattern +character when building a 1-layer model for per-subblock param counting. +""" + +from types import SimpleNamespace + +import pytest + +pytest.importorskip("transformers") + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor + +NEMOTRON_H_PATTERN = "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" + + +class TestTruncatePatternForSubblock: + """Test ModelDescriptor.truncate_pattern_for_subblock.""" + + @pytest.mark.parametrize( + ("index", "expected"), + [ + (0, "M"), + (1, "-"), + (7, "*"), + ], + ids=["mamba", "ffn", "attention"], + ) + def test_index_selects_correct_layer_type(self, index, expected): + """Parent layer index selects the matching character from the pattern.""" + cfg = _make_config() + + ModelDescriptor.truncate_pattern_for_subblock(cfg, parent_layer_index=index) + + assert cfg.hybrid_override_pattern == expected + + @pytest.mark.parametrize( + ("index", "expected"), + [ + (1, "-"), + (2, "*"), + ], + ids=["ffn_after_strip", "attention_after_strip"], + ) + def test_pipe_separators_stripped_before_indexing(self, index, expected): + """Pipe-delimited patterns like 'M|-|*' are normalised to 'M-*' before lookup.""" + cfg = _make_config("M|-|*") + + ModelDescriptor.truncate_pattern_for_subblock(cfg, parent_layer_index=index) + + assert cfg.hybrid_override_pattern == expected + + def test_missing_attribute_is_noop(self): + """Config without hybrid_override_pattern is left unchanged.""" + cfg = SimpleNamespace() + + ModelDescriptor.truncate_pattern_for_subblock(cfg, parent_layer_index=0) + + assert not hasattr(cfg, "hybrid_override_pattern") + + def test_empty_pattern_is_noop(self): + """Empty pattern string is left unchanged.""" + cfg = _make_config("") + + ModelDescriptor.truncate_pattern_for_subblock(cfg, parent_layer_index=0) + + assert cfg.hybrid_override_pattern == "" + + def test_pipes_only_pattern_raises(self): + """Pattern with only pipe separators has no layer-type characters and should error.""" + cfg = _make_config("|||") + + with pytest.raises(ValueError, match="no layer-type characters"): + ModelDescriptor.truncate_pattern_for_subblock(cfg, parent_layer_index=0) + + def test_none_index_defaults_to_first_char(self): + """Without an explicit index, defaults to pattern[0].""" + cfg = _make_config("*-M") + + ModelDescriptor.truncate_pattern_for_subblock(cfg) + + assert cfg.hybrid_override_pattern == "*" + + @pytest.mark.parametrize( + "index", + [999, -1], + ids=["above_range", "negative"], + ) + def test_out_of_range_index_defaults_to_first_char(self, index): + """Out-of-range index defaults to pattern[0].""" + cfg = _make_config("*-M") + + ModelDescriptor.truncate_pattern_for_subblock(cfg, parent_layer_index=index) + + assert cfg.hybrid_override_pattern == "*" + + +def _make_config(pattern=NEMOTRON_H_PATTERN): + return SimpleNamespace(hybrid_override_pattern=pattern)