diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 76dba6cca3..c97d0aebf6 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -407,6 +407,7 @@ jobs: - script: L2_Launch_quantization_aware_training - script: L2_Launch_quantization_export - script: L2_Launch_recipes_llama_cuda_graphs + - script: L2_Launch_utils needs: [pre-flight, cicd-unit-tests] runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2 if: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f9024c9d3..faef42e98f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -199,6 +199,18 @@ Unit tests are stored at `tests/unit_tests`. Please add your test to an existing **Functional tests** are integration tests that perform model training or operate on larger artifacts. We use pytest for writing these. In some cases, it might be desired to run your test (or parts of it) in a subprocess to avoid process contamination. We use `subprocess.run` for this inside the pytest function. Please add your test into one of the predefined folders. If none of the folders matches semantically, please reach out to the `@nvidia-nemo/automation` in your PR for consultation. +### Functional Test Launcher Scripts + +Functional tests that take longer to run should be placed in a `L2_Launch_*.sh` launcher script inside the [`tests/functional_tests/`](tests/functional_tests/) folder. These launcher scripts allow CI to run test groups in parallel, significantly reducing overall pipeline time. + +When adding a new `L2_Launch_*.sh` file, you **must** also update [`.github/workflows/cicd-main.yml`](.github/workflows/cicd-main.yml) to include it in the `cicd-functional-tests` job matrix. Add a new entry under `matrix.include`, for example: + +```yaml +- script: L2_Launch_your_new_test +``` + +Without this step, your new launcher script will not be picked up by CI. + ## 📦 Dependencies Management We use [uv](https://docs.astral.sh/uv/) for managing dependencies. For reproducible builds, our project tracks the generated `uv.lock` file in the repository. diff --git a/tests/functional_tests/models/ministral3_vl/__init__.py b/tests/functional_tests/models/ministral3_vl/__init__.py deleted file mode 100644 index 341a77c5bc..0000000000 --- a/tests/functional_tests/models/ministral3_vl/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/functional_tests/models/ministral3_vl/test_ministral3_vl_conversion.py b/tests/functional_tests/models/ministral3_vl/test_ministral3_vl_conversion.py deleted file mode 100644 index 1d2d0df65c..0000000000 --- a/tests/functional_tests/models/ministral3_vl/test_ministral3_vl_conversion.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import subprocess -from pathlib import Path - -import pytest -import torch - - -# Ministral 3 VL toy model configuration based on typical Ministral 3 VL structure -# This is a minimized version for testing purposes -HF_MINISTRAL3_VL_TOY_MODEL_CONFIG = { - "architectures": ["Mistral3ForConditionalGeneration"], - "model_type": "mistral3", - "torch_dtype": "bfloat16", - "transformers_version": "5.0.0", - "image_token_index": 10, - "text_config": { - "model_type": "mistral3_text", - "hidden_size": 512, - "intermediate_size": 1536, # 3 * hidden_size for FFN - "num_hidden_layers": 4, - "num_attention_heads": 8, - "num_key_value_heads": 2, # GQA with 4 query groups - "vocab_size": 32768, - "max_position_embeddings": 4096, - "rms_norm_eps": 1e-5, - "tie_word_embeddings": True, - "rope_theta": 1000000, - "rope_parameters": { - "rope_type": "yarn", - "factor": 16.0, - "original_max_position_embeddings": 16384, - "llama_4_scaling_beta": 0.0, - }, - }, - "vision_config": { - "model_type": "pixtral", - "hidden_size": 256, - "intermediate_size": 1024, - "num_hidden_layers": 4, - "num_attention_heads": 4, - "image_size": 448, - "patch_size": 14, - "num_channels": 3, - }, - "spatial_merge_size": 2, - "vision_feature_layer": -1, -} - - -class TestMinistral3VLConversion: - """ - Test Ministral 3 VL model conversion from local HuggingFace model with different parallelism configurations. - """ - - @pytest.fixture(scope="class") - def ministral3_vl_toy_model_path(self, tmp_path_factory): - """ - Create and save a HuggingFace Ministral 3 VL toy model from config to a temporary directory. - - Args: - tmp_path_factory: Pytest temporary path factory for class-scoped fixtures - - Returns: - str: Path to the saved HuggingFace model directory - """ - # Skip if transformers doesn't have Mistral3 support - pytest.importorskip("transformers", minversion="5.0.0") - - try: - from transformers import Mistral3ForConditionalGeneration - from transformers.models.mistral3.configuration_mistral3 import Mistral3Config - except ImportError: - pytest.skip("Mistral3ForConditionalGeneration not available in transformers") - - # Create a temporary directory for this test class - temp_dir = tmp_path_factory.mktemp("ministral3_vl_toy_model") - model_dir = temp_dir / "ministral3_vl_toy" - - # Create config from the toy model config - config_dict = HF_MINISTRAL3_VL_TOY_MODEL_CONFIG.copy() - - # Create config object - config = Mistral3Config(**config_dict) - config.torch_dtype = torch.bfloat16 - - # Create model with random weights and convert to bfloat16 - model = Mistral3ForConditionalGeneration(config) - model = model.bfloat16() - - # Debug: Check model dtype before saving - for name, param in model.named_parameters(): - print(f"Before save - {name}: {param.dtype}") - break # Just check the first parameter - - # Create minimal tokenizer files - tokenizer_config = { - "tokenizer_class": "LlamaTokenizer", - "vocab_size": 32768, - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - } - - model_dir.mkdir(parents=True, exist_ok=True) - with open(model_dir / "tokenizer_config.json", "w") as f: - json.dump(tokenizer_config, f, indent=2) - - # Save model and config to directory - model.save_pretrained(model_dir, safe_serialization=True) - - # Save config.json explicitly to ensure compatibility - config_path = model_dir / "config.json" - with open(config_path, "w") as f: - json.dump(config_dict, f, indent=2) - - return str(model_dir) - - def test_toy_model_creation(self, ministral3_vl_toy_model_path): - """ - Test that the toy model is created correctly and can be loaded. - - Args: - ministral3_vl_toy_model_path: Path to the toy Ministral 3 VL model (from fixture) - """ - # Verify the model directory exists - model_path = Path(ministral3_vl_toy_model_path) - assert model_path.exists(), f"Model directory not found at {model_path}" - - # Check essential files exist - config_file = model_path / "config.json" - assert config_file.exists(), f"config.json not found at {config_file}" - - # Check for model weights (safetensors preferred) - weights_file = model_path / "model.safetensors" - if not weights_file.exists(): - weights_file = model_path / "pytorch_model.bin" - assert weights_file.exists(), f"Model weights file not found in {model_path}" - - # Check for tokenizer files - tokenizer_config_file = model_path / "tokenizer_config.json" - assert tokenizer_config_file.exists(), f"tokenizer_config.json not found at {tokenizer_config_file}" - - # Load and verify config - with open(config_file) as f: - config_data = json.load(f) - - assert config_data["model_type"] == "mistral3" - assert "text_config" in config_data - assert "vision_config" in config_data - assert config_data["text_config"]["hidden_size"] == 512 - assert config_data["text_config"]["num_hidden_layers"] == 4 - assert config_data["text_config"]["num_attention_heads"] == 8 - assert config_data["vision_config"]["hidden_size"] == 256 - - @pytest.mark.run_only_on("GPU") - @pytest.mark.parametrize( - "tp,pp,test_name", - [ - (2, 1, "TP"), - (1, 2, "PP"), - ], - ) - def test_ministral3_vl_conversion_parallelism(self, ministral3_vl_toy_model_path, tmp_path, tp, pp, test_name): - """ - Test Ministral 3 VL model conversion with different parallelism configurations. - - Args: - ministral3_vl_toy_model_path: Path to the toy Ministral 3 VL model (from fixture) - tmp_path: Pytest temporary path fixture - tp: Tensor parallelism size - pp: Pipeline parallelism size - test_name: Name of the test for identification - """ - - # Create temporary output directory for conversion results - test_output_dir = tmp_path / f"ministral3_vl_{test_name}" - test_output_dir.mkdir(exist_ok=True) - - # Run hf_megatron_roundtrip_multi_gpu.py with specified parallelism configuration on our toy model - cmd = [ - "python", - "-m", - "torch.distributed.run", - "--nproc_per_node=2", - "--nnodes=1", - "-m", - "coverage", - "run", - "--data-file=/opt/Megatron-Bridge/.coverage", - "--source=/opt/Megatron-Bridge/", - "--parallel-mode", - "examples/conversion/hf_megatron_roundtrip_multi_gpu.py", - "--hf-model-id", - ministral3_vl_toy_model_path, # Use our local toy model instead of downloading - "--output-dir", - str(test_output_dir), - "--tp", - str(tp), - "--pp", - str(pp), - ] - - result = subprocess.run( - cmd, capture_output=True, text=True, cwd=Path(__file__).parent.parent.parent.parent.parent - ) - print(cmd) - - # Check that the conversion completed successfully - if result.returncode != 0: - print(f"STDOUT: {result.stdout}") - print(f"STDERR: {result.stderr}") - assert False, f"Ministral 3 VL {test_name} conversion failed with return code {result.returncode}" - - # Verify that the converted model was saved - # The output directory should be named after the last part of the model path - model_name = Path(ministral3_vl_toy_model_path).name # "ministral3_vl_toy" - converted_model_dir = test_output_dir / model_name - assert converted_model_dir.exists(), f"Converted model directory not found at {converted_model_dir}" - - # Check that essential model files exist - config_file = converted_model_dir / "config.json" - assert config_file.exists(), f"config.json not found in converted model at {config_file}" - - # Check for model weights file (could be either safetensors or pytorch_model.bin) - weights_file_safetensors = converted_model_dir / "model.safetensors" - weights_file_pytorch = converted_model_dir / "pytorch_model.bin" - assert weights_file_safetensors.exists() or weights_file_pytorch.exists(), ( - f"Model weights file not found in converted model at {converted_model_dir}" - ) - - # Verify the config contains Ministral 3 VL-specific parameters - with open(config_file) as f: - saved_config = json.load(f) - - assert saved_config["model_type"] == "mistral3", "Model type should be mistral3" - assert "text_config" in saved_config, "VL model should have text_config" - assert "vision_config" in saved_config, "VL model should have vision_config" - assert saved_config["text_config"]["hidden_size"] == 512, "Hidden size should match toy config" - assert saved_config["text_config"]["num_attention_heads"] == 8, ( - "Number of attention heads should match toy config" - )