diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml index b20761ee7dda..f811e70b18fa 100644 --- a/.github/workflows/install-test.yml +++ b/.github/workflows/install-test.yml @@ -130,6 +130,54 @@ jobs: python tests/core_ptl/check_imports.py --domain "$collection" done + test-asr-install-linux-amd: + name: ubuntu-22.04-amd-py${{ matrix.python }}-asr + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python: ["3.10", "3.11", "3.12"] + steps: + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Check disk space before cleanup + run: df -h + + - name: Free up disk space + run: | + # Remove unnecessary packages and files on Ubuntu + sudo apt-get clean + sudo rm -rf /usr/local/lib/android || true + sudo rm -rf /opt/ghc || true + sudo rm -rf /usr/local/.ghcup || true + sudo rm -rf /usr/share/dotnet || true + sudo rm -rf /opt/az || true + # Clear pip and npm caches + pip cache purge || true + sudo npm cache clean --force || true + + - name: Check disk space after cleanup + run: df -h + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + + - name: Install NeMo + run: | + pip install --no-cache-dir --upgrade pip + pip install --no-cache-dir ".[asr]" + + - name: Check disk space after installation + run: df -h + + - name: Run import checks + run: | + # Run import checks + python tests/core_ptl/check_imports.py --domain asr + test-installs-linux-arm: name: ubuntu-22.04-arm-py${{ matrix.python }}-${{ matrix.installer }} runs-on: ubuntu-22.04-arm @@ -188,3 +236,51 @@ jobs: for collection in "asr" "tts" "lightning" "core"; do python tests/core_ptl/check_imports.py --domain "$collection" done + + test-asr-installs-linux-arm: + name: ubuntu-22.04-arm-py${{ matrix.python }}-asr + runs-on: ubuntu-22.04-arm + strategy: + fail-fast: false + matrix: + python: ["3.10", "3.11", "3.12"] + steps: + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Check disk space before cleanup + run: df -h + + - name: Free up disk space + run: | + # Remove unnecessary packages and files on Ubuntu ARM + sudo apt-get clean + sudo rm -rf /usr/local/lib/android || true + sudo rm -rf /opt/ghc || true + sudo rm -rf /usr/local/.ghcup || true + sudo rm -rf /usr/share/dotnet || true + sudo rm -rf /opt/az || true + # Clear pip and npm caches + pip cache purge || true + sudo npm cache clean --force || true + + - name: Check disk space after cleanup + run: df -h + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + + - name: Install NeMo + run: | + pip install --no-cache-dir --upgrade pip + pip install --no-cache-dir ".[asr]" + + - name: Check disk space after installation + run: df -h + + - name: Run import checks + run: | + # Run import checks + python tests/core_ptl/check_imports.py --domain asr diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py index a202c3b36204..6c7cceb769ec 100644 --- a/nemo/lightning/fabric/strategies.py +++ b/nemo/lightning/fabric/strategies.py @@ -42,8 +42,19 @@ from lightning.pytorch.loops.fetchers import _DataFetcher from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO from lightning.pytorch.utilities.combined_loader import CombinedLoader -from megatron.core.distributed import DistributedDataParallelConfig -from megatron.core.optimizer import OptimizerConfig + +try: + from megatron.core.distributed import DistributedDataParallelConfig + from megatron.core.optimizer import OptimizerConfig + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + DistributedDataParallelConfig = object + OptimizerConfig = object + HAVE_MEGATRON_CORE = False + from torch import Tensor, nn from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook from torch.nn import Module diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py index 1ff1ed5ed840..6711f86ed894 100644 --- a/nemo/lightning/io/pl.py +++ b/nemo/lightning/io/pl.py @@ -22,17 +22,26 @@ from lightning.fabric.plugins import CheckpointIO from lightning.fabric.utilities.cloud_io import get_filesystem from lightning.fabric.utilities.types import _PATH -from megatron.core.dist_checkpointing.serialization import ( - get_default_load_sharded_strategy, - get_default_save_sharded_strategy, -) -from megatron.core.dist_checkpointing.strategies.base import SaveShardedStrategy -from megatron.core.dist_checkpointing.strategies.fully_parallel import ( - FullyParallelLoadStrategyWrapper, - FullyParallelSaveStrategyWrapper, -) -from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy -from megatron.core.parallel_state import get_data_parallel_group + +try: + from megatron.core.dist_checkpointing.serialization import ( + get_default_load_sharded_strategy, + get_default_save_sharded_strategy, + ) + from megatron.core.dist_checkpointing.strategies.base import SaveShardedStrategy + from megatron.core.dist_checkpointing.strategies.fully_parallel import ( + FullyParallelLoadStrategyWrapper, + FullyParallelSaveStrategyWrapper, + ) + from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy + from megatron.core.parallel_state import get_data_parallel_group + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + from torch import nn from typing_extensions import Self, override diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py index a3a600814821..140f35693cc7 100644 --- a/nemo/lightning/megatron_init.py +++ b/nemo/lightning/megatron_init.py @@ -60,15 +60,17 @@ except (ImportError, ModuleNotFoundError): logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") - from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator - from apex.transformer.pipeline_parallel.utils import ( - get_current_global_batch_size, - get_micro_batch_size, - get_num_microbatches, - ) - from apex.transformer.pipeline_parallel.utils import ( - setup_microbatch_calculator as init_num_microbatches_calculator, - ) + + if HAVE_APEX: + from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator + from apex.transformer.pipeline_parallel.utils import ( + get_current_global_batch_size, + get_micro_batch_size, + get_num_microbatches, + ) + from apex.transformer.pipeline_parallel.utils import ( + setup_microbatch_calculator as init_num_microbatches_calculator, + ) MCORE_MB_CALCULATOR = False diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 4c1659a39c6b..0fc056bc0f75 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -50,12 +50,23 @@ import torch.distributed from lightning.pytorch.trainer.states import TrainerFn from lightning.pytorch.utilities import move_data_to_device -from megatron.core import parallel_state -from megatron.core.distributed import DistributedDataParallel as McoreDDP -from megatron.core.distributed import DistributedDataParallelConfig -from megatron.core.optimizer import OptimizerConfig -from megatron.core.transformer.moe.moe_utils import get_moe_layer_wise_logging_tracker -from megatron.core.transformer.transformer_config import TransformerConfig + +try: + from megatron.core import parallel_state + from megatron.core.distributed import DistributedDataParallel as McoreDDP + from megatron.core.distributed import DistributedDataParallelConfig + from megatron.core.optimizer import OptimizerConfig + from megatron.core.transformer.moe.moe_utils import get_moe_layer_wise_logging_tracker + from megatron.core.transformer.transformer_config import TransformerConfig + + HAVE_MEGATRON_CORE = True +except (ImportError, ModuleNotFoundError): + + McoreDDP = object + DistributedDataParallelConfig = object + TransformerConfig = object + HAVE_MEGATRON_CORE = False + from torch import Tensor, nn from typing_extensions import override diff --git a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py index a9fa5c681230..555163006a18 100644 --- a/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py +++ b/nemo/lightning/pytorch/callbacks/ddp_parity_checker.py @@ -16,7 +16,15 @@ import torch from lightning.pytorch.callbacks.callback import Callback -from megatron.core.utils import check_param_hashes_across_dp_replicas + +try: + from megatron.core.utils import check_param_hashes_across_dp_replicas + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False from nemo.lightning import io from nemo.utils import logging diff --git a/nemo/lightning/pytorch/callbacks/progress_printer.py b/nemo/lightning/pytorch/callbacks/progress_printer.py index 817bd79c8a1a..854bd8d49d37 100644 --- a/nemo/lightning/pytorch/callbacks/progress_printer.py +++ b/nemo/lightning/pytorch/callbacks/progress_printer.py @@ -17,7 +17,16 @@ from lightning.pytorch.callbacks.progress import ProgressBar from lightning.pytorch.utilities.types import STEP_OUTPUT -from megatron.core.num_microbatches_calculator import get_num_microbatches + +try: + from megatron.core.num_microbatches_calculator import get_num_microbatches + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + from typing_extensions import override diff --git a/nemo/lightning/pytorch/optim/megatron.py b/nemo/lightning/pytorch/optim/megatron.py index 2a157675978b..3e4f9b87c766 100644 --- a/nemo/lightning/pytorch/optim/megatron.py +++ b/nemo/lightning/pytorch/optim/megatron.py @@ -15,9 +15,19 @@ from typing import Callable, List, Optional import lightning.pytorch as pl -from megatron.core.distributed import finalize_model_grads -from megatron.core.optimizer import OptimizerConfig -from megatron.core.utils import get_model_config + +try: + from megatron.core.distributed import finalize_model_grads + from megatron.core.optimizer import OptimizerConfig + from megatron.core.utils import get_model_config + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + OptimizerConfig = object + HAVE_MEGATRON_CORE = False + from torch.optim import Optimizer from nemo.lightning._strategy_lib import setup_megatron_optimizer diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index f1dd70af8c3f..18a4e4be1b2e 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -29,7 +29,17 @@ from lightning.pytorch.strategies.fsdp import FSDPStrategy as PLFSDPStrategy from lightning.pytorch.trainer.states import TrainerFn from lightning.pytorch.utilities.types import STEP_OUTPUT -from megatron.core.transformer.transformer_layer import TransformerLayer + +try: + from megatron.core.transformer.transformer_layer import TransformerLayer + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + TransformerLayer = object + HAVE_MEGATRON_CORE = False + from torch.distributed.checkpoint.state_dict import ( # get_state_dict, StateDictOptions, get_optimizer_state_dict, diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index bfa28c8c281d..419e2b84f72e 100755 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -52,11 +52,20 @@ from lightning.pytorch.strategies.ddp import DDPStrategy from lightning.pytorch.trainer.states import RunningStage, TrainerFn from lightning.pytorch.utilities.types import STEP_OUTPUT -from megatron.core import Timers -from megatron.core.dist_checkpointing.validation import StrictHandling -from megatron.core.distributed import DistributedDataParallelConfig -from megatron.core.optimizer import OptimizerConfig -from megatron.core.utils import get_torch_version, is_torch_min_version + +try: + from megatron.core import Timers + from megatron.core.dist_checkpointing.validation import StrictHandling + from megatron.core.distributed import DistributedDataParallelConfig + from megatron.core.optimizer import OptimizerConfig + from megatron.core.utils import get_torch_version, is_torch_min_version + + HAVE_MEGATRON_CORE = True +except (ImportError, ModuleNotFoundError): + + DistributedDataParallelConfig = object + HAVE_MEGATRON_CORE = False + from torch import nn from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook from torch.distributed.checkpoint.utils import CheckpointException diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index 64c52889925d..3a3297414bbb 100755 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -24,10 +24,23 @@ import torch from lightning.fabric.plugins import ClusterEnvironment from lightning.pytorch.callbacks import TQDMProgressBar -from megatron.core import parallel_state -from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedObject, ShardedTensor -from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor -from megatron.core.transformer.utils import _get_extra_state_offsets + +try: + from megatron.core import parallel_state + from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedObject, ShardedTensor + from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor + from megatron.core.transformer.utils import _get_extra_state_offsets + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + ShardedObject = object + ShardedBase = object + ShardedTensor = object + HAVE_MEGATRON_CORE = False + + from torch import Tensor, nn from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor from torch.distributed._tensor import DTensor, Replicate, Shard