From 565d3bb8c68a9cae0c301f2bf1a8502c208c1a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 14 Feb 2023 20:07:29 +0100 Subject: [PATCH] CI: Update colossalai version (#16747) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- .azure/gpu-tests-pytorch.yml | 8 +- .azure/ipu-tests.yml | 3 +- dockers/base-cuda/Dockerfile | 4 +- dockers/base-xla/tpu_workflow_pytorch.jsonnet | 2 +- requirements/pytorch/strategies.txt | 4 +- .../pytorch/strategies/colossalai.py | 78 +++++++------------ tests/tests_pytorch/run_standalone_tests.sh | 2 +- .../strategies/test_colossalai.py | 14 +++- 8 files changed, 46 insertions(+), 69 deletions(-) diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index df085fd8a3093..e11b515899de4 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -108,17 +108,11 @@ jobs: - bash: pip uninstall -y -r requirements/pytorch/strategies.txt condition: eq(variables['scope'], '') - displayName: 'UnInstall strategies' + displayName: 'Uninstall strategies' - bash: | set -e - - CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") - CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") - pip install "colossalai==0.1.12+torch${PYTORCH_VERSION}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org - pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL} - python requirements/pytorch/check-avail-strategies.py condition: eq(variables['scope'], 'strategies') displayName: 'Install strategies' diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml index adba405ad7be9..4ed1c17bfc3d7 100644 --- a/.azure/ipu-tests.yml +++ b/.azure/ipu-tests.yml @@ -76,7 +76,8 @@ jobs: for fpath in `ls requirements/**/*.txt`; do \ python ./requirements/pytorch/adjust-versions.py $fpath; \ done - pip install -e .[dev] + pip install -e .[extra,examples,test] + pip list env: PACKAGE_NAME: "pytorch" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 1273776d0a41c..684650f71e70b 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -102,9 +102,7 @@ RUN \ # install ColossalAI # TODO: 1.13 wheels are not released, remove skip once they are if [[ $PYTORCH_VERSION != "1.13" ]]; then \ - CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \ - CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \ - pip install "colossalai==0.1.12+torch${PYTORCH_VERSION}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org ; \ + pip install "colossalai==0.2.3"; \ python -c "import colossalai; print(colossalai.__version__)" ; \ fi diff --git a/dockers/base-xla/tpu_workflow_pytorch.jsonnet b/dockers/base-xla/tpu_workflow_pytorch.jsonnet index 13d1eca8c10a8..cfc4d14cf9e11 100644 --- a/dockers/base-xla/tpu_workflow_pytorch.jsonnet +++ b/dockers/base-xla/tpu_workflow_pytorch.jsonnet @@ -42,7 +42,7 @@ local tputests = base.BaseTest { for fpath in `ls requirements/**/*.txt`; do python requirements/pytorch/adjust-versions.py $fpath {PYTORCH_VERSION}; done - PACKAGE_NAME=pytorch pip install .[dev] + PACKAGE_NAME=pytorch pip install .[extra,test] pip list pip install -q -r .actions/requirements.txt diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 91e8bec70d4e0..7b392dc0b2bb4 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,3 +1,5 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -deepspeed>=0.6.0, <=0.8.0 + +deepspeed>=0.6.0, <0.8.0 # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released +colossalai>=0.2.0, <=0.2.4 diff --git a/src/lightning/pytorch/strategies/colossalai.py b/src/lightning/pytorch/strategies/colossalai.py index 0022938bb0d6c..103bdd151309b 100644 --- a/src/lightning/pytorch/strategies/colossalai.py +++ b/src/lightning/pytorch/strategies/colossalai.py @@ -36,7 +36,6 @@ from lightning.pytorch.utilities.types import STEP_OUTPUT _COLOSSALAI_AVAILABLE = RequirementCache("colossalai") -_COLOSSALAI_GREATER_0_1_10 = RequirementCache("colossalai>0.1.10") if TYPE_CHECKING and _COLOSSALAI_AVAILABLE: with _patch_cuda_is_available(): from colossalai.utils.model.colo_init_context import ColoInitContext @@ -94,7 +93,7 @@ def configure_sharded_model(self) -> None: chunk_search_n_grids: The number of intervals in the search range. - min_chunk_size: The minimum size for a chunk. + min_chunk_size: The minimum size for a chunk in bytes. initial_scale: The initial dynamic loss scale value. @@ -130,7 +129,7 @@ def __init__( gpu_margin_mem_ratio: float = 0.0, chunk_search_range: int = 64 * 1024**2, chunk_search_n_grids: int = 4096, - min_chunk_size: Optional[int] = None, + min_chunk_size: int = 32 * 1024**2, initial_scale: float = 2**16, min_scale: float = 1, growth_factor: float = 2, @@ -242,10 +241,7 @@ def _post_init_method(self, module: torch.nn.Module, *args: Any, **kwargs: Any) def setup_precision_plugin(self) -> None: with _patch_cuda_is_available(): - from colossalai.gemini import ChunkManager, GeminiManager from colossalai.nn.optimizer import CPUAdam, HybridAdam - from colossalai.nn.parallel import ZeroDDP - from colossalai.tensor import ProcessGroup from colossalai.zero import ZeroOptimizer super().setup_precision_plugin() @@ -265,52 +261,30 @@ def setup_precision_plugin(self) -> None: pl_module = self.model if not hasattr(pl_module, "_colossalai_zero"): - if not _COLOSSALAI_GREATER_0_1_10: - if self.use_chunk: - chunk_size = self.chunk_size or ChunkManager.search_chunk_size( - self.model, **self.chunk_size_search_kwargs - ) - else: - chunk_size = None - process_group = ProcessGroup() - chunk_manager = ChunkManager( - chunk_size, - process_group, - self.enable_distributed_storage, - GeminiManager.get_default_device(self.placement_policy), - ) - gemini_manager = GeminiManager(self.placement_policy, chunk_manager) - model = _LightningModuleWrapperBase(self.model) - self.model = ZeroDDP(model, gemini_manager, self.force_outputs_fp32) - else: - with _patch_cuda_is_available(): - from colossalai.nn.parallel import GeminiDDP - from colossalai.utils import get_current_device - if not self.use_chunk: - raise ValueError("`ColossalAIStrategy` must use chunk in versions higher than 0.1.10") - chunk_search_range: int = self.chunk_size_search_kwargs.get( - "search_range", 32 * 1024**2 - ) # type: ignore[assignment] - search_range_mb: float = chunk_search_range / 1024**2 - search_n_grids: int = self.chunk_size_search_kwargs.get("n_grids", 4096) # type: ignore[assignment] - search_interval: int = math.ceil(chunk_search_range / search_n_grids) - min_chunk_size_mb: float = self.chunk_size_search_kwargs.get( - "min_chunk_size", 32 * 1024**2 - ) # type: ignore[assignment] - if min_chunk_size_mb is not None: - min_chunk_size_mb /= 1024**2 - - model = _LightningModuleWrapperBase(self.model) - self.model = GeminiDDP( - module=model, - device=get_current_device(), - placement_policy=self.placement_policy, - pin_memory=True, - force_outputs_fp32=self.force_outputs_fp32, - search_range_mb=search_range_mb, - hidden_dim=search_interval, - min_chunk_size_mb=min_chunk_size_mb, - ) + with _patch_cuda_is_available(): + from colossalai.nn.parallel import GeminiDDP + from colossalai.utils import get_current_device + if not self.use_chunk: + raise ValueError("`ColossalAIStrategy` must use chunk in versions higher than 0.1.10") + chunk_search_range: int = self.chunk_size_search_kwargs.get( + "search_range", 32 * 1024**2 + ) # type: ignore[assignment] + search_range_mb: float = chunk_search_range / 1024**2 + search_n_grids: int = self.chunk_size_search_kwargs.get("n_grids", 4096) # type: ignore[assignment] + search_interval: int = math.ceil(chunk_search_range / search_n_grids) + min_chunk_size_mb = int(self.chunk_size_search_kwargs["min_chunk_size"] // (1024**2)) + + model = _LightningModuleWrapperBase(self.model) + self.model = GeminiDDP( + module=model, + device=get_current_device(), + placement_policy=self.placement_policy, + pin_memory=True, + force_outputs_fp32=self.force_outputs_fp32, + search_range_mb=search_range_mb, + hidden_dim=search_interval, + min_chunk_size_mb=min_chunk_size_mb, + ) assert self.model is not None pl_module._colossalai_zero = [self.model] # type: ignore[assignment] diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 65ead08c2a910..48dbe307b73ab 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -50,7 +50,7 @@ function show_batched_output { if [ -f standalone_test_output.txt ]; then # if exists cat standalone_test_output.txt # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail - if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qv 'on_exception'; then + if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then echo "Potential error! Stopping." rm standalone_test_output.txt exit 1 diff --git a/tests/tests_pytorch/strategies/test_colossalai.py b/tests/tests_pytorch/strategies/test_colossalai.py index 77136c3f61f04..832854f82db3b 100644 --- a/tests/tests_pytorch/strategies/test_colossalai.py +++ b/tests/tests_pytorch/strategies/test_colossalai.py @@ -228,6 +228,7 @@ def test_multi_gpu_checkpointing(tmpdir): precision=16, strategy="colossalai", callbacks=[ck], + num_sanity_val_steps=0, # TODO: remove once validation/test before fitting is supported again ) trainer.fit(model, datamodule=dm) @@ -235,11 +236,17 @@ def test_multi_gpu_checkpointing(tmpdir): saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results == results - # here, we test whether restore_checkpoint_after_setup is worked + +@pytest.mark.xfail(raises=AssertionError, match="You should run a completed iteration as your warmup iter") +@RunIf(min_cuda_gpus=2, standalone=True, colossalai=True, sklearn=True) +def test_test_without_fit(tmpdir): model = ModelParallelClassificationModel() + dm = ClassifDataModule() trainer = Trainer(default_root_dir=tmpdir, accelerator="gpu", devices=2, precision=16, strategy="colossalai") - saved_results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path) - assert saved_results == results + + # Colossal requires warmup, you can't run validation/test without having fit first + # This is a temporary limitation + trainer.test(model, datamodule=dm) @RunIf(min_cuda_gpus=2, standalone=True, colossalai=True, sklearn=True) @@ -255,6 +262,7 @@ def test_multi_gpu_model_colossalai_fit_test(tmpdir): precision=16, strategy=ColossalAIStrategy(initial_scale=32), max_epochs=1, + num_sanity_val_steps=0, # TODO: remove once validation/test before fitting is supported again ) trainer.fit(model, datamodule=dm)