From 565d3bb8c68a9cae0c301f2bf1a8502c208c1a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 14 Feb 2023 20:07:29 +0100
Subject: [PATCH] CI: Update colossalai version (#16747)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 .azure/gpu-tests-pytorch.yml                  |  8 +-
 .azure/ipu-tests.yml                          |  3 +-
 dockers/base-cuda/Dockerfile                  |  4 +-
 dockers/base-xla/tpu_workflow_pytorch.jsonnet |  2 +-
 requirements/pytorch/strategies.txt           |  4 +-
 .../pytorch/strategies/colossalai.py          | 78 +++++++------------
 tests/tests_pytorch/run_standalone_tests.sh   |  2 +-
 .../strategies/test_colossalai.py             | 14 +++-
 8 files changed, 46 insertions(+), 69 deletions(-)

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index df085fd8a3093..e11b515899de4 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -108,17 +108,11 @@ jobs:
 
     - bash: pip uninstall -y -r requirements/pytorch/strategies.txt
       condition: eq(variables['scope'], '')
-      displayName: 'UnInstall strategies'
+      displayName: 'Uninstall strategies'
 
     - bash: |
         set -e
-
-        CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))")
-        CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])")
-        pip install "colossalai==0.1.12+torch${PYTORCH_VERSION}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org
-
         pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
-
         python requirements/pytorch/check-avail-strategies.py
       condition: eq(variables['scope'], 'strategies')
       displayName: 'Install strategies'
diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml
index adba405ad7be9..4ed1c17bfc3d7 100644
--- a/.azure/ipu-tests.yml
+++ b/.azure/ipu-tests.yml
@@ -76,7 +76,8 @@ jobs:
         for fpath in `ls requirements/**/*.txt`; do \
           python ./requirements/pytorch/adjust-versions.py $fpath; \
         done
-        pip install -e .[dev]
+        pip install -e .[extra,examples,test]
+
         pip list
       env:
         PACKAGE_NAME: "pytorch"
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 1273776d0a41c..684650f71e70b 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -102,9 +102,7 @@ RUN \
     # install ColossalAI
     # TODO: 1.13 wheels are not released, remove skip once they are
     if [[ $PYTORCH_VERSION != "1.13" ]]; then \
-        CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
-        CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
-        pip install "colossalai==0.1.12+torch${PYTORCH_VERSION}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org ; \
+        pip install "colossalai==0.2.3"; \
         python -c "import colossalai; print(colossalai.__version__)" ; \
     fi
 
diff --git a/dockers/base-xla/tpu_workflow_pytorch.jsonnet b/dockers/base-xla/tpu_workflow_pytorch.jsonnet
index 13d1eca8c10a8..cfc4d14cf9e11 100644
--- a/dockers/base-xla/tpu_workflow_pytorch.jsonnet
+++ b/dockers/base-xla/tpu_workflow_pytorch.jsonnet
@@ -42,7 +42,7 @@ local tputests = base.BaseTest {
       for fpath in `ls requirements/**/*.txt`; do
         python requirements/pytorch/adjust-versions.py $fpath {PYTORCH_VERSION};
       done
-      PACKAGE_NAME=pytorch pip install .[dev]
+      PACKAGE_NAME=pytorch pip install .[extra,test]
       pip list
 
       pip install -q -r .actions/requirements.txt
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 91e8bec70d4e0..7b392dc0b2bb4 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -1,3 +1,5 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-deepspeed>=0.6.0, <=0.8.0
+
+deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
+colossalai>=0.2.0, <=0.2.4
diff --git a/src/lightning/pytorch/strategies/colossalai.py b/src/lightning/pytorch/strategies/colossalai.py
index 0022938bb0d6c..103bdd151309b 100644
--- a/src/lightning/pytorch/strategies/colossalai.py
+++ b/src/lightning/pytorch/strategies/colossalai.py
@@ -36,7 +36,6 @@
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 
 _COLOSSALAI_AVAILABLE = RequirementCache("colossalai")
-_COLOSSALAI_GREATER_0_1_10 = RequirementCache("colossalai>0.1.10")
 if TYPE_CHECKING and _COLOSSALAI_AVAILABLE:
     with _patch_cuda_is_available():
         from colossalai.utils.model.colo_init_context import ColoInitContext
@@ -94,7 +93,7 @@ def configure_sharded_model(self) -> None:
 
         chunk_search_n_grids: The number of intervals in the search range.
 
-        min_chunk_size: The minimum size for a chunk.
+        min_chunk_size: The minimum size for a chunk in bytes.
 
         initial_scale: The initial dynamic loss scale value.
 
@@ -130,7 +129,7 @@ def __init__(
         gpu_margin_mem_ratio: float = 0.0,
         chunk_search_range: int = 64 * 1024**2,
         chunk_search_n_grids: int = 4096,
-        min_chunk_size: Optional[int] = None,
+        min_chunk_size: int = 32 * 1024**2,
         initial_scale: float = 2**16,
         min_scale: float = 1,
         growth_factor: float = 2,
@@ -242,10 +241,7 @@ def _post_init_method(self, module: torch.nn.Module, *args: Any, **kwargs: Any)
 
     def setup_precision_plugin(self) -> None:
         with _patch_cuda_is_available():
-            from colossalai.gemini import ChunkManager, GeminiManager
             from colossalai.nn.optimizer import CPUAdam, HybridAdam
-            from colossalai.nn.parallel import ZeroDDP
-            from colossalai.tensor import ProcessGroup
             from colossalai.zero import ZeroOptimizer
 
         super().setup_precision_plugin()
@@ -265,52 +261,30 @@ def setup_precision_plugin(self) -> None:
         pl_module = self.model
 
         if not hasattr(pl_module, "_colossalai_zero"):
-            if not _COLOSSALAI_GREATER_0_1_10:
-                if self.use_chunk:
-                    chunk_size = self.chunk_size or ChunkManager.search_chunk_size(
-                        self.model, **self.chunk_size_search_kwargs
-                    )
-                else:
-                    chunk_size = None
-                process_group = ProcessGroup()
-                chunk_manager = ChunkManager(
-                    chunk_size,
-                    process_group,
-                    self.enable_distributed_storage,
-                    GeminiManager.get_default_device(self.placement_policy),
-                )
-                gemini_manager = GeminiManager(self.placement_policy, chunk_manager)
-                model = _LightningModuleWrapperBase(self.model)
-                self.model = ZeroDDP(model, gemini_manager, self.force_outputs_fp32)
-            else:
-                with _patch_cuda_is_available():
-                    from colossalai.nn.parallel import GeminiDDP
-                    from colossalai.utils import get_current_device
-                if not self.use_chunk:
-                    raise ValueError("`ColossalAIStrategy` must use chunk in versions higher than 0.1.10")
-                chunk_search_range: int = self.chunk_size_search_kwargs.get(
-                    "search_range", 32 * 1024**2
-                )  # type: ignore[assignment]
-                search_range_mb: float = chunk_search_range / 1024**2
-                search_n_grids: int = self.chunk_size_search_kwargs.get("n_grids", 4096)  # type: ignore[assignment]
-                search_interval: int = math.ceil(chunk_search_range / search_n_grids)
-                min_chunk_size_mb: float = self.chunk_size_search_kwargs.get(
-                    "min_chunk_size", 32 * 1024**2
-                )  # type: ignore[assignment]
-                if min_chunk_size_mb is not None:
-                    min_chunk_size_mb /= 1024**2
-
-                model = _LightningModuleWrapperBase(self.model)
-                self.model = GeminiDDP(
-                    module=model,
-                    device=get_current_device(),
-                    placement_policy=self.placement_policy,
-                    pin_memory=True,
-                    force_outputs_fp32=self.force_outputs_fp32,
-                    search_range_mb=search_range_mb,
-                    hidden_dim=search_interval,
-                    min_chunk_size_mb=min_chunk_size_mb,
-                )
+            with _patch_cuda_is_available():
+                from colossalai.nn.parallel import GeminiDDP
+                from colossalai.utils import get_current_device
+            if not self.use_chunk:
+                raise ValueError("`ColossalAIStrategy` must use chunk in versions higher than 0.1.10")
+            chunk_search_range: int = self.chunk_size_search_kwargs.get(
+                "search_range", 32 * 1024**2
+            )  # type: ignore[assignment]
+            search_range_mb: float = chunk_search_range / 1024**2
+            search_n_grids: int = self.chunk_size_search_kwargs.get("n_grids", 4096)  # type: ignore[assignment]
+            search_interval: int = math.ceil(chunk_search_range / search_n_grids)
+            min_chunk_size_mb = int(self.chunk_size_search_kwargs["min_chunk_size"] // (1024**2))
+
+            model = _LightningModuleWrapperBase(self.model)
+            self.model = GeminiDDP(
+                module=model,
+                device=get_current_device(),
+                placement_policy=self.placement_policy,
+                pin_memory=True,
+                force_outputs_fp32=self.force_outputs_fp32,
+                search_range_mb=search_range_mb,
+                hidden_dim=search_interval,
+                min_chunk_size_mb=min_chunk_size_mb,
+            )
 
             assert self.model is not None
             pl_module._colossalai_zero = [self.model]  # type: ignore[assignment]
diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh
index 65ead08c2a910..48dbe307b73ab 100644
--- a/tests/tests_pytorch/run_standalone_tests.sh
+++ b/tests/tests_pytorch/run_standalone_tests.sh
@@ -50,7 +50,7 @@ function show_batched_output {
   if [ -f standalone_test_output.txt ]; then  # if exists
     cat standalone_test_output.txt
     # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
-    if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qv 'on_exception'; then
+    if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then
       echo "Potential error! Stopping."
       rm standalone_test_output.txt
       exit 1
diff --git a/tests/tests_pytorch/strategies/test_colossalai.py b/tests/tests_pytorch/strategies/test_colossalai.py
index 77136c3f61f04..832854f82db3b 100644
--- a/tests/tests_pytorch/strategies/test_colossalai.py
+++ b/tests/tests_pytorch/strategies/test_colossalai.py
@@ -228,6 +228,7 @@ def test_multi_gpu_checkpointing(tmpdir):
         precision=16,
         strategy="colossalai",
         callbacks=[ck],
+        num_sanity_val_steps=0,  # TODO: remove once validation/test before fitting is supported again
     )
     trainer.fit(model, datamodule=dm)
 
@@ -235,11 +236,17 @@ def test_multi_gpu_checkpointing(tmpdir):
     saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
     assert saved_results == results
 
-    # here, we test whether restore_checkpoint_after_setup is worked
+
+@pytest.mark.xfail(raises=AssertionError, match="You should run a completed iteration as your warmup iter")
+@RunIf(min_cuda_gpus=2, standalone=True, colossalai=True, sklearn=True)
+def test_test_without_fit(tmpdir):
     model = ModelParallelClassificationModel()
+    dm = ClassifDataModule()
     trainer = Trainer(default_root_dir=tmpdir, accelerator="gpu", devices=2, precision=16, strategy="colossalai")
-    saved_results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
-    assert saved_results == results
+
+    # Colossal requires warmup, you can't run validation/test without having fit first
+    # This is a temporary limitation
+    trainer.test(model, datamodule=dm)
 
 
 @RunIf(min_cuda_gpus=2, standalone=True, colossalai=True, sklearn=True)
@@ -255,6 +262,7 @@ def test_multi_gpu_model_colossalai_fit_test(tmpdir):
         precision=16,
         strategy=ColossalAIStrategy(initial_scale=32),
         max_epochs=1,
+        num_sanity_val_steps=0,  # TODO: remove once validation/test before fitting is supported again
     )
     trainer.fit(model, datamodule=dm)