Skip to content

Commit

Permalink
CI: Update colossalai version (#16747)
Browse files Browse the repository at this point in the history
Co-authored-by: Carlos Mocholí <[email protected]>
  • Loading branch information
awaelchli and carmocca authored Feb 14, 2023
1 parent fbbbbf6 commit 565d3bb
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 69 deletions.
8 changes: 1 addition & 7 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,17 +108,11 @@ jobs:

- bash: pip uninstall -y -r requirements/pytorch/strategies.txt
condition: eq(variables['scope'], '')
displayName: 'UnInstall strategies'
displayName: 'Uninstall strategies'

- bash: |
set -e
CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))")
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])")
pip install "colossalai==0.1.12+torch${PYTORCH_VERSION}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org
pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
python requirements/pytorch/check-avail-strategies.py
condition: eq(variables['scope'], 'strategies')
displayName: 'Install strategies'
Expand Down
3 changes: 2 additions & 1 deletion .azure/ipu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ jobs:
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath; \
done
pip install -e .[dev]
pip install -e .[extra,examples,test]
pip list
env:
PACKAGE_NAME: "pytorch"
Expand Down
4 changes: 1 addition & 3 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,7 @@ RUN \
# install ColossalAI
# TODO: 1.13 wheels are not released, remove skip once they are
if [[ $PYTORCH_VERSION != "1.13" ]]; then \
CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
pip install "colossalai==0.1.12+torch${PYTORCH_VERSION}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org ; \
pip install "colossalai==0.2.3"; \
python -c "import colossalai; print(colossalai.__version__)" ; \
fi

Expand Down
2 changes: 1 addition & 1 deletion dockers/base-xla/tpu_workflow_pytorch.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ local tputests = base.BaseTest {
for fpath in `ls requirements/**/*.txt`; do
python requirements/pytorch/adjust-versions.py $fpath {PYTORCH_VERSION};
done
PACKAGE_NAME=pytorch pip install .[dev]
PACKAGE_NAME=pytorch pip install .[extra,test]
pip list
pip install -q -r .actions/requirements.txt
Expand Down
4 changes: 3 additions & 1 deletion requirements/pytorch/strategies.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
deepspeed>=0.6.0, <=0.8.0

deepspeed>=0.6.0, <0.8.0 # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
colossalai>=0.2.0, <=0.2.4
78 changes: 26 additions & 52 deletions src/lightning/pytorch/strategies/colossalai.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from lightning.pytorch.utilities.types import STEP_OUTPUT

_COLOSSALAI_AVAILABLE = RequirementCache("colossalai")
_COLOSSALAI_GREATER_0_1_10 = RequirementCache("colossalai>0.1.10")
if TYPE_CHECKING and _COLOSSALAI_AVAILABLE:
with _patch_cuda_is_available():
from colossalai.utils.model.colo_init_context import ColoInitContext
Expand Down Expand Up @@ -94,7 +93,7 @@ def configure_sharded_model(self) -> None:
chunk_search_n_grids: The number of intervals in the search range.
min_chunk_size: The minimum size for a chunk.
min_chunk_size: The minimum size for a chunk in bytes.
initial_scale: The initial dynamic loss scale value.
Expand Down Expand Up @@ -130,7 +129,7 @@ def __init__(
gpu_margin_mem_ratio: float = 0.0,
chunk_search_range: int = 64 * 1024**2,
chunk_search_n_grids: int = 4096,
min_chunk_size: Optional[int] = None,
min_chunk_size: int = 32 * 1024**2,
initial_scale: float = 2**16,
min_scale: float = 1,
growth_factor: float = 2,
Expand Down Expand Up @@ -242,10 +241,7 @@ def _post_init_method(self, module: torch.nn.Module, *args: Any, **kwargs: Any)

def setup_precision_plugin(self) -> None:
with _patch_cuda_is_available():
from colossalai.gemini import ChunkManager, GeminiManager
from colossalai.nn.optimizer import CPUAdam, HybridAdam
from colossalai.nn.parallel import ZeroDDP
from colossalai.tensor import ProcessGroup
from colossalai.zero import ZeroOptimizer

super().setup_precision_plugin()
Expand All @@ -265,52 +261,30 @@ def setup_precision_plugin(self) -> None:
pl_module = self.model

if not hasattr(pl_module, "_colossalai_zero"):
if not _COLOSSALAI_GREATER_0_1_10:
if self.use_chunk:
chunk_size = self.chunk_size or ChunkManager.search_chunk_size(
self.model, **self.chunk_size_search_kwargs
)
else:
chunk_size = None
process_group = ProcessGroup()
chunk_manager = ChunkManager(
chunk_size,
process_group,
self.enable_distributed_storage,
GeminiManager.get_default_device(self.placement_policy),
)
gemini_manager = GeminiManager(self.placement_policy, chunk_manager)
model = _LightningModuleWrapperBase(self.model)
self.model = ZeroDDP(model, gemini_manager, self.force_outputs_fp32)
else:
with _patch_cuda_is_available():
from colossalai.nn.parallel import GeminiDDP
from colossalai.utils import get_current_device
if not self.use_chunk:
raise ValueError("`ColossalAIStrategy` must use chunk in versions higher than 0.1.10")
chunk_search_range: int = self.chunk_size_search_kwargs.get(
"search_range", 32 * 1024**2
) # type: ignore[assignment]
search_range_mb: float = chunk_search_range / 1024**2
search_n_grids: int = self.chunk_size_search_kwargs.get("n_grids", 4096) # type: ignore[assignment]
search_interval: int = math.ceil(chunk_search_range / search_n_grids)
min_chunk_size_mb: float = self.chunk_size_search_kwargs.get(
"min_chunk_size", 32 * 1024**2
) # type: ignore[assignment]
if min_chunk_size_mb is not None:
min_chunk_size_mb /= 1024**2

model = _LightningModuleWrapperBase(self.model)
self.model = GeminiDDP(
module=model,
device=get_current_device(),
placement_policy=self.placement_policy,
pin_memory=True,
force_outputs_fp32=self.force_outputs_fp32,
search_range_mb=search_range_mb,
hidden_dim=search_interval,
min_chunk_size_mb=min_chunk_size_mb,
)
with _patch_cuda_is_available():
from colossalai.nn.parallel import GeminiDDP
from colossalai.utils import get_current_device
if not self.use_chunk:
raise ValueError("`ColossalAIStrategy` must use chunk in versions higher than 0.1.10")
chunk_search_range: int = self.chunk_size_search_kwargs.get(
"search_range", 32 * 1024**2
) # type: ignore[assignment]
search_range_mb: float = chunk_search_range / 1024**2
search_n_grids: int = self.chunk_size_search_kwargs.get("n_grids", 4096) # type: ignore[assignment]
search_interval: int = math.ceil(chunk_search_range / search_n_grids)
min_chunk_size_mb = int(self.chunk_size_search_kwargs["min_chunk_size"] // (1024**2))

model = _LightningModuleWrapperBase(self.model)
self.model = GeminiDDP(
module=model,
device=get_current_device(),
placement_policy=self.placement_policy,
pin_memory=True,
force_outputs_fp32=self.force_outputs_fp32,
search_range_mb=search_range_mb,
hidden_dim=search_interval,
min_chunk_size_mb=min_chunk_size_mb,
)

assert self.model is not None
pl_module._colossalai_zero = [self.model] # type: ignore[assignment]
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_pytorch/run_standalone_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ function show_batched_output {
if [ -f standalone_test_output.txt ]; then # if exists
cat standalone_test_output.txt
# heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qv 'on_exception'; then
if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then
echo "Potential error! Stopping."
rm standalone_test_output.txt
exit 1
Expand Down
14 changes: 11 additions & 3 deletions tests/tests_pytorch/strategies/test_colossalai.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,18 +228,25 @@ def test_multi_gpu_checkpointing(tmpdir):
precision=16,
strategy="colossalai",
callbacks=[ck],
num_sanity_val_steps=0, # TODO: remove once validation/test before fitting is supported again
)
trainer.fit(model, datamodule=dm)

results = trainer.test(datamodule=dm)
saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
assert saved_results == results

# here, we test whether restore_checkpoint_after_setup is worked

@pytest.mark.xfail(raises=AssertionError, match="You should run a completed iteration as your warmup iter")
@RunIf(min_cuda_gpus=2, standalone=True, colossalai=True, sklearn=True)
def test_test_without_fit(tmpdir):
model = ModelParallelClassificationModel()
dm = ClassifDataModule()
trainer = Trainer(default_root_dir=tmpdir, accelerator="gpu", devices=2, precision=16, strategy="colossalai")
saved_results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
assert saved_results == results

# Colossal requires warmup, you can't run validation/test without having fit first
# This is a temporary limitation
trainer.test(model, datamodule=dm)


@RunIf(min_cuda_gpus=2, standalone=True, colossalai=True, sklearn=True)
Expand All @@ -255,6 +262,7 @@ def test_multi_gpu_model_colossalai_fit_test(tmpdir):
precision=16,
strategy=ColossalAIStrategy(initial_scale=32),
max_epochs=1,
num_sanity_val_steps=0, # TODO: remove once validation/test before fitting is supported again
)
trainer.fit(model, datamodule=dm)

Expand Down

0 comments on commit 565d3bb

Please sign in to comment.