diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index bf8ceca533c9..945457b304c5 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -34,6 +34,8 @@ jobs: pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + sudo apt-get update + sudo apt-get install -y libaio-dev - name: Python environment run: | diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 457228f34700..24a171fcac5f 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -646,6 +646,14 @@ def zero_offload_optimizer(self): def zero_offload_param(self): return self._config.zero_config.offload_param + def zero_use_cpu_optimizer(self): + if self._config.zero_config.offload_optimizer is not None: + return self._config.zero_config.offload_optimizer.device in [ + OffloadDeviceEnum.cpu, + OffloadDeviceEnum.nvme + ] + return False + def zero_cpu_offload(self): if self._config.zero_config.offload_optimizer is not None: return self._config.zero_config.offload_optimizer.device == OffloadDeviceEnum.cpu @@ -1188,7 +1196,7 @@ def _configure_basic_optimizer(self, model_parameters): optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters) else: - if self.zero_cpu_offload(): + if self.zero_use_cpu_optimizer(): if self.optimizer_name() == ADAGRAD_OPTIMIZER: from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad optimizer = DeepSpeedCPUAdagrad(model_parameters,