From a4a595e1f5bf31185087c67109584acbdfdf3dc0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 8 Mar 2023 17:08:23 -0800 Subject: [PATCH 1/3] [deepspeed] offload + non-cpuadam optimizer exception --- setup.py | 2 +- tests/deepspeed/test_deepspeed.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1be88908c937..59dea97226b2 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,7 @@ "dataclasses", "datasets!=2.5.0", "decord==0.6.0", - "deepspeed>=0.6.5", + "deepspeed>=0.8.3", "dill<0.3.5", "evaluate>=0.2.0", "fairscale>0.3", diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 9b203dfd7b95..5b9e14eb64f0 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -426,6 +426,7 @@ def test_hf_optimizer_with_offload(self, stage, dtype): del ds_config_dict["optimizer"] # force default HF Trainer optimizer # force cpu offload ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" + ds_config_dict["zero_force_ds_cpu_optimizer"] = True # offload is not efficient w/o CPUAdam with mockenv_context(**self.dist_env_1_gpu): kwargs = {"local_rank": 0, "deepspeed": ds_config_dict} kwargs[dtype] = True @@ -776,6 +777,7 @@ def test_load_best_model(self, stage, dtype): ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # will use HF Trainer optimizer del ds_config_dict["scheduler"] # will use HF Trainer scheduler + ds_config_dict["zero_force_ds_cpu_optimizer"] = True # offload is not efficient w/o CPUAdam # must use this setting to get the reload path exercised ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True From 00eed151c6124259541015b6c98bae27ee25c9b2 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 8 Mar 2023 17:14:15 -0800 Subject: [PATCH 2/3] flip --- tests/deepspeed/test_deepspeed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 5b9e14eb64f0..ba9c269cd107 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -426,7 +426,7 @@ def test_hf_optimizer_with_offload(self, stage, dtype): del ds_config_dict["optimizer"] # force default HF Trainer optimizer # force cpu offload ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" - ds_config_dict["zero_force_ds_cpu_optimizer"] = True # offload is not efficient w/o CPUAdam + ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam with mockenv_context(**self.dist_env_1_gpu): kwargs = {"local_rank": 0, "deepspeed": ds_config_dict} kwargs[dtype] = True @@ -777,7 +777,7 @@ def test_load_best_model(self, stage, dtype): ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # will use HF Trainer optimizer del ds_config_dict["scheduler"] # will use HF Trainer scheduler - ds_config_dict["zero_force_ds_cpu_optimizer"] = True # offload is not efficient w/o CPUAdam + ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam # must use this setting to get the reload path exercised ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True From e9d5ad54ec316fd79cc04c0ca781340f9ac521c8 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 8 Mar 2023 17:23:41 -0800 Subject: [PATCH 3/3] revert min version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 59dea97226b2..1be88908c937 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,7 @@ "dataclasses", "datasets!=2.5.0", "decord==0.6.0", - "deepspeed>=0.8.3", + "deepspeed>=0.6.5", "dill<0.3.5", "evaluate>=0.2.0", "fairscale>0.3",