diff --git a/docker/rocm/ds_config_cpu_offload_off.json b/docker/rocm/ds_config_cpu_offload_off.json new file mode 100644 index 000000000000..18ce5a399923 --- /dev/null +++ b/docker/rocm/ds_config_cpu_offload_off.json @@ -0,0 +1,47 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "zero_allow_untested_optimizer": true, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": 3e-5, + "betas": [ + 0.8, + 0.999 + ], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 3e-5, + "warmup_num_steps": 500 + } + }, + + "steps_per_print": 2000, + "wall_clock_breakdown": false +} diff --git a/docker/rocm/ds_config_cpu_offload_on.json b/docker/rocm/ds_config_cpu_offload_on.json new file mode 100644 index 000000000000..0170afb5d292 --- /dev/null +++ b/docker/rocm/ds_config_cpu_offload_on.json @@ -0,0 +1,47 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 1.5e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.5e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "zero_allow_untested_optimizer": true, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": 3e-5, + "betas": [ + 0.8, + 0.999 + ], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 3e-5, + "warmup_num_steps": 500 + } + }, + + "steps_per_print": 2000, + "wall_clock_breakdown": false +} diff --git a/docker/rocm/hugginface_zero.Dockerfile b/docker/rocm/hugginface_zero.Dockerfile new file mode 100644 index 000000000000..42cba5d713f1 --- /dev/null +++ b/docker/rocm/hugginface_zero.Dockerfile @@ -0,0 +1,24 @@ +# Select base Image +FROM rraminen/deepspeed:DeepSpeed_Megatron-LM-GPT2_bingBERT_rocm4.0 + +# Install dependencies +RUN apt update && apt install -y \ + unzip +RUN pip3 install regex sacremoses filelock gitpython rouge_score sacrebleu datasets fairscale + +# copy repo to workspace +WORKDIR /workspace +COPY . transformers/ +RUN cd transformers/ && \ + python3 -m pip install --no-cache-dir . + +# set work dir +WORKDIR /workspace/transformers + + + + + + + +