[AutoConfig]add benchmark scripts (PaddlePaddle#7897)

* add auto_tuner * fix * update log_file * update json * close eval/predict * fix run_mode * update * fix * Revert "fix" This reverts commit e526c86. * Revert "update" This reverts commit 9cbd773. * update prepare * Revert "Revert "update"" This reverts commit 811b6a4. * Revert "Revert "fix"" This reverts commit 32cc005. * update finetune prepare * update * add * update sft/lora steps * update json * update * add benchmark * update years * update a100
DesmonDay · Jan 26, 2024 · 879a853 · 879a853
1 parent 95c0dd4
commit 879a853
Show file tree

Hide file tree

Showing 14 changed files with 799 additions and 0 deletions.
diff --git a/tests/test_tipc/auto_tuner/autoconfig/check.sh b/tests/test_tipc/auto_tuner/autoconfig/check.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+autoconfig_json_file=$(basename "$1")   # autoconfig/llama7b_pretrain.json
+model_name=${autoconfig_json_file%.*} 
+auto_log_file=./autoconfig/${model_name}_auto_tuner.log
+
+if [ -f "$auto_log_file" ] && grep -q "Launch best cfg:" "$auto_log_file"; then  
+    echo "autotuner 已找到最优配置"  
+    if [ -d "./autoconfig/best_cfg" ]; then  
+        echo "autotuner 已执行最优配置"  
+        exit 0  
+    else  
+        echo "autotuner 未执行最优配置"  
+        exit -1  
+    fi  
+else  
+    echo "autotuner 执行失败，请检查日志文件是否存在或是否包含指定文本！"  
+    exit -1  
+fi
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json
@@ -0,0 +1,80 @@
+{
+  "dp_degree": "auto",
+  "invalid_strategy": [
+    "stage3_mp*"
+  ],
+  "max_search_time": 900,
+  "max_time_per_task": 300,
+  "metric_cfg": {
+    "OptimizationDirection": "Maximize",
+    "name": "interval_samples_per_second"
+  },
+  "micro_batch_size": "auto",
+  "mode": "LoRA",
+  "model_cfg": {
+    "global_batch_size": 8,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "vocab_size": 65024
+  },
+  "mp_degree": [
+    1
+  ],
+  "need_baseline": true,
+  "pp_degree": [
+    1
+  ],
+  "run_cmd": {
+    "gradient_accumulation_steps": [
+      "./autoconfig/llama7b_lora_params.json",
+      "gradient_accumulation_steps"
+    ],
+    "micro_batch_size": [
+      "./autoconfig/llama7b_lora_params.json",
+      "per_device_train_batch_size"
+    ],
+    "mp_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "tensor_parallel_degree"
+    ],
+    "pp_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "pipeline_parallel_degree"
+    ],
+    "run_best_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_lora_params.json",
+        "autotuner_benchmark",
+        0
+      ]
+    },
+    "search_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_lora_params.json",
+        "autotuner_benchmark",
+        1
+      ]
+    },
+    "sharding_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "sharding_parallel_degree"
+    ],
+    "sharding_stage": [
+      "./autoconfig/llama7b_lora_params.json",
+      "sharding",
+      "stage"
+    ],
+    "use_recompute": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute"
+    ]
+  },
+  "schedule_prior": [
+    "mp4"
+  ],
+  "sharding_degree": "auto",
+  "sharding_stage": "auto",
+  "task_limit": 2000,
+  "use_recompute": "auto"
+}
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_params.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_params.json
@@ -0,0 +1,38 @@
+{
+	"model_name_or_path": "facebook/llama-7b",
+	"dataset_name_or_path": "./data",
+	"output_dir": "./checkpoints/llama_lora_ckpts",
+	"per_device_train_batch_size": 1,
+	"gradient_accumulation_steps": 1,
+	"per_device_eval_batch_size": 8,
+	"eval_accumulation_steps": 16,
+	"num_train_epochs": 1,
+    "max_steps": 100,
+	"learning_rate": 0.0003,
+	"warmup_steps": 30,
+	"logging_steps": 1,
+	"evaluation_strategy": "no",
+	"save_strategy": "steps",
+	"src_length": 1024,
+	"max_length": 2048,
+	"bf16": true,
+	"fp16_opt_level": "O2",
+	"do_train": true,
+	"do_eval": false,
+	"disable_tqdm": true,
+	"load_best_model_at_end": false,
+	"eval_with_do_generation": false,
+	"metric_for_best_model": "accuracy",
+	"recompute": true,
+	"save_total_limit": 1,
+	"tensor_parallel_degree": 1,
+	"pipeline_parallel_degree": 1,
+	"lora": true,
+	"zero_padding": false,
+	"use_flash_attention": true,
+	"sharding_parallel_degree": 8,
+	"sharding": "stage3",
+	"recompute_granularity": "full_attn",
+	"autotuner_benchmark": 1,
+	"benchmark": 1
+}
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain.json
@@ -0,0 +1,87 @@
+{
+    "dp_degree": "auto",
+    "max_search_time": 900,
+    "max_time_per_task": 400,
+    "metric_cfg": {
+      "OptimizationDirection": "Maximize",
+      "name": "interval_samples_per_second"
+    },
+    "micro_batch_size": "auto",
+    "model_cfg": {
+      "global_batch_size": 8,
+      "hidden_size": 5120,
+      "num_attention_heads": 40,
+      "num_layers": 40,
+      "vocab_size": 32000
+    },
+    "mp_degree": "auto",
+    "pp_degree": "auto",
+    "run_cmd": {
+      "gradient_accumulation_steps": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "gradient_accumulation_steps"
+      ],
+      "micro_batch_size": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "per_device_train_batch_size"
+      ],
+      "mp_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "tensor_parallel_degree"
+      ],
+      "pp_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "pipeline_parallel_degree"
+      ],
+      "run_best_stage": {
+        "continue_training": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "continue_training",
+          0
+        ],
+        "autotuner_benchmark": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "autotuner_benchmark",
+          0
+        ]
+      },
+      "search_stage": {
+        "continue_training": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "continue_training",
+          0
+        ],
+        "autotuner_benchmark": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "autotuner_benchmark",
+          1
+        ]
+      },
+      "sharding_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "sharding_parallel_degree"
+      ],
+      "sharding_stage": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "sharding",
+        "stage"
+      ],
+      "use_recompute": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "recompute"
+      ],
+      "recompute_granularity": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "recompute_granularity"
+      ]
+    },
+    "sharding_degree": "auto",
+    "sharding_stage": "auto",
+    "task_limit": 2000,
+    "use_recompute": "auto",
+    "recompute_granularity": "auto",
+    "invalid_strategy": ["stage3_mp*"],
+    "schedule_prior": ["mp4"],
+    "need_baseline": true,
+    "mode": "Pretrain"
+  }
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_params.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_params.json
@@ -0,0 +1,42 @@
+{
+	"model_name_or_path": "facebook/llama-7b",
+	"tokenizer_name_or_path": "facebook/llama-7b",
+	"input_dir": "./data",
+	"output_dir": "./checkpoints/llama_pretrain_ckpts",
+	"per_device_train_batch_size": 1,
+	"gradient_accumulation_steps": 8,
+	"per_device_eval_batch_size": 2,
+	"tensor_parallel_degree": 8,
+	"pipeline_parallel_degree": 1,
+	"sharding": "stage3",
+	"virtual_pp_degree": 1,
+	"sequence_parallel": 0,
+	"use_flash_attention": true,
+	"use_fused_rms_norm": true,
+	"use_fused_rope": true,
+	"max_seq_length": 4096,
+	"learning_rate": 3e-05,
+	"min_learning_rate": 3e-06,
+	"warmup_steps": 30,
+	"logging_steps": 1,
+	"max_steps": 100,
+	"save_steps": 5000,
+	"eval_steps": 1000,
+	"weight_decay": 0.01,
+	"bf16": true,
+	"fp16_opt_level": "O2",
+	"warmup_ratio": 0.01,
+	"max_grad_norm": 1.0,
+	"dataloader_num_workers": 1,
+	"continue_training": 0,
+	"do_train": true,
+	"do_eval": false,
+	"do_predict": false,
+	"disable_tqdm": true,
+	"recompute": true,
+	"distributed_dataloader": 1,
+	"recompute_granularity": "full",
+	"save_total_limit": 2,
+	"sharding_parallel_degree": 1,
+	"autotuner_benchmark": 1
+}
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json
@@ -0,0 +1,78 @@
+{
+  "dp_degree": "auto",
+  "invalid_strategy": [
+    "stage3_mp*"
+  ],
+  "max_search_time": 900,
+  "max_time_per_task": 300,
+  "metric_cfg": {
+    "OptimizationDirection": "Maximize",
+    "name": "interval_samples_per_second"
+  },
+  "micro_batch_size": "auto",
+  "mode": "SFT",
+  "model_cfg": {
+    "global_batch_size": 8,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "vocab_size": 65024
+  },
+  "mp_degree": "auto",
+  "need_baseline": true,
+  "pp_degree": [
+    1
+  ],
+  "run_cmd": {
+    "gradient_accumulation_steps": [
+      "./autoconfig/llama7b_sft_params.json",
+      "gradient_accumulation_steps"
+    ],
+    "micro_batch_size": [
+      "./autoconfig/llama7b_sft_params.json",
+      "per_device_train_batch_size"
+    ],
+    "mp_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "tensor_parallel_degree"
+    ],
+    "pp_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "pipeline_parallel_degree"
+    ],
+    "run_best_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_sft_params.json",
+        "autotuner_benchmark",
+        0
+      ]
+    },
+    "search_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_sft_params.json",
+        "autotuner_benchmark",
+        1
+      ]
+    },
+    "sharding_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "sharding_parallel_degree"
+    ],
+    "sharding_stage": [
+      "./autoconfig/llama7b_sft_params.json",
+      "sharding",
+      "stage"
+    ],
+    "use_recompute": [
+      "./autoconfig/llama7b_sft_params.json",
+      "recompute"
+    ]
+  },
+  "schedule_prior": [
+    "mp4"
+  ],
+  "sharding_degree": "auto",
+  "sharding_stage": "auto",
+  "task_limit": 2000,
+  "use_recompute": "auto"
+}