Merge remote-tracking branch 'upstream/develop' into add_master_grad

PaddlePaddle · Dec 13, 2023 · 877da15 · 877da15
2 parents 4cb6cb4 + a73a7bf
commit 877da15
Show file tree

Hide file tree

Showing 33 changed files with 366 additions and 201 deletions.
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -1,8 +1,14 @@
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
 coverage:
   status:
     project:
-      default:
-        informational: true
+      default: 
+      target: 75% # overall project Coverage < 75%  CI will fail
+      informational: true
     patch:
-      default:
-        informational: true
+      default: 
+      target: 90% # lines adjusted  Coverage < 90%  CI will fail
+      informational: true
diff --git a/llm/data.py b/llm/data.py
@@ -208,6 +208,7 @@ def convert_rounds_example_common(example, tokenizer, data_args, is_test=True, i
     input_ids = rounds_inputs.pop("input_ids")
     # shift input_ids and labels
     input_ids, labels = input_ids[:-1], labels[1:]
+
     seq_length = len(input_ids)
     features = {"input_ids": input_ids, "labels": labels}
     if intokens:

diff --git a/llm/gpt-3/finetune_generation.py b/llm/gpt-3/finetune_generation.py
@@ -100,7 +100,7 @@ def main():
     training_args.tgt_length = data_args.tgt_length
     paddle.set_device(training_args.device)
 
-    set_seed(args=training_args)
+    set_seed(seed=training_args.seed)
 
     # Log on each process the small summary:
     logger.warning(

diff --git a/llm/llama/auto_parallel/run_auto.sh b/llm/llama/auto_parallel/run_auto.sh
@@ -12,20 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# just for debug
+# just for debug auto_parallel
 
 set -x
 unset CUDA_VISIBLE_DEVICES
 
-export FLAGS_call_stack_level=2
-
-task_name="llama_auto_dp2mp2pp2"
-rm -rf output/$task_name/
+task_name="llama_auto_dp2sharding2mp2pp2_vpp2"
+# rm -rf output/$task_name/  # ckpt is saved in 'output/''
 rm -rf "output/$task_name""_log"
 
-export SOT_LOG_LEVEL=4
+export PARALLEL_CROSS_ENTROPY=true
+export FLAGS_call_stack_level=2
 export PYTHONPATH=../../../:$PYTHONPATH
-
 python -u -m paddle.distributed.launch \
     --gpus "0,1,2,3,4,5,6,7" \
     --log_dir "output/$task_name""_log" \
@@ -38,17 +36,19 @@ python -u -m paddle.distributed.launch \
     --split 949,50,1 \
     --max_seq_length 2048 \
     --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps 4 \
+    --per_device_eval_batch_size 8 \
+    --gradient_accumulation_steps 8 \
     --use_flash_attention 0 \
     --use_fused_rms_norm 0 \
     --fp16 0 \
     --fp16_opt_level "O2"  \
     --scale_loss 1024 \
-    --pipeline_parallel_degree 2 \
     --tensor_parallel_degree 2 \
-    --sharding_parallel_degree 1 \
-    --sharding "stage1" \
+    --pipeline_parallel_degree 2 \
+    --virtual_pp_degree 2 \
+    --pipeline_schedule_mode "VPP" \
+    --sharding_parallel_degree 2 \
+    --sharding "stage2" \
     --learning_rate 0.0001 \
     --min_learning_rate 0.00001 \
     --max_steps 10 \
@@ -58,14 +58,16 @@ python -u -m paddle.distributed.launch \
     --max_grad_norm 1.0 \
     --logging_steps 1 \
     --dataloader_num_workers 1 \
-    --sharding "" \
     --eval_steps 1000 \
     --report_to "visualdl" \
     --disable_tqdm true \
     --continue_training 0 \
     --recompute 1 \
+    --recompute_granularity full \
     --do_train \
     --do_eval \
     --device "gpu" \
     --data_impl "mmap" \
     --parallel_mode "auto"
+
+    # --resume_from_checkpoint "output/llama_auto_serial/checkpoint-2" \
diff --git a/llm/llama/auto_parallel/run_pretrain_auto.py b/llm/llama/auto_parallel/run_pretrain_auto.py
@@ -32,8 +32,10 @@
     PdArgumentParser,
     Trainer,
     TrainingArguments,
+    get_last_checkpoint,
     speed_metrics,
 )
+from paddlenlp.trainer.trainer_utils import PREFIX_CHECKPOINT_DIR
 from paddlenlp.transformers import (
     AutoTokenizer,
     CosineAnnealingWithWarmupDecay,
@@ -98,6 +100,9 @@ class PreTrainingArguments(TrainingArguments):
             "help": "Enable fused_linear_param_grad pass, which should replace add_n_op with add_op for gradients accumulation."
         },
     )
+    pipeline_schedule_mode: str = field(
+        default="1F1B", metadata={"help": "The pipeline schedule mode, support FThenB, 1F1B, VPP and Eager-1F1B."}
+    )
 
     def __post_init__(self):
         super().__post_init__()
@@ -441,6 +446,21 @@ def main():
         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
     )
 
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        # if last_checkpoint is None and len(
+        #         os.listdir(training_args.output_dir)) > 1:
+        #     raise ValueError(
+        #         f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+        #         "Use --overwrite_output_dir to overcome.")
+        if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
     config_class, model_class = MODEL_CLASSES[model_args.model_type]
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
@@ -553,6 +573,17 @@ def loss_func(loss, outputs):
     print_config(training_args)
 
     engine = auto.Engine(model, loss_func, optimizer, strategy=training_args.strategy)
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    if checkpoint:
+        logger.info(f"Starting training from resume_from_checkpoint : {checkpoint}")
+        engine.load(os.path.join(checkpoint, "auto"))
+
     engine.prepare(
         [
             paddle.static.InputSpec(
@@ -638,6 +669,16 @@ def loss_func(loss, outputs):
                 start_time_last_logged = time.time()
                 tr_loss = float(0)
 
+            if training_args.save_steps > 0 and global_step % training_args.save_steps == 0:
+                paddle.device.cuda.synchronize()
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{global_step}"
+                run_dir = training_args.output_dir
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+                os.makedirs(output_dir, exist_ok=True)
+                logger.info(f"Saving model checkpoint to {output_dir}")
+                prefix_path = os.path.join(output_dir, "auto")
+                engine.save(prefix_path, training=True)
+
             if global_step >= training_args.max_steps:
                 break
 

diff --git a/llm/llama/benchmark.py b/llm/llama/benchmark.py
@@ -175,7 +175,7 @@ def main():
 
     paddle.set_device(training_args.device)
 
-    set_seed(args=training_args)
+    set_seed(seed=training_args.seed)
 
     # Log on each process the small summary:
     logger.warning(

diff --git a/llm/predictor.py b/llm/predictor.py
@@ -43,6 +43,7 @@
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
+    ChatGLMv2Tokenizer,
     LlamaTokenizer,
     PretrainedModel,
     PretrainedTokenizer,
@@ -197,7 +198,8 @@ def _preprocess(self, source):
             return_tensors=self.return_tensors,
             padding=True,
             # when use chat_template, it should not add special tokens
-            add_special_tokens=self.config.chat_template is None,
+            # chatglm2 prefix-tokens can not be tokenized into ids
+            add_special_tokens=self.tokenizer.chat_template is None or isinstance(self.tokenizer, ChatGLMv2Tokenizer),
         )
         return tokenized_source
 

diff --git a/llm/utils.py b/llm/utils.py
@@ -29,7 +29,7 @@
 from paddlenlp.datasets import InTokensIterableDataset
 from paddlenlp.trainer import Trainer, TrainerCallback
 from paddlenlp.trainer.trainer_utils import IterableDatasetShard, has_length
-from paddlenlp.transformers import LlamaForCausalLMPipe
+from paddlenlp.transformers import ChatGLMv2Tokenizer, LlamaForCausalLMPipe
 from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
 from paddlenlp.utils.log import logger
 
@@ -409,7 +409,7 @@ def dybatch_preprocess(
                 padding=True,
                 max_length=src_length,
                 # if use chat_template, it will not add special_tokens
-                add_special_tokens=tokenizer.chat_template is None,
+                add_special_tokens=tokenizer.chat_template is None or isinstance(tokenizer, ChatGLMv2Tokenizer),
             )
             input_ids.append(tokens["input_ids"][0])
             position_ids.append(tokens["position_ids"][0])

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h
@@ -24,7 +24,7 @@
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h
@@ -24,7 +24,7 @@
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h
@@ -29,7 +29,7 @@
 #ifdef PADDLE_ON_INFERENCE
 #include "paddle/extension.h"
 #include "paddle_inference_api.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #else
 #include "paddle/extension.h"
 #endif

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -260,7 +260,7 @@ def __init__(
         self._memory_tracker.start()
 
         # Seed must be set before instantiating the model when using model
-        set_seed(args=self.args)
+        set_seed(seed=self.args.seed)
 
         if model is None:
             raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")