diff --git a/.github/codecov.yml b/.github/codecov.yml index a793f5919561..b61b3a0df9fe 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -1,8 +1,14 @@ +codecov: + notify: + require_ci_to_pass: yes + coverage: status: project: - default: - informational: true + default: + target: 75% # overall project Coverage < 75% CI will fail + informational: true patch: - default: - informational: true \ No newline at end of file + default: + target: 90% # lines adjusted Coverage < 90% CI will fail + informational: true diff --git a/llm/data.py b/llm/data.py index f8de60eaa1e6..b454c2082511 100644 --- a/llm/data.py +++ b/llm/data.py @@ -208,6 +208,7 @@ def convert_rounds_example_common(example, tokenizer, data_args, is_test=True, i input_ids = rounds_inputs.pop("input_ids") # shift input_ids and labels input_ids, labels = input_ids[:-1], labels[1:] + seq_length = len(input_ids) features = {"input_ids": input_ids, "labels": labels} if intokens: diff --git a/llm/gpt-3/finetune_generation.py b/llm/gpt-3/finetune_generation.py index 9018221f78da..0d0df71d8100 100644 --- a/llm/gpt-3/finetune_generation.py +++ b/llm/gpt-3/finetune_generation.py @@ -100,7 +100,7 @@ def main(): training_args.tgt_length = data_args.tgt_length paddle.set_device(training_args.device) - set_seed(args=training_args) + set_seed(seed=training_args.seed) # Log on each process the small summary: logger.warning( diff --git a/llm/llama/auto_parallel/run_auto.sh b/llm/llama/auto_parallel/run_auto.sh index 8e01dcec3406..f8a114870dad 100644 --- a/llm/llama/auto_parallel/run_auto.sh +++ b/llm/llama/auto_parallel/run_auto.sh @@ -12,20 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -# just for debug +# just for debug auto_parallel set -x unset CUDA_VISIBLE_DEVICES -export FLAGS_call_stack_level=2 - -task_name="llama_auto_dp2mp2pp2" -rm -rf output/$task_name/ +task_name="llama_auto_dp2sharding2mp2pp2_vpp2" +# rm -rf output/$task_name/ # ckpt is saved in 'output/'' rm -rf "output/$task_name""_log" -export SOT_LOG_LEVEL=4 +export PARALLEL_CROSS_ENTROPY=true +export FLAGS_call_stack_level=2 export PYTHONPATH=../../../:$PYTHONPATH - python -u -m paddle.distributed.launch \ --gpus "0,1,2,3,4,5,6,7" \ --log_dir "output/$task_name""_log" \ @@ -38,17 +36,19 @@ python -u -m paddle.distributed.launch \ --split 949,50,1 \ --max_seq_length 2048 \ --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 8 \ + --gradient_accumulation_steps 8 \ --use_flash_attention 0 \ --use_fused_rms_norm 0 \ --fp16 0 \ --fp16_opt_level "O2" \ --scale_loss 1024 \ - --pipeline_parallel_degree 2 \ --tensor_parallel_degree 2 \ - --sharding_parallel_degree 1 \ - --sharding "stage1" \ + --pipeline_parallel_degree 2 \ + --virtual_pp_degree 2 \ + --pipeline_schedule_mode "VPP" \ + --sharding_parallel_degree 2 \ + --sharding "stage2" \ --learning_rate 0.0001 \ --min_learning_rate 0.00001 \ --max_steps 10 \ @@ -58,14 +58,16 @@ python -u -m paddle.distributed.launch \ --max_grad_norm 1.0 \ --logging_steps 1 \ --dataloader_num_workers 1 \ - --sharding "" \ --eval_steps 1000 \ --report_to "visualdl" \ --disable_tqdm true \ --continue_training 0 \ --recompute 1 \ + --recompute_granularity full \ --do_train \ --do_eval \ --device "gpu" \ --data_impl "mmap" \ --parallel_mode "auto" + + # --resume_from_checkpoint "output/llama_auto_serial/checkpoint-2" \ diff --git a/llm/llama/auto_parallel/run_pretrain_auto.py b/llm/llama/auto_parallel/run_pretrain_auto.py index 91ffcf50ffdc..d728f5437516 100644 --- a/llm/llama/auto_parallel/run_pretrain_auto.py +++ b/llm/llama/auto_parallel/run_pretrain_auto.py @@ -32,8 +32,10 @@ PdArgumentParser, Trainer, TrainingArguments, + get_last_checkpoint, speed_metrics, ) +from paddlenlp.trainer.trainer_utils import PREFIX_CHECKPOINT_DIR from paddlenlp.transformers import ( AutoTokenizer, CosineAnnealingWithWarmupDecay, @@ -98,6 +100,9 @@ class PreTrainingArguments(TrainingArguments): "help": "Enable fused_linear_param_grad pass, which should replace add_n_op with add_op for gradients accumulation." }, ) + pipeline_schedule_mode: str = field( + default="1F1B", metadata={"help": "The pipeline schedule mode, support FThenB, 1F1B, VPP and Eager-1F1B."} + ) def __post_init__(self): super().__post_init__() @@ -441,6 +446,21 @@ def main(): + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + # if last_checkpoint is None and len( + # os.listdir(training_args.output_dir)) > 1: + # raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome.") + if last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + config_class, model_class = MODEL_CLASSES[model_args.model_type] tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path) @@ -553,6 +573,17 @@ def loss_func(loss, outputs): print_config(training_args) engine = auto.Engine(model, loss_func, optimizer, strategy=training_args.strategy) + + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + if checkpoint: + logger.info(f"Starting training from resume_from_checkpoint : {checkpoint}") + engine.load(os.path.join(checkpoint, "auto")) + engine.prepare( [ paddle.static.InputSpec( @@ -638,6 +669,16 @@ def loss_func(loss, outputs): start_time_last_logged = time.time() tr_loss = float(0) + if training_args.save_steps > 0 and global_step % training_args.save_steps == 0: + paddle.device.cuda.synchronize() + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{global_step}" + run_dir = training_args.output_dir + output_dir = os.path.join(run_dir, checkpoint_folder) + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Saving model checkpoint to {output_dir}") + prefix_path = os.path.join(output_dir, "auto") + engine.save(prefix_path, training=True) + if global_step >= training_args.max_steps: break diff --git a/llm/llama/benchmark.py b/llm/llama/benchmark.py index 8b488d89a8a3..16adae39c2c1 100644 --- a/llm/llama/benchmark.py +++ b/llm/llama/benchmark.py @@ -175,7 +175,7 @@ def main(): paddle.set_device(training_args.device) - set_seed(args=training_args) + set_seed(seed=training_args.seed) # Log on each process the small summary: logger.warning( diff --git a/llm/predictor.py b/llm/predictor.py index 13ca1981227b..b4245e58b747 100644 --- a/llm/predictor.py +++ b/llm/predictor.py @@ -43,6 +43,7 @@ AutoConfig, AutoModelForCausalLM, AutoTokenizer, + ChatGLMv2Tokenizer, LlamaTokenizer, PretrainedModel, PretrainedTokenizer, @@ -197,7 +198,8 @@ def _preprocess(self, source): return_tensors=self.return_tensors, padding=True, # when use chat_template, it should not add special tokens - add_special_tokens=self.config.chat_template is None, + # chatglm2 prefix-tokens can not be tokenized into ids + add_special_tokens=self.tokenizer.chat_template is None or isinstance(self.tokenizer, ChatGLMv2Tokenizer), ) return tokenized_source diff --git a/llm/utils.py b/llm/utils.py index e36e5cb7b1c3..fc438d3f3802 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -29,7 +29,7 @@ from paddlenlp.datasets import InTokensIterableDataset from paddlenlp.trainer import Trainer, TrainerCallback from paddlenlp.trainer.trainer_utils import IterableDatasetShard, has_length -from paddlenlp.transformers import LlamaForCausalLMPipe +from paddlenlp.transformers import ChatGLMv2Tokenizer, LlamaForCausalLMPipe from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer from paddlenlp.utils.log import logger @@ -409,7 +409,7 @@ def dybatch_preprocess( padding=True, max_length=src_length, # if use chat_template, it will not add special_tokens - add_special_tokens=tokenizer.chat_template is None, + add_special_tokens=tokenizer.chat_template is None or isinstance(tokenizer, ChatGLMv2Tokenizer), ) input_ids.append(tokens["input_ids"][0]) position_ids.append(tokens["position_ids"][0]) diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h index f1480b861856..82219aba6ebe 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h @@ -24,7 +24,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h index ec4e2966b60a..e9cc413b42dc 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h @@ -22,7 +22,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h index c6e476302086..419649092abe 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h @@ -26,7 +26,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h index fd6daa2f0d4a..f9427e4c0eca 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h @@ -22,7 +22,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h index 973329c17b69..99eef25111d6 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h @@ -26,7 +26,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h index 39b73b89809a..75394d5a8ee2 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h @@ -24,7 +24,7 @@ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h index c41d645fda76..48c70553fd52 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h @@ -24,7 +24,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h index 159d4e9070ee..cf21beea10f0 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h @@ -24,7 +24,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h index 081fef05010d..c8213cb1dcad 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h @@ -26,7 +26,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h index b87ccdc46885..0519df524010 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h @@ -24,7 +24,7 @@ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h index 5a4e2962cd74..43ccad5a23c7 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h @@ -24,7 +24,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h index 15750959fed1..1fe581d4879c 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h @@ -25,7 +25,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h index 6ca034f3c79c..071636b6029c 100644 --- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h +++ b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h @@ -26,7 +26,7 @@ limitations under the License. */ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h index 074ab2880919..3ed6a1577e89 100644 --- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h +++ b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h @@ -29,7 +29,7 @@ #ifdef PADDLE_ON_INFERENCE #include "paddle/extension.h" #include "paddle_inference_api.h" -#include "paddle/phi/api/ext/exception.h" +#include "paddle/common/exception.h" #else #include "paddle/extension.h" #endif diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 0f6c631c70d5..fa19373979a8 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -260,7 +260,7 @@ def __init__( self._memory_tracker.start() # Seed must be set before instantiating the model when using model - set_seed(args=self.args) + set_seed(seed=self.args.seed) if model is None: raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument") diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py index e90c7eea2f67..eaa0a4046123 100644 --- a/paddlenlp/trainer/trainer_utils.py +++ b/paddlenlp/trainer/trainer_utils.py @@ -57,77 +57,65 @@ ] -def set_seed(seed: int = 1234, args=None): - # NOTE(shenliang03): For parameter init seed: +def set_seed(seed: int = 1234): + # NOTE: For parameter init seed: # seed: dp/mp_undistributed_paramter/sharding is same; others is different # For compute seed(dropout): # global seed: only mp group is same. # local seed: all groups are different - if args is None: - random.seed(seed) - np.random.seed(seed) - paddle.seed(seed) + hcg = fleet.get_hybrid_communicate_group() if hasattr(fleet.fleet, "_hcg") else None + if hcg is not None and paddle.distributed.get_world_size() > 1: + # obtain rank message of hybrid parallel - else: - hcg = fleet.get_hybrid_communicate_group() if hasattr(fleet.fleet, "_hcg") else None - if hcg is not None and paddle.distributed.get_world_size() > 1: - # obtain rank message of hybrid parallel - - mp_rank = hcg.get_model_parallel_rank() - mp_size = hcg.get_model_parallel_world_size() - - pp_rank = hcg.get_stage_id() - pp_size = hcg.get_pipe_parallel_world_size() + mp_rank = hcg.get_model_parallel_rank() + mp_size = hcg.get_model_parallel_world_size() - dp_rank = hcg.get_data_parallel_rank() - dp_size = hcg.get_data_parallel_world_size() + pp_rank = hcg.get_stage_id() + pp_size = hcg.get_pipe_parallel_world_size() - sharding_rank = hcg.get_sharding_parallel_rank() - # sharding_size = hcg.get_sharding_parallel_world_size() - else: - mp_rank, mp_size = 0, 1 - pp_rank, pp_size = 0, 1 - dp_rank, dp_size = 0, 1 - sharding_rank, _ = 0, 1 - - # NOTE: the commented seeds are set only for precision validation - # seed += 100 * pp_rank - random.seed(seed + 100 * pp_rank) - np.random.seed(seed + 100 * pp_rank) - - # seed = mp_rank + - # pp_rank * (mp_size) + - # dp_rank * (mp_size * pp_size) + - # sharding_rank * (mp_size * pp_size * dp_size) - # seed offset is order to avoid conflicts with the parameter initialization seed - - seed_offset = seed + 1024 + paddle.distributed.get_world_size() - global_seed = ( - seed_offset - + pp_rank * (mp_size) - + dp_rank * (mp_size * pp_size) - + sharding_rank * (mp_size * pp_size * dp_size) - ) + dp_rank = hcg.get_data_parallel_rank() + dp_size = hcg.get_data_parallel_world_size() - seed_offset += paddle.distributed.get_world_size() - local_seed = ( - seed_offset - + mp_rank - + pp_rank * (mp_size) - + dp_rank * (mp_size * pp_size) - + sharding_rank * (mp_size * pp_size * dp_size) - ) + sharding_rank = hcg.get_sharding_parallel_rank() + # sharding_size = hcg.get_sharding_parallel_world_size() + else: + mp_rank, mp_size = 0, 1 + pp_rank, pp_size = 0, 1 + dp_rank, dp_size = 0, 1 + sharding_rank, _ = 0, 1 + + # NOTE: the commented seeds are set only for precision validation + # seed += 100 * pp_rank + random.seed(seed + 100 * pp_rank) + np.random.seed(seed + 100 * pp_rank) + + seed_offset = seed + global_seed = ( + seed_offset + + pp_rank * (mp_size) + + dp_rank * (mp_size * pp_size) + + sharding_rank * (mp_size * pp_size * dp_size) + ) + + seed_offset += paddle.distributed.get_world_size() + local_seed = ( + seed_offset + + mp_rank + + pp_rank * (mp_size) + + dp_rank * (mp_size * pp_size) + + sharding_rank * (mp_size * pp_size * dp_size) + ) - tracker = get_rng_state_tracker() - if "global_seed" not in tracker.states_: - tracker.add("global_seed", global_seed) + tracker = get_rng_state_tracker() + if "global_seed" not in tracker.states_: + tracker.add("global_seed", global_seed) - if "local_seed" not in tracker.states_: - tracker.add("local_seed", local_seed) + if "local_seed" not in tracker.states_: + tracker.add("local_seed", local_seed) - paddle.seed(global_seed) + paddle.seed(global_seed) - logger.info("The global seed is set to {} and local seed is set to {}.".format(global_seed, local_seed)) + logger.info("The global seed is set to {} and local seed is set to {}.".format(global_seed, local_seed)) class ExplicitEnum(Enum): diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 975d67e7835b..d48b804ed143 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -1095,7 +1095,7 @@ def is_segment_parallel_supported(): pipeline.enable_send_recv_overlap = "enable_send_recv_overlap" in pipeline_parallel_config pipeline.accumulate_steps = self.gradient_accumulation_steps pipeline.micro_batch_size = self.per_device_train_batch_size - pipeline.schedule_mode = "1F1B" + pipeline.schedule_mode = self.pipeline_schedule_mode logger.info(f"PP configs:{strategy.pipeline}, use master_grad: {self.amp_master_grad}") diff --git a/paddlenlp/transformers/chatglm_v2/tokenizer.py b/paddlenlp/transformers/chatglm_v2/tokenizer.py index 8ede3ac23d96..bfe4ba48fab4 100644 --- a/paddlenlp/transformers/chatglm_v2/tokenizer.py +++ b/paddlenlp/transformers/chatglm_v2/tokenizer.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import os from typing import Dict, List, Optional, Union @@ -278,3 +279,13 @@ def _pad( encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input return encoded_inputs + + def encode_chat_inputs(self, conversations: List[List[str, str]]): + # encode system + result = super().encode_chat_inputs(conversations) + if "system" in result: + result["system"] = self.get_prefix_tokens() + result["system"] + else: + result["conversations"][0][0] = self.get_prefix_tokens() + result["conversations"][0][0] + + return result diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 26b5928132f7..31142b14c4e8 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -306,15 +306,16 @@ def _prepare_qkv(self, query, key, value, use_cache=False, past_key_value=None): return query_states, key_states, value_states, past_key_value def _flash_attention(self, q, k, v, attention_mask=None, output_attentions=False): - out, weights = flash_attention( - query=q, - key=k, - value=v, - dropout=self.config.attention_probs_dropout_prob, - causal=q.shape[1] != 1, - return_softmax=output_attentions, - training=self.training, - ) + with seed_guard_context("local_seed"): + out, weights = flash_attention( + query=q, + key=k, + value=v, + dropout=self.config.attention_probs_dropout_prob, + causal=q.shape[1] != 1, + return_softmax=output_attentions, + training=self.training, + ) # [bs, seq_len, num_head, head_dim] -> [bs, seq_len, num_head * head_dim] out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) return (out, weights) if output_attentions else out @@ -639,7 +640,11 @@ def forward( attention_weights = hidden_states[1] if output_attentions else None hidden_states = hidden_states[0] if (use_cache or output_attentions) else hidden_states - with seed_guard_context("global_seed"): + # Use a ternary operator for a more concise assignment of current_seed + current_seed = "local_seed" if self.config.sequence_parallel else "global_seed" + + # The 'with' block ensures the correct seed context is used + with seed_guard_context(current_seed): if self.config.use_fused_dropout_add: hidden_states = self.fused_dropout_add1(hidden_states, residual) else: @@ -654,7 +659,7 @@ def forward( # when sequence_parallel=True: # hidden_states => [bs * seq_len / n, embed_dim] - with seed_guard_context("global_seed"): + with seed_guard_context(current_seed): if not self.config.use_fused_dropout_add: hidden_states = residual + self.dropout2( self.linear2(self.activation(self.linear1(hidden_states), approximate=True)) @@ -730,7 +735,12 @@ def forward(self, input_ids, position_ids=None, inputs_embeddings=None): embeddings = paddle.reshape_(embeddings, [bs * seq_len, hidden_size]) # [bs * seq_len / n, dim] (n is mp parallelism) embeddings = ScatterOp.apply(embeddings) - embeddings = self.dropout(embeddings) + + # Use a ternary operator for a more concise assignment of current_seed + current_seed = "local_seed" if self.config.sequence_parallel else "global_seed" + # The 'with' block ensures the correct seed context is used + with seed_guard_context(current_seed): + embeddings = self.dropout(embeddings) return embeddings diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py index 1bda7d640538..9d9f44dc26ba 100644 --- a/paddlenlp/transformers/gpt/modeling_pp.py +++ b/paddlenlp/transformers/gpt/modeling_pp.py @@ -22,18 +22,15 @@ from paddle.distributed.fleet.utils import recompute from paddlenlp.transformers.model_utils import PipelinePretrainedModel -from paddlenlp.transformers.sequence_parallel_utils import ( - GatherOp, - mark_as_sequence_parallel_parameter, -) +from ..sequence_parallel_utils import mark_as_sequence_parallel_parameter from .modeling import ( GPTConfig, GPTDecoderLayer, GPTEmbeddings, + GPTLMHead, GPTPretrainedModel, GPTPretrainingCriterion, - parallel_matmul, ) __all__ = [ @@ -141,27 +138,13 @@ def forward(self, args): return hidden_states -class GPTLMHeadPipe(GPTEmbeddings): +class GPTLMHeadPipe(GPTLMHead): def __init__(self, config): super(GPTLMHeadPipe, self).__init__(config) @property def embedding_weight(self): - return get_attr(self.word_embeddings, "weight") - - def forward(self, output): - if self.config.sequence_parallel: - output = GatherOp.apply(output) - output = paddle.reshape_(output, [-1, self.config.seq_length, self.config.hidden_size]) - - tensor_parallel_output = False if self.config.tensor_parallel_degree > 1 else True - output = parallel_matmul( - output, - self.embedding_weight, - transpose_y=True, - tensor_parallel_output=tensor_parallel_output, - ) - return output + return get_attr(self, "weight") class GPTForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): @@ -198,7 +181,9 @@ def __init__( config.tensor_parallel_rank = tensor_parallel_rank self.add_sequential_layer( - SharedLayerDesc("gpt", GPTEmbeddingPipe, shared_weight_attr="embedding_weight", config=config), + SharedLayerDesc( + "gpt_shared_weight", GPTEmbeddingPipe, shared_weight_attr="embedding_weight", config=config + ), "gpt.embeddings", ) for i in range(config.num_hidden_layers): @@ -209,8 +194,8 @@ def __init__( self.add_sequential_layer(LayerDesc(LayerNormPipe, config=config), "gpt.decoder.norm") self.add_sequential_layer( - SharedLayerDesc("gpt", GPTLMHeadPipe, shared_weight_attr="embedding_weight", config=config), - "gpt.embeddings", + SharedLayerDesc("gpt_shared_weight", GPTLMHeadPipe, shared_weight_attr="embedding_weight", config=config), + "gpt.embeddings.word_embeddings", ) recompute_interval = 0 diff --git a/paddlenlp/transformers/llama/modeling_auto.py b/paddlenlp/transformers/llama/modeling_auto.py index 17efd5cb03ce..b52c35235753 100644 --- a/paddlenlp/transformers/llama/modeling_auto.py +++ b/paddlenlp/transformers/llama/modeling_auto.py @@ -51,6 +51,7 @@ LlamaNTKScalingRotaryEmbedding, LlamaRotaryEmbedding, _expand_2d_mask, + _make_causal_mask, apply_rotary_pos_emb, build_alibi_tensor, get_triangle_upper_mask, @@ -71,11 +72,17 @@ ] -def get_dist_attr(shard_specs, pp_idx=0): +def get_mesh(pp_idx=None): mesh = fleet.auto.get_mesh() + if pp_idx is None: + return mesh if "pp" in mesh.dim_names: mesh = mesh.get_mesh_with_dim("pp")[pp_idx] + return mesh + +def get_dist_attr(shard_specs, pp_idx=None): + mesh = get_mesh(pp_idx) new_spec = [] for spec in shard_specs: if not spec: @@ -89,22 +96,6 @@ def get_dist_attr(shard_specs, pp_idx=0): return mesh, new_spec -def _make_causal_mask(input_ids_shape, past_key_values_length): - """ - Make causal mask used for self-attention - """ - batch_size, target_length = input_ids_shape # target_length: seq_len - - mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool")) - - if past_key_values_length > 0: - # [tgt_len, tgt_len + past_len] - mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1) - - # [bs, 1, tgt_len, tgt_len + past_len] - return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length]) - - def scaled_dot_product_attention( query_states, config, @@ -118,14 +109,13 @@ def scaled_dot_product_attention( _, kv_seq_len, _, _ = value_states.shape if config.use_flash_attention and flash_attention: - # Flash Attention now ignore attention mask - # Current Flash Attention doesn't support attn maskt # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim] # Torch Flash Attention input [ bz, nhead, seqlen, head_dim] - if alibi is not None: - attention_mask = attention_mask.cast(alibi.dtype) + alibi + version = paddle.version.full_version if version != "0.0.0" and version <= "2.5.2": + if alibi is not None: + raise ValueError("Flash Attention doesn't support alibi") attn_output, attn_weights = flash_attention( query_states, key_states, @@ -134,6 +124,9 @@ def scaled_dot_product_attention( return_softmax=output_attentions, ) else: + if alibi is not None: + alibi = alibi.reshape([bsz, num_heads, 1, -1]) + attention_mask = attention_mask.cast(alibi.dtype) + alibi attn_output = F.scaled_dot_product_attention( query_states, key_states, @@ -229,7 +222,6 @@ def __init__(self, config, ipp: Optional[int] = None): self.intermediate_size = config.intermediate_size self.fuse_attention_ffn = config.fuse_attention_ffn self.ipp = ipp - self.config = config if config.fuse_attention_ffn: self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) @@ -240,7 +232,7 @@ def __init__(self, config, ipp: Optional[int] = None): self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) def forward(self, x): - if self.config.fuse_attention_ffn: + if self.fuse_attention_ffn: fleet.auto.shard_tensor(self.gate_up_fused_proj.weight, *get_dist_attr([None, "mp"], self.ipp)) else: fleet.auto.shard_tensor(self.gate_proj.weight, *get_dist_attr([None, "mp"], self.ipp)) @@ -485,7 +477,9 @@ def forward( class LlamaDecoderLayerAuto(nn.Layer): - def __init__(self, config, layerwise_recompute: bool = False, ipp: Optional[int] = None): + def __init__( + self, config, layerwise_recompute: bool = False, ipp: Optional[int] = None, ichunk: Optional[int] = None + ): super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -499,6 +493,7 @@ def __init__(self, config, layerwise_recompute: bool = False, ipp: Optional[int] self.layerwise_recompute = layerwise_recompute self.recompute_granularity = config.recompute_granularity self.ipp = ipp + self.ichunk = ichunk def forward( self, @@ -707,18 +702,26 @@ def __init__(self, config: LlamaConfig): self.hidden_size, ) - def get_layer_ipp(layer_index): - mesh = fleet.auto.get_mesh() - if "pp" not in mesh.dim_names: - return None - else: - pp_degree = mesh.get_dim_size("pp") - layer_per_stage = math.ceil(config.num_hidden_layers / pp_degree) - return layer_index // layer_per_stage + mesh = get_mesh() + if "pp" not in mesh.dim_names: + pp_degree = 1 + else: + pp_degree = mesh.get_dim_size("pp") + virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1) + assert config.num_hidden_layers % (pp_degree * virtual_pp_degree) == 0 + + num_layer_per_chunk = math.ceil(config.num_hidden_layers / pp_degree / virtual_pp_degree) + total_virtual_chunks = config.num_hidden_layers // virtual_pp_degree + self.layer_to_ipp = [(i // num_layer_per_chunk) % pp_degree for i in range(config.num_hidden_layers)] + self.layer_to_ichunk = [ + (i // total_virtual_chunks) % virtual_pp_degree for i in range(config.num_hidden_layers) + ] self.layers = nn.LayerList( [ - LlamaDecoderLayerAuto(config, i not in self.no_recompute_layers, get_layer_ipp(i)) + LlamaDecoderLayerAuto( + config, i not in self.no_recompute_layers, self.layer_to_ipp[i], self.layer_to_ichunk[i] + ) for i in range(config.num_hidden_layers) ] ) @@ -744,7 +747,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values input_shape, past_key_values_length=past_key_values_length ) # NOTE(zhaoyingli): infer spmd does not support [seq_len, seq_len] --> [batch, 1, seq_len, seq_len] in data_parallel - fleet.auto.shard_tensor(combined_attention_mask, *get_dist_attr([None, None, None, None])) + fleet.auto.shard_tensor(combined_attention_mask, get_mesh(), [None, None, None, None]) expanded_attn_mask = expanded_attn_mask & combined_attention_mask # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len] elif len(attention_mask.shape) == 3: @@ -779,6 +782,10 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # NOTE(zhaoyingli): temprorary method to guarantee the later ops are placed all ranks until meeting new annotaion. + full = fleet.auto.shard_op(paddle.full, get_mesh(), chunk_id=0) + full(shape=[1], fill_value=0) + # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") @@ -798,9 +805,14 @@ def forward( cache_length = paddle.shape(past_key_values[0][0])[1] seq_length_with_past += cache_length - fleet.auto.shard_tensor(self.embed_tokens.weight, *get_dist_attr(["mp", None], 0)) if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) + fleet.auto.shard_tensor(self.embed_tokens.weight, *get_dist_attr(["mp", None], 0)) + embed_tokens = fleet.auto.shard_op(self.embed_tokens, get_mesh(0), chunk_id=0) + inputs_embeds = embed_tokens(input_ids) + + # NOTE(zhaoyingli): temprorary method to guarantee the later ops are placed all ranks until meeting new annotaion. + full = fleet.auto.shard_op(paddle.full, get_mesh(), chunk_id=0) + full(shape=[1], fill_value=0) # embed positions if attention_mask is None: @@ -815,8 +827,9 @@ def forward( if position_ids is None: position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length)) - # NOTE(zhaoyingli): infer spmd does not support [seq_len] --> [batch, seq_len] in data_parallel - fleet.auto.shard_tensor(position_ids, *get_dist_attr([None, None])) + # NOTE(zhaoyingli): + # 1. infer spmd does not support [seq_len] --> [batch, seq_len] in data_parallel + fleet.auto.shard_tensor(position_ids, get_mesh(), [None, None]) attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype @@ -832,12 +845,16 @@ def forward( next_decoder_cache = () if use_cache else None for idx, (decoder_layer) in enumerate(self.layers): + ipp = decoder_layer.ipp + ichunk = decoder_layer.ichunk + fleet.auto.shard_tensor(hidden_states, *get_dist_attr(["dp", None, None], ipp)) + decoder_layer = fleet.auto.shard_op(decoder_layer, get_mesh(ipp), chunk_id=ichunk) + if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None has_gradient = not hidden_states.stop_gradient - fleet.auto.shard_tensor(hidden_states, *get_dist_attr(["dp", None, None], decoder_layer.ipp)) if ( self.enable_recompute and idx not in self.no_recompute_layers @@ -875,6 +892,26 @@ def forward( if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + return fleet.auto.shard_op(self._post_output, get_mesh(-1), chunk_id=self.layer_to_ichunk[-1])( + hidden_states, + output_hidden_states, + next_decoder_cache, + all_self_attns, + all_hidden_states, + use_cache, + return_dict, + ) + + def _post_output( + self, + hidden_states, + output_hidden_states, + next_decoder_cache, + all_self_attns, + all_hidden_states, + use_cache, + return_dict, + ): hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer @@ -1078,13 +1115,15 @@ def forward( self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1 ) - logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output) + lm_head = fleet.auto.shard_op(self.lm_head, get_mesh(-1), chunk_id=self.llama.layer_to_ichunk[-1]) + logits = lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output) loss = None if labels is not None: labels.stop_gradient = True fleet.auto.shard_tensor(labels, *get_dist_attr(["dp", None], -1)) - loss = self.criterion(logits, labels) + criterion = fleet.auto.shard_op(self.criterion, get_mesh(-1), chunk_id=self.llama.layer_to_ichunk[-1]) + loss = criterion(logits, labels) if not return_dict: output = (logits,) + outputs[1:] diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 0c1b9fb413ee..187ebbc3e378 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -42,6 +42,7 @@ ) from huggingface_hub.utils import EntryNotFoundError from paddle import Tensor +from paddle.distributed.fleet.meta_parallel.parallel_layers import SharedLayerDesc from paddle.nn import Embedding, Layer # TODO(fangzeyang) Temporary fix and replace by paddle framework downloader later @@ -1416,7 +1417,6 @@ def _resolve_model_file_path( use_safetensors: bool | None = None, variant=None, ) -> str: - """resolve model target file path from `` and `cache_dir` 1. when it is file path: @@ -2399,7 +2399,7 @@ def add_sequential_layer(self, layer_desc, name_prefix=""): def get_sequential_layers(self): return [x["layer"] for x in self._sequential_layers] - def get_sequential_name_prefixs(self): + def get_sequential_name_prefixes(self): return {str(index): x["name_prefix"] for index, x in enumerate(self._sequential_layers)} def _set_pipeline_name_mapping(self, mappings=None): @@ -2420,43 +2420,35 @@ def _set_pipeline_name_mapping(self, mappings=None): # else it will be like 0.xxx use_virtual_pp_degree = first_key[0].isdigit() and first_key[1].isdigit() - prefixs = self.get_sequential_name_prefixs() + prefixes = self.get_sequential_name_prefixes() for k in state_dict_keys: name_splited = k.split(".") if use_virtual_pp_degree: if name_splited[0].isdigit(): if name_splited[1].isdigit(): idx = str(int(name_splited[0]) + int(name_splited[1])) - single_name = [prefixs[idx]] + single_name = [prefixes[idx]] single_name.extend(name_splited[2:]) else: - single_name = [prefixs[str(len(prefixs) - 1)]] + single_name = [prefixes[str(len(prefixes) - 1)]] single_name.extend(name_splited[2:]) logger.warning( f"Please check! we treat this key as last layer, get {k}, set origin name as {'.'.join(single_name)}" ) elif name_splited[0] == "shared_layers": - # TODO: it treat shared_layers as first layer - single_name = [prefixs["0"]] + single_name = [self.get_shardlayer_prefix(name_splited)] single_name.extend(name_splited[2:]) - logger.warning( - f"Please check! we treat shared_layers as first layer, get {k}, set origin name as {'.'.join(single_name)}" - ) else: raise ValueError(f"Unexpected key: {k} for pp layer.") else: idx = name_splited[0] # for normal pp layer if idx.isdigit(): - single_name = [prefixs[idx]] + single_name = [prefixes[idx]] single_name.extend(name_splited[1:]) elif idx == "shared_layers": - # TODO: it treat shared_layers as first layer - single_name = [prefixs["0"]] + single_name = [self.get_shardlayer_prefix(name_splited)] single_name.extend(name_splited[2:]) - logger.warning( - f"Please check! we treat shared_layers as first layer, get {k}, set origin name as {'.'.join(single_name)}" - ) else: raise ValueError(f"Unexpected key: {k} for pp layer.") @@ -2468,6 +2460,34 @@ def _set_pipeline_name_mapping(self, mappings=None): return self._single_to_pp_mapping + def get_shardlayer_prefix(self, name_splited): + """_summary_ + This function retrieves the prefix of a shared layer. The process involves: + 1. Identifying all key names of shared layers, like 'shared_weight01', 'shared_weight02', etc. + 2. For instance, given name_splited = ['shared_layers', 'shared_weight01', 'weight'], + the 'shared_layer_key' would be name_splited[1], which is 'shared_weight01'. + 3. By traversing through all layers, the function checks if the specified + shared_layer is present in the current stage. If found, it returns the corresponding prefix. + + Note: For retrieving all SharedLayer instances in Paddle, you can refer to the following Paddle code. + https://github.com/PaddlePaddle/Paddle/blob/2cf724d055679a1a0e48766dfb1708b920273078/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py#L460-L513 + Args: + name_splited (_type_): _description_ + + Returns: + _type_: _description_ + """ + shared_layer_names = {s.layer_name for s in self._layers_desc if isinstance(s, SharedLayerDesc)} + assert name_splited[1] in shared_layer_names, f"The shared layer name {name_splited[1]} must be in prefixes!" + shared_layer_key = name_splited[1] + for idx, layer in enumerate(self._layers_desc): + if isinstance(layer, SharedLayerDesc) and layer.layer_name == shared_layer_key: + if self.get_stage_from_index(idx) == self._stage_id: + return self.get_sequential_name_prefixes()[str(idx)] + + # the prefix must be in the current stage, else raise error + raise ValueError(f"The shared layer {shared_layer_key} must be in the current stage!") + def state_dict(self, *args, **kwargs): state_dict = super().state_dict(*args, **kwargs) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index da8f9430d218..c8ef61d3fffd 100644 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -49,6 +49,7 @@ function llama_case_list_auto() { llama_auto_recompute_bs16_fp32_DP2-MP1-PP1 llama_auto_recompute_bs16_fp32_DP2-MP2-PP1 llama_auto_recompute_bs16_fp32_DP2-MP2-PP2 + llama_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2 } function case_list_auto_pir() { @@ -837,7 +838,6 @@ function llama_auto_recompute_bs8_fp32_DP1-MP1-PP1() { echo "=========== $FUNCNAME run begin ===========" export PYTHONPATH=$root_path/:$PYTHONPATH export FLAGS_call_stack_level=2 - export SOT_LOG_LEVEL=4 task_name="llama_auto_bs8_dp1mp1pp1" case_out_dir="output/$task_name" @@ -878,7 +878,6 @@ function llama_auto_recompute_bs8_fp32_DP1-MP1-PP1() { --max_grad_norm 1.0 \ --logging_steps 1 \ --dataloader_num_workers 1 \ - --sharding "" \ --eval_steps 1000 \ --report_to "visualdl" \ --disable_tqdm true \ @@ -905,7 +904,6 @@ function llama_auto_recompute_bs16_fp32_DP2-MP1-PP1() { echo "=========== $FUNCNAME run begin ===========" export PYTHONPATH=$root_path/:$PYTHONPATH export FLAGS_call_stack_level=2 - export SOT_LOG_LEVEL=4 task_name="llama_auto_bs16_dp2mp1pp1" case_out_dir="output/$task_name" @@ -946,7 +944,6 @@ function llama_auto_recompute_bs16_fp32_DP2-MP1-PP1() { --max_grad_norm 1.0 \ --logging_steps 1 \ --dataloader_num_workers 1 \ - --sharding "" \ --eval_steps 1000 \ --report_to "visualdl" \ --disable_tqdm true \ @@ -973,7 +970,6 @@ function llama_auto_recompute_bs16_fp32_DP2-MP2-PP1() { echo "=========== $FUNCNAME run begin ===========" export PYTHONPATH=$root_path/:$PYTHONPATH export FLAGS_call_stack_level=2 - export SOT_LOG_LEVEL=4 task_name="llama_auto_bs16_dp2mp2pp1" case_out_dir="output/$task_name" @@ -1014,7 +1010,6 @@ function llama_auto_recompute_bs16_fp32_DP2-MP2-PP1() { --max_grad_norm 1.0 \ --logging_steps 1 \ --dataloader_num_workers 1 \ - --sharding "" \ --eval_steps 1000 \ --report_to "visualdl" \ --disable_tqdm true \ @@ -1041,9 +1036,8 @@ function llama_auto_recompute_bs16_fp32_DP2-MP2-PP2() { echo "=========== $FUNCNAME run begin ===========" export PYTHONPATH=$root_path/:$PYTHONPATH export FLAGS_call_stack_level=2 - export SOT_LOG_LEVEL=4 - task_name="llama_auto_bs16_dp2mp2pp1" + task_name="llama_auto_bs16_dp2mp2pp2" case_out_dir="output/$task_name" case_log_dir="output/$task_name""_log" rm -rf $case_out_dir @@ -1082,7 +1076,6 @@ function llama_auto_recompute_bs16_fp32_DP2-MP2-PP2() { --max_grad_norm 1.0 \ --logging_steps 1 \ --dataloader_num_workers 1 \ - --sharding "" \ --eval_steps 1000 \ --report_to "visualdl" \ --disable_tqdm true \ @@ -1105,6 +1098,73 @@ function llama_auto_recompute_bs16_fp32_DP2-MP2-PP2() { echo "=========== $FUNCNAME run end ===========" } +function llama_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2() { + echo "=========== $FUNCNAME run begin ===========" + export PYTHONPATH=$root_path/:$PYTHONPATH + export FLAGS_call_stack_level=2 + + task_name="llama_auto_bs16_dp2mp2pp2vpp2sharding2" + case_out_dir="output/$task_name" + case_log_dir="output/$task_name""_log" + rm -rf $case_out_dir + rm -rf $case_log_dir + + python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \ + --model_type "llama" \ + --model_name_or_path "facebook/llama-7b" \ + --tokenizer_name_or_path "facebook/llama-7b" \ + --hidden_size 1024 \ + --intermediate_size 3072 \ + --num_hidden_layers 8 \ + --num_attention_heads 32 \ + --input_dir "./data" \ + --output_dir $case_out_dir \ + --split 949,50,1 \ + --max_seq_length 2048 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 8 \ + --gradient_accumulation_steps 8 \ + --use_flash_attention 0 \ + --use_fused_rms_norm 0 \ + --fp16 0 \ + --fp16_opt_level "O2" \ + --scale_loss 1024 \ + --tensor_parallel_degree 2 \ + --pipeline_parallel_degree 2 \ + --virtual_pp_degree 2 \ + --pipeline_schedule_mode "VPP" \ + --sharding_parallel_degree 2 \ + --sharding "stage2" \ + --learning_rate 0.0001 \ + --min_learning_rate 0.00001 \ + --max_steps 10 \ + --save_steps 5000 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --logging_steps 1 \ + --dataloader_num_workers 1 \ + --eval_steps 1000 \ + --report_to "visualdl" \ + --disable_tqdm true \ + --continue_training 0 \ + --recompute 1 \ + --do_train \ + --do_eval \ + --device "gpu" \ + --data_impl "mmap" \ + --parallel_mode "auto" \ + >>${log_path}/$FUNCNAME 2>&1 + loss=`cat $case_log_dir/workerlog.3 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=-1 + mem=-1 + echo "result: loss=$loss ips=$ips mem=$mem" + loss_base=9.53802204 + ips_base=-1 + mem_base=-1 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} ############ case end ############ function check_result() { diff --git a/tests/test_tipc/llm/inference/run_predictor.sh b/tests/test_tipc/llm/inference/run_predictor.sh index 4d2d213c501a..5373fe2a6cea 100644 --- a/tests/test_tipc/llm/inference/run_predictor.sh +++ b/tests/test_tipc/llm/inference/run_predictor.sh @@ -28,7 +28,7 @@ data_file=${data_file:-"tests/fixtures/llm/zh_query.json"} benchmark=${benchmark:-"0"} common_arguments="--decode_strategy ${decode_strategy} --src_length 300 --max_length 200 --benchmark ${benchmark} --dtype ${dtype} --batch_size 3 --inference_model ${inference_model} " -common_arguments+="--data_file ${data_file} --top_p ${top_p}" +common_arguments+="--data_file ${data_file} --top_p ${top_p} --chat_template none" echo "pwd -> " diff --git a/tests/test_tipc/llm/inference/run_predictor_precaches.sh b/tests/test_tipc/llm/inference/run_predictor_precaches.sh index 40d70f298217..705cf00faae7 100644 --- a/tests/test_tipc/llm/inference/run_predictor_precaches.sh +++ b/tests/test_tipc/llm/inference/run_predictor_precaches.sh @@ -28,7 +28,7 @@ common_arguments="--decode_strategy ${decode_strategy} --src_length 300 --max_le if [ $fused_model ]; then common_arguments+=" --inference_model " fi -common_arguments+="--data_file ${data_file} --top_p ${top_p} --benchmark ${benchmark}" +common_arguments+="--data_file ${data_file} --top_p ${top_p} --benchmark ${benchmark} --chat_template none" cd .. echo "precache ing"