diff --git a/applications/DeepSpeed-Chat/dschat/__init__.py b/applications/DeepSpeed-Chat/dschat/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/__init__.py b/applications/DeepSpeed-Chat/dschat/rlhf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py index 0cf1c28ab..e89f701cc 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py @@ -22,7 +22,10 @@ def get_train_ds_config(offload, enable_tensorboard=False, enable_mixed_precision_lora=False, tb_path="", - tb_name=""): + tb_name="", + offload_optimizer_config=None, + offload_param_config=None, + aio_config=None): device = "cpu" if offload else "none" if dtype == "fp16": @@ -45,12 +48,16 @@ def get_train_ds_config(offload, "stage3_prefetch_bucket_size": 3e7, "memory_efficient_linear": False } + if offload_optimizer_config: + zero_opt_dict["offload_optimizer"].update(offload_optimizer_config) + if offload_param_config: + zero_opt_dict["offload_param"].update(offload_param_config) if enable_mixed_precision_lora: zero_opt_dict["zero_quantized_nontrainable_weights"] = True if dist.get_world_size() != get_accelerator().device_count(): zero_opt_dict["zero_hpz_partition_size"] = get_accelerator( ).device_count() - return { + config = { "train_batch_size": GLOBAL_BATCH_SIZE, "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, "steps_per_print": 10, @@ -73,6 +80,9 @@ def get_train_ds_config(offload, "job_name": f"{tb_name}_tensorboard" } } + if aio_config: + config["aio"] = aio_config + return config def get_eval_ds_config(offload, dtype, stage=0): diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/module/__init__.py b/applications/DeepSpeed-Chat/dschat/utils/module/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/applications/DeepSpeed-Chat/dschat/utils/utils.py b/applications/DeepSpeed-Chat/dschat/utils/utils.py index e4dc7d036..167234d06 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/utils.py @@ -74,8 +74,7 @@ def get(self): def get_tokenizer(model_name_or_path, fast_tokenizer=True): if "llama" in model_name_or_path: - from transformers.models.llama import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, fast_tokenizer=fast_tokenizer) if tokenizer.pad_token is None: # assert tokenizer.eos_token is not None @@ -94,16 +93,13 @@ def get_tokenizer(model_name_or_path, fast_tokenizer=True): def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True, add_special_tokens=None): - if os.path.exists(model_name_or_path): - # Locally tokenizer loading has some issue, so we need to force download - model_json = os.path.join(model_name_or_path, "config.json") - if os.path.exists(model_json): - model_json_file = json.load(open(model_json)) - model_name = model_json_file.get("_name_or_path", - model_name_or_path) - tokenizer = get_tokenizer(model_name, - fast_tokenizer=fast_tokenizer) + # Support loading from local path directly + if os.path.exists(model_name_or_path) and os.path.isdir(model_name_or_path): + # Directly load tokenizer from local path + tokenizer = get_tokenizer(model_name_or_path, + fast_tokenizer=fast_tokenizer) else: + # Load from HuggingFace Hub or use original logic tokenizer = get_tokenizer(model_name_or_path, fast_tokenizer=fast_tokenizer) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index aa505a25d..44d99b76d 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -5,6 +5,7 @@ # DeepSpeed Team import argparse import math +from pprint import pformat import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler @@ -29,6 +30,18 @@ from dschat.utils.perf import print_throughput +def str2bool(value): + if isinstance(value, bool): + return value + lowered = value.lower() + if lowered in ("yes", "true", "t", "1"): + return True + if lowered in ("no", "false", "f", "0"): + return False + raise argparse.ArgumentTypeError( + f"Boolean value expected, got `{value}`.") + + def parse_args(): parser = argparse.ArgumentParser( description= @@ -145,6 +158,80 @@ def parse_args(): parser.add_argument('--offload', action='store_true', help='Enable ZeRO Offload techniques.') + parser.add_argument('--offload_optimizer_device', + type=str, + choices=['cpu', 'nvme'], + default=None, + help='Device to use for ZeRO optimizer state offload.') + parser.add_argument('--offload_optimizer_nvme_path', + type=str, + default=None, + help='NVMe path used when offloading optimizer states to nvme.') + parser.add_argument('--offload_optimizer_pin_memory', + type=str2bool, + default=None, + help='Whether to pin optimizer offload memory (true|false).') + parser.add_argument('--offload_optimizer_ratio', + type=float, + default=None, + help='Ratio of optimizer state to keep on device when offloading.') + parser.add_argument('--offload_optimizer_buffer_count', + type=int, + default=None, + help='Number of optimizer offload buffers.') + parser.add_argument('--offload_optimizer_fast_init', + type=str2bool, + default=None, + help='Use fast init for optimizer offload buffers (true|false).') + parser.add_argument('--offload_param_device', + type=str, + choices=['cpu', 'nvme'], + default=None, + help='Device to use for ZeRO parameter offload.') + parser.add_argument('--offload_param_nvme_path', + type=str, + default=None, + help='NVMe path used when offloading parameters to nvme.') + parser.add_argument('--offload_param_pin_memory', + type=str2bool, + default=None, + help='Whether to pin parameter offload memory (true|false).') + parser.add_argument('--offload_param_buffer_size', + type=int, + default=None, + help='Parameter offload buffer size (number of elements). Increase if embedding layer is larger than the default.') + parser.add_argument('--offload_param_buffer_count', + type=int, + default=None, + help='Number of parameter offload buffers.') + parser.add_argument('--offload_param_max_in_cpu', + type=float, + default=None, + help='Maximum number of parameters to keep in CPU memory during offload.') + parser.add_argument('--aio_block_size', + type=int, + default=1048576, + help='AIO block size for NVMe offload (bytes).') + parser.add_argument('--aio_queue_depth', + type=int, + default=8, + help='AIO queue depth for NVMe offload.') + parser.add_argument('--aio_intra_op_parallelism', + type=int, + default=1, + help='AIO intra_op_parallelism for NVMe offload.') + parser.add_argument('--aio_single_submit', + type=str2bool, + default=False, + help='AIO single_submit flag.') + parser.add_argument('--aio_overlap_events', + type=str2bool, + default=True, + help='AIO overlap_events flag.') + parser.add_argument('--aio_use_gds', + type=str2bool, + default=False, + help='AIO use_gds flag.') parser.add_argument('--dtype', type=str, default='fp16', @@ -222,18 +309,64 @@ def main(): args.global_rank = torch.distributed.get_rank() + offload_optimizer_overrides = { + "device": args.offload_optimizer_device, + "nvme_path": args.offload_optimizer_nvme_path, + "pin_memory": args.offload_optimizer_pin_memory, + "ratio": args.offload_optimizer_ratio, + "buffer_count": args.offload_optimizer_buffer_count, + "fast_init": args.offload_optimizer_fast_init + } + offload_optimizer_overrides = { + key: value + for key, value in offload_optimizer_overrides.items() + if value is not None + } + offload_param_overrides = { + "device": args.offload_param_device, + "nvme_path": args.offload_param_nvme_path, + "pin_memory": args.offload_param_pin_memory, + "buffer_size": args.offload_param_buffer_size, + "buffer_count": args.offload_param_buffer_count, + "max_in_cpu": args.offload_param_max_in_cpu + } + offload_param_overrides = { + key: value + for key, value in offload_param_overrides.items() + if value is not None + } + aio_config = { + "block_size": args.aio_block_size, + "queue_depth": args.aio_queue_depth, + "intra_op_parallelism": args.aio_intra_op_parallelism, + "single_submit": args.aio_single_submit, + "overlap_events": args.aio_overlap_events, + "use_gds": args.aio_use_gds, + } ds_config = get_train_ds_config(offload=args.offload, dtype=args.dtype, stage=args.zero_stage, enable_tensorboard=args.enable_tensorboard, tb_path=args.tensorboard_path, - tb_name="step1_model") + tb_name="step1_model", + offload_optimizer_config=( + offload_optimizer_overrides + if offload_optimizer_overrides else None), + offload_param_config=( + offload_param_overrides + if offload_param_overrides else None), + aio_config=aio_config) ds_config[ 'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size ds_config[ 'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size( ) * args.gradient_accumulation_steps + + # It seems that ds_config is completed here, so we print configuration here + print_rank_0("***** DeepSpeed User Provided config *****", args.global_rank) + print_rank_0(pformat(ds_config), args.global_rank) + # If passed along, set the training seed now. set_random_seed(args.seed) @@ -245,6 +378,9 @@ def main(): fast_tokenizer=True, add_special_tokens=additional_special_tokens) + print_rank_0("***** Tokenizer *****", args.global_rank) + print_rank_0(tokenizer, args.global_rank) + model = create_hf_model(AutoModelForCausalLM, args.model_name_or_path, tokenizer, @@ -264,6 +400,10 @@ def main(): model = only_optimize_lora_parameters(model) model = make_model_gradient_checkpointing_compatible(model) + # Print full model architecture (rank 0 only to avoid log spam) + print_rank_0("***** Model architecture *****", args.global_rank) + print_rank_0(model, args.global_rank) + # Prepare the data train_phase = 1 train_dataset, eval_dataset = create_prompt_dataset( @@ -319,6 +459,7 @@ def evaluation(model, eval_dataloader): model, args.weight_decay, args.lora_learning_rate) AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam + print_rank_0(f"offload: {args.offload}", args.global_rank) optimizer = AdamOptimizer(optimizer_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.95)) @@ -348,8 +489,9 @@ def evaluation(model, eval_dataloader): print_rank_0( f"***** Evaluating perplexity, Epoch {0}/{args.num_train_epochs} *****", args.global_rank) - perplexity, eval_loss = evaluation(model, eval_dataloader) - print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", args.global_rank) + print_rank_0("Jump Evaluation", args.global_rank) + # perplexity, eval_loss = evaluation(model, eval_dataloader) + # print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", args.global_rank) for epoch in range(args.num_train_epochs): print_rank_0( @@ -372,6 +514,11 @@ def evaluation(model, eval_dataloader): if torch.distributed.get_rank() == 0: print_throughput(model.model, args, end - start, args.global_rank) + + # return for debugging + if step > 20: + return 0 + # Evaluate perplexity on the validation set. print_rank_0( diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh index 7689266ee..6f69b5e9c 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh @@ -13,24 +13,40 @@ if [ "$ZERO_STAGE" == "" ]; then fi mkdir -p $OUTPUT -deepspeed main.py \ - --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ - --data_split 2,4,4 \ - --model_name_or_path meta-llama/Llama-2-7b-hf \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --max_seq_len 512 \ - --learning_rate 9.65e-6 \ - --weight_decay 0. \ - --num_train_epochs 4 \ - --gradient_accumulation_steps 1 \ - --lr_scheduler_type cosine \ - --num_warmup_steps 0 \ - --seed 1234 \ - --gradient_checkpointing \ - --zero_stage $ZERO_STAGE \ - --deepspeed \ - --lora_dim 128 \ - --lora_module_name "layers." \ - --output_dir $OUTPUT \ +CUDA_VISIBLE_DEVICES=0 deepspeed --master_port=29600 main.py \ + --offload \ + --offload_optimizer_device nvme \ + --offload_optimizer_nvme_path /mnt/nvme_deepspeed \ + --offload_optimizer_pin_memory true \ + --offload_optimizer_ratio 0.3 \ + --offload_optimizer_buffer_count 8 \ + --offload_optimizer_fast_init false \ + --offload_param_device nvme \ + --offload_param_nvme_path /mnt/nvme_deepspeed \ + --offload_param_pin_memory true \ + --offload_param_buffer_size 134217728 \ + --offload_param_buffer_count 32 \ + --offload_param_max_in_cpu 0 \ + --aio_use_gds true \ + --dtype bf16 \ + --data_path Dahoas/rm-static \ + --data_split 2,4,4 \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --max_seq_len 512 \ + --learning_rate 9.65e-6 \ + --weight_decay 0. \ + --num_train_epochs 4 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --gradient_checkpointing \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --lora_dim 128 \ + --lora_module_name "layers." \ + --data_output_path /tmp/data_files2 \ + --output_dir $OUTPUT \ &> $OUTPUT/training.log