From d14e267300ef19bbc1475070ffdcce623491f95d Mon Sep 17 00:00:00 2001 From: Cathy <815244047@qq.com> Date: Fri, 10 May 2024 07:12:27 +0800 Subject: [PATCH] Add QWen1.5/Qwen2 support (#303) * add qwen2 support * update yaml * update format --------- Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> --- .../autoconfig/scripts/compare_throughput.py | 6 +- auto_configurator/autoconfig/search_config.py | 11 +- .../autoconfig/training_config.py | 39 ++- auto_configurator/autoconfig/utils.py | 11 +- auto_configurator/base_configs/qwen2_14b.yaml | 235 ++++++++++++++++ auto_configurator/base_configs/qwen2_4b.yaml | 235 ++++++++++++++++ auto_configurator/base_configs/qwen2_72b.yaml | 235 ++++++++++++++++ auto_configurator/base_configs/qwen2_7b.yaml | 235 ++++++++++++++++ .../conf/search_config/qwen2/14b.yaml | 23 ++ .../conf/search_config/qwen2/4b.yaml | 22 ++ .../conf/search_config/qwen2/72b.yaml | 23 ++ .../conf/search_config/qwen2/7b.yaml | 22 ++ .../conf/conversion/qwen2/convert_qwen2.yaml | 22 ++ .../conf/evaluation/qwen2/evaluate_all.yaml | 24 ++ .../conf/evaluation/qwen2/evaluate_boolq.yaml | 24 ++ .../conf/fine_tuning/qwen2/squad.yaml | 194 +++++++++++++ launcher_scripts/conf/peft/qwen2/sft.yaml | 263 ++++++++++++++++++ launcher_scripts/conf/peft/qwen2/squad.yaml | 242 ++++++++++++++++ .../conf/training/qwen2/qwen2_14b.yaml | 231 +++++++++++++++ .../conf/training/qwen2/qwen2_4b.yaml | 234 ++++++++++++++++ .../conf/training/qwen2/qwen2_72b.yaml | 234 ++++++++++++++++ .../conf/training/qwen2/qwen2_7b.yaml | 234 ++++++++++++++++ launcher_scripts/main.py | 3 + .../eval_harness/lm_eval/models/__init__.py | 2 + .../eval_harness/lm_eval/models/nemo_qwen2.py | 261 +++++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 9 + 26 files changed, 3057 insertions(+), 17 deletions(-) create mode 100644 auto_configurator/base_configs/qwen2_14b.yaml create mode 100644 auto_configurator/base_configs/qwen2_4b.yaml create mode 100644 auto_configurator/base_configs/qwen2_72b.yaml create mode 100644 auto_configurator/base_configs/qwen2_7b.yaml create mode 100644 auto_configurator/conf/search_config/qwen2/14b.yaml create mode 100644 auto_configurator/conf/search_config/qwen2/4b.yaml create mode 100644 auto_configurator/conf/search_config/qwen2/72b.yaml create mode 100644 auto_configurator/conf/search_config/qwen2/7b.yaml create mode 100644 launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml create mode 100644 launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml create mode 100644 launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml create mode 100644 launcher_scripts/conf/fine_tuning/qwen2/squad.yaml create mode 100644 launcher_scripts/conf/peft/qwen2/sft.yaml create mode 100644 launcher_scripts/conf/peft/qwen2/squad.yaml create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_14b.yaml create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_4b.yaml create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_72b.yaml create mode 100644 launcher_scripts/conf/training/qwen2/qwen2_7b.yaml create mode 100644 launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py index a1e9c96ab9..9771bc8045 100644 --- a/auto_configurator/autoconfig/scripts/compare_throughput.py +++ b/auto_configurator/autoconfig/scripts/compare_throughput.py @@ -78,12 +78,12 @@ def main(cfg): gbs = model_cfg.get("global_batch_size") enc_seq_len = ( model_cfg.get("encoder_seq_length") - if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm") + if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2") else model_cfg.get("seq_length") ) dec_seq_len = data_cfg.get("seq_length_dec") - if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm"): + if model_name in ("gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"): hs = model_cfg.get("hidden_size") ffn_hs = None layers = model_cfg.get("num_layers") @@ -249,7 +249,7 @@ def calculate_tflops( Bert Formula: Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL)) """ - if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]: + if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]: # Model FLOPS calculation model_flops = ( ( diff --git a/auto_configurator/autoconfig/search_config.py b/auto_configurator/autoconfig/search_config.py index 1059be8c99..5bd83b964b 100644 --- a/auto_configurator/autoconfig/search_config.py +++ b/auto_configurator/autoconfig/search_config.py @@ -20,7 +20,16 @@ from autoconfig.inference_sweep import search_inference_config from autoconfig.training_config import search_training_config -SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert", "llama", "baichuan2", "chatglm"] +SUPPORTED_MODELS = [ + "gpt3", + "t5", + "mt5", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", +] def search_config( diff --git a/auto_configurator/autoconfig/training_config.py b/auto_configurator/autoconfig/training_config.py index 56919b981c..ac007794a4 100644 --- a/auto_configurator/autoconfig/training_config.py +++ b/auto_configurator/autoconfig/training_config.py @@ -81,17 +81,19 @@ def generate_grid_search_configs( # 2 * num_layers is needed because of encoder/decoder architecture. multiplier = ( - 1 if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] else 2 + 1 + if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"] + else 2 ) seq_length = base_cfg["model"]["data"]["seq_length"] num_layers = ( base_cfg["model"]["num_layers"] - if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] + if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"] else base_cfg["model"]["encoder"]["num_layers"] ) - if model_name in ["gpt3", "bert", "llama"]: + if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]: act_method = base_cfg["model"].get("activations_checkpoint_method", "None") else: act_method = base_cfg["model"]["encoder"].get( @@ -126,7 +128,14 @@ def generate_grid_search_configs( base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"] ) gbs = base_cfg["model"]["global_batch_size"] - if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]: + if model_name in [ + "gpt3", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", + ]: att_heads = base_cfg["model"]["num_attention_heads"] num_layers = base_cfg["model"]["num_layers"] else: @@ -222,7 +231,8 @@ def _set_activations_checkpoint_params( max_layers_per_pipe = num_layers interval_layers_per_pipe = act_multiple if ( - model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] and pp > 2 + model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"] + and pp > 2 ): # Interleaved pipeline scheduling. virtual_pipelines = ( num_layers // pp @@ -246,7 +256,14 @@ def _set_activations_checkpoint_params( 0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple ) - if pp > 1 and model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]: + if pp > 1 and model_name in [ + "gpt3", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", + ]: # Num micro batches with partial act ckpt num_micro_batches_partial_act_ckpt = list( range(min_micro_b, max_micro_b + 1, interval_micro_b) @@ -824,14 +841,18 @@ def _calculate_tp_pp_mbs_grid( gpu_memory_gb = train_cfg.get("gpu_memory_gb") multiplier = ( - 1 if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"] else 2 + 1 + if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"] + else 2 + ) + init_pp = ( + [] if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"] else [1] ) - init_pp = [] if model_name in ["gpt3", "llama", "baichuan2", "chatglm"] else [1] valid_pp = init_pp + [ multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0 ] # Only divisors of num_layers are possible. - if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]: + if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]: if gpu_memory_gb == 80: ( tp, diff --git a/auto_configurator/autoconfig/utils.py b/auto_configurator/autoconfig/utils.py index c53f71021e..c8d366e1f2 100644 --- a/auto_configurator/autoconfig/utils.py +++ b/auto_configurator/autoconfig/utils.py @@ -45,7 +45,7 @@ def _calculate_model_size( :rtype: float :raises NotImplementedError: if the model name is not valid. """ - if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]: + if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]: model_size = ( 12 * num_layers @@ -113,7 +113,7 @@ def calculate_model_size_params( :raises NotImplementedError: if the model name is not supported. """ ffn, kv = None, None # Only needed for some models. - if model_name in ["gpt3", "llama", "baichuan2", "chatglm"]: + if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2"]: if model_size_in_b < 0.25: hs, att_h, lr = 768, 12, 6e-4 elif model_size_in_b < 0.5: @@ -395,7 +395,7 @@ def modify_cfg( """ new_cfg = copy.deepcopy(base_cfg) if act is not None: - if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]: + if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]: new_cfg["model"]["activations_checkpoint_num_layers"] = act else: new_cfg["model"]["encoder"]["activations_checkpoint_num_layers"] = act // 2 @@ -407,6 +407,7 @@ def modify_cfg( "llama", "baichuan2", "chatglm", + "qwen2", ]: new_cfg["model"][ "num_micro_batches_with_partial_activation_checkpoints" @@ -418,6 +419,7 @@ def modify_cfg( "llama", "baichuan2", "chatglm", + "qwen2", ]: new_cfg["model"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe @@ -427,6 +429,7 @@ def modify_cfg( "llama", "baichuan2", "chatglm", + "qwen2", ]: new_cfg["model"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines @@ -434,7 +437,7 @@ def modify_cfg( new_cfg["model"]["pipeline_model_parallel_size"] = pp new_cfg["model"]["micro_batch_size"] = mbs - if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm"]: + if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2"]: att_heads = new_cfg["model"]["num_attention_heads"] num_layers = new_cfg["model"]["num_layers"] else: diff --git a/auto_configurator/base_configs/qwen2_14b.yaml b/auto_configurator/base_configs/qwen2_14b.yaml new file mode 100644 index 0000000000..dd6beb9d8a --- /dev/null +++ b/auto_configurator/base_configs/qwen2_14b.yaml @@ -0,0 +1,235 @@ +run: + name: qwen2_14b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:00:00" + dependency: "singleton" +trainer: + num_nodes: 1 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: false + enable_model_summary: false + +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits + filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 40 + hidden_size: 5120 + ffn_hidden_size: 13696 + num_attention_heads: 40 + num_query_groups: 40 + override_vocab_size: 152064 + rotary_base: 1000000.0 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-14B + model: null # /path/to/tokenizer.model + vocab_file: null + merge_file: null + delimiter: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: false + masked_softmax_fusion: true + get_attention_mask_from_fusion: true + apply_rope_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true + + ## Transformer Engine + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false. + overlap_p2p_comm: false + batch_p2p_comm: true + ub_tp_comm_overlap: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: 900,50,50 + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + validation_drop_last: true + no_seqlen_plus_one_input_tokens: false + pad_samples_to_global_batch_size: false + shuffle_documents: true + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document diff --git a/auto_configurator/base_configs/qwen2_4b.yaml b/auto_configurator/base_configs/qwen2_4b.yaml new file mode 100644 index 0000000000..b2268b32e7 --- /dev/null +++ b/auto_configurator/base_configs/qwen2_4b.yaml @@ -0,0 +1,235 @@ +run: + name: qwen2_4b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:00:00" + dependency: "singleton" +trainer: + num_nodes: 1 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: false + enable_model_summary: false + +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits + filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 2 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 40 + hidden_size: 2560 + ffn_hidden_size: 6912 + num_attention_heads: 20 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + num_query_groups: 20 + override_vocab_size: 151936 + rotary_base: 5000000.0 + + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-4B + model: null # /path/to/tokenizer.model + vocab_file: null + merge_file: null + delimiter: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: false + masked_softmax_fusion: true + get_attention_mask_from_fusion: true + apply_rope_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: false + + ## Transformer Engine + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false. + overlap_p2p_comm: false + batch_p2p_comm: true + ub_tp_comm_overlap: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: 900,50,50 + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + validation_drop_last: true + no_seqlen_plus_one_input_tokens: false + pad_samples_to_global_batch_size: false + shuffle_documents: true + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document diff --git a/auto_configurator/base_configs/qwen2_72b.yaml b/auto_configurator/base_configs/qwen2_72b.yaml new file mode 100644 index 0000000000..0f01f0b423 --- /dev/null +++ b/auto_configurator/base_configs/qwen2_72b.yaml @@ -0,0 +1,235 @@ +run: + name: qwen2_72b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:00:00" + dependency: "singleton" +trainer: + num_nodes: 1 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: false + enable_model_summary: false + +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits + filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 80 + hidden_size: 8192 + ffn_hidden_size: 24576 + num_attention_heads: 64 + num_query_groups: 64 + override_vocab_size: 152064 + rotary_base: 1000000.0 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-72B + model: null # /path/to/tokenizer.model + vocab_file: null + merge_file: null + delimiter: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: false + masked_softmax_fusion: true + get_attention_mask_from_fusion: true + apply_rope_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true + + ## Transformer Engine + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false. + overlap_p2p_comm: false + batch_p2p_comm: true + ub_tp_comm_overlap: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: 900,50,50 + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + validation_drop_last: true + no_seqlen_plus_one_input_tokens: false + pad_samples_to_global_batch_size: false + shuffle_documents: true + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document diff --git a/auto_configurator/base_configs/qwen2_7b.yaml b/auto_configurator/base_configs/qwen2_7b.yaml new file mode 100644 index 0000000000..cfb67e2934 --- /dev/null +++ b/auto_configurator/base_configs/qwen2_7b.yaml @@ -0,0 +1,235 @@ +run: + name: qwen2_7b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:00:00" + dependency: "singleton" +trainer: + num_nodes: 1 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: false + enable_model_summary: false + +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits + filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 + num_attention_heads: 32 + num_query_groups: 32 + override_vocab_size: 151936 + rotary_base: 1000000.0 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-7B + model: null # /path/to/tokenizer.model + vocab_file: null + merge_file: null + delimiter: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: false + masked_softmax_fusion: true + get_attention_mask_from_fusion: true + apply_rope_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true + + ## Transformer Engine + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: true # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: false # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to false. + overlap_p2p_comm: false + batch_p2p_comm: true + ub_tp_comm_overlap: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: 900,50,50 + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + validation_drop_last: true + no_seqlen_plus_one_input_tokens: false + pad_samples_to_global_batch_size: false + shuffle_documents: true + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document diff --git a/auto_configurator/conf/search_config/qwen2/14b.yaml b/auto_configurator/conf/search_config/qwen2/14b.yaml new file mode 100644 index 0000000000..050ff98bf3 --- /dev/null +++ b/auto_configurator/conf/search_config/qwen2/14b.yaml @@ -0,0 +1,23 @@ +train_settings: + model_size_in_b: 14 # unit in billion parameters + num_nodes: 2 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 100 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 152064 + seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: ${auto_configurator_path}/base_configs/qwen2_14b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] + diff --git a/auto_configurator/conf/search_config/qwen2/4b.yaml b/auto_configurator/conf/search_config/qwen2/4b.yaml new file mode 100644 index 0000000000..4945230665 --- /dev/null +++ b/auto_configurator/conf/search_config/qwen2/4b.yaml @@ -0,0 +1,22 @@ +train_settings: + model_size_in_b: 4 # unit in billion parameters + num_nodes: 1 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 100 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 151936 + seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: ${auto_configurator_path}/base_configs/qwen2_4b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] diff --git a/auto_configurator/conf/search_config/qwen2/72b.yaml b/auto_configurator/conf/search_config/qwen2/72b.yaml new file mode 100644 index 0000000000..4e5ff55bae --- /dev/null +++ b/auto_configurator/conf/search_config/qwen2/72b.yaml @@ -0,0 +1,23 @@ +train_settings: + model_size_in_b: 72 # unit in billion parameters + num_nodes: 8 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 100 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 152064 + seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: ${auto_configurator_path}/base_configs/qwen2_72b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] + diff --git a/auto_configurator/conf/search_config/qwen2/7b.yaml b/auto_configurator/conf/search_config/qwen2/7b.yaml new file mode 100644 index 0000000000..ac32a47292 --- /dev/null +++ b/auto_configurator/conf/search_config/qwen2/7b.yaml @@ -0,0 +1,22 @@ +train_settings: + model_size_in_b: 7 # unit in billion parameters + num_nodes: 1 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 100 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 151936 + seq_length: 32768 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: ${auto_configurator_path}/base_configs/qwen2_7b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] diff --git a/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml b/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml new file mode 100644 index 0000000000..ab670c4f50 --- /dev/null +++ b/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml @@ -0,0 +1,22 @@ +run: + name: convert_${conversion.run.model_train_name} + nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node + time_limit: "1:00:00" + dependency: "singleton" + ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} + convert_name: convert_nemo + model_train_name: qwen2_7b + train_dir: ${base_results_dir}/${.model_train_name} + results_dir: ${.train_dir}/${.convert_name} + nemo_file_name: megatron_qwen2.nemo # name of nemo checkpoint; must be .nemo file + pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory + +model: + model_type: gpt # gpt or t5, use t5 for mt5 as well + checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints + checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_qwen2-*last.ckpt) + hparams_file: ${conversion.run.train_dir}/results/hparams.yaml + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model diff --git a/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml b/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml new file mode 100644 index 0000000000..4ba8dea87c --- /dev/null +++ b/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml @@ -0,0 +1,24 @@ +run: + name: ${.eval_name}_${.model_train_name} + time_limit: "02:00:00" + dependency: "singleton" + nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node + ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} + eval_name: eval_all + model_train_name: qwen2_7b + train_dir: ${base_results_dir}/${.model_train_name} + tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks + results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} + +model: + model_type: nemo-qwen2 + nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints + #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints + #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + precision: bf16 # must match training precision - 32, 16 or bf16 + eval_batch_size: 4 + #tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model diff --git a/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml b/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml new file mode 100644 index 0000000000..49ba25236c --- /dev/null +++ b/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml @@ -0,0 +1,24 @@ +run: + name: ${.eval_name}_${.model_train_name} + time_limit: "02:00:00" + dependency: "singleton" + nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node + ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} + eval_name: eval_boolq + model_train_name: llama2_7b + train_dir: ${base_results_dir}/${.model_train_name} + tasks: boolq # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks + results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} + +model: + model_type: nemo-llama + nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints + #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints + #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + precision: bf16 # must match training precision - 32, 16 or bf16 + eval_batch_size: 4 + #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model diff --git a/launcher_scripts/conf/fine_tuning/qwen2/squad.yaml b/launcher_scripts/conf/fine_tuning/qwen2/squad.yaml new file mode 100644 index 0000000000..66df27b43c --- /dev/null +++ b/launcher_scripts/conf/fine_tuning/qwen2/squad.yaml @@ -0,0 +1,194 @@ +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: qwen2_7b + convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name} + task_name: "squad" # Rename this name to be more clear + results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name} + +trainer: + devices: 8 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: ${fine_tuning.run.results_dir}/results + exp_dir: null + name: megatron_qwen2_${fine_tuning.run.task_name} + create_wandb_logger: False + wandb_logger_kwargs: + project: nemo_qwen2_${fine_tuning.run.task_name} + name: ${fine_tuning.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} + save_top_k: 5 + mode: min + save_nemo_on_train_end: True + filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}} + save_best_model: True + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + global_batch_size: 32 + micro_batch_size: 4 + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_qwen2.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: True + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + answer_only_loss: True # not used right now + gradient_as_bucket_view: False + seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value + use_flash_attention: True # if not None, will match the base model's value + + hidden_dropout: 0.1 + attention_dropout: 0.1 + ffn_dropout: 0.1 + + # FSDP + fsdp: False # Enable training with torch FSDP. + fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'. + fsdp_grad_reduce_dtype: 'bf16' # Gradient reduction data type. + fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. + fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme. + + data: + chat: False # whether use chatbot data or not + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: + - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: 4096 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: + - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + separate_prompt_and_response_with_newline: True + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: + - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: + - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length} + min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length} + drop_last: True + context_key: 'input' + label_key: 'output' + add_eos: ${fine_tuning.model.data.train_ds.add_eos} + add_sep: ${fine_tuning.model.data.train_ds.add_sep} + add_bos: ${fine_tuning.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + test_ds: + file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length} + min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length} + drop_last: True + context_key: 'input' + label_key: 'output' + add_eos: ${fine_tuning.model.data.train_ds.add_eos} + add_sep: ${fine_tuning.model.data.train_ds.add_sep} + add_bos: ${fine_tuning.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. + lr: 1e-6 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} + min_lr: 1e-8 + warmup_steps: 1000 + last_epoch: -1 + + diff --git a/launcher_scripts/conf/peft/qwen2/sft.yaml b/launcher_scripts/conf/peft/qwen2/sft.yaml new file mode 100644 index 0000000000..09cb2b3430 --- /dev/null +++ b/launcher_scripts/conf/peft/qwen2/sft.yaml @@ -0,0 +1,263 @@ +defaults: + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: + +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf + +run: + name: sft_${.model_train_name} + results_dir: ${base_results_dir}/${.name} + time_limit: 04:00:00 + dependency: singleton + convert_name: convert_nemo + model_train_name: qwen2 + convert_dir: ${base_results_dir}/${.model_train_name}/${.convert_name} + +trainer: + devices: 8 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: ${peft.run.results_dir}/results + exp_dir: null + name: ${peft.run.name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${peft.model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${peft.exp_manager.name}--{${.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${peft.model.tensor_model_parallel_size}, ${peft.model.pipeline_model_parallel_size}} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + create_tensorboard_logger: False + log_global_rank_0_only: True + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null + + encoder_seq_length: ${peft.model.data.train_ds.max_seq_length} + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ${peft.run.convert_dir}/results/megatron_qwen2.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + overlap_p2p_comm: False + ub_tp_comm_overlap: False + tp_comm_atomic_ag: False + tp_comm_atomic_rs: False + + megatron_amp_O2: True + mcore_gpt: True + transformer_engine: False + + get_attention_mask_from_fusion: True + apply_rope_fusion: True + bias_activation_fusion: True + bias_dropout_add_fusion: True + masked_softmax_fusion: True + gradient_accumulation_fusion: True + grad_div_ar_fusion: True + + fp8: False + fp8_e4m3: False + fp8_hybrid: True + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 512 + fp8_amax_compute_algo: max + fp8_wgrad: True + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + answer_only_loss: False + gradient_as_bucket_view: True + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: null # null (SFT, no PEFT), ptuning, lora + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: ??? # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 2 + memmap_workers: 2 + pin_memory: True + packed_sequence: False # Set to true to load with GPTSFTPackedDataset + pad_to_max_length: False + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: False + truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + validation_ds: + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: ${peft.model.data.train_ds.num_workers} + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${peft.model.data.train_ds.max_seq_length} + min_seq_length: ${peft.model.data.train_ds.min_seq_length} + drop_last: False + label_key: ${peft.model.data.train_ds.label_key} + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${peft.model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: ${peft.model.data.train_ds.num_workers} + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${peft.model.data.train_ds.max_seq_length} + min_seq_length: ${peft.model.data.train_ds.min_seq_length} + drop_last: False + label_key: ${peft.model.data.train_ds.label_key} + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${peft.model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + bucket_cap_mb: 50 + dtype: bf16 # fp32 | bf16 + grad_sync_dtype: bf16 # fp32 | bf16 + overlap_grad_sync: True + overlap_param_sync: True + contiguous_grad_buffer: True + contiguous_param_buffer: True + diff --git a/launcher_scripts/conf/peft/qwen2/squad.yaml b/launcher_scripts/conf/peft/qwen2/squad.yaml new file mode 100644 index 0000000000..059fa78715 --- /dev/null +++ b/launcher_scripts/conf/peft/qwen2/squad.yaml @@ -0,0 +1,242 @@ +name: megatron_llama_peft_tuning-${peft.model.peft.peft_scheme} + +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: llama2_7b + convert_dir: ${base_results_dir}/${.model_train_name}/${.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/peft_${.name} + +trainer: + devices: 8 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: ${peft.run.results_dir}/results + exp_dir: null + name: ${peft.name} + create_wandb_logger: False + wandb_logger_kwargs: + project: nemo_llama2_${peft.run.task_name} + name: ${peft.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${peft.model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${peft.model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ${peft.run.convert_dir}/results/megatron_llama.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + mcore_gpt: True + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + answer_only_loss: True + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + # FSDP + fsdp: False # Enable training with torch FSDP. + fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'. + fsdp_grad_reduce_dtype: 'bf16' # Gradient reduction data type. + fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. + fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme. + + peft: + peft_scheme: "lora" # can be either adapter, ia3, lora, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: + - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 4096 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: + - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + separate_prompt_and_response_with_newline: False + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: + - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: + - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${peft.model.data.train_ds.max_seq_length} + min_seq_length: ${peft.model.data.train_ds.min_seq_length} + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${peft.model.data.train_ds.max_seq_length} + min_seq_length: ${peft.model.data.train_ds.min_seq_length} + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false diff --git a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml new file mode 100644 index 0000000000..70d02245b0 --- /dev/null +++ b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml @@ -0,0 +1,231 @@ +defaults: + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h5120_tp2_mbs1_seqlen4096 + +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf + +run: + name: qwen2_13b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 32 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 300000 + max_time: '5:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, + ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 40 + hidden_size: 5120 + ffn_hidden_size: 13696 + num_attention_heads: 40 + num_query_groups: 40 + override_vocab_size: 152064 + rotary_base: 1000000.0 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-14B + model: null # /path/to/tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: true + tp_comm_atomic_ag: False + tp_comm_atomic_rs: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document diff --git a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml new file mode 100644 index 0000000000..6217eb0145 --- /dev/null +++ b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml @@ -0,0 +1,234 @@ +defaults: + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: + +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf + +run: + name: qwen2_4b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:30:00" + dependency: "singleton" +trainer: + num_nodes: 16 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 40 + hidden_size: 2560 + ffn_hidden_size: 6912 + num_attention_heads: 20 + num_query_groups: 20 + override_vocab_size: 151936 + rotary_base: 5000000.0 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-4B + model: null + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: false + + ## Transformer Engine + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False + ub_tp_comm_overlap: False + tp_comm_atomic_ag: False + tp_comm_atomic_rs: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document + diff --git a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml new file mode 100644 index 0000000000..0ba8267c42 --- /dev/null +++ b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml @@ -0,0 +1,234 @@ +defaults: + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h8192_tp4_mbs1_seqlen4096 + +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf + +run: + name: qwen2_72b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 128 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 + max_time: '19:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + virtual_pipeline_model_parallel_size: 20 + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 80 + hidden_size: 8192 + ffn_hidden_size: 24576 + num_attention_heads: 64 + num_query_groups: 64 + override_vocab_size: 152064 + rotary_base: 1000000.0 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-72B + model: null # /path/to/tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: true + tp_comm_atomic_ag: False + tp_comm_atomic_rs: False + use_flash_attention: true + overlap_p2p_comm: true + batch_p2p_comm: false + gc_interval: 3 + optim: + name: distributed_fused_adam + lr: 0.00015 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 2000 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document + diff --git a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml new file mode 100644 index 0000000000..98bd353954 --- /dev/null +++ b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml @@ -0,0 +1,234 @@ +defaults: + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: + +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf + +run: + name: qwen2_7b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:30:00" + dependency: "singleton" +trainer: + num_nodes: 16 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_qwen2 + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_qwen2_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_qwen2--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 32768 + max_position_embeddings: 32768 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 + num_attention_heads: 32 + num_query_groups: 32 + override_vocab_size: 151936 + rotary_base: 1000000.0 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + qkv_bias: true + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'huggingface' + type: Qwen/Qwen1.5-7B + model: null # /path/to/tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true + + ## Transformer Engine + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False + ub_tp_comm_overlap: False + tp_comm_atomic_ag: False + tp_comm_atomic_rs: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 32768 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-qwen2_00_text_document + - .0333 + - ${data_dir}/my-qwen2_01_text_document + - .0333 + - ${data_dir}/my-qwen2_02_text_document + - .0333 + - ${data_dir}/my-qwen2_03_text_document + - .0333 + - ${data_dir}/my-qwen2_04_text_document + - .0333 + - ${data_dir}/my-qwen2_05_text_document + - .0333 + - ${data_dir}/my-qwen2_06_text_document + - .0333 + - ${data_dir}/my-qwen2_07_text_document + - .0333 + - ${data_dir}/my-qwen2_08_text_document + - .0333 + - ${data_dir}/my-qwen2_09_text_document + - .0333 + - ${data_dir}/my-qwen2_10_text_document + - .0333 + - ${data_dir}/my-qwen2_11_text_document + - .0333 + - ${data_dir}/my-qwen2_12_text_document + - .0333 + - ${data_dir}/my-qwen2_13_text_document + - .0333 + - ${data_dir}/my-qwen2_14_text_document + - .0333 + - ${data_dir}/my-qwen2_15_text_document + - .0333 + - ${data_dir}/my-qwen2_16_text_document + - .0333 + - ${data_dir}/my-qwen2_17_text_document + - .0333 + - ${data_dir}/my-qwen2_18_text_document + - .0333 + - ${data_dir}/my-qwen2_19_text_document + - .0333 + - ${data_dir}/my-qwen2_20_text_document + - .0333 + - ${data_dir}/my-qwen2_21_text_document + - .0333 + - ${data_dir}/my-qwen2_22_text_document + - .0333 + - ${data_dir}/my-qwen2_23_text_document + - .0333 + - ${data_dir}/my-qwen2_24_text_document + - .0333 + - ${data_dir}/my-qwen2_25_text_document + - .0333 + - ${data_dir}/my-qwen2_26_text_document + - .0333 + - ${data_dir}/my-qwen2_27_text_document + - .0333 + - ${data_dir}/my-qwen2_28_text_document + - .0334 + - ${data_dir}/my-qwen2_29_text_document + diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py index 60f524c884..78f2a35780 100755 --- a/launcher_scripts/main.py +++ b/launcher_scripts/main.py @@ -90,6 +90,7 @@ "chatglm", "mistral", "mixtral", + "qwen2", ], NeMoEvaluation: [ "t5", @@ -111,6 +112,7 @@ "starcoder2", "peft_mistral", "peft_mixtral", + "peft_qwen2", ], DiffusionModelEvaluation: ["stable_diffusion", "imagen"], }, @@ -124,6 +126,7 @@ "falcon", "baichuan2", "chatglm", + "qwen2", ], MC4DataPreparation: ["mt5"], SteerLMDataPreparation: ["steerlm"], diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py index 46a264649e..cb1a4e681a 100755 --- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py @@ -25,6 +25,7 @@ nemo_chatglm, nemo_mistral, nemo_mixtral, + nemo_qwen2, ) MODEL_REGISTRY = { @@ -37,6 +38,7 @@ "nemo-falcon": nemo_falcon.NeMo_FalconLM_TP_PP, "nemo-mistral": nemo_mistral.NeMo_MISTRAL_TP_PP, "nemo-mixtral": nemo_mixtral.NeMo_MIXTRAL_TP_PP, + "nemo-qwen2": nemo_qwen2.NeMo_QWEN2_TP_PP, "dummy": dummy.DummyLM, } diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py new file mode 100644 index 0000000000..7eb7b7c67b --- /dev/null +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_qwen2.py @@ -0,0 +1,261 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +import tqdm +from lm_eval import utils +from lm_eval.base import LM +from megatron.core import parallel_state +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import ( + MegatronGPTModel, +) +from nemo.collections.nlp.modules.common.megatron.megatron_init import ( + fake_initialize_model_parallel, +) +from nemo.collections.nlp.modules.common.text_generation_utils import ( + generate, + get_computeprob_response, +) +from nemo.collections.nlp.parts.nlp_overrides import ( + NLPDDPStrategy, + NLPSaveRestoreConnector, +) +from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.get_rank import is_global_rank_zero +from nemo.utils.model_utils import inject_model_parallel_rank +from omegaconf import OmegaConf, open_dict +from pytorch_lightning.trainer.trainer import Trainer +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataloader import default_collate + +from .nemo_gpt3 import DDP_initialize, RequestDataset, setup_trainer_and_model + + +class NeMo_QWEN2_TP_PP(LM): + def __init__(self, args, truncate=False, batch_size=1): + super().__init__() + + # get nemo megatron + logging.info(f"**** Building Qwen2 model ...") + self.trainer, self.model = setup_trainer_and_model(args) + self.tokenizer = self.model.tokenizer + self.model.eval() + + self.max_length = self.model.cfg.get("max_position_embeddings") + + self.truncate = truncate + self.batch_size = batch_size + + # initialize DDP and move model to GPU + DDP_initialize(self.model) + self.model = self.model.cuda() + + @classmethod + def create_from_arg_string(cls, arg_string, additional_config={}): + args = utils.simple_parse_args_string(arg_string) + args2 = {k: v for k, v in additional_config.items() if v is not None} + return cls(args, **args2) + + def loglikelihood(self, requests): + return self._loglikelihood(requests) + + """ + request: (context, continuation) + how this all works: + CTX CONT + inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + gpt2 \ \ + logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice + cont_toks 4 5 6 7 8 9 + when too long to fit in context, truncate from the left + """ + + def _loglikelihood(self, requests): + def pad_collate(batch, eos_id=2): + tokens = [item[0] for item in batch] + conti_lens = [item[1] for item in batch] + lens = [ + len(token) - 1 for token in tokens + ] # fake delete last token by reducing input len + max_len = max(lens) + extra_pad_len = 0 + if max_len % 8 != 0: + extra_pad_len = 8 - (max_len % 8) + max_len += extra_pad_len + # extra_pad_len = 2048 - max_len + # max_len += extra_pad_len + + tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id) + if extra_pad_len > 0: + extra_pad = torch.ones(extra_pad_len, len(batch)) * eos_id + extra_pad = extra_pad.type_as(tokens_pad) + tokens_pad = torch.vstack((tokens_pad, extra_pad)) + # Add padding to all samples to adapt nemo generate api + + new_batch = [] + for token, lenn, conti_len in zip(tokens_pad.T, lens, conti_lens): + # (token, lenn, tokens_to_generate, compute_logprobs) + new_batch.append((token, max_len, lenn, conti_len)) + + new_batch = default_collate(new_batch) + return new_batch + + def _collate(x): # used to reorder request and remove duplications + """ + the negative sign on len(toks) sorts descending - this has a few advantages: + - time estimates will always be over not underestimates, which is more useful for planning + - to know the size of a batch when going through the list, you know the first one is always the batch padded context length. + this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement + - any OOMs will happen right away rather than near the end + """ + toks = x[0] + x[1] + return -len(toks), tuple(toks) + + reord = utils.Reorderer(requests, _collate) + request_ds = RequestDataset( + reord.get_reordered(), self.model.tokenizer, self.max_length + ) + request_dl = DataLoader( + request_ds, + collate_fn=pad_collate, + batch_size=self.batch_size, + shuffle=False, + ) + + def logits_to_results(batch, response): + input_token_ids_batch, _, lens, conti_lens = batch + batch_size = len(lens) + assert ( + len(response["token_ids"]) == batch_size + ), "Response's length not equal to batch size." + + batch_res = [] + for index in range(batch_size): + inp_len = lens[index] + conti_len = conti_lens[index] + + inp_token_ids = input_token_ids_batch[index].tolist()[ + : inp_len + 1 + ] # recover fake deleted token + response_token_ids = response["token_ids"][index][:inp_len] + + assert ( + response_token_ids == inp_token_ids[:-1] + ), f"Mismatch in input tokens." + + log_probs = response["full_logprob"][index][:inp_len] # torch.tensor + log_probs = log_probs[-conti_len:] + + greedy_tokens = log_probs.argmax(dim=-1) + greedy_tokens = self.tokenizer.ids_to_tokens( + greedy_tokens.cpu().numpy().tolist() + ) + + conti_token_ids = inp_token_ids[-conti_len:] + conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids) + + max_equal = greedy_tokens == conti_tokens + log_probs = log_probs.cpu().to(torch.float32) + conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens)) + conti_probs = torch.gather( + log_probs, 1, conti_enc.unsqueeze(-1) + ).squeeze(-1) + + batch_res.append( + ( + float(conti_probs.sum()), + bool(max_equal), + greedy_tokens, + conti_tokens, + ) + ) + return batch_res + + res = [] + for batch in tqdm.tqdm(request_dl): + # inputs = (token_ids, conti_lens) + inputs = (batch[0].cuda(), batch[1].cuda()) + response = generate( + model=self.model, + inputs=inputs, + tokens_to_generate=1, + all_probs=True, + temperature=1.0, + add_BOS=False, + top_k=0, + top_p=0.9, + greedy=True, + repetition_penalty=1.0, + min_tokens_to_generate=0, + compute_logprob=True, + end_strings=[""], + ) + response = get_computeprob_response(self.tokenizer, response, inputs) + + if is_global_rank_zero(): + res.extend(logits_to_results(batch, response)) + + del inputs, response + + return reord.get_original(res) if self.can_access_output() else None + + def loglikelihood_rolling(self, requests): + loglikelihoods = [] + len_rolling_token_windows = [0] + all_rolling_token_windows = [] + + for (string,) in requests: + rolling_token_windows = list( + map( + utils.make_disjoint_window, + utils.get_rolling_token_windows( + token_list=self.tokenizer.text_to_ids(string), + prefix_token=2, + max_seq_len=self.max_length, + context_len=1, + ), + ) + ) + + len_rolling_token_windows.append( + len(rolling_token_windows) + len_rolling_token_windows[-1] + ) + all_rolling_token_windows.extend(rolling_token_windows) + + string_nll = self._loglikelihood(all_rolling_token_windows) + if self.can_access_output(): + string_nll = [x[0] for x in string_nll] + # discard is_greedy + for i in range(len(len_rolling_token_windows) - 1): + loglikelihoods.append( + sum( + string_nll[ + len_rolling_token_windows[i] : len_rolling_token_windows[ + i + 1 + ] + ] + ) + ) + + return loglikelihoods + + def greedy_until(self, requests): + raise NotImplementedError + + def can_access_output(self): + return is_global_rank_zero() diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index ba5e6a4af5..cae697fbcb 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -44,6 +44,7 @@ "mixtral", "starcoder2", "chatglm", + "qwen2", ] __VISION_MODELS_LIST__ = ["vit"] __MULTIMODAL_MODELS_LIST__ = [ @@ -881,6 +882,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path: / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", "mixtral": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", + "qwen2": self._nemo_code_path + / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", } return model_type_to_code_path[model_type] @@ -966,6 +969,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path: / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py", "mixtral": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py", + "qwen2": self._nemo_code_path + / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py", } return model_type_to_code_path[model_type] @@ -1113,6 +1118,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path: / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", "mixtral": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", + "qwen2": self._nemo_code_path + / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", } return model_type_to_code_path[model_type] @@ -1668,6 +1675,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path: / "examples/nlp/language_modeling/tuning/megatron_gpt_generate.py", "peft_mixtral": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_generate.py", + "peft_qwen2": self._nemo_code_path + / "examples/nlp/language_modeling/tuning/megatron_gpt_generate.py", "vit": self._nemo_code_path / "examples/vision/vision_transformer/megatron_vit_classification_evaluate.py", "clip": self._nemo_code_path