diff --git a/evaluation/results/bloom/run_bloom_bs_evaluation_176b.slurm b/evaluation/results/bloom/run_bloom_bs_evaluation_176b.slurm new file mode 100644 index 00000000..5e24f21d --- /dev/null +++ b/evaluation/results/bloom/run_bloom_bs_evaluation_176b.slurm @@ -0,0 +1,305 @@ +#!/bin/bash +#SBATCH --job-name=bs-eval-bloom-176b +#SBATCH --partition=gpu_p5 +#SBATCH --constraint=a100 +#SBATCH --reservation=hug +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=64 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out # output file name +#SBATCH --account=six@a100 +#SBATCH --array=0-171 + +set -x -e + +source $six_ALL_CCFRWORK/start-py38-pt111 +conda activate muennighofflmeval + +echo "START TIME: $(date)" + + +CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step95000 +MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed-bloom +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export TOKENIZERS_PARALLELISM=false + +cd $MEGATRON_DEEPSPEED_REPO + +# Make sure you use the slow version of the tokenizer. +# Same tokenizer for 125m and 175b +TOKENIZER_NAME_OR_PATH=bigscience/tokenizer + +PP_SIZE=8 +TP_SIZE=1 +SEQ_LEN=2048 + +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 + +#dummy arguments to make megatron happy. +MEGATRON_REQUIRED_ARGS=" \ + --num-layers -1 \ + --hidden-size -1 \ + --num-attention-heads -1 \ + --seq-length -1 \ + --max-position-embeddings -1 \ +" + + +ZERO_STAGE=0 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +# --task_list GEM/web_nlg_en,GEM/web_nlg_en_challenge_test_numbers,GEM/web_nlg_en_challenge_test_scramble,GEM/web_nlg_en_challenge_validation_sample,GEM/web_nlg_ru,GEM/web_nlg_ru_challenge_test_scramble,GEM/web_nlg_ru_challenge_validation_sample,GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05,GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc,GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05,GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc,GEM/wiki_auto_asset_turk_test_asset,GEM/wiki_auto_asset_turk_test_turk,GEM/wiki_lingua_ar,GEM/wiki_lingua_cs,GEM/wiki_lingua_de,GEM/wiki_lingua_en,GEM/wiki_lingua_es,GEM/wiki_lingua_fr,GEM/wiki_lingua_hi,GEM/wiki_lingua_id,GEM/wiki_lingua_it,GEM/wiki_lingua_ja,GEM/wiki_lingua_ko,GEM/wiki_lingua_nl,GEM/wiki_lingua_pt,GEM/wiki_lingua_ru,GEM/wiki_lingua_th,GEM/wiki_lingua_tr,GEM/wiki_lingua_vi,GEM/wiki_lingua_zh,gem_xsum,gem_xsum_challenge_sample,gem_xsum_challenge_test_backtranslation,gem_xsum_challenge_test_bfp_02,gem_xsum_challenge_test_bfp_05,gem_xsum_challenge_test_covid,gem_xsum_challenge_test_nopunc,axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en\ + +TASKS=( +GEM/web_nlg_en +GEM/web_nlg_en_challenge_test_numbers +GEM/web_nlg_en_challenge_test_scramble +GEM/web_nlg_en_challenge_validation_sample +GEM/web_nlg_ru +GEM/web_nlg_ru_challenge_test_scramble +GEM/web_nlg_ru_challenge_validation_sample +GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation +GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02 +GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05 +GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc +GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation +GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02 +GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05 +GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc +GEM/wiki_auto_asset_turk_test_asset +GEM/wiki_auto_asset_turk_test_turk +GEM/wiki_lingua_ar +GEM/wiki_lingua_cs +GEM/wiki_lingua_de +GEM/wiki_lingua_en +GEM/wiki_lingua_es +GEM/wiki_lingua_fr +GEM/wiki_lingua_hi +GEM/wiki_lingua_id +GEM/wiki_lingua_it +GEM/wiki_lingua_ja +GEM/wiki_lingua_ko +GEM/wiki_lingua_nl +GEM/wiki_lingua_pt +GEM/wiki_lingua_ru +GEM/wiki_lingua_th +GEM/wiki_lingua_tr +GEM/wiki_lingua_vi +GEM/wiki_lingua_zh +gem_xsum +gem_xsum_challenge_sample +gem_xsum_challenge_test_backtranslation +gem_xsum_challenge_test_bfp_02 +gem_xsum_challenge_test_bfp_05 +gem_xsum_challenge_test_covid +gem_xsum_challenge_test_nopunc +axb +axg +boolq +cb +cola +copa +crows_pairs_english +crows_pairs_french +diabla +e2e_nlg_cleaned +mnli +mnli_mismatched +multirc +piaf +qqp +rte +sst +tydiqa_primary +tydiqa_secondary +wic +wsc +wnli +wino_bias_type1_anti +wino_bias_type1_pro +wino_bias_type2_anti +wino_bias_type2_pro +xquad_ar +xquad_en +gsarti/flores_101_afr +gsarti/flores_101_amh +gsarti/flores_101_ara +gsarti/flores_101_hye +gsarti/flores_101_asm +gsarti/flores_101_ast +gsarti/flores_101_azj +gsarti/flores_101_bel +gsarti/flores_101_ben +gsarti/flores_101_bos +gsarti/flores_101_bul +gsarti/flores_101_mya +gsarti/flores_101_cat +gsarti/flores_101_ceb +gsarti/flores_101_zho_simpl +gsarti/flores_101_zho_trad +gsarti/flores_101_hrv +gsarti/flores_101_ces +gsarti/flores_101_dan +gsarti/flores_101_nld +gsarti/flores_101_eng +gsarti/flores_101_est +gsarti/flores_101_tgl +gsarti/flores_101_fin +gsarti/flores_101_fra +gsarti/flores_101_ful +gsarti/flores_101_glg +gsarti/flores_101_lug +gsarti/flores_101_kat +gsarti/flores_101_deu +gsarti/flores_101_ell +gsarti/flores_101_guj +gsarti/flores_101_hau +gsarti/flores_101_heb +gsarti/flores_101_hin +gsarti/flores_101_hun +gsarti/flores_101_isl +gsarti/flores_101_ibo +gsarti/flores_101_ind +gsarti/flores_101_gle +gsarti/flores_101_ita +gsarti/flores_101_jpn +gsarti/flores_101_jav +gsarti/flores_101_kea +gsarti/flores_101_kam +gsarti/flores_101_kan +gsarti/flores_101_kaz +gsarti/flores_101_khm +gsarti/flores_101_kor +gsarti/flores_101_kir +gsarti/flores_101_lao +gsarti/flores_101_lav +gsarti/flores_101_lin +gsarti/flores_101_lit +gsarti/flores_101_luo +gsarti/flores_101_ltz +gsarti/flores_101_mkd +gsarti/flores_101_msa +gsarti/flores_101_mal +gsarti/flores_101_mlt +gsarti/flores_101_mri +gsarti/flores_101_mar +gsarti/flores_101_mon +gsarti/flores_101_npi +gsarti/flores_101_nso +gsarti/flores_101_nob +gsarti/flores_101_nya +gsarti/flores_101_oci +gsarti/flores_101_ory +gsarti/flores_101_orm +gsarti/flores_101_pus +gsarti/flores_101_fas +gsarti/flores_101_pol +gsarti/flores_101_por +gsarti/flores_101_pan +gsarti/flores_101_ron +gsarti/flores_101_rus +gsarti/flores_101_srp +gsarti/flores_101_sna +gsarti/flores_101_snd +gsarti/flores_101_slk +gsarti/flores_101_slv +gsarti/flores_101_som +gsarti/flores_101_ckb +gsarti/flores_101_spa +gsarti/flores_101_swh +gsarti/flores_101_swe +gsarti/flores_101_tgk +gsarti/flores_101_tam +gsarti/flores_101_tel +gsarti/flores_101_tha +gsarti/flores_101_tur +gsarti/flores_101_ukr +gsarti/flores_101_umb +gsarti/flores_101_urd +gsarti/flores_101_uzb +gsarti/flores_101_vie +gsarti/flores_101_cym +gsarti/flores_101_wol +gsarti/flores_101_xho +gsarti/flores_101_yor +gsarti/flores_101_zul +) + +#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ]; +# then +# echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}" +# exit 1 +#fi +TASK=${TASKS[$SLURM_ARRAY_TASK_ID]} + +BLOOM_FOLDER=$WORK/bloom/bloom-176b/$TASK +mkdir -p $BLOOM_FOLDER + +CMD="./tasks/eval_harness/evaluate_bsevalharness.py \ + --load $CHECKPOINT_PATH \ + --results_path $BLOOM_FOLDER/bs_results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --task_list $TASK\ + --deepspeed \ + --deepspeed_config ds_config.json \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier 16 \ + --offloadearly \ + $MEGATRON_REQUIRED_ARGS \ + " + +GPUS_PER_NODE=8 +NNODES=$SLURM_NNODES +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CUDA_LAUNCH_BLOCKING=1 + +echo $LAUNCHER $CMD + +export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO + +$LAUNCHER $CMD 2>&1 | tee $BLOOM_FOLDER/bs-eval-harness.log diff --git a/evaluation/results/bloom/run_bloom_evaluation_175b.slurm b/evaluation/results/bloom/run_bloom_evaluation_175b.slurm new file mode 100644 index 00000000..679ae7d2 --- /dev/null +++ b/evaluation/results/bloom/run_bloom_evaluation_175b.slurm @@ -0,0 +1,166 @@ +#!/bin/bash +#SBATCH --job-name=eai-eval-bloom-176b +#SBATCH --partition=gpu_p5 +#SBATCH --constraint=a100 +#SBATCH --reservation=hug +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=64 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out # output file name +#SBATCH --account=six@a100 +#SBATCH --array=0-32 + +set -x -e + +source $six_ALL_CCFRWORK/start-py38-pt111 +# Required in order to load the opt tokenizer +conda activate thomas_lm_eval + +echo "START TIME: $(date)" + + +CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step95000 +MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed-bloom +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export TOKENIZERS_PARALLELISM=false + +cd $MEGATRON_DEEPSPEED_REPO + +# Make sure you use the slow version of the tokenizer. +# Same tokenizer for 125m and 175b +TOKENIZER_NAME_OR_PATH=bigscience/tokenizer + +PP_SIZE=8 +TP_SIZE=1 +SEQ_LEN=2048 + +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 + +#dummy arguments to make megatron happy. +MEGATRON_REQUIRED_ARGS=" \ + --num-layers -1 \ + --hidden-size -1 \ + --num-attention-heads -1 \ + --seq-length -1 \ + --max-position-embeddings -1 \ +" + + +ZERO_STAGE=0 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +# --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \ +TASKS=( +arc_challenge +arc_easy +boolq +copa +headqa +hellaswag +lambada +logiqa +mathqa +mc_taco +mrpc +multirc +openbookqa +piqa +prost +pubmedqa +qnli +qqp +race +rte +sciq +sst +triviaqa +webqs +wic +winogrande +wnli +wsc +lambada_mt_en +lambada_mt_fr +lambada_mt_de +lambada_mt_it +lambada_mt_es +) + +#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ]; +# then +# echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}" +# exit 1 +#fi +TASK=${TASKS[$SLURM_ARRAY_TASK_ID]} + +BLOOM_FOLDER=$WORK/bloom/bloom-176b/$TASK +mkdir -p $BLOOM_FOLDER + +CMD="./tasks/eval_harness/evaluate.py \ + --load $CHECKPOINT_PATH \ + --results_path $BLOOM_FOLDER/eai_results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --task_list $TASK\ + --deepspeed \ + --deepspeed_config ds_config.json \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier 16 \ + --offloadearly \ + $MEGATRON_REQUIRED_ARGS \ + " + +GPUS_PER_NODE=8 +NNODES=$SLURM_NNODES +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CUDA_LAUNCH_BLOCKING=1 + +echo $LAUNCHER $CMD + +export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO + +$LAUNCHER $CMD 2>&1 | tee $BLOOM_FOLDER/eval-harness.log diff --git a/evaluation/results/opt/README.md b/evaluation/results/opt/README.md new file mode 100644 index 00000000..5fb71237 --- /dev/null +++ b/evaluation/results/opt/README.md @@ -0,0 +1 @@ +Utilities requires to run OPT evaluation diff --git a/evaluation/results/opt/concatenate_all_results.py b/evaluation/results/opt/concatenate_all_results.py new file mode 100644 index 00000000..4e8fe616 --- /dev/null +++ b/evaluation/results/opt/concatenate_all_results.py @@ -0,0 +1,99 @@ +import argparse +import json +import re +from pathlib import Path +from re import Pattern +from typing import List, Dict + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--results-dir", required=True, type=Path, help="Path to the list of results") + parser.add_argument("--concatenate-output-file", required=True, type=Path, help="Path to store the final output file") + return parser.parse_args() + +MODEL = "opt-175b-meg-ds" +# MODEL = "global_step95000" +RESULTS_REGEX = re.compile(rf"(eai|bs)_results_lm-eval_{MODEL}_(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})_backup\.json") +def get_all_files_that_match_results_in_folder(root_folder: Path) -> List[Path]: + json_files = [] + for folder in root_folder.iterdir(): + if folder.is_dir(): + json_files += get_all_files_that_match_results_in_folder(folder) + else: + # it's actually a file + file = folder + + match = RESULTS_REGEX.match(file.name) + + if match is None: + continue + else: + json_files.append(file) + return json_files + +def sort_dict(dictionary: Dict) -> Dict: + results = {} + + for key, value in sorted(dictionary.items(), key=lambda item: item[0]): + new_value = value + + if isinstance(value, dict): + new_value = sort_dict(new_value) + elif isinstance(value, list): + new_value = sorted(value) + + results[key] = new_value + + return results + +def main(): + args = get_args() + + # Get all json files + json_files = get_all_files_that_match_results_in_folder(args.results_dir) + + # Merge all json files + final_result = { + "results": {}, + "versions": {} + } + for file in json_files: + with open(file, "r") as fi: + task_result = json.load(fi) + + match = RESULTS_REGEX.match(file.name) + assert match is not None + prefix = match.group(1) + datetime_string = match.group(2) + + if prefix == "eai": + results_key = "results" + elif prefix == "bs": + results_key = "table_results" + else: + raise ValueError(f"Unsupported key: {prefix}") + + for key, value in task_result[results_key].items(): + if key not in final_result["results"]: + final_result["results"][key] = { + datetime_string: value + } + else: + assert datetime_string not in final_result["results"][key] + final_result["results"][key][datetime_string] = value + + for key, value in task_result["versions"].items(): + final_result["versions"][key] = value + + # We sort dict, better for serialization + final_result = sort_dict(final_result) + + # Save result + with open(args.concatenate_output_file, "w") as fo: + json.dump(final_result, fo, indent=2) + + pass + +if __name__ == "__main__": + main() diff --git a/evaluation/results/opt/convert_transformers_checkpoint_to_meg_ds.py b/evaluation/results/opt/convert_transformers_checkpoint_to_meg_ds.py new file mode 100644 index 00000000..b49dae9e --- /dev/null +++ b/evaluation/results/opt/convert_transformers_checkpoint_to_meg_ds.py @@ -0,0 +1,247 @@ +import argparse +import json +import re, os +from functools import partial +from multiprocessing import Pool +from typing import List, Optional, Dict + +import torch +from tqdm import tqdm + + +def get_args(): + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--opt_checkpoint_path", + default=None, + type=str, + required=True, + help="Path to the transformers OPT checkpoint path.", + ) + parser.add_argument( + "--opt_sharded_index_path", + default=None, + type=str, + required=True, + help="Path to the transformers OPT checkpoint metadata path.", + ) + parser.add_argument( + "--megatron_dump_folder_path", default=None, type=str, required=True, + help="Path to the output Megatron-DS model." + ) + parser.add_argument( + "--num-proc", default=1, type=int, + ) + return parser.parse_args() + + +def compute_meg_ds_weight_names(num_layers: int): + return { + "layer_01-model_00-model_states.pt": [ + "word_embeddings.weight", + "position_embeddings.weight", + ], + **{ + f"layer_{str(layer_id).zfill(2)}-model_00-model_states.pt": [ + "input_layernorm.weight", + "input_layernorm.bias", + "self_attention.query_key_value.weight", + "self_attention.query_key_value.bias", + "self_attention.dense.weight", + "self_attention.dense.bias", + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + "mlp.dense_h_to_4h.weight", + "mlp.dense_h_to_4h.bias", + "mlp.dense_4h_to_h.weight", + "mlp.dense_4h_to_h.bias", + ] + for layer_id in range(3, num_layers + 3) + }, + f"layer_{str(num_layers + 4).zfill(2)}-model_00-model_states.pt": [ + "weight", + "bias" + ] + } + +NON_TRANSFORMERS_BLOCK_WEIGHTS = { + "word_embeddings.weight": "decoder.embed_tokens.weight", + "position_embeddings.weight": "decoder.embed_positions.weight", + "weight": "decoder.final_layer_norm.weight", + "bias": "decoder.final_layer_norm.bias" +} +TRANSFORMERS_BLOCK_WEIGHTS = { + "input_layernorm.weight": ["self_attn_layer_norm.weight"], + "input_layernorm.bias": ["self_attn_layer_norm.bias"], + "self_attention.query_key_value.weight": ["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"], + "self_attention.query_key_value.bias": ["self_attn.q_proj.bias", "self_attn.k_proj.bias", "self_attn.v_proj.bias"], + "self_attention.dense.weight": ["self_attn.out_proj.weight"], + "self_attention.dense.bias": ["self_attn.out_proj.bias"], + "post_attention_layernorm.weight": ["final_layer_norm.weight"], + "post_attention_layernorm.bias": ["final_layer_norm.bias"], + "mlp.dense_h_to_4h.weight": ["fc1.weight"], + "mlp.dense_h_to_4h.bias": ["fc1.bias"], + "mlp.dense_4h_to_h.weight": ["fc2.weight"], + "mlp.dense_4h_to_h.bias": ["fc2.bias"] +} +def get_transformers_weight_names(meg_ds_weight: str, layer_id: Optional[int]) -> List[str]: + if layer_id is None: + return [NON_TRANSFORMERS_BLOCK_WEIGHTS[meg_ds_weight]] + else: + return [f"decoder.layers.{layer_id}.{tfrs_block_name}" for tfrs_block_name in TRANSFORMERS_BLOCK_WEIGHTS[meg_ds_weight]] + +def get_layer_id(meg_ds_filename: str, total_num_layers: int) -> Optional[int]: + layer_id = int(re.match(r"layer_(\d*)-model_00-model_states.pt", meg_ds_filename)[1]) - 3 + + if layer_id < 0: + return None + + if layer_id >= total_num_layers: + return None + + return layer_id + +def merge_layers(layers, num_heads: int, hidden_size: int): + if len(layers) == 1: + return layers[0] + else: + # We merge QKV + if len(layers[0].shape) == 1: + # bias + return torch.reshape( + torch.cat( + [ + layer.view(num_heads, 1, hidden_size // num_heads) + for layer in layers + ], + dim=1 + ), + (3 * hidden_size, ) + ) + else: + #weight + return torch.reshape( + torch.cat( + [ + layer.view(num_heads, 1, hidden_size // num_heads, hidden_size) + for layer in layers + ], + dim=1 + ), + (3 * hidden_size, hidden_size) + ) + +def find_transformers_weights_and_save_meg_ds_weights( + meg_ds_filename: str, + meg_ds_weight_names: List[str], + opt_checkpoint_path: str, + megatron_dump_folder_path:str, + total_num_layers: int, + num_heads: int, + hidden_size: int, + trfs_weight_map: Dict[str, str] +): + layer_id = get_layer_id(meg_ds_filename, total_num_layers=total_num_layers) + trfs_weight_namess = {meg_ds_weight_name: get_transformers_weight_names(meg_ds_weight_name, layer_id=layer_id) for meg_ds_weight_name in meg_ds_weight_names} + + # Find the path they live in. + trfs_filenames = set(trfs_weight_map[trfs_weight_name] for trfs_weight_names in trfs_weight_namess.values() for trfs_weight_name in trfs_weight_names) + trfs_filename_to_weights = { + trfs_filename: torch.load(os.path.join(opt_checkpoint_path, trfs_filename), map_location="cpu") + for trfs_filename in trfs_filenames + } + + # query those weights + result = { + meg_ds_weight_name: [ + trfs_filename_to_weights[trfs_weight_map[tfrs_weight_name]][tfrs_weight_name] + for tfrs_weight_name in tfrs_weight_names + ] + for meg_ds_weight_name, tfrs_weight_names in trfs_weight_namess.items() + } + + # possibly concatenate + save_path = os.path.join(megatron_dump_folder_path, meg_ds_filename) + with open(save_path, "wb") as fo: + # qkv are mixed s.t. [q1 k1 v1 q2 k2 v2 ...] with (1,2..) being head_id + torch.save( + { + key: merge_layers(values, num_heads=num_heads, hidden_size=hidden_size) + for key, values in result.items() + }, + fo + ) + + +def convert_opt_checkpoint_to_megatron( + opt_checkpoint_path: str, + megatron_dump_folder_path: str, + opt_index_path: str, + num_proc: int +): + # Get total number of layers + with open(opt_index_path, "r") as fi: + index_file = json.load(fi)["weight_map"] + # Compute total amount of layers + with open(os.path.join(opt_checkpoint_path, "config.json"), "r") as fi: + config = json.load(fi) + total_amount_of_layers = config["num_hidden_layers"] + num_heads = config["num_attention_heads"] + hidden_size = config["hidden_size"] + + # Given the total number of layers we can compute exactly each meg_ds params we need to find. + meg_ds_filename_to_meg_ds_weights = compute_meg_ds_weight_names(total_amount_of_layers) + + # Given the needed weights we can query them from the transformers checkpoint + # We have to be smart about it and load a bin file once and get everything. + if num_proc == 1: + for meg_ds_filename, meg_ds_weight_names in tqdm(meg_ds_filename_to_meg_ds_weights.items()): + find_transformers_weights_and_save_meg_ds_weights( + meg_ds_filename=meg_ds_filename, + meg_ds_weight_names=meg_ds_weight_names, + opt_checkpoint_path=opt_checkpoint_path, + megatron_dump_folder_path=megatron_dump_folder_path, + total_num_layers=total_amount_of_layers, + trfs_weight_map=index_file, + num_heads=num_heads, + hidden_size=hidden_size + ) + else: + with Pool(num_proc) as pool: + pool.starmap( + partial( + find_transformers_weights_and_save_meg_ds_weights, + opt_checkpoint_path=opt_checkpoint_path, + megatron_dump_folder_path=megatron_dump_folder_path, + total_num_layers=total_amount_of_layers, + trfs_weight_map=index_file, + num_heads=num_heads, + hidden_size=hidden_size + ), + tqdm(meg_ds_filename_to_meg_ds_weights.items()) + ) + + # Create dummy mp_rank_00_model_states.pt + torch.save( + { + "mp_world_size": 1, + "module": None, + "dp_world_size": 1, + "checkpoint_version": 3, + "iteration": 0 + }, + os.path.join(megatron_dump_folder_path, "mp_rank_00_model_states.pt") + ) + +def main(): + args = get_args() + convert_opt_checkpoint_to_megatron( + opt_checkpoint_path=args.opt_checkpoint_path, + megatron_dump_folder_path=args.megatron_dump_folder_path, + opt_index_path=args.opt_sharded_index_path, + num_proc=args.num_proc + ) + +if __name__ == "__main__": + main() diff --git a/evaluation/results/opt/run_opt_bs_evaluation_125m.slurm b/evaluation/results/opt/run_opt_bs_evaluation_125m.slurm new file mode 100644 index 00000000..bf4afde9 --- /dev/null +++ b/evaluation/results/opt/run_opt_bs_evaluation_125m.slurm @@ -0,0 +1,215 @@ +#!/bin/bash +#SBATCH --job-name=bs-eval-opt-125m +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=10 +#SBATCH --hint=nomultithread +#SBATCH --gres=gpu:1 +#SBATCH --time 20:00:00 +#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out +#SBATCH --account=six@v100 +#SBATCH --array=0-69 + +set -x -e + +source $six_ALL_CCFRWORK/start-py38-pt111 +conda activate muennighofflmeval + +echo "START TIME: $(date)" + + +CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-125m-meg-ds +MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics + +cd $MEGATRON_DEEPSPEED_REPO + +# Make sure you use the slow version of the tokenizer. +# Same tokenizer for 125m and 175b +TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m + +PP_SIZE=1 +TP_SIZE=1 + +NHIDDEN=768 +NLAYERS=12 +NHEADS=12 +SEQ_LEN=2048 +MAX_POSITION_EMBEDDINGS=2050 + +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 + +MEGATRON_REQUIRED_ARGS=" + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --fp16 \ + --relu \ + --seed 42 \ + --pad-vocab-size-to 50272 \ + --make-vocab-size-divisible-by 1\ + --no-bias-gelu-fusion\ +" + + +ZERO_STAGE=0 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +# --task_list GEM/web_nlg_en,GEM/web_nlg_en_challenge_test_numbers,GEM/web_nlg_en_challenge_test_scramble,GEM/web_nlg_en_challenge_validation_sample,GEM/web_nlg_ru,GEM/web_nlg_ru_challenge_test_scramble,GEM/web_nlg_ru_challenge_validation_sample,GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05,GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc,GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05,GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc,GEM/wiki_auto_asset_turk_test_asset,GEM/wiki_auto_asset_turk_test_turk,GEM/wiki_lingua_ar,GEM/wiki_lingua_cs,GEM/wiki_lingua_de,GEM/wiki_lingua_en,GEM/wiki_lingua_es,GEM/wiki_lingua_fr,GEM/wiki_lingua_hi,GEM/wiki_lingua_id,GEM/wiki_lingua_it,GEM/wiki_lingua_ja,GEM/wiki_lingua_ko,GEM/wiki_lingua_nl,GEM/wiki_lingua_pt,GEM/wiki_lingua_ru,GEM/wiki_lingua_th,GEM/wiki_lingua_tr,GEM/wiki_lingua_vi,GEM/wiki_lingua_zh,gem_xsum,gem_xsum_challenge_sample,gem_xsum_challenge_test_backtranslation,gem_xsum_challenge_test_bfp_02,gem_xsum_challenge_test_bfp_05,gem_xsum_challenge_test_covid,gem_xsum_challenge_test_nopunc,axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en\ + +TASKS=( +GEM/web_nlg_en +GEM/web_nlg_en_challenge_test_numbers +GEM/web_nlg_en_challenge_test_scramble +GEM/web_nlg_en_challenge_validation_sample +GEM/web_nlg_ru +GEM/web_nlg_ru_challenge_test_scramble +GEM/web_nlg_ru_challenge_validation_sample +GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation +GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02 +GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05 +GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc +GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation +GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02 +GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05 +GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc +GEM/wiki_auto_asset_turk_test_asset +GEM/wiki_auto_asset_turk_test_turk +GEM/wiki_lingua_ar +GEM/wiki_lingua_cs +GEM/wiki_lingua_de +GEM/wiki_lingua_en +GEM/wiki_lingua_es +GEM/wiki_lingua_fr +GEM/wiki_lingua_hi +GEM/wiki_lingua_id +GEM/wiki_lingua_it +GEM/wiki_lingua_ja +GEM/wiki_lingua_ko +GEM/wiki_lingua_nl +GEM/wiki_lingua_pt +GEM/wiki_lingua_ru +GEM/wiki_lingua_th +GEM/wiki_lingua_tr +GEM/wiki_lingua_vi +GEM/wiki_lingua_zh +gem_xsum +gem_xsum_challenge_sample +gem_xsum_challenge_test_backtranslation +gem_xsum_challenge_test_bfp_02 +gem_xsum_challenge_test_bfp_05 +gem_xsum_challenge_test_covid +gem_xsum_challenge_test_nopunc +axb +axg +boolq +cb +cola +copa +crows_pairs_english +crows_pairs_french +diabla +e2e_nlg_cleaned +mnli +mnli_mismatched +multirc +piaf +qqp +rte +sst +tydiqa_primary +tydiqa_secondary +wic +wsc +wnli +wino_bias_type1_anti +wino_bias_type1_pro +wino_bias_type2_anti +wino_bias_type2_pro +xquad_ar +xquad_en +) + +#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ]; +# then +# echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}" +# exit 1 +#fi +TASK=${TASKS[$SLURM_ARRAY_TASK_ID]} + +OPT_FOLDER=$WORK/opt/opt-125m/$TASK +mkdir -p $OPT_FOLDER + +CMD="./tasks/eval_harness/evaluate_bsevalharness.py \ + --load $CHECKPOINT_PATH \ + --results_path $OPT_FOLDER/bs_results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --task_list $TASK\ + --deepspeed \ + --deepspeed_config ds_config.json \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier 4 \ + $MEGATRON_REQUIRED_ARGS \ + " + +GPUS_PER_NODE=1 +NNODES=$SLURM_NNODES +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CUDA_LAUNCH_BLOCKING=1 + +echo $LAUNCHER $CMD + +export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO + +$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/bs-eval-harness.log diff --git a/evaluation/results/opt/run_opt_bs_evaluation_175b.slurm b/evaluation/results/opt/run_opt_bs_evaluation_175b.slurm new file mode 100644 index 00000000..097581be --- /dev/null +++ b/evaluation/results/opt/run_opt_bs_evaluation_175b.slurm @@ -0,0 +1,321 @@ +#!/bin/bash +#SBATCH --job-name=bs-eval-opt-175b +#SBATCH --partition=gpu_p5 +#SBATCH --constraint=a100 +#SBATCH --reservation=hug +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=64 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out # output file name +#SBATCH --account=six@a100 +#SBATCH --array=0-171 + +set -x -e + +source $six_ALL_CCFRWORK/start-py38-pt111 +conda activate muennighofflmeval + +echo "START TIME: $(date)" + + +CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-175b-meg-ds +MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics + +cd $MEGATRON_DEEPSPEED_REPO + +# Make sure you use the slow version of the tokenizer. +# Same tokenizer for 125m and 175b +TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m + +PP_SIZE=8 +TP_SIZE=1 + +NHIDDEN=12288 +NLAYERS=96 +NHEADS=96 +SEQ_LEN=2048 +MAX_POSITION_EMBEDDINGS=2050 + +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 + +MEGATRON_REQUIRED_ARGS=" + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --fp16 \ + --relu \ + --seed 42 \ + --pad-vocab-size-to 50272 \ + --make-vocab-size-divisible-by 1\ + --no-bias-gelu-fusion\ +" + + +ZERO_STAGE=0 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +# --task_list GEM/web_nlg_en,GEM/web_nlg_en_challenge_test_numbers,GEM/web_nlg_en_challenge_test_scramble,GEM/web_nlg_en_challenge_validation_sample,GEM/web_nlg_ru,GEM/web_nlg_ru_challenge_test_scramble,GEM/web_nlg_ru_challenge_validation_sample,GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05,GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc,GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05,GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc,GEM/wiki_auto_asset_turk_test_asset,GEM/wiki_auto_asset_turk_test_turk,GEM/wiki_lingua_ar,GEM/wiki_lingua_cs,GEM/wiki_lingua_de,GEM/wiki_lingua_en,GEM/wiki_lingua_es,GEM/wiki_lingua_fr,GEM/wiki_lingua_hi,GEM/wiki_lingua_id,GEM/wiki_lingua_it,GEM/wiki_lingua_ja,GEM/wiki_lingua_ko,GEM/wiki_lingua_nl,GEM/wiki_lingua_pt,GEM/wiki_lingua_ru,GEM/wiki_lingua_th,GEM/wiki_lingua_tr,GEM/wiki_lingua_vi,GEM/wiki_lingua_zh,gem_xsum,gem_xsum_challenge_sample,gem_xsum_challenge_test_backtranslation,gem_xsum_challenge_test_bfp_02,gem_xsum_challenge_test_bfp_05,gem_xsum_challenge_test_covid,gem_xsum_challenge_test_nopunc,axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en\ + +TASKS=( +GEM/web_nlg_en +GEM/web_nlg_en_challenge_test_numbers +GEM/web_nlg_en_challenge_test_scramble +GEM/web_nlg_en_challenge_validation_sample +GEM/web_nlg_ru +GEM/web_nlg_ru_challenge_test_scramble +GEM/web_nlg_ru_challenge_validation_sample +GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation +GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02 +GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05 +GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc +GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation +GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02 +GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05 +GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc +GEM/wiki_auto_asset_turk_test_asset +GEM/wiki_auto_asset_turk_test_turk +GEM/wiki_lingua_ar +GEM/wiki_lingua_cs +GEM/wiki_lingua_de +GEM/wiki_lingua_en +GEM/wiki_lingua_es +GEM/wiki_lingua_fr +GEM/wiki_lingua_hi +GEM/wiki_lingua_id +GEM/wiki_lingua_it +GEM/wiki_lingua_ja +GEM/wiki_lingua_ko +GEM/wiki_lingua_nl +GEM/wiki_lingua_pt +GEM/wiki_lingua_ru +GEM/wiki_lingua_th +GEM/wiki_lingua_tr +GEM/wiki_lingua_vi +GEM/wiki_lingua_zh +gem_xsum +gem_xsum_challenge_sample +gem_xsum_challenge_test_backtranslation +gem_xsum_challenge_test_bfp_02 +gem_xsum_challenge_test_bfp_05 +gem_xsum_challenge_test_covid +gem_xsum_challenge_test_nopunc +axb +axg +boolq +cb +cola +copa +crows_pairs_english +crows_pairs_french +diabla +e2e_nlg_cleaned +mnli +mnli_mismatched +multirc +piaf +qqp +rte +sst +tydiqa_primary +tydiqa_secondary +wic +wsc +wnli +wino_bias_type1_anti +wino_bias_type1_pro +wino_bias_type2_anti +wino_bias_type2_pro +xquad_ar +xquad_en +gsarti/flores_101_afr +gsarti/flores_101_amh +gsarti/flores_101_ara +gsarti/flores_101_hye +gsarti/flores_101_asm +gsarti/flores_101_ast +gsarti/flores_101_azj +gsarti/flores_101_bel +gsarti/flores_101_ben +gsarti/flores_101_bos +gsarti/flores_101_bul +gsarti/flores_101_mya +gsarti/flores_101_cat +gsarti/flores_101_ceb +gsarti/flores_101_zho_simpl +gsarti/flores_101_zho_trad +gsarti/flores_101_hrv +gsarti/flores_101_ces +gsarti/flores_101_dan +gsarti/flores_101_nld +gsarti/flores_101_eng +gsarti/flores_101_est +gsarti/flores_101_tgl +gsarti/flores_101_fin +gsarti/flores_101_fra +gsarti/flores_101_ful +gsarti/flores_101_glg +gsarti/flores_101_lug +gsarti/flores_101_kat +gsarti/flores_101_deu +gsarti/flores_101_ell +gsarti/flores_101_guj +gsarti/flores_101_hau +gsarti/flores_101_heb +gsarti/flores_101_hin +gsarti/flores_101_hun +gsarti/flores_101_isl +gsarti/flores_101_ibo +gsarti/flores_101_ind +gsarti/flores_101_gle +gsarti/flores_101_ita +gsarti/flores_101_jpn +gsarti/flores_101_jav +gsarti/flores_101_kea +gsarti/flores_101_kam +gsarti/flores_101_kan +gsarti/flores_101_kaz +gsarti/flores_101_khm +gsarti/flores_101_kor +gsarti/flores_101_kir +gsarti/flores_101_lao +gsarti/flores_101_lav +gsarti/flores_101_lin +gsarti/flores_101_lit +gsarti/flores_101_luo +gsarti/flores_101_ltz +gsarti/flores_101_mkd +gsarti/flores_101_msa +gsarti/flores_101_mal +gsarti/flores_101_mlt +gsarti/flores_101_mri +gsarti/flores_101_mar +gsarti/flores_101_mon +gsarti/flores_101_npi +gsarti/flores_101_nso +gsarti/flores_101_nob +gsarti/flores_101_nya +gsarti/flores_101_oci +gsarti/flores_101_ory +gsarti/flores_101_orm +gsarti/flores_101_pus +gsarti/flores_101_fas +gsarti/flores_101_pol +gsarti/flores_101_por +gsarti/flores_101_pan +gsarti/flores_101_ron +gsarti/flores_101_rus +gsarti/flores_101_srp +gsarti/flores_101_sna +gsarti/flores_101_snd +gsarti/flores_101_slk +gsarti/flores_101_slv +gsarti/flores_101_som +gsarti/flores_101_ckb +gsarti/flores_101_spa +gsarti/flores_101_swh +gsarti/flores_101_swe +gsarti/flores_101_tgk +gsarti/flores_101_tam +gsarti/flores_101_tel +gsarti/flores_101_tha +gsarti/flores_101_tur +gsarti/flores_101_ukr +gsarti/flores_101_umb +gsarti/flores_101_urd +gsarti/flores_101_uzb +gsarti/flores_101_vie +gsarti/flores_101_cym +gsarti/flores_101_wol +gsarti/flores_101_xho +gsarti/flores_101_yor +gsarti/flores_101_zul +) + +#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ]; +# then +# echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}" +# exit 1 +#fi +TASK=${TASKS[$SLURM_ARRAY_TASK_ID]} + +OPT_FOLDER=$WORK/opt/opt-175b/$TASK +mkdir -p $OPT_FOLDER + +CMD="./tasks/eval_harness/evaluate_bsevalharness.py \ + --load $CHECKPOINT_PATH \ + --results_path $OPT_FOLDER/bs_results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --task_list $TASK\ + --deepspeed \ + --deepspeed_config ds_config.json \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier 16 \ + --offloadearly \ + $MEGATRON_REQUIRED_ARGS \ + " + +GPUS_PER_NODE=8 +NNODES=$SLURM_NNODES +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CUDA_LAUNCH_BLOCKING=1 + +echo $LAUNCHER $CMD + +export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO + +$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/bs-eval-harness.log diff --git a/evaluation/results/opt/run_opt_evaluation_125m.slurm b/evaluation/results/opt/run_opt_evaluation_125m.slurm new file mode 100644 index 00000000..cc657325 --- /dev/null +++ b/evaluation/results/opt/run_opt_evaluation_125m.slurm @@ -0,0 +1,173 @@ +#!/bin/bash +#SBATCH --job-name=eai-eval-opt-125m +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=10 +#SBATCH --hint=nomultithread +#SBATCH --gres=gpu:1 +#SBATCH --time 20:00:00 +#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out +#SBATCH --account=six@v100 +#SBATCH --array=0-27 + +set -x -e + +source $six_ALL_CCFRWORK/start-py38-pt111 +# Required in order to load the opt tokenizer +conda activate thomas_lm_eval + +echo "START TIME: $(date)" + + +CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-125m-meg-ds +MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics + +cd $MEGATRON_DEEPSPEED_REPO + +# Make sure you use the slow version of the tokenizer. +# Same tokenizer for 125m and 175b +TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m + +PP_SIZE=1 +TP_SIZE=1 + +NHIDDEN=768 +NLAYERS=12 +NHEADS=12 +SEQ_LEN=2048 +MAX_POSITION_EMBEDDINGS=2050 + +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 + +MEGATRON_REQUIRED_ARGS=" + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --fp16 \ + --relu \ + --seed 42 \ + --pad-vocab-size-to 50272 \ + --make-vocab-size-divisible-by 1\ + --no-bias-gelu-fusion\ +" + + +ZERO_STAGE=0 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +# --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \ +TASKS=( +arc_challenge +arc_easy +boolq +copa +headqa +hellaswag +lambada +logiqa +mathqa +mc_taco +mrpc +multirc +openbookqa +piqa +prost +pubmedqa +qnli +qqp +race +rte +sciq +sst +triviaqa +webqs +wic +winogrande +wnli +wsc +) + +#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ]; +# then +# echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}" +# exit 1 +#fi +TASK=${TASKS[$SLURM_ARRAY_TASK_ID]} + +OPT_FOLDER=$WORK/opt/opt-125m/$TASK +mkdir -p $OPT_FOLDER + +CMD="./tasks/eval_harness/evaluate.py \ + --load $CHECKPOINT_PATH \ + --results_path $OPT_FOLDER/eai_results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --task_list $TASK\ + --deepspeed \ + --deepspeed_config ds_config.json \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier 8 \ + $MEGATRON_REQUIRED_ARGS \ + " + +GPUS_PER_NODE=1 +NNODES=$SLURM_NNODES +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CUDA_LAUNCH_BLOCKING=1 + +echo $LAUNCHER $CMD + +export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO + +$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/eval-harness.log diff --git a/evaluation/results/opt/run_opt_evaluation_175b.slurm b/evaluation/results/opt/run_opt_evaluation_175b.slurm new file mode 100644 index 00000000..515564cd --- /dev/null +++ b/evaluation/results/opt/run_opt_evaluation_175b.slurm @@ -0,0 +1,182 @@ +#!/bin/bash +#SBATCH --job-name=eai-eval-opt-175b +#SBATCH --partition=gpu_p5 +#SBATCH --constraint=a100 +#SBATCH --reservation=hug +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=64 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out # output file name +#SBATCH --account=six@a100 +#SBATCH --array=0-32 + +set -x -e + +source $six_ALL_CCFRWORK/start-py38-pt111 +# Required in order to load the opt tokenizer +conda activate thomas_lm_eval + +echo "START TIME: $(date)" + + +CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-175b-meg-ds +MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics + +cd $MEGATRON_DEEPSPEED_REPO + +# Make sure you use the slow version of the tokenizer. +# Same tokenizer for 125m and 175b +TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m + +PP_SIZE=8 +TP_SIZE=1 + +NHIDDEN=12288 +NLAYERS=96 +NHEADS=96 +SEQ_LEN=2048 +MAX_POSITION_EMBEDDINGS=2050 + +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 + +MEGATRON_REQUIRED_ARGS=" + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --fp16 \ + --relu \ + --seed 42 \ + --pad-vocab-size-to 50272 \ + --make-vocab-size-divisible-by 1\ + --no-bias-gelu-fusion\ +" + + +ZERO_STAGE=0 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +# --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \ +TASKS=( +arc_challenge +arc_easy +boolq +copa +headqa +hellaswag +lambada +logiqa +mathqa +mc_taco +mrpc +multirc +openbookqa +piqa +prost +pubmedqa +qnli +qqp +race +rte +sciq +sst +triviaqa +webqs +wic +winogrande +wnli +wsc +lambada_mt_en +lambada_mt_fr +lambada_mt_de +lambada_mt_it +lambada_mt_es +) + +#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ]; +# then +# echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}" +# exit 1 +#fi +TASK=${TASKS[$SLURM_ARRAY_TASK_ID]} + +OPT_FOLDER=$WORK/opt/opt-175b/$TASK +mkdir -p $OPT_FOLDER + +CMD="./tasks/eval_harness/evaluate.py \ + --load $CHECKPOINT_PATH \ + --results_path $OPT_FOLDER/eai_results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --task_list $TASK\ + --deepspeed \ + --deepspeed_config ds_config.json \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier 16 \ + --offloadearly \ + $MEGATRON_REQUIRED_ARGS \ + " + +GPUS_PER_NODE=8 +NNODES=$SLURM_NNODES +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CUDA_LAUNCH_BLOCKING=1 + +echo $LAUNCHER $CMD + +export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO + +$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/eval-harness.log