diff --git a/evaluation/results/bloom/run_bloom_bs_evaluation_176b.slurm b/evaluation/results/bloom/run_bloom_bs_evaluation_176b.slurm
new file mode 100644
index 00000000..5e24f21d
--- /dev/null
+++ b/evaluation/results/bloom/run_bloom_bs_evaluation_176b.slurm
@@ -0,0 +1,305 @@
+#!/bin/bash
+#SBATCH --job-name=bs-eval-bloom-176b
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --reservation=hug
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --array=0-171
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+conda activate muennighofflmeval
+
+echo "START TIME: $(date)"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step95000
+MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed-bloom
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export TOKENIZERS_PARALLELISM=false
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# Make sure you use the slow version of the tokenizer.
+# Same tokenizer for 125m and 175b
+TOKENIZER_NAME_OR_PATH=bigscience/tokenizer
+
+PP_SIZE=8
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+# --task_list GEM/web_nlg_en,GEM/web_nlg_en_challenge_test_numbers,GEM/web_nlg_en_challenge_test_scramble,GEM/web_nlg_en_challenge_validation_sample,GEM/web_nlg_ru,GEM/web_nlg_ru_challenge_test_scramble,GEM/web_nlg_ru_challenge_validation_sample,GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05,GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc,GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05,GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc,GEM/wiki_auto_asset_turk_test_asset,GEM/wiki_auto_asset_turk_test_turk,GEM/wiki_lingua_ar,GEM/wiki_lingua_cs,GEM/wiki_lingua_de,GEM/wiki_lingua_en,GEM/wiki_lingua_es,GEM/wiki_lingua_fr,GEM/wiki_lingua_hi,GEM/wiki_lingua_id,GEM/wiki_lingua_it,GEM/wiki_lingua_ja,GEM/wiki_lingua_ko,GEM/wiki_lingua_nl,GEM/wiki_lingua_pt,GEM/wiki_lingua_ru,GEM/wiki_lingua_th,GEM/wiki_lingua_tr,GEM/wiki_lingua_vi,GEM/wiki_lingua_zh,gem_xsum,gem_xsum_challenge_sample,gem_xsum_challenge_test_backtranslation,gem_xsum_challenge_test_bfp_02,gem_xsum_challenge_test_bfp_05,gem_xsum_challenge_test_covid,gem_xsum_challenge_test_nopunc,axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en\
+
+TASKS=(
+GEM/web_nlg_en
+GEM/web_nlg_en_challenge_test_numbers
+GEM/web_nlg_en_challenge_test_scramble
+GEM/web_nlg_en_challenge_validation_sample
+GEM/web_nlg_ru
+GEM/web_nlg_ru_challenge_test_scramble
+GEM/web_nlg_ru_challenge_validation_sample
+GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation
+GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02
+GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05
+GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc
+GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation
+GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02
+GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05
+GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc
+GEM/wiki_auto_asset_turk_test_asset
+GEM/wiki_auto_asset_turk_test_turk
+GEM/wiki_lingua_ar
+GEM/wiki_lingua_cs
+GEM/wiki_lingua_de
+GEM/wiki_lingua_en
+GEM/wiki_lingua_es
+GEM/wiki_lingua_fr
+GEM/wiki_lingua_hi
+GEM/wiki_lingua_id
+GEM/wiki_lingua_it
+GEM/wiki_lingua_ja
+GEM/wiki_lingua_ko
+GEM/wiki_lingua_nl
+GEM/wiki_lingua_pt
+GEM/wiki_lingua_ru
+GEM/wiki_lingua_th
+GEM/wiki_lingua_tr
+GEM/wiki_lingua_vi
+GEM/wiki_lingua_zh
+gem_xsum
+gem_xsum_challenge_sample
+gem_xsum_challenge_test_backtranslation
+gem_xsum_challenge_test_bfp_02
+gem_xsum_challenge_test_bfp_05
+gem_xsum_challenge_test_covid
+gem_xsum_challenge_test_nopunc
+axb
+axg
+boolq
+cb
+cola
+copa
+crows_pairs_english
+crows_pairs_french
+diabla
+e2e_nlg_cleaned
+mnli
+mnli_mismatched
+multirc
+piaf
+qqp
+rte
+sst
+tydiqa_primary
+tydiqa_secondary
+wic
+wsc
+wnli
+wino_bias_type1_anti
+wino_bias_type1_pro
+wino_bias_type2_anti
+wino_bias_type2_pro
+xquad_ar
+xquad_en
+gsarti/flores_101_afr
+gsarti/flores_101_amh
+gsarti/flores_101_ara
+gsarti/flores_101_hye
+gsarti/flores_101_asm
+gsarti/flores_101_ast
+gsarti/flores_101_azj
+gsarti/flores_101_bel
+gsarti/flores_101_ben
+gsarti/flores_101_bos
+gsarti/flores_101_bul
+gsarti/flores_101_mya
+gsarti/flores_101_cat
+gsarti/flores_101_ceb
+gsarti/flores_101_zho_simpl
+gsarti/flores_101_zho_trad
+gsarti/flores_101_hrv
+gsarti/flores_101_ces
+gsarti/flores_101_dan
+gsarti/flores_101_nld
+gsarti/flores_101_eng
+gsarti/flores_101_est
+gsarti/flores_101_tgl
+gsarti/flores_101_fin
+gsarti/flores_101_fra
+gsarti/flores_101_ful
+gsarti/flores_101_glg
+gsarti/flores_101_lug
+gsarti/flores_101_kat
+gsarti/flores_101_deu
+gsarti/flores_101_ell
+gsarti/flores_101_guj
+gsarti/flores_101_hau
+gsarti/flores_101_heb
+gsarti/flores_101_hin
+gsarti/flores_101_hun
+gsarti/flores_101_isl
+gsarti/flores_101_ibo
+gsarti/flores_101_ind
+gsarti/flores_101_gle
+gsarti/flores_101_ita
+gsarti/flores_101_jpn
+gsarti/flores_101_jav
+gsarti/flores_101_kea
+gsarti/flores_101_kam
+gsarti/flores_101_kan
+gsarti/flores_101_kaz
+gsarti/flores_101_khm
+gsarti/flores_101_kor
+gsarti/flores_101_kir
+gsarti/flores_101_lao
+gsarti/flores_101_lav
+gsarti/flores_101_lin
+gsarti/flores_101_lit
+gsarti/flores_101_luo
+gsarti/flores_101_ltz
+gsarti/flores_101_mkd
+gsarti/flores_101_msa
+gsarti/flores_101_mal
+gsarti/flores_101_mlt
+gsarti/flores_101_mri
+gsarti/flores_101_mar
+gsarti/flores_101_mon
+gsarti/flores_101_npi
+gsarti/flores_101_nso
+gsarti/flores_101_nob
+gsarti/flores_101_nya
+gsarti/flores_101_oci
+gsarti/flores_101_ory
+gsarti/flores_101_orm
+gsarti/flores_101_pus
+gsarti/flores_101_fas
+gsarti/flores_101_pol
+gsarti/flores_101_por
+gsarti/flores_101_pan
+gsarti/flores_101_ron
+gsarti/flores_101_rus
+gsarti/flores_101_srp
+gsarti/flores_101_sna
+gsarti/flores_101_snd
+gsarti/flores_101_slk
+gsarti/flores_101_slv
+gsarti/flores_101_som
+gsarti/flores_101_ckb
+gsarti/flores_101_spa
+gsarti/flores_101_swh
+gsarti/flores_101_swe
+gsarti/flores_101_tgk
+gsarti/flores_101_tam
+gsarti/flores_101_tel
+gsarti/flores_101_tha
+gsarti/flores_101_tur
+gsarti/flores_101_ukr
+gsarti/flores_101_umb
+gsarti/flores_101_urd
+gsarti/flores_101_uzb
+gsarti/flores_101_vie
+gsarti/flores_101_cym
+gsarti/flores_101_wol
+gsarti/flores_101_xho
+gsarti/flores_101_yor
+gsarti/flores_101_zul
+)
+
+#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ];
+#    then
+#    echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}"
+#    exit 1
+#fi
+TASK=${TASKS[$SLURM_ARRAY_TASK_ID]}
+
+BLOOM_FOLDER=$WORK/bloom/bloom-176b/$TASK
+mkdir -p $BLOOM_FOLDER
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $BLOOM_FOLDER/bs_results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --task_list $TASK\
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 16 \
+    --offloadearly \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $BLOOM_FOLDER/bs-eval-harness.log
diff --git a/evaluation/results/bloom/run_bloom_evaluation_175b.slurm b/evaluation/results/bloom/run_bloom_evaluation_175b.slurm
new file mode 100644
index 00000000..679ae7d2
--- /dev/null
+++ b/evaluation/results/bloom/run_bloom_evaluation_175b.slurm
@@ -0,0 +1,166 @@
+#!/bin/bash
+#SBATCH --job-name=eai-eval-bloom-176b
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --reservation=hug
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --array=0-32
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+# Required in order to load the opt tokenizer
+conda activate thomas_lm_eval
+
+echo "START TIME: $(date)"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step95000
+MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed-bloom
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export TOKENIZERS_PARALLELISM=false
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# Make sure you use the slow version of the tokenizer.
+# Same tokenizer for 125m and 175b
+TOKENIZER_NAME_OR_PATH=bigscience/tokenizer
+
+PP_SIZE=8
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+# --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+TASKS=(
+arc_challenge
+arc_easy
+boolq
+copa
+headqa
+hellaswag
+lambada
+logiqa
+mathqa
+mc_taco
+mrpc
+multirc
+openbookqa
+piqa
+prost
+pubmedqa
+qnli
+qqp
+race
+rte
+sciq
+sst
+triviaqa
+webqs
+wic
+winogrande
+wnli
+wsc
+lambada_mt_en
+lambada_mt_fr
+lambada_mt_de
+lambada_mt_it
+lambada_mt_es
+)
+
+#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ];
+#    then
+#    echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}"
+#    exit 1
+#fi
+TASK=${TASKS[$SLURM_ARRAY_TASK_ID]}
+
+BLOOM_FOLDER=$WORK/bloom/bloom-176b/$TASK
+mkdir -p $BLOOM_FOLDER
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $BLOOM_FOLDER/eai_results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --task_list $TASK\
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 16 \
+    --offloadearly \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $BLOOM_FOLDER/eval-harness.log
diff --git a/evaluation/results/opt/README.md b/evaluation/results/opt/README.md
new file mode 100644
index 00000000..5fb71237
--- /dev/null
+++ b/evaluation/results/opt/README.md
@@ -0,0 +1 @@
+Utilities requires to run OPT evaluation
diff --git a/evaluation/results/opt/concatenate_all_results.py b/evaluation/results/opt/concatenate_all_results.py
new file mode 100644
index 00000000..4e8fe616
--- /dev/null
+++ b/evaluation/results/opt/concatenate_all_results.py
@@ -0,0 +1,99 @@
+import argparse
+import json
+import re
+from pathlib import Path
+from re import Pattern
+from typing import List, Dict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results-dir", required=True, type=Path, help="Path to the list of results")
+    parser.add_argument("--concatenate-output-file", required=True, type=Path, help="Path to store the final output file")
+    return parser.parse_args()
+
+MODEL = "opt-175b-meg-ds"
+# MODEL = "global_step95000"
+RESULTS_REGEX = re.compile(rf"(eai|bs)_results_lm-eval_{MODEL}_(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})_backup\.json")
+def get_all_files_that_match_results_in_folder(root_folder: Path) -> List[Path]:
+    json_files = []
+    for folder in root_folder.iterdir():
+        if folder.is_dir():
+            json_files += get_all_files_that_match_results_in_folder(folder)
+        else:
+            # it's actually a file
+            file = folder
+
+            match = RESULTS_REGEX.match(file.name)
+
+            if match is None:
+                continue
+            else:
+                json_files.append(file)
+    return json_files
+
+def sort_dict(dictionary: Dict) -> Dict:
+    results = {}
+
+    for key, value in sorted(dictionary.items(), key=lambda item: item[0]):
+        new_value = value
+
+        if isinstance(value, dict):
+            new_value = sort_dict(new_value)
+        elif isinstance(value, list):
+            new_value = sorted(value)
+
+        results[key] = new_value
+
+    return results
+
+def main():
+    args = get_args()
+
+    # Get all json files
+    json_files = get_all_files_that_match_results_in_folder(args.results_dir)
+
+    # Merge all json files
+    final_result = {
+        "results": {},
+        "versions": {}
+    }
+    for file in json_files:
+        with open(file, "r") as fi:
+            task_result = json.load(fi)
+
+        match = RESULTS_REGEX.match(file.name)
+        assert match is not None
+        prefix = match.group(1)
+        datetime_string = match.group(2)
+
+        if prefix == "eai":
+            results_key = "results"
+        elif prefix == "bs":
+            results_key = "table_results"
+        else:
+            raise ValueError(f"Unsupported key: {prefix}")
+
+        for key, value in task_result[results_key].items():
+            if key not in final_result["results"]:
+                final_result["results"][key] = {
+                    datetime_string: value
+                }
+            else:
+                assert datetime_string not in final_result["results"][key]
+                final_result["results"][key][datetime_string] = value
+
+        for key, value in task_result["versions"].items():
+            final_result["versions"][key] = value
+
+    # We sort dict, better for serialization
+    final_result = sort_dict(final_result)
+
+    # Save result
+    with open(args.concatenate_output_file, "w") as fo:
+        json.dump(final_result, fo, indent=2)
+
+    pass
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/results/opt/convert_transformers_checkpoint_to_meg_ds.py b/evaluation/results/opt/convert_transformers_checkpoint_to_meg_ds.py
new file mode 100644
index 00000000..b49dae9e
--- /dev/null
+++ b/evaluation/results/opt/convert_transformers_checkpoint_to_meg_ds.py
@@ -0,0 +1,247 @@
+import argparse
+import json
+import re, os
+from functools import partial
+from multiprocessing import Pool
+from typing import List, Optional, Dict
+
+import torch
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--opt_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the transformers OPT checkpoint path.",
+    )
+    parser.add_argument(
+        "--opt_sharded_index_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the transformers OPT checkpoint metadata path.",
+    )
+    parser.add_argument(
+        "--megatron_dump_folder_path", default=None, type=str, required=True,
+        help="Path to the output Megatron-DS model."
+    )
+    parser.add_argument(
+        "--num-proc", default=1, type=int,
+    )
+    return parser.parse_args()
+
+
+def compute_meg_ds_weight_names(num_layers: int):
+    return {
+        "layer_01-model_00-model_states.pt": [
+            "word_embeddings.weight",
+            "position_embeddings.weight",
+        ],
+        **{
+            f"layer_{str(layer_id).zfill(2)}-model_00-model_states.pt": [
+                "input_layernorm.weight",
+                "input_layernorm.bias",
+                "self_attention.query_key_value.weight",
+                "self_attention.query_key_value.bias",
+                "self_attention.dense.weight",
+                "self_attention.dense.bias",
+                "post_attention_layernorm.weight",
+                "post_attention_layernorm.bias",
+                "mlp.dense_h_to_4h.weight",
+                "mlp.dense_h_to_4h.bias",
+                "mlp.dense_4h_to_h.weight",
+                "mlp.dense_4h_to_h.bias",
+            ]
+            for layer_id in range(3, num_layers + 3)
+        },
+        f"layer_{str(num_layers + 4).zfill(2)}-model_00-model_states.pt": [
+            "weight",
+            "bias"
+        ]
+    }
+
+NON_TRANSFORMERS_BLOCK_WEIGHTS = {
+    "word_embeddings.weight": "decoder.embed_tokens.weight",
+    "position_embeddings.weight": "decoder.embed_positions.weight",
+    "weight": "decoder.final_layer_norm.weight",
+    "bias": "decoder.final_layer_norm.bias"
+}
+TRANSFORMERS_BLOCK_WEIGHTS = {
+    "input_layernorm.weight": ["self_attn_layer_norm.weight"],
+    "input_layernorm.bias": ["self_attn_layer_norm.bias"],
+    "self_attention.query_key_value.weight": ["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"],
+    "self_attention.query_key_value.bias": ["self_attn.q_proj.bias", "self_attn.k_proj.bias", "self_attn.v_proj.bias"],
+    "self_attention.dense.weight": ["self_attn.out_proj.weight"],
+    "self_attention.dense.bias": ["self_attn.out_proj.bias"],
+    "post_attention_layernorm.weight": ["final_layer_norm.weight"],
+    "post_attention_layernorm.bias": ["final_layer_norm.bias"],
+    "mlp.dense_h_to_4h.weight": ["fc1.weight"],
+    "mlp.dense_h_to_4h.bias": ["fc1.bias"],
+    "mlp.dense_4h_to_h.weight": ["fc2.weight"],
+    "mlp.dense_4h_to_h.bias": ["fc2.bias"]
+}
+def get_transformers_weight_names(meg_ds_weight: str, layer_id: Optional[int]) -> List[str]:
+    if layer_id is None:
+        return [NON_TRANSFORMERS_BLOCK_WEIGHTS[meg_ds_weight]]
+    else:
+        return [f"decoder.layers.{layer_id}.{tfrs_block_name}" for tfrs_block_name in TRANSFORMERS_BLOCK_WEIGHTS[meg_ds_weight]]
+
+def get_layer_id(meg_ds_filename: str, total_num_layers: int) -> Optional[int]:
+    layer_id = int(re.match(r"layer_(\d*)-model_00-model_states.pt", meg_ds_filename)[1]) - 3
+
+    if layer_id < 0:
+        return None
+
+    if layer_id >= total_num_layers:
+        return None
+
+    return layer_id
+
+def merge_layers(layers, num_heads: int, hidden_size: int):
+    if len(layers) == 1:
+        return layers[0]
+    else:
+        # We merge QKV
+        if len(layers[0].shape) == 1:
+            # bias
+            return torch.reshape(
+                torch.cat(
+                    [
+                        layer.view(num_heads, 1, hidden_size // num_heads)
+                        for layer in layers
+                    ],
+                    dim=1
+                ),
+                (3 * hidden_size, )
+            )
+        else:
+            #weight
+            return torch.reshape(
+                torch.cat(
+                    [
+                        layer.view(num_heads, 1, hidden_size // num_heads, hidden_size)
+                        for layer in layers
+                    ],
+                    dim=1
+                ),
+                (3 * hidden_size, hidden_size)
+            )
+
+def find_transformers_weights_and_save_meg_ds_weights(
+    meg_ds_filename: str,
+    meg_ds_weight_names: List[str],
+    opt_checkpoint_path: str,
+    megatron_dump_folder_path:str,
+    total_num_layers: int,
+    num_heads: int,
+    hidden_size: int,
+    trfs_weight_map: Dict[str, str]
+):
+    layer_id = get_layer_id(meg_ds_filename, total_num_layers=total_num_layers)
+    trfs_weight_namess = {meg_ds_weight_name: get_transformers_weight_names(meg_ds_weight_name, layer_id=layer_id) for meg_ds_weight_name in meg_ds_weight_names}
+
+    # Find the path they live in.
+    trfs_filenames = set(trfs_weight_map[trfs_weight_name] for trfs_weight_names in trfs_weight_namess.values() for trfs_weight_name in trfs_weight_names)
+    trfs_filename_to_weights = {
+        trfs_filename: torch.load(os.path.join(opt_checkpoint_path, trfs_filename), map_location="cpu")
+        for trfs_filename in trfs_filenames
+    }
+
+    # query those weights
+    result = {
+        meg_ds_weight_name: [
+            trfs_filename_to_weights[trfs_weight_map[tfrs_weight_name]][tfrs_weight_name]
+            for tfrs_weight_name in tfrs_weight_names
+        ]
+        for meg_ds_weight_name, tfrs_weight_names in trfs_weight_namess.items()
+    }
+
+    # possibly concatenate
+    save_path = os.path.join(megatron_dump_folder_path, meg_ds_filename)
+    with open(save_path, "wb") as fo:
+        # qkv are mixed s.t. [q1 k1 v1 q2 k2 v2 ...] with (1,2..) being head_id
+        torch.save(
+            {
+                key: merge_layers(values, num_heads=num_heads, hidden_size=hidden_size)
+                for key, values in result.items()
+            },
+            fo
+        )
+
+
+def convert_opt_checkpoint_to_megatron(
+    opt_checkpoint_path: str,
+    megatron_dump_folder_path: str,
+    opt_index_path: str,
+    num_proc: int
+):
+    # Get total number of layers
+    with open(opt_index_path, "r") as fi:
+        index_file = json.load(fi)["weight_map"]
+    # Compute total amount of layers
+    with open(os.path.join(opt_checkpoint_path, "config.json"), "r") as fi:
+        config = json.load(fi)
+    total_amount_of_layers = config["num_hidden_layers"]
+    num_heads = config["num_attention_heads"]
+    hidden_size = config["hidden_size"]
+
+    # Given the total number of layers we can compute exactly each meg_ds params we need to find.
+    meg_ds_filename_to_meg_ds_weights = compute_meg_ds_weight_names(total_amount_of_layers)
+
+    # Given the needed weights we can query them from the transformers checkpoint
+    # We have to be smart about it and load a bin file once and get everything.
+    if num_proc == 1:
+        for meg_ds_filename, meg_ds_weight_names in tqdm(meg_ds_filename_to_meg_ds_weights.items()):
+            find_transformers_weights_and_save_meg_ds_weights(
+                meg_ds_filename=meg_ds_filename,
+                meg_ds_weight_names=meg_ds_weight_names,
+                opt_checkpoint_path=opt_checkpoint_path,
+                megatron_dump_folder_path=megatron_dump_folder_path,
+                total_num_layers=total_amount_of_layers,
+                trfs_weight_map=index_file,
+                num_heads=num_heads,
+                hidden_size=hidden_size
+            )
+    else:
+        with Pool(num_proc) as pool:
+            pool.starmap(
+                partial(
+                    find_transformers_weights_and_save_meg_ds_weights,
+                    opt_checkpoint_path=opt_checkpoint_path,
+                    megatron_dump_folder_path=megatron_dump_folder_path,
+                    total_num_layers=total_amount_of_layers,
+                    trfs_weight_map=index_file,
+                    num_heads=num_heads,
+                    hidden_size=hidden_size
+                ),
+                tqdm(meg_ds_filename_to_meg_ds_weights.items())
+            )
+
+    # Create dummy mp_rank_00_model_states.pt
+    torch.save(
+        {
+            "mp_world_size": 1,
+            "module": None,
+            "dp_world_size": 1,
+            "checkpoint_version": 3,
+            "iteration": 0
+        },
+        os.path.join(megatron_dump_folder_path, "mp_rank_00_model_states.pt")
+    )
+
+def main():
+    args = get_args()
+    convert_opt_checkpoint_to_megatron(
+        opt_checkpoint_path=args.opt_checkpoint_path,
+        megatron_dump_folder_path=args.megatron_dump_folder_path,
+        opt_index_path=args.opt_sharded_index_path,
+        num_proc=args.num_proc
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/results/opt/run_opt_bs_evaluation_125m.slurm b/evaluation/results/opt/run_opt_bs_evaluation_125m.slurm
new file mode 100644
index 00000000..bf4afde9
--- /dev/null
+++ b/evaluation/results/opt/run_opt_bs_evaluation_125m.slurm
@@ -0,0 +1,215 @@
+#!/bin/bash
+#SBATCH --job-name=bs-eval-opt-125m
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=10
+#SBATCH --hint=nomultithread
+#SBATCH --gres=gpu:1
+#SBATCH --time 20:00:00
+#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out
+#SBATCH --account=six@v100
+#SBATCH --array=0-69
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+conda activate muennighofflmeval
+
+echo "START TIME: $(date)"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-125m-meg-ds
+MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# Make sure you use the slow version of the tokenizer.
+# Same tokenizer for 125m and 175b
+TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m
+
+PP_SIZE=1
+TP_SIZE=1
+
+NHIDDEN=768
+NLAYERS=12
+NHEADS=12
+SEQ_LEN=2048
+MAX_POSITION_EMBEDDINGS=2050
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+MEGATRON_REQUIRED_ARGS="
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $MAX_POSITION_EMBEDDINGS \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --fp16 \
+    --relu \
+    --seed 42 \
+    --pad-vocab-size-to 50272 \
+    --make-vocab-size-divisible-by 1\
+    --no-bias-gelu-fusion\
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+# --task_list GEM/web_nlg_en,GEM/web_nlg_en_challenge_test_numbers,GEM/web_nlg_en_challenge_test_scramble,GEM/web_nlg_en_challenge_validation_sample,GEM/web_nlg_ru,GEM/web_nlg_ru_challenge_test_scramble,GEM/web_nlg_ru_challenge_validation_sample,GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05,GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc,GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05,GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc,GEM/wiki_auto_asset_turk_test_asset,GEM/wiki_auto_asset_turk_test_turk,GEM/wiki_lingua_ar,GEM/wiki_lingua_cs,GEM/wiki_lingua_de,GEM/wiki_lingua_en,GEM/wiki_lingua_es,GEM/wiki_lingua_fr,GEM/wiki_lingua_hi,GEM/wiki_lingua_id,GEM/wiki_lingua_it,GEM/wiki_lingua_ja,GEM/wiki_lingua_ko,GEM/wiki_lingua_nl,GEM/wiki_lingua_pt,GEM/wiki_lingua_ru,GEM/wiki_lingua_th,GEM/wiki_lingua_tr,GEM/wiki_lingua_vi,GEM/wiki_lingua_zh,gem_xsum,gem_xsum_challenge_sample,gem_xsum_challenge_test_backtranslation,gem_xsum_challenge_test_bfp_02,gem_xsum_challenge_test_bfp_05,gem_xsum_challenge_test_covid,gem_xsum_challenge_test_nopunc,axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en\
+
+TASKS=(
+GEM/web_nlg_en
+GEM/web_nlg_en_challenge_test_numbers
+GEM/web_nlg_en_challenge_test_scramble
+GEM/web_nlg_en_challenge_validation_sample
+GEM/web_nlg_ru
+GEM/web_nlg_ru_challenge_test_scramble
+GEM/web_nlg_ru_challenge_validation_sample
+GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation
+GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02
+GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05
+GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc
+GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation
+GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02
+GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05
+GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc
+GEM/wiki_auto_asset_turk_test_asset
+GEM/wiki_auto_asset_turk_test_turk
+GEM/wiki_lingua_ar
+GEM/wiki_lingua_cs
+GEM/wiki_lingua_de
+GEM/wiki_lingua_en
+GEM/wiki_lingua_es
+GEM/wiki_lingua_fr
+GEM/wiki_lingua_hi
+GEM/wiki_lingua_id
+GEM/wiki_lingua_it
+GEM/wiki_lingua_ja
+GEM/wiki_lingua_ko
+GEM/wiki_lingua_nl
+GEM/wiki_lingua_pt
+GEM/wiki_lingua_ru
+GEM/wiki_lingua_th
+GEM/wiki_lingua_tr
+GEM/wiki_lingua_vi
+GEM/wiki_lingua_zh
+gem_xsum
+gem_xsum_challenge_sample
+gem_xsum_challenge_test_backtranslation
+gem_xsum_challenge_test_bfp_02
+gem_xsum_challenge_test_bfp_05
+gem_xsum_challenge_test_covid
+gem_xsum_challenge_test_nopunc
+axb
+axg
+boolq
+cb
+cola
+copa
+crows_pairs_english
+crows_pairs_french
+diabla
+e2e_nlg_cleaned
+mnli
+mnli_mismatched
+multirc
+piaf
+qqp
+rte
+sst
+tydiqa_primary
+tydiqa_secondary
+wic
+wsc
+wnli
+wino_bias_type1_anti
+wino_bias_type1_pro
+wino_bias_type2_anti
+wino_bias_type2_pro
+xquad_ar
+xquad_en
+)
+
+#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ];
+#    then
+#    echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}"
+#    exit 1
+#fi
+TASK=${TASKS[$SLURM_ARRAY_TASK_ID]}
+
+OPT_FOLDER=$WORK/opt/opt-125m/$TASK
+mkdir -p $OPT_FOLDER
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $OPT_FOLDER/bs_results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --task_list $TASK\
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/bs-eval-harness.log
diff --git a/evaluation/results/opt/run_opt_bs_evaluation_175b.slurm b/evaluation/results/opt/run_opt_bs_evaluation_175b.slurm
new file mode 100644
index 00000000..097581be
--- /dev/null
+++ b/evaluation/results/opt/run_opt_bs_evaluation_175b.slurm
@@ -0,0 +1,321 @@
+#!/bin/bash
+#SBATCH --job-name=bs-eval-opt-175b
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --reservation=hug
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --array=0-171
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+conda activate muennighofflmeval
+
+echo "START TIME: $(date)"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-175b-meg-ds
+MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# Make sure you use the slow version of the tokenizer.
+# Same tokenizer for 125m and 175b
+TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m
+
+PP_SIZE=8
+TP_SIZE=1
+
+NHIDDEN=12288
+NLAYERS=96
+NHEADS=96
+SEQ_LEN=2048
+MAX_POSITION_EMBEDDINGS=2050
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+MEGATRON_REQUIRED_ARGS="
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $MAX_POSITION_EMBEDDINGS \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --fp16 \
+    --relu \
+    --seed 42 \
+    --pad-vocab-size-to 50272 \
+    --make-vocab-size-divisible-by 1\
+    --no-bias-gelu-fusion\
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+# --task_list GEM/web_nlg_en,GEM/web_nlg_en_challenge_test_numbers,GEM/web_nlg_en_challenge_test_scramble,GEM/web_nlg_en_challenge_validation_sample,GEM/web_nlg_ru,GEM/web_nlg_ru_challenge_test_scramble,GEM/web_nlg_ru_challenge_validation_sample,GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02,GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05,GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc,GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02,GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05,GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc,GEM/wiki_auto_asset_turk_test_asset,GEM/wiki_auto_asset_turk_test_turk,GEM/wiki_lingua_ar,GEM/wiki_lingua_cs,GEM/wiki_lingua_de,GEM/wiki_lingua_en,GEM/wiki_lingua_es,GEM/wiki_lingua_fr,GEM/wiki_lingua_hi,GEM/wiki_lingua_id,GEM/wiki_lingua_it,GEM/wiki_lingua_ja,GEM/wiki_lingua_ko,GEM/wiki_lingua_nl,GEM/wiki_lingua_pt,GEM/wiki_lingua_ru,GEM/wiki_lingua_th,GEM/wiki_lingua_tr,GEM/wiki_lingua_vi,GEM/wiki_lingua_zh,gem_xsum,gem_xsum_challenge_sample,gem_xsum_challenge_test_backtranslation,gem_xsum_challenge_test_bfp_02,gem_xsum_challenge_test_bfp_05,gem_xsum_challenge_test_covid,gem_xsum_challenge_test_nopunc,axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en\
+
+TASKS=(
+GEM/web_nlg_en
+GEM/web_nlg_en_challenge_test_numbers
+GEM/web_nlg_en_challenge_test_scramble
+GEM/web_nlg_en_challenge_validation_sample
+GEM/web_nlg_ru
+GEM/web_nlg_ru_challenge_test_scramble
+GEM/web_nlg_ru_challenge_validation_sample
+GEM/wiki_auto_asset_turk_challenge_test_asset_backtranslation
+GEM/wiki_auto_asset_turk_challenge_test_asset_bfp02
+GEM/wiki_auto_asset_turk_challenge_test_asset_bfp05
+GEM/wiki_auto_asset_turk_challenge_test_asset_nopunc
+GEM/wiki_auto_asset_turk_challenge_test_turk_backtranslation
+GEM/wiki_auto_asset_turk_challenge_test_turk_bfp02
+GEM/wiki_auto_asset_turk_challenge_test_turk_bfp05
+GEM/wiki_auto_asset_turk_challenge_test_turk_nopunc
+GEM/wiki_auto_asset_turk_test_asset
+GEM/wiki_auto_asset_turk_test_turk
+GEM/wiki_lingua_ar
+GEM/wiki_lingua_cs
+GEM/wiki_lingua_de
+GEM/wiki_lingua_en
+GEM/wiki_lingua_es
+GEM/wiki_lingua_fr
+GEM/wiki_lingua_hi
+GEM/wiki_lingua_id
+GEM/wiki_lingua_it
+GEM/wiki_lingua_ja
+GEM/wiki_lingua_ko
+GEM/wiki_lingua_nl
+GEM/wiki_lingua_pt
+GEM/wiki_lingua_ru
+GEM/wiki_lingua_th
+GEM/wiki_lingua_tr
+GEM/wiki_lingua_vi
+GEM/wiki_lingua_zh
+gem_xsum
+gem_xsum_challenge_sample
+gem_xsum_challenge_test_backtranslation
+gem_xsum_challenge_test_bfp_02
+gem_xsum_challenge_test_bfp_05
+gem_xsum_challenge_test_covid
+gem_xsum_challenge_test_nopunc
+axb
+axg
+boolq
+cb
+cola
+copa
+crows_pairs_english
+crows_pairs_french
+diabla
+e2e_nlg_cleaned
+mnli
+mnli_mismatched
+multirc
+piaf
+qqp
+rte
+sst
+tydiqa_primary
+tydiqa_secondary
+wic
+wsc
+wnli
+wino_bias_type1_anti
+wino_bias_type1_pro
+wino_bias_type2_anti
+wino_bias_type2_pro
+xquad_ar
+xquad_en
+gsarti/flores_101_afr
+gsarti/flores_101_amh
+gsarti/flores_101_ara
+gsarti/flores_101_hye
+gsarti/flores_101_asm
+gsarti/flores_101_ast
+gsarti/flores_101_azj
+gsarti/flores_101_bel
+gsarti/flores_101_ben
+gsarti/flores_101_bos
+gsarti/flores_101_bul
+gsarti/flores_101_mya
+gsarti/flores_101_cat
+gsarti/flores_101_ceb
+gsarti/flores_101_zho_simpl
+gsarti/flores_101_zho_trad
+gsarti/flores_101_hrv
+gsarti/flores_101_ces
+gsarti/flores_101_dan
+gsarti/flores_101_nld
+gsarti/flores_101_eng
+gsarti/flores_101_est
+gsarti/flores_101_tgl
+gsarti/flores_101_fin
+gsarti/flores_101_fra
+gsarti/flores_101_ful
+gsarti/flores_101_glg
+gsarti/flores_101_lug
+gsarti/flores_101_kat
+gsarti/flores_101_deu
+gsarti/flores_101_ell
+gsarti/flores_101_guj
+gsarti/flores_101_hau
+gsarti/flores_101_heb
+gsarti/flores_101_hin
+gsarti/flores_101_hun
+gsarti/flores_101_isl
+gsarti/flores_101_ibo
+gsarti/flores_101_ind
+gsarti/flores_101_gle
+gsarti/flores_101_ita
+gsarti/flores_101_jpn
+gsarti/flores_101_jav
+gsarti/flores_101_kea
+gsarti/flores_101_kam
+gsarti/flores_101_kan
+gsarti/flores_101_kaz
+gsarti/flores_101_khm
+gsarti/flores_101_kor
+gsarti/flores_101_kir
+gsarti/flores_101_lao
+gsarti/flores_101_lav
+gsarti/flores_101_lin
+gsarti/flores_101_lit
+gsarti/flores_101_luo
+gsarti/flores_101_ltz
+gsarti/flores_101_mkd
+gsarti/flores_101_msa
+gsarti/flores_101_mal
+gsarti/flores_101_mlt
+gsarti/flores_101_mri
+gsarti/flores_101_mar
+gsarti/flores_101_mon
+gsarti/flores_101_npi
+gsarti/flores_101_nso
+gsarti/flores_101_nob
+gsarti/flores_101_nya
+gsarti/flores_101_oci
+gsarti/flores_101_ory
+gsarti/flores_101_orm
+gsarti/flores_101_pus
+gsarti/flores_101_fas
+gsarti/flores_101_pol
+gsarti/flores_101_por
+gsarti/flores_101_pan
+gsarti/flores_101_ron
+gsarti/flores_101_rus
+gsarti/flores_101_srp
+gsarti/flores_101_sna
+gsarti/flores_101_snd
+gsarti/flores_101_slk
+gsarti/flores_101_slv
+gsarti/flores_101_som
+gsarti/flores_101_ckb
+gsarti/flores_101_spa
+gsarti/flores_101_swh
+gsarti/flores_101_swe
+gsarti/flores_101_tgk
+gsarti/flores_101_tam
+gsarti/flores_101_tel
+gsarti/flores_101_tha
+gsarti/flores_101_tur
+gsarti/flores_101_ukr
+gsarti/flores_101_umb
+gsarti/flores_101_urd
+gsarti/flores_101_uzb
+gsarti/flores_101_vie
+gsarti/flores_101_cym
+gsarti/flores_101_wol
+gsarti/flores_101_xho
+gsarti/flores_101_yor
+gsarti/flores_101_zul
+)
+
+#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ];
+#    then
+#    echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}"
+#    exit 1
+#fi
+TASK=${TASKS[$SLURM_ARRAY_TASK_ID]}
+
+OPT_FOLDER=$WORK/opt/opt-175b/$TASK
+mkdir -p $OPT_FOLDER
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $OPT_FOLDER/bs_results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --task_list $TASK\
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 16 \
+    --offloadearly \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/bs-eval-harness.log
diff --git a/evaluation/results/opt/run_opt_evaluation_125m.slurm b/evaluation/results/opt/run_opt_evaluation_125m.slurm
new file mode 100644
index 00000000..cc657325
--- /dev/null
+++ b/evaluation/results/opt/run_opt_evaluation_125m.slurm
@@ -0,0 +1,173 @@
+#!/bin/bash
+#SBATCH --job-name=eai-eval-opt-125m
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=10
+#SBATCH --hint=nomultithread
+#SBATCH --gres=gpu:1
+#SBATCH --time 20:00:00
+#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out
+#SBATCH --account=six@v100
+#SBATCH --array=0-27
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+# Required in order to load the opt tokenizer
+conda activate thomas_lm_eval
+
+echo "START TIME: $(date)"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-125m-meg-ds
+MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# Make sure you use the slow version of the tokenizer.
+# Same tokenizer for 125m and 175b
+TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m
+
+PP_SIZE=1
+TP_SIZE=1
+
+NHIDDEN=768
+NLAYERS=12
+NHEADS=12
+SEQ_LEN=2048
+MAX_POSITION_EMBEDDINGS=2050
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+MEGATRON_REQUIRED_ARGS="
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $MAX_POSITION_EMBEDDINGS \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --fp16 \
+    --relu \
+    --seed 42 \
+    --pad-vocab-size-to 50272 \
+    --make-vocab-size-divisible-by 1\
+    --no-bias-gelu-fusion\
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+# --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+TASKS=(
+arc_challenge
+arc_easy
+boolq
+copa
+headqa
+hellaswag
+lambada
+logiqa
+mathqa
+mc_taco
+mrpc
+multirc
+openbookqa
+piqa
+prost
+pubmedqa
+qnli
+qqp
+race
+rte
+sciq
+sst
+triviaqa
+webqs
+wic
+winogrande
+wnli
+wsc
+)
+
+#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ];
+#    then
+#    echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}"
+#    exit 1
+#fi
+TASK=${TASKS[$SLURM_ARRAY_TASK_ID]}
+
+OPT_FOLDER=$WORK/opt/opt-125m/$TASK
+mkdir -p $OPT_FOLDER
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $OPT_FOLDER/eai_results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --task_list $TASK\
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 8 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/eval-harness.log
diff --git a/evaluation/results/opt/run_opt_evaluation_175b.slurm b/evaluation/results/opt/run_opt_evaluation_175b.slurm
new file mode 100644
index 00000000..515564cd
--- /dev/null
+++ b/evaluation/results/opt/run_opt_evaluation_175b.slurm
@@ -0,0 +1,182 @@
+#!/bin/bash
+#SBATCH --job-name=eai-eval-opt-175b
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --reservation=hug
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfswork/rech/six/uty16tp/logs/%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --array=0-32
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+# Required in order to load the opt tokenizer
+conda activate thomas_lm_eval
+
+echo "START TIME: $(date)"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/opt/opt-175b-meg-ds
+MEGATRON_DEEPSPEED_REPO=/gpfswork/rech/six/uty16tp/code/big_science/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# Make sure you use the slow version of the tokenizer.
+# Same tokenizer for 125m and 175b
+TOKENIZER_NAME_OR_PATH=/gpfsscratch/rech/six/commun/opt/opt-125m
+
+PP_SIZE=8
+TP_SIZE=1
+
+NHIDDEN=12288
+NLAYERS=96
+NHEADS=96
+SEQ_LEN=2048
+MAX_POSITION_EMBEDDINGS=2050
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+MEGATRON_REQUIRED_ARGS="
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $MAX_POSITION_EMBEDDINGS \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --fp16 \
+    --relu \
+    --seed 42 \
+    --pad-vocab-size-to 50272 \
+    --make-vocab-size-divisible-by 1\
+    --no-bias-gelu-fusion\
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+# --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+TASKS=(
+arc_challenge
+arc_easy
+boolq
+copa
+headqa
+hellaswag
+lambada
+logiqa
+mathqa
+mc_taco
+mrpc
+multirc
+openbookqa
+piqa
+prost
+pubmedqa
+qnli
+qqp
+race
+rte
+sciq
+sst
+triviaqa
+webqs
+wic
+winogrande
+wnli
+wsc
+lambada_mt_en
+lambada_mt_fr
+lambada_mt_de
+lambada_mt_it
+lambada_mt_es
+)
+
+#if [ "${#TASKS[@]}" -ne "$SLURM_ARRAY_TASK_COUNT" ];
+#    then
+#    echo "Please update the array size as the it doesn't correspond to the number of models we want to evaluate. Array size: $SLURM_ARRAY_TASK_COUNT, number of models: ${#TASKS[@]}"
+#    exit 1
+#fi
+TASK=${TASKS[$SLURM_ARRAY_TASK_ID]}
+
+OPT_FOLDER=$WORK/opt/opt-175b/$TASK
+mkdir -p $OPT_FOLDER
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $OPT_FOLDER/eai_results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --task_list $TASK\
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 16 \
+    --offloadearly \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $OPT_FOLDER/eval-harness.log