Skip to content

Commit

Permalink
Add nemo to mcore GPT conversion script (#7730)
Browse files Browse the repository at this point in the history
* add conversion script

Signed-off-by: Chen Cui <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove references to 'ckpt'

Signed-off-by: Chen Cui <[email protected]>

* add one more sanity check to make sure there is no unexpected keys in state dict

Signed-off-by: Chen Cui <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* make cpu loading work

Signed-off-by: Chen Cui <[email protected]>

* make script work for llama2 models

Signed-off-by: Chen Cui <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address code check

Signed-off-by: Chen Cui <[email protected]>

* remove trainer precision (was for old sanity check)

Signed-off-by: Chen Cui <[email protected]>

* fix script for llama2 model

Signed-off-by: Chen Cui <[email protected]>

* remove commented code

Signed-off-by: Chen Cui <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <[email protected]>
  • Loading branch information
3 people committed Oct 23, 2023
1 parent 4523562 commit fb6fb2f
Showing 1 changed file with 271 additions and 0 deletions.
271 changes: 271 additions & 0 deletions scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from argparse import ArgumentParser
from collections import OrderedDict

import torch
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer

from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from nemo.utils import AppState, logging

r"""
Script to convert a legacy (non-mcore path) nemo checkpoint into mcore-path checkpoint for GPT models.
*Important* Before running this script, please first
1) convert your legacy checkpoint to TP1 PP1 format:
python examples/nlp/language_modeling/megatron_change_num_partitions.py \
<follow the readme in that script> \
--target_tensor_model_parallel_size=1 \
--target_pipeline_model_parallel_size=1
2) extract your nemo file to a folder with
tar -xvf filename.nemo
Then, run this conversion script:
python convert_nemo_gpt_to_mcore.py \
--in-file <path to extracted, TP1 PP1 legacy checkpoint folder> \
--out-file <path to output nemo ile>
"""


def get_args():
parser = ArgumentParser()
parser.add_argument(
"--in-file", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
)
parser.add_argument(
"--out-file", type=str, default=None, required=True, help="Path to output mcore weights file (ends in .nemo)."
)
parser.add_argument(
"--cpu-only",
action="store_true",
help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
"but this option makes the conversion script significantly slower.",
)
args = parser.parse_args()
return args


def get_mcore_model_from_nemo_file(nemo_restore_from_path, cpu_only=False):
model_cfg = MegatronGPTModel.restore_from(nemo_restore_from_path, return_config=True)
model_cfg.tokenizer.vocab_file = None
model_cfg.tokenizer.merge_file = None
model_cfg.mcore_gpt = True
model_cfg.use_cpu_initialization = cpu_only

logging.info("*** initializing mcore model with the following config")
logging.info(OmegaConf.to_yaml(model_cfg))
trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())

app_state = AppState()
if os.path.isdir(nemo_restore_from_path):
app_state.nemo_file_folder = nemo_restore_from_path
else:
logging.warning(
"`nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
)
mcore_model = MegatronGPTModel(model_cfg, trainer=trainer)
return mcore_model


def print_mcore_parameter_names(restore_from_path):
mcore_model = get_mcore_model_from_nemo_file(restore_from_path)

print("*********")
print('\n'.join(sorted([k + '###' + str(v.shape) for k, v in mcore_model.named_parameters()])))
print("*********")


def build_key_mapping(nemo_cfg):
num_layers = nemo_cfg.num_layers
has_bias = nemo_cfg.get("bias", True)
has_layernorm_bias = (
nemo_cfg.get("normalization", "layernorm") != "rmsnorm"
) # llama model uses rmsnorm which does not have bias
model_str = 'model.module' if nemo_cfg.get('megatron_amp_O2', False) else 'model'

# For GPT there is a 1:1 mapping of keys
mcore_to_nemo_mapping = {
f"{model_str}.embedding.word_embeddings.weight": "model.language_model.embedding.word_embeddings.weight",
f"{model_str}.decoder.final_layernorm.weight": "model.language_model.encoder.final_layernorm.weight",
}
if has_layernorm_bias:
mcore_to_nemo_mapping[
f"{model_str}.decoder.final_layernorm.bias"
] = "model.language_model.encoder.final_layernorm.bias"

if not nemo_cfg.get("share_embeddings_and_output_weights", True):
mcore_to_nemo_mapping[f"{model_str}.output_layer.weight"] = "model.language_model.output_layer.weight"

if nemo_cfg.get("position_embedding_type", 'learned_absolute') == 'rope':
mcore_to_nemo_mapping[f"{model_str}.rotary_pos_emb.inv_freq"] = "model.language_model.rotary_pos_emb.inv_freq"
else:
mcore_to_nemo_mapping[
f"{model_str}.embedding.position_embeddings.weight"
] = "model.language_model.embedding.position_embeddings.weight"

nemo_prefix = "model.language_model.encoder.layers"
mcore_prefix = f"{model_str}.decoder.layers"
for i in range(num_layers):
for wb in ('weight', 'bias') if has_bias else ('weight',):
mcore_to_nemo_mapping.update(
{
f"{mcore_prefix}.{i}.mlp.linear_fc2.{wb}": f"{nemo_prefix}.{i}.mlp.dense_4h_to_h.{wb}",
f"{mcore_prefix}.{i}.mlp.linear_fc1.{wb}": f"{nemo_prefix}.{i}.mlp.dense_h_to_4h.{wb}",
f"{mcore_prefix}.{i}.self_attention.linear_proj.{wb}": f"{nemo_prefix}.{i}.self_attention.dense.{wb}",
f"{mcore_prefix}.{i}.self_attention.linear_qkv.{wb}": f"{nemo_prefix}.{i}.self_attention.query_key_value.{wb}",
}
)
# layernorm layers always have bias, but llama model uses rmsnorm which does not have bias
for wb in ('weight', 'bias') if has_layernorm_bias else ('weight',):
mcore_to_nemo_mapping.update(
{
f"{mcore_prefix}.{i}.self_attention.linear_qkv.layer_norm_{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
f"{mcore_prefix}.{i}.mlp.linear_fc1.layer_norm_{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
}
)

return mcore_to_nemo_mapping


def load_model(model, state_dict):
# try:
for name, module in model.named_parameters():
if name in state_dict:
module.data = state_dict.pop(name)
else:
raise RuntimeError(f"Unexpected key: {name} not in state_dict but in model.")

for name, buffer in model.named_buffers():
if name in state_dict:
buffer.data = state_dict.pop(name)

if len(state_dict.keys()) != 0:
raise RuntimeError(f"Additional keys: {state_dict.keys()} in state_dict but not in model.")

return model


def restore_model(nemo_file, cpu_only=False):
dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
if cpu_only:
map_location = torch.device('cpu')
model_config = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
)
model_config.use_cpu_initialization = True
else:
model_config, map_location = None, None
return MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
)


def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=False):
if skip_if_output_exists and os.path.exists(output_nemo_file):
logging.info(f"Output file already exists ({output_nemo_file}), skipping conversion...")
return
nemo_model = restore_model(input_nemo_file, cpu_only=cpu_only)

nemo_tokenizer_model = nemo_model.cfg.tokenizer.model
nemo_state_dict = nemo_model.state_dict()
mcore_state_dict = OrderedDict()
for mcore_param, nemo_param in build_key_mapping(nemo_model.cfg).items():
if mcore_param.endswith("linear_fc1.weight"):
# in llama models, need to concat dense_h_to_4h.weight and dense_h_to_4h_2.weight for the corresponding linear_fc1.weight
second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight")
if second_param in nemo_state_dict:
mcore_state_dict[mcore_param] = torch.cat(
[nemo_state_dict[nemo_param], nemo_state_dict[second_param]], dim=0
)
else:
mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]

mcore_model = get_mcore_model_from_nemo_file(input_nemo_file, cpu_only=cpu_only)
mcore_model = load_model(mcore_model, mcore_state_dict)

if nemo_model.cfg.tokenizer.model is not None:
logging.info("registering artifact: tokenizer.model = " + nemo_tokenizer_model)
mcore_model.register_artifact("tokenizer.model", nemo_tokenizer_model)

mcore_model.cfg.use_cpu_initialization = False
mcore_model.save_to(output_nemo_file)
logging.info(f"Done. Model saved to {output_nemo_file}")


def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):

nemo_model = restore_model(nemo_file, cpu_only=cpu_only).eval()
mcore_model = restore_model(mcore_file, cpu_only=cpu_only).eval()

logging.debug("*** Mcore model restored config")
logging.debug(OmegaConf.to_yaml(mcore_model.cfg))

nemo_summary = nemo_model.summarize()
mcore_summary = mcore_model.summarize()

logging.info("Sanity checks:")

# check num weights match
assert nemo_summary.total_parameters == mcore_summary.total_parameters, "❌ total parameters do not match"
assert nemo_summary.model_size == mcore_summary.model_size, "❌ model sizes do not match"
logging.info("✅ Number of weights match")

# check weights match
mcore_state_dict = mcore_model.state_dict()
nemo_state_dict = nemo_model.state_dict()
with open_dict(nemo_model.cfg):
nemo_model.cfg.megatron_amp_O2 = False # we want build_key_mapping in the next line to not use O2 prefix
for mcore_param, nemo_param in build_key_mapping(nemo_model.cfg).items():
try:
mcore_weight = mcore_state_dict.pop(mcore_param)
nemo_weight = nemo_state_dict.pop(nemo_param)
if mcore_param.endswith("linear_fc1.weight"):
# linear_fc1.weight should map to concat(dense_h_to_4h.weight, dense_h_to_4h_2.weight)
# but build_key_mapping only maps it to dense_h_to_4h.weight, so we handle the concat here.
second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight")
nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
assert torch.allclose(mcore_weight, nemo_weight), f"❌ parameter {mcore_param} does not match"
except KeyError:
buffers = [k for k, v in mcore_model.named_buffers()]
assert (
mcore_param in buffers or mcore_param.replace('model.', 'model.module.', 1) in buffers
), f"❌ parameter {mcore_param} is not found in the state dict or named_buffers()"
nemo_state_dict.pop(nemo_param)

logging.info("✅ Weights match")

# check for unexpected weights in state dict
assert len(nemo_state_dict) == 0, f"❌ unexpected items in nemo_state_dict: {nemo_state_dict}"
assert (
len([k for k in mcore_state_dict if not k.endswith('_extra_state')]) == 0
), f"❌ unexpected items in mcore_state_dict: {mcore_state_dict}"
logging.info("✅ No unexpected weights in state dicts")


if __name__ == '__main__':
args = get_args()

input_nemo_file = args.in_file
output_nemo_file = args.out_file
cpu_only = args.cpu_only

os.makedirs(os.path.dirname(output_nemo_file), exist_ok=True)
convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=cpu_only)
torch.cuda.empty_cache()
run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)

0 comments on commit fb6fb2f

Please sign in to comment.