From 3344f0bb1cfaec8ad226271e5da85d6c5a0518b3 Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Sun, 14 Apr 2024 14:46:02 -0700 Subject: [PATCH] Builder (#184) * install gguf * moved model build to builder.py --- builder.py | 193 +++++++++++++++++++++++++++++++++++++++++++++++ eval.py | 3 +- export.py | 3 +- export_aoti.py | 2 +- export_et.py | 4 +- generate.py | 157 +------------------------------------- requirements.txt | 1 + 7 files changed, 202 insertions(+), 161 deletions(-) create mode 100644 builder.py diff --git a/builder.py b/builder.py new file mode 100644 index 0000000000..72f3292bdf --- /dev/null +++ b/builder.py @@ -0,0 +1,193 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +import itertools +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + +import torch +import torch._dynamo.config +import torch._inductor.config + +from quantize import quantize_model, name_to_dtype, set_precision, get_precision +from cli import cli_args + +def device_sync(device): + if "cuda" in device: + torch.cuda.synchronize(device) + elif ("cpu" in device) or ("mps" in device): + pass + else: + print(f"device={ device } is not yet suppported") + + +torch._inductor.config.coordinate_descent_tuning = True +torch._inductor.config.triton.unique_kernel_names = True +torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future + + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from sentencepiece import SentencePieceProcessor + +from model import Transformer + +def _load_model( + checkpoint_path, + checkpoint_dir, + params_path, + params_table, + gguf_path, + device, + precision, + use_tp # =False +): + use_cuda = "cuda" in device + with torch.device("meta"): + if params_path: + model = Transformer.from_params(params_path) + elif params_table: + model = Transformer.from_table(params_path) + elif gguf_path: + model = Transformer.from_gguf(gguf_path) + else: + model = Transformer.from_name(checkpoint_path.parent.name) + + # checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) + cps = [] + if checkpoint_dir is not None: + # Load multiple checkpoint; ignore the single path. + checkpoint_path = None + for i in range(4): + cp_name = f"consolidated.{i}.pth" + print(f"Loading {cp_name}") + cps.append( + torch.load( + os.path.join(checkpoint_dir, cp_name), + map_location=device, + mmap=True, + ) + ) + + checkpoint = {} + for key in cps[0].keys(): + if not torch.allclose(cps[0][key], cps[1][key]): + values = (cps[0][key], cps[1][key], cps[2][key], cps[3][key]) + if key.endswith("wo.weight") or key.endswith("w2.weight"): + checkpoint[key] = torch.cat(values, dim=1) + else: + checkpoint[key] = torch.cat(values, dim=0) + else: + checkpoint[key] = cps[0][key] + else: + checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True, weights_only=True) + + if "model" in checkpoint and "stories" in str(checkpoint_path): + checkpoint = checkpoint["model"] + + model.load_state_dict(checkpoint, assign=True) + + if use_tp: + from tp import apply_tp + + print("Applying tensor parallel to model ...") + apply_tp(model) + + model = model.to(device=device, dtype=precision) + return model.eval() + + +def _initialize_model( + checkpoint_path, + checkpoint_dir, + params_path, + params_table, + gguf_path, + dso_path, + pte_path, + quantize, + device, + precision, + setup_caches, + use_tp # =False +): + assert ( + (checkpoint_path and checkpoint_path.is_file()) or + (checkpoint_dir and checkpoint_path.is_dir()) or + (gguf_path and gguf_path.is_file()) or + (dso_path and Path(dso_path).is_file()) or + (pte_path and Path(pte_path).is_file()) + ), "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path" + assert not (dso_path and pte_path), "specify either DSO path or PTE path, but not both" + + if (checkpoint_path and (dso_path or pte_path)): + print("Warning: checkpoint path ignored because an exported DSO or PTE path specified") + if (checkpoint_dir and (dso_path or pte_path)): + print("Warning: checkpoint dir ignored because an exported DSO or PTE path specified") + if (gguf_path and (dso_path or pte_path)): + print("Warning: GGUF path ignored because an exported DSO or PTE path specified") + + print("Loading model ...") + t0 = time.time() + model_ = _load_model( + checkpoint_path, + checkpoint_dir, + params_path, + params_table, + gguf_path, + device, + precision, + use_tp + ) + device_sync(device=device) # MKG + print(f"Time to load model: {time.time() - t0:.02f} seconds") + + if dso_path: + # make sure user did not try to set dtype + # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export." + assert quantize is None or quantize == "{ }", f"quantize not valid for exported DSO model. Specify quantization during export." + try: + model = model_ + # Replace model forward with the AOT-compiled forward + # This is a hacky way to quickly demo AOTI's capability. + # model is still a Python object, and any mutation to its + # attributes will NOT be seen on by AOTI-compiled forward + # function, e.g. calling model.setup_cache will NOT touch + # AOTI compiled and maintained model buffers such as kv_cache. + model.forward = torch._export.aot_load(str(dso_path.absolute()), device) + except: + raise RuntimeError(f"Failed to load AOTI compiled {dso_path}") + elif pte_path: + # make sure user did not try to set dtype + # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export." + assert quantize is None or quantize == "{ }", f"quantize not valid for exported PTE model. Specify quantization during export." + try: + from model_et import PTEModel + model = PTEModel(model_.config, pte_path) + except Exception as e: + raise RuntimeError(f"Failed to load ET compiled {pte_path}") + else: + model = model_ + + if quantize: + t0q = time.time() + quantize_model(model, quantize) + device_sync(device=device) # MKG + print(f"Time to quantize model: {time.time() - t0q:.02f} seconds") + + if setup_caches: + max_seq_length = 350 + with torch.device(device): + model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) + + model.to(dtype=precision) + + return model + + diff --git a/eval.py b/eval.py index a28eda0ef0..a63cd5f2cf 100644 --- a/eval.py +++ b/eval.py @@ -31,7 +31,8 @@ except: lm_eval_available = False -from generate import _initialize_model, encode_tokens, model_forward +from builder import _initialize_model +from generate import encode_tokens, model_forward if lm_eval_available: try: # lm_eval version 0.4 diff --git a/export.py b/export.py index 9a783e2289..c5c05c9617 100644 --- a/export.py +++ b/export.py @@ -25,7 +25,8 @@ from export_aoti import export_model as export_model_aoti from model import Transformer -from generate import _load_model, decode_one_token, _initialize_model +from builder import _initialize_model +from generate import decode_one_token from quantize import quantize_model, name_to_dtype from torch._export import capture_pre_autograd_graph diff --git a/export_aoti.py b/export_aoti.py index 7a5306b5b6..c7d4d6d92d 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -14,7 +14,7 @@ import torch.nn as nn from torch.export import Dim, export -from generate import _load_model, decode_one_token +from generate import decode_one_token from quantize import quantize_model from model import Transformer diff --git a/export_et.py b/export_et.py index 6630ea86fa..bfdb8d0872 100644 --- a/export_et.py +++ b/export_et.py @@ -11,7 +11,7 @@ import torch.nn as nn from torch.export import Dim, export -from generate import _load_model, decode_one_token +from generate import decode_one_token from quantize import quantize_model from quantize import quantize_model, name_to_dtype, set_precision, get_precision @@ -28,8 +28,6 @@ from executorch.exir.passes.quant_fusion_pass import QuantFusionPass from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass -from generate import _load_model - from model import Transformer from torch._export import capture_pre_autograd_graph diff --git a/generate.py b/generate.py index 61d5185bbe..ac99d6542e 100644 --- a/generate.py +++ b/generate.py @@ -8,6 +8,7 @@ import time from pathlib import Path from typing import Optional, Tuple +from builder import _load_model, _initialize_model import torch import torch._dynamo.config @@ -274,161 +275,6 @@ def encode_tokens(tokenizer, string, bos=True, device="cuda"): return torch.tensor(tokens, dtype=torch.int, device=device) -def _load_model( - checkpoint_path, - checkpoint_dir, - params_path, - params_table, - gguf_path, - device, - precision, - use_tp # =False -): - use_cuda = "cuda" in device - with torch.device("meta"): - if params_path: - model = Transformer.from_params(params_path) - elif params_table: - model = Transformer.from_table(params_path) - elif gguf_path: - model = Transformer.from_gguf(gguf_path) - else: - model = Transformer.from_name(checkpoint_path.parent.name) - - # checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) - cps = [] - if checkpoint_dir is not None: - # Load multiple checkpoint; ignore the single path. - checkpoint_path = None - for i in range(4): - cp_name = f"consolidated.{i}.pth" - print(f"Loading {cp_name}") - cps.append( - torch.load( - os.path.join(checkpoint_dir, cp_name), - map_location=device, - mmap=True, - ) - ) - - checkpoint = {} - for key in cps[0].keys(): - if not torch.allclose(cps[0][key], cps[1][key]): - values = (cps[0][key], cps[1][key], cps[2][key], cps[3][key]) - if key.endswith("wo.weight") or key.endswith("w2.weight"): - checkpoint[key] = torch.cat(values, dim=1) - else: - checkpoint[key] = torch.cat(values, dim=0) - else: - checkpoint[key] = cps[0][key] - else: - checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True, weights_only=True) - - if "model" in checkpoint and "stories" in str(checkpoint_path): - checkpoint = checkpoint["model"] - - model.load_state_dict(checkpoint, assign=True) - - if use_tp: - from tp import apply_tp - - print("Applying tensor parallel to model ...") - apply_tp(model) - - model = model.to(device=device, dtype=precision) - return model.eval() - - -B_INST, E_INST = "[INST]", "[/INST]" - -def _initialize_model( - checkpoint_path, - checkpoint_dir, - params_path, - params_table, - gguf_path, - dso_path, - pte_path, - quantize, - device, - precision, - setup_caches, - use_tp # =False -): - assert ( - (checkpoint_path and checkpoint_path.is_file()) or - (checkpoint_dir and checkpoint_path.is_dir()) or - (gguf_path and gguf_path.is_file()) or - (dso_path and Path(dso_path).is_file()) or - (pte_path and Path(pte_path).is_file()) - ), "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path" - assert not (dso_path and pte_path), "specify either DSO path or PTE path, but not both" - - if (checkpoint_path and (dso_path or pte_path)): - print("Warning: checkpoint path ignored because an exported DSO or PTE path specified") - if (checkpoint_dir and (dso_path or pte_path)): - print("Warning: checkpoint dir ignored because an exported DSO or PTE path specified") - if (gguf_path and (dso_path or pte_path)): - print("Warning: GGUF path ignored because an exported DSO or PTE path specified") - - print("Loading model ...") - t0 = time.time() - model_ = _load_model( - checkpoint_path, - checkpoint_dir, - params_path, - params_table, - gguf_path, - device, - precision, - use_tp - ) - device_sync(device=device) # MKG - print(f"Time to load model: {time.time() - t0:.02f} seconds") - - if dso_path: - # make sure user did not try to set dtype - # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export." - assert quantize is None or quantize == "{ }", f"quantize not valid for exported DSO model. Specify quantization during export." - try: - model = model_ - # Replace model forward with the AOT-compiled forward - # This is a hacky way to quickly demo AOTI's capability. - # model is still a Python object, and any mutation to its - # attributes will NOT be seen on by AOTI-compiled forward - # function, e.g. calling model.setup_cache will NOT touch - # AOTI compiled and maintained model buffers such as kv_cache. - model.forward = torch._export.aot_load(str(dso_path.absolute()), device) - except: - raise RuntimeError(f"Failed to load AOTI compiled {dso_path}") - elif pte_path: - # make sure user did not try to set dtype - # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export." - assert quantize is None or quantize == "{ }", f"quantize not valid for exported PTE model. Specify quantization during export." - try: - from model_et import PTEModel - model = PTEModel(model_.config, pte_path) - except Exception as e: - raise RuntimeError(f"Failed to load ET compiled {pte_path}") - else: - model = model_ - - if quantize: - t0q = time.time() - quantize_model(model, quantize) - device_sync(device=device) # MKG - print(f"Time to quantize model: {time.time() - t0q:.02f} seconds") - - if setup_caches: - max_seq_length = 350 - with torch.device(device): - model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) - - model.to(dtype=precision) - - return model - - def _main( prompt: str = "Hello, my name is", chat_mode: bool = False, @@ -494,6 +340,7 @@ def _main( # will add a version of _initialize_model in future # (need additional args) if is_speculative: + from builder import _load_model draft_model = _load_model( draft_checkpoint_path, None, # checkpoint_dir diff --git a/requirements.txt b/requirements.txt index 2f1a2a7cc1..0f59dc82c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch sentencepiece numpy +gguf