Skip to content

Commit

Permalink
renbame to meet review
Browse files Browse the repository at this point in the history
  • Loading branch information
rnwang04 committed Feb 7, 2025
1 parent d8c5221 commit 27eadad
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@
model_path = args.repo_id_or_model_path
save_dir = args.save_directory

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

trans_version = transformers.__version__
if version.parse(trans_version) >= version.parse("4.45.0"):
tokenizer_json = os.path.join(model_path, "tokenizer.json")
dst_path = os.path.join(save_dir, "tokenizer.json")
shutil.copy(tokenizer_json, dst_path)
else:
tokenizer.save_pretrained(save_dir)

t0 = time.perf_counter()
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
Expand All @@ -73,15 +83,6 @@
compile_blob=not args.disable_compile_blob)
t1 = time.perf_counter()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

trans_version = transformers.__version__
if version.parse(trans_version) >= version.parse("4.45.0"):
tokenizer_json = os.path.join(model_path, "tokenizer.json")
dst_path = os.path.join(save_dir, "tokenizer.json")
shutil.copy(tokenizer_json, dst_path)
else:
tokenizer.save_pretrained(save_dir)

print("-" * 80)
print(f"Convert model cost {t1 - t0}s.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
weight_dir = os.path.join(save_directory, "model_weights")
if not os.path.exists(weight_dir):
os.mkdir(weight_dir)
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
const_parameter = os.environ.get("IPEX_LLM_NPU_CONST_PARAMETER", "1") == "1"
if keep_ir:
layernorm_const = False
const_parameter = False

lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn")
if hasattr(model, "lm_head") and not isinstance(model.lm_head, SlicedLMHead):
Expand Down Expand Up @@ -471,7 +471,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"head_dim": model.model.layers[0].self_attn.head_dim,
"transpose_value_cache": transpose_value_cache,
"max_prompt_len": max_prompt_len,
"layernorm_const": layernorm_const,
"layernorm_const": const_parameter,
"group_size": group_size,
"fused_layers": fused_layers,
"qkv_bias": True,
Expand All @@ -487,12 +487,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
# save fused_layers blobs of fused decoder layers
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, kv_len,
group_size, layernorm_const, "decode",
group_size, const_parameter, "decode",
keep_ir=keep_ir, compile_blob=compile_blob)
# save blob of single prefill layer
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
group_size, layernorm_const, "prefill",
group_size, const_parameter, "prefill",
keep_ir=keep_ir, compile_blob=compile_blob)
# save blob of lmhead and bin of embedding
convert_lm_head_and_embedding(model, save_directory, weight_dir,
Expand Down Expand Up @@ -532,7 +532,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"head_dim": model.model.layers[0].self_attn.head_dim,
"transpose_value_cache": transpose_value_cache,
"max_prompt_len": max_prompt_len,
"layernorm_const": layernorm_const,
"layernorm_const": const_parameter,
"group_size": group_size,
"fused_layers": fused_layers,
"qkv_bias": False,
Expand All @@ -556,12 +556,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
# save fused_layers blobs of fused decoder layers
convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, kv_len,
group_size, layernorm_const, "decode",
group_size, const_parameter, "decode",
keep_ir=keep_ir, compile_blob=compile_blob)
# save blob of single prefill layer
convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
group_size, layernorm_const, "prefill",
group_size, const_parameter, "prefill",
keep_ir=keep_ir, compile_blob=compile_blob)
elif model.config.model_type == "minicpm":
if group_size == 0:
Expand All @@ -573,7 +573,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"head_dim": model.model.layers[0].self_attn.head_dim,
"transpose_value_cache": transpose_value_cache,
"max_prompt_len": max_prompt_len,
"layernorm_const": layernorm_const,
"layernorm_const": const_parameter,
"group_size": group_size,
"fused_layers": fused_layers,
"qkv_bias": False,
Expand All @@ -591,12 +591,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
# save fused_layers blobs of fused decoder layers
convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, kv_len,
group_size, layernorm_const, "decode",
group_size, const_parameter, "decode",
keep_ir=keep_ir, compile_blob=compile_blob)
# save blob of single prefill layer
convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
group_size, layernorm_const, "prefill",
group_size, const_parameter, "prefill",
keep_ir=keep_ir, compile_blob=compile_blob)
# save blob of lmhead and bin of embedding and embedding_post
convert_lm_head_and_embedding(model, n_splits_linear,
Expand Down
16 changes: 8 additions & 8 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,

def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const, mode="decode",
const_parameter, mode="decode",
keep_ir=False, compile_blob=True):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
Expand Down Expand Up @@ -297,14 +297,14 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
else:
input_len = kv_len
decoder_name = "decoder_layer_prefill"
layernorm_const = False
const_parameter = False
keep_position_ids = False
npu_dpu_groups = 6

single_decoder = LowBitLlamaMultiDecoderlayer(
[1, input_len, num_heads * head_dim],
input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
input_layernorm_weights=[layer_norm_0] if const_parameter else None,
post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
cached_cos=cached_cos,
cached_sin=cached_sin,
num_heads=num_heads,
Expand Down Expand Up @@ -334,7 +334,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
if mode == "decode":
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
# llama-2-7B & llama-3-8B
if layernorm_const:
if const_parameter:
st_idx = 5
else:
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
Expand All @@ -344,7 +344,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
st_idx = 7
else:
# llama-3.2-3B & llama-3.2-1B
if layernorm_const:
if const_parameter:
st_idx = 6
else:
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
Expand Down Expand Up @@ -375,7 +375,7 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,

def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const, mode="decode",
const_parameter, mode="decode",
keep_ir=False, compile_blob=True):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
Expand Down Expand Up @@ -446,7 +446,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
else: # FP16 Linear
np_dtype = np.float16

if not layernorm_const:
if not const_parameter:
input_layer_norm_weights = None
post_attn_layernorm_weights = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,

def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const, mode="decode",
const_parameter, mode="decode",
keep_ir=False, compile_blob=True):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
Expand Down Expand Up @@ -333,12 +333,12 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
else:
input_len = kv_len
decoder_name = "decoder_layer_prefill"
layernorm_const = False
const_parameter = False

single_decoder = LowBitMinicpmMultiDecoderlayer(
[1, input_len, num_heads * head_dim],
input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
input_layernorm_weights=[layer_norm_0] if const_parameter else None,
post_attn_layernorm_weights=[layer_norm_1] if const_parameter else None,
cached_cos=cached_cos,
cached_sin=cached_sin,
num_heads=num_heads,
Expand All @@ -364,7 +364,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))

if mode == "decode":
if layernorm_const:
if const_parameter:
st_idx = 5
else:
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
Expand Down Expand Up @@ -394,7 +394,7 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,

def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const, mode="decode",
const_parameter, mode="decode",
keep_ir=False, compile_blob=True):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
Expand Down Expand Up @@ -461,7 +461,7 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
else: # FP16 Linear
np_dtype = np.float16

if not layernorm_const:
if not const_parameter:
input_layer_norm_weights = None
post_attn_layernorm_weights = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,

def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const, mode="decode",
const_parameter, mode="decode",
keep_ir=False, compile_blob=True):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
Expand Down Expand Up @@ -196,7 +196,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,

# 0, 1, 2 are input_embed/attention_mask/position_id
if mode == "decode":
if layernorm_const:
if const_parameter:
st_idx = 3
else:
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
Expand Down Expand Up @@ -234,7 +234,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,

def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const, mode="decode",
const_parameter, mode="decode",
keep_ir=False, compile_blob=True):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
Expand Down Expand Up @@ -313,7 +313,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
else: # FP16 Linear
np_dtype = np.float16

if not layernorm_const:
if not const_parameter:
input_layer_norm_weights = None
post_attn_layernorm_weights = None
q_biases = None
Expand Down

0 comments on commit 27eadad

Please sign in to comment.