Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions neural_compressor/torch/algorithms/layer_wise/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,15 @@ def load_tensor(path, tensor_name=None, prefix=None):
return state_dict


def load_tensor_from_safetensors(path, tensor_name=None, device="cpu"):
def load_tensor_from_safetensors(path, tensor_name=None, prefix=None, device="cpu"):
"""Load a tensor from safetensors file with given tensor name."""
with safe_open(path, framework="pt", device=device) as f:
value = f.get_tensor(tensor_name)
if tensor_name in f.keys():
value = f.get_tensor(tensor_name)
elif prefix and tensor_name.replace(f"{prefix}.", "") in f.keys():
value = f.get_tensor(tensor_name.replace(f"{prefix}.", ""))
else:
raise ValueError(f"Tensor '{tensor_name}' not found in the file '{path}'")
return value


Expand Down Expand Up @@ -212,9 +217,11 @@ def load_value(model, param_name, path, device="cpu"):
files = os.listdir(path)
safetensors_files = [filename for filename in files if filename.endswith(".safetensors")]
if len(safetensors_files) == 1:
value = load_tensor_from_safetensors(os.path.join(path, "model.safetensors"), param_name, device=device)
value = load_tensor_from_safetensors(
os.path.join(path, "model.safetensors"), param_name, prefix=prefix, device=device
)
elif len(safetensors_files) >= 2:
value = load_tensor_from_safetensors_shard(path, param_name, device=device)
value = load_tensor_from_safetensors_shard(path, param_name, prefix=prefix, device=device)
elif "pytorch_model.bin.index.json" in files:
value = load_tensor_from_shard(path, param_name, prefix)
else:
Expand Down
8 changes: 7 additions & 1 deletion neural_compressor/torch/algorithms/weight_only/save_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import tempfile

import torch
from packaging.version import parse

from neural_compressor.common.utils import AWQ, TEQ, save_config_mapping
from neural_compressor.torch.utils import (
Expand Down Expand Up @@ -809,6 +810,7 @@ def _get_resolved_archive_file(self, **kwargs):
return resolved_archive_file, is_sharded

def _init_hf_model(self, model_class, config):
import transformers
from accelerate.big_modeling import init_empty_weights
from transformers.modeling_utils import no_init_weights
from transformers.utils import ContextManagers
Expand Down Expand Up @@ -846,7 +848,11 @@ def _init_hf_model(self, model_class, config):

dtype_orig = model_class._set_default_torch_dtype(torch_dtype)

init_contexts = [no_init_weights(_enable=_fast_init)]
init_contexts = (
[no_init_weights(_enable=_fast_init)]
if parse(transformers.__version__) < parse("4.51")
else [no_init_weights()]
)
init_contexts.append(init_empty_weights())

with ContextManagers(init_contexts):
Expand Down
56 changes: 31 additions & 25 deletions neural_compressor/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import transformers
from accelerate import init_empty_weights
from accelerate.utils import is_xpu_available
from packaging.version import parse
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_utils import load_state_dict
Expand Down Expand Up @@ -678,7 +679,11 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
quantization_config.weight_dtype = "int4"
logger.warning("int4 weight_dtype is used, please change the config.json if you don't want to use it.")

init_contexts = [no_init_weights(_enable=_fast_init)]
init_contexts = (
[no_init_weights(_enable=_fast_init)]
if parse(transformers.__version__) < parse("4.51")
else [no_init_weights()]
)
init_contexts.append(init_empty_weights())

with ContextManagers(init_contexts):
Expand All @@ -704,35 +709,36 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
model, resolved_archive_file, loaded_state_dict_keys, quantization_config, is_sharded
)
else:
if transformers.__version__ >= "4.50":
model_message = model_class._load_pretrained_model(
if parse(transformers.__version__) < parse("4.50"):
tmp_args = (
model,
None,
checkpoint_files,
pretrained_model_name_or_path,
sharded_metadata=sharded_metadata,
_fast_init=_fast_init,
low_cpu_mem_usage=True,
disk_offload_folder=offload_folder,
offload_state_dict=offload_state_dict,
dtype=torch_dtype,
keep_in_fp32_modules=[],
)
else:
model_message = model_class._load_pretrained_model(
model,
None,
loaded_state_dict_keys, # XXX: rename?
loaded_state_dict_keys,
resolved_archive_file,
pretrained_model_name_or_path,
sharded_metadata=sharded_metadata,
_fast_init=_fast_init,
low_cpu_mem_usage=True,
offload_folder=offload_folder,
offload_state_dict=offload_state_dict,
dtype=torch_dtype,
keep_in_fp32_modules=[],
)
tmp_kwargs = {
"sharded_metadata": sharded_metadata,
"_fast_init": _fast_init,
"low_cpu_mem_usage": True,
"offload_folder": offload_folder,
"offload_state_dict": offload_state_dict,
"dtype": torch_dtype,
"keep_in_fp32_modules": [],
}
else:
tmp_args = (model, None, checkpoint_files, pretrained_model_name_or_path)
tmp_kwargs = {
"sharded_metadata": sharded_metadata,
"disk_offload_folder": offload_folder,
"offload_state_dict": offload_state_dict,
"dtype": torch_dtype,
}
if parse(transformers.__version__) < parse("4.51"):
tmp_kwargs["_fast_init"] = _fast_init
tmp_kwargs["low_cpu_mem_usage"] = True

model_message = model_class._load_pretrained_model(*tmp_args, **tmp_kwargs)
model = model_message[0]

# make sure token embedding weights are still tied if needed
Expand Down
24 changes: 14 additions & 10 deletions test/3x/torch/quantization/weight_only/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
TeqConfig,
)

torch.manual_seed(42)

ipex_version = get_ipex_version()

try:
Expand All @@ -30,7 +32,7 @@

class TestTansformersLikeAPI:
def setup_class(self):
self.model_name_or_path = "hf-internal-testing/tiny-random-gptj"
self.model_name_or_path = "hf-tiny-model-private/tiny-random-GPTJForCausalLM"
self.autoawq_model = "casperhansen/opt-125m-awq"
self.prompt = "One day, the little girl"
self.generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
Expand Down Expand Up @@ -58,8 +60,9 @@ def test_quantization_for_llm(self):
woq_model.eval()

output = woq_model(dummy_input)[0]
assert torch.allclose(output, label, atol=0.1), "Accuracy gap atol > 0.1 is unexpected."
assert isclose(float(output[0][0][0]), 0.17786270380020142, rel_tol=1e-04)
assert torch.allclose(output, label, atol=0.12), "Accuracy gap atol > 0.1 is unexpected."
# label[0][0][0] = -0.0910
assert isclose(float(output[0][0][0]), -0.1006, abs_tol=1e-04)

# AWQ
woq_config = AwqConfig(
Expand All @@ -69,14 +72,14 @@ def test_quantization_for_llm(self):
woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config)
woq_model.eval()
output = woq_model(dummy_input)
assert isclose(float(output[0][0][0][0]), 0.19592927396297455, rel_tol=1e-04)
assert isclose(float(output[0][0][0][0]), -0.1045, abs_tol=1e-04)

# TEQ
woq_config = TeqConfig(bits=4, n_samples=5, batch_size=1, seq_len=512, group_size=16, tokenizer=tokenizer)
woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config)
woq_model.eval()
output = woq_model(dummy_input)
assert isclose(float(output[0][0][0][0]), 0.17786270380020142, rel_tol=1e-04)
assert isclose(float(output[0][0][0][0]), -0.1006, abs_tol=1e-04)

# GPTQ
woq_config = GPTQConfig(
Expand All @@ -95,11 +98,11 @@ def test_quantization_for_llm(self):
woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config)
woq_model.eval()
output = woq_model(dummy_input)
# Since the output of torch.cholesky() has changed in different Torch versions
# The output of torch.cholesky() changes on different torch version
if ipex_version < Version("2.5.0"):
assert isclose(float(output[0][0][0][0]), 0.17234990000724792, rel_tol=1e-04)
assert isclose(float(output[0][0][0][0]), -0.08614, abs_tol=1e-04)
else:
assert isclose(float(output[0][0][0][0]), 0.17049233615398407, rel_tol=1e-04)
assert isclose(float(output[0][0][0][0]), -0.0874, abs_tol=1e-04)

# AUTOROUND
woq_config = AutoRoundConfig(
Expand All @@ -108,10 +111,11 @@ def test_quantization_for_llm(self):
woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config)
woq_model.eval()
output = woq_model(dummy_input)
# The output might change when device supports bf16
if CpuInfo().bf16:
assert isclose(float(output[0][0][0][0]), 0.19140625, rel_tol=1e-04)
assert isclose(float(output[0][0][0][0]), -0.07275, abs_tol=1e-04)
else:
assert isclose(float(output[0][0][0][0]), 0.18400897085666656, rel_tol=1e-04)
assert isclose(float(output[0][0][0][0]), -0.0786, abs_tol=1e-04)

def test_save_load(self):
model_name_or_path = self.model_name_or_path
Expand Down
Loading