diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py index 76e2f7fc3b1..c0ebf8bd199 100644 --- a/neural_compressor/torch/algorithms/layer_wise/utils.py +++ b/neural_compressor/torch/algorithms/layer_wise/utils.py @@ -157,10 +157,15 @@ def load_tensor(path, tensor_name=None, prefix=None): return state_dict -def load_tensor_from_safetensors(path, tensor_name=None, device="cpu"): +def load_tensor_from_safetensors(path, tensor_name=None, prefix=None, device="cpu"): """Load a tensor from safetensors file with given tensor name.""" with safe_open(path, framework="pt", device=device) as f: - value = f.get_tensor(tensor_name) + if tensor_name in f.keys(): + value = f.get_tensor(tensor_name) + elif prefix and tensor_name.replace(f"{prefix}.", "") in f.keys(): + value = f.get_tensor(tensor_name.replace(f"{prefix}.", "")) + else: + raise ValueError(f"Tensor '{tensor_name}' not found in the file '{path}'") return value @@ -212,9 +217,11 @@ def load_value(model, param_name, path, device="cpu"): files = os.listdir(path) safetensors_files = [filename for filename in files if filename.endswith(".safetensors")] if len(safetensors_files) == 1: - value = load_tensor_from_safetensors(os.path.join(path, "model.safetensors"), param_name, device=device) + value = load_tensor_from_safetensors( + os.path.join(path, "model.safetensors"), param_name, prefix=prefix, device=device + ) elif len(safetensors_files) >= 2: - value = load_tensor_from_safetensors_shard(path, param_name, device=device) + value = load_tensor_from_safetensors_shard(path, param_name, prefix=prefix, device=device) elif "pytorch_model.bin.index.json" in files: value = load_tensor_from_shard(path, param_name, prefix) else: diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py index b3c6051ba4b..6b50133dd5e 100644 --- a/neural_compressor/torch/algorithms/weight_only/save_load.py +++ b/neural_compressor/torch/algorithms/weight_only/save_load.py @@ -21,6 +21,7 @@ import tempfile import torch +from packaging.version import parse from neural_compressor.common.utils import AWQ, TEQ, save_config_mapping from neural_compressor.torch.utils import ( @@ -809,6 +810,7 @@ def _get_resolved_archive_file(self, **kwargs): return resolved_archive_file, is_sharded def _init_hf_model(self, model_class, config): + import transformers from accelerate.big_modeling import init_empty_weights from transformers.modeling_utils import no_init_weights from transformers.utils import ContextManagers @@ -846,7 +848,11 @@ def _init_hf_model(self, model_class, config): dtype_orig = model_class._set_default_torch_dtype(torch_dtype) - init_contexts = [no_init_weights(_enable=_fast_init)] + init_contexts = ( + [no_init_weights(_enable=_fast_init)] + if parse(transformers.__version__) < parse("4.51") + else [no_init_weights()] + ) init_contexts.append(init_empty_weights()) with ContextManagers(init_contexts): diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index 11fce9c3029..20f207eb34d 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -38,6 +38,7 @@ import transformers from accelerate import init_empty_weights from accelerate.utils import is_xpu_available +from packaging.version import parse from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig from transformers.modeling_utils import load_state_dict @@ -678,7 +679,11 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config.weight_dtype = "int4" logger.warning("int4 weight_dtype is used, please change the config.json if you don't want to use it.") - init_contexts = [no_init_weights(_enable=_fast_init)] + init_contexts = ( + [no_init_weights(_enable=_fast_init)] + if parse(transformers.__version__) < parse("4.51") + else [no_init_weights()] + ) init_contexts.append(init_empty_weights()) with ContextManagers(init_contexts): @@ -704,35 +709,36 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): model, resolved_archive_file, loaded_state_dict_keys, quantization_config, is_sharded ) else: - if transformers.__version__ >= "4.50": - model_message = model_class._load_pretrained_model( + if parse(transformers.__version__) < parse("4.50"): + tmp_args = ( model, None, - checkpoint_files, - pretrained_model_name_or_path, - sharded_metadata=sharded_metadata, - _fast_init=_fast_init, - low_cpu_mem_usage=True, - disk_offload_folder=offload_folder, - offload_state_dict=offload_state_dict, - dtype=torch_dtype, - keep_in_fp32_modules=[], - ) - else: - model_message = model_class._load_pretrained_model( - model, - None, - loaded_state_dict_keys, # XXX: rename? + loaded_state_dict_keys, resolved_archive_file, pretrained_model_name_or_path, - sharded_metadata=sharded_metadata, - _fast_init=_fast_init, - low_cpu_mem_usage=True, - offload_folder=offload_folder, - offload_state_dict=offload_state_dict, - dtype=torch_dtype, - keep_in_fp32_modules=[], ) + tmp_kwargs = { + "sharded_metadata": sharded_metadata, + "_fast_init": _fast_init, + "low_cpu_mem_usage": True, + "offload_folder": offload_folder, + "offload_state_dict": offload_state_dict, + "dtype": torch_dtype, + "keep_in_fp32_modules": [], + } + else: + tmp_args = (model, None, checkpoint_files, pretrained_model_name_or_path) + tmp_kwargs = { + "sharded_metadata": sharded_metadata, + "disk_offload_folder": offload_folder, + "offload_state_dict": offload_state_dict, + "dtype": torch_dtype, + } + if parse(transformers.__version__) < parse("4.51"): + tmp_kwargs["_fast_init"] = _fast_init + tmp_kwargs["low_cpu_mem_usage"] = True + + model_message = model_class._load_pretrained_model(*tmp_args, **tmp_kwargs) model = model_message[0] # make sure token embedding weights are still tied if needed diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py index 29259dd8b94..190c4cea33a 100644 --- a/test/3x/torch/quantization/weight_only/test_transformers.py +++ b/test/3x/torch/quantization/weight_only/test_transformers.py @@ -19,6 +19,8 @@ TeqConfig, ) +torch.manual_seed(42) + ipex_version = get_ipex_version() try: @@ -30,7 +32,7 @@ class TestTansformersLikeAPI: def setup_class(self): - self.model_name_or_path = "hf-internal-testing/tiny-random-gptj" + self.model_name_or_path = "hf-tiny-model-private/tiny-random-GPTJForCausalLM" self.autoawq_model = "casperhansen/opt-125m-awq" self.prompt = "One day, the little girl" self.generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) @@ -58,8 +60,9 @@ def test_quantization_for_llm(self): woq_model.eval() output = woq_model(dummy_input)[0] - assert torch.allclose(output, label, atol=0.1), "Accuracy gap atol > 0.1 is unexpected." - assert isclose(float(output[0][0][0]), 0.17786270380020142, rel_tol=1e-04) + assert torch.allclose(output, label, atol=0.12), "Accuracy gap atol > 0.1 is unexpected." + # label[0][0][0] = -0.0910 + assert isclose(float(output[0][0][0]), -0.1006, abs_tol=1e-04) # AWQ woq_config = AwqConfig( @@ -69,14 +72,14 @@ def test_quantization_for_llm(self): woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config) woq_model.eval() output = woq_model(dummy_input) - assert isclose(float(output[0][0][0][0]), 0.19592927396297455, rel_tol=1e-04) + assert isclose(float(output[0][0][0][0]), -0.1045, abs_tol=1e-04) # TEQ woq_config = TeqConfig(bits=4, n_samples=5, batch_size=1, seq_len=512, group_size=16, tokenizer=tokenizer) woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config) woq_model.eval() output = woq_model(dummy_input) - assert isclose(float(output[0][0][0][0]), 0.17786270380020142, rel_tol=1e-04) + assert isclose(float(output[0][0][0][0]), -0.1006, abs_tol=1e-04) # GPTQ woq_config = GPTQConfig( @@ -95,11 +98,11 @@ def test_quantization_for_llm(self): woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config) woq_model.eval() output = woq_model(dummy_input) - # Since the output of torch.cholesky() has changed in different Torch versions + # The output of torch.cholesky() changes on different torch version if ipex_version < Version("2.5.0"): - assert isclose(float(output[0][0][0][0]), 0.17234990000724792, rel_tol=1e-04) + assert isclose(float(output[0][0][0][0]), -0.08614, abs_tol=1e-04) else: - assert isclose(float(output[0][0][0][0]), 0.17049233615398407, rel_tol=1e-04) + assert isclose(float(output[0][0][0][0]), -0.0874, abs_tol=1e-04) # AUTOROUND woq_config = AutoRoundConfig( @@ -108,10 +111,11 @@ def test_quantization_for_llm(self): woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config) woq_model.eval() output = woq_model(dummy_input) + # The output might change when device supports bf16 if CpuInfo().bf16: - assert isclose(float(output[0][0][0][0]), 0.19140625, rel_tol=1e-04) + assert isclose(float(output[0][0][0][0]), -0.07275, abs_tol=1e-04) else: - assert isclose(float(output[0][0][0][0]), 0.18400897085666656, rel_tol=1e-04) + assert isclose(float(output[0][0][0][0]), -0.0786, abs_tol=1e-04) def test_save_load(self): model_name_or_path = self.model_name_or_path