diff --git a/docs/source/ar/llm_tutorial_optimization.md b/docs/source/ar/llm_tutorial_optimization.md index 59f17f7f8a92..887f718241f4 100644 --- a/docs/source/ar/llm_tutorial_optimization.md +++ b/docs/source/ar/llm_tutorial_optimization.md @@ -231,7 +231,7 @@ flush() دعنا نرى ما هو استهلاك ذاكرة GPU الذروة الذي يوفره تكميم 4 بت. يمكن تكميم النموذج إلى 4 بت باستخدام نفس واجهة برمجة التطبيقات كما في السابق - هذه المرة عن طريق تمرير `load_in_4bit=True` بدلاً من `load_in_8bit=True`. ```python -model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0) +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) diff --git a/docs/source/ar/trainer.md b/docs/source/ar/trainer.md index b6bc21974430..8b0d05c783d0 100644 --- a/docs/source/ar/trainer.md +++ b/docs/source/ar/trainer.md @@ -459,7 +459,7 @@ args = TrainingArguments( model_id = "google/gemma-2b" tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id، low_cpu_mem_usage=True).to(0) +model = AutoModelForCausalLM.from_pretrained(model_id).to(0) trainer = trl.SFTTrainer( model=model، @@ -503,7 +503,7 @@ args = TrainingArguments( # تحميل النموذج والمجزىء اللغوي model_id = "google/gemma-2b" tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0) +model = AutoModelForCausalLM.from_pretrained(model_id).to(0) # تهيئة المدرب trainer = Trainer( @@ -547,7 +547,7 @@ args = TrainingArguments( model_id = "google/gemma-2b" tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0) +model = AutoModelForCausalLM.from_pretrained(model_id).to(0) trainer = trl.SFTTrainer( model=model, diff --git a/docs/source/en/internal/model_debugging_utils.md b/docs/source/en/internal/model_debugging_utils.md index 69f622ae1096..f43b70ea9fb1 100644 --- a/docs/source/en/internal/model_debugging_utils.md +++ b/docs/source/en/internal/model_debugging_utils.md @@ -51,7 +51,7 @@ torch.random.manual_seed(673) # load pretrained model and processor model_id = "llava-hf/llava-1.5-7b-hf" processor = LlavaProcessor.from_pretrained(model_id) -model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True) +model = LlavaForConditionalGeneration.from_pretrained(model_id) # create random image input random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy()) diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index c7c53765a2f8..038aa76689f4 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -236,7 +236,7 @@ flush() Let's see what peak GPU memory consumption 4-bit quantization gives. Quantizing the model to 4-bit can be done with the same API as before - this time by passing `load_in_4bit=True` instead of `load_in_8bit=True`. ```python -model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0) +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index 3810b3590a00..e7edca9fd3a3 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -170,7 +170,6 @@ model_id = "facebook/chameleon-7b" model = ChameleonForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, attn_implementation="flash_attention_2" ).to(0) ``` diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index 2af882b6118d..cfc60d074c7e 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -157,7 +157,7 @@ import requests processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") -model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) +model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16) model.to("cuda:0") # prepare image and text prompt, using the appropriate prompt template @@ -292,7 +292,6 @@ from transformers import AutoModelForImageTextToText model = AutoModelForImageTextToText.from_pretrained( model_id, torch_dtype=torch.float16, - low_cpu_mem_usage=True, use_flash_attention_2=True ).to(0) ``` diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index 14d5f6508add..da3359f7e320 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -121,7 +121,6 @@ processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov- model = LlavaOnevisionForConditionalGeneration.from_pretrained( "llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, - low_cpu_mem_usage=True, device_map="cuda:0" ) @@ -286,7 +285,6 @@ from transformers import LlavaOnevisionForConditionalGeneration model = LlavaOnevisionForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16, - low_cpu_mem_usage=True, use_flash_attention_2=True ).to(0) ``` diff --git a/docs/source/en/models.md b/docs/source/en/models.md index 4cef0d1553ef..fb76f0264bec 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -148,11 +148,6 @@ You need enough memory to hold two copies of the model weights (random and pretr Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types. -### Fast initialization - -A PyTorch model is instantiated with random weights, or "empty" tensors, that take up space in memory without filling it. - -Transformers boosts loading speed by skipping random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default. ### Sharded checkpoints @@ -245,7 +240,7 @@ Big Model Inference's second feature relates to how weights are loaded and dispa Both features combined reduces memory usage and loading times for big pretrained models. -Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`, such that not more than 1x the model size is used in CPU memory. +Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference. ```py from transformers import AutoModelForCausalLM diff --git a/docs/source/ja/main_classes/model.md b/docs/source/ja/main_classes/model.md index 916040c4a3b2..0923fea1cc05 100644 --- a/docs/source/ja/main_classes/model.md +++ b/docs/source/ja/main_classes/model.md @@ -39,19 +39,8 @@ rendered properly in your Markdown viewer. Transformers 4.20.0では、[`~PreTrainedModel.from_pretrained`] メソッドが再設計され、[Accelerate](https://huggingface.co/docs/accelerate/big_modeling) を使用して大規模モデルを扱うことが可能になりました。これには Accelerate >= 0.9.0 と PyTorch >= 1.9.0 が必要です。以前の方法でフルモデルを作成し、その後事前学習の重みを読み込む代わりに(これにはメモリ内のモデルサイズが2倍必要で、ランダムに初期化されたモデル用と重み用の2つが必要でした)、モデルを空の外殻として作成し、事前学習の重みが読み込まれるときにパラメーターを実体化するオプションが追加されました。 -このオプションは `low_cpu_mem_usage=True` で有効にできます。モデルはまず空の重みを持つメタデバイス上に作成され、その後状態辞書が内部に読み込まれます(シャードされたチェックポイントの場合、シャードごとに読み込まれます)。この方法で使用される最大RAMは、モデルの完全なサイズだけです。 - - -```py -from transformers import AutoModelForSeq2SeqLM - -t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True) -``` - さらに、モデルが完全にRAMに収まらない場合(現時点では推論のみ有効)、異なるデバイスにモデルを直接配置できます。`device_map="auto"` を使用すると、Accelerateは各レイヤーをどのデバイスに配置するかを決定し、最速のデバイス(GPU)を最大限に活用し、残りの部分をCPU、あるいはGPU RAMが不足している場合はハードドライブにオフロードします。モデルが複数のデバイスに分割されていても、通常どおり実行されます。 -`device_map` を渡す際、`low_cpu_mem_usage` は自動的に `True` に設定されるため、それを指定する必要はありません。 - ```py from transformers import AutoModelForSeq2SeqLM diff --git a/docs/source/ko/llm_tutorial_optimization.md b/docs/source/ko/llm_tutorial_optimization.md index d43affd288fc..5a95e2d9b596 100644 --- a/docs/source/ko/llm_tutorial_optimization.md +++ b/docs/source/ko/llm_tutorial_optimization.md @@ -227,7 +227,7 @@ flush() 이제 4비트 양자화가 제공하는 최대 GPU 메모리 사용량을 확인해 봅시다. 4비트로 모델을 양자화하려면 이전과 동일한 API를 사용하되 이번에는 `load_in_8bit=True` 대신 `load_in_4bit=True`를 전달하면 됩니다. ```python -model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0) +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) diff --git a/docs/source/ko/model_doc/chameleon.md b/docs/source/ko/model_doc/chameleon.md index 14a18a09765b..ac2fa16b7703 100644 --- a/docs/source/ko/model_doc/chameleon.md +++ b/docs/source/ko/model_doc/chameleon.md @@ -148,7 +148,6 @@ model_id = "facebook/chameleon-7b" model = ChameleonForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, attn_implementation="flash_attention_2" ).to(0) ``` diff --git a/docs/source/ko/trainer.md b/docs/source/ko/trainer.md index 0e6f1d7ed59c..7def9cccd894 100644 --- a/docs/source/ko/trainer.md +++ b/docs/source/ko/trainer.md @@ -421,7 +421,7 @@ args = TrainingArguments( model_id = "google/gemma-2b" tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0) +model = AutoModelForCausalLM.from_pretrained(model_id).to(0) trainer = trl.SFTTrainer( model=model, diff --git a/docs/source/zh/main_classes/model.md b/docs/source/zh/main_classes/model.md index 6c0ee3e2b2c0..57c1b374ed1d 100644 --- a/docs/source/zh/main_classes/model.md +++ b/docs/source/zh/main_classes/model.md @@ -29,18 +29,8 @@ http://www.apache.org/licenses/LICENSE-2.0 在 Transformers 4.20.0 中,[`~PreTrainedModel.from_pretrained`] 方法已重新设计,以适应使用 [Accelerate](https://huggingface.co/docs/accelerate/big_modeling) 加载大型模型的场景。这需要您使用的 Accelerate 和 PyTorch 版本满足: Accelerate >= 0.9.0, PyTorch >= 1.9.0。除了创建完整模型,然后在其中加载预训练权重(这会占用两倍于模型大小的内存空间,一个用于随机初始化模型,一个用于预训练权重),我们提供了一种选项,将模型创建为空壳,然后只有在加载预训练权重时才实例化其参数。 -您可以使用 `low_cpu_mem_usage=True` 激活此选项。首先,在 Meta 设备上创建模型(带有空权重),然后将状态字典加载到其中(在分片检查点的情况下逐片加载)。这样,最大使用的内存占用仅为模型的完整大小。 - -```python -from transformers import AutoModelForSeq2SeqLM - -t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True) -``` - 此外,如果内存不足以放下加载整个模型(目前仅适用于推理),您可以直接将模型放置在不同的设备上。使用 `device_map="auto"`,Accelerate 将确定将每一层放置在哪个设备上,以最大化使用最快的设备(GPU),并将其余部分卸载到 CPU,甚至硬盘上(如果您没有足够的 GPU 内存 或 CPU 内存)。即使模型分布在几个设备上,它也将像您通常期望的那样运行。 -在传递 `device_map` 时,`low_cpu_mem_usage` 会自动设置为 `True`,因此您不需要指定它: - ```python from transformers import AutoModelForSeq2SeqLM diff --git a/examples/pytorch/language-modeling/README.md b/examples/pytorch/language-modeling/README.md index 700d1a2b5613..e1b2beddf4e9 100644 --- a/examples/pytorch/language-modeling/README.md +++ b/examples/pytorch/language-modeling/README.md @@ -229,10 +229,6 @@ sure all your batches have the same length. To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` to the command line. This is supported by `run_mlm.py`, `run_clm.py` and `run_fim.py`. Make sure to adapt the other scripts to your use case by taking inspiration from them. -## Low Cpu Memory Usage - -To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line. This is currently supported by `run_clm.py`,`run_mlm.py`, `run_plm.py`, `run_fim.py`, `run_mlm_no_trainer.py`, `run_clm_no_trainer.py` and `run_fim_no_trainer.py`. - ## Creating a model on the fly When training a model from scratch, configuration values may be overridden with the help of `--config_overrides`: diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 63a4ef469e75..8082df71e1ab 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -139,15 +139,6 @@ class ModelArguments: "choices": ["auto", "bfloat16", "float16", "float32"], }, ) - low_cpu_mem_usage: bool = field( - default=False, - metadata={ - "help": ( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "set True will benefit LLM loading time and RAM consumption." - ) - }, - ) def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): @@ -432,7 +423,6 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype, - low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 662899ecd198..d11798e034a8 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -228,14 +228,6 @@ def parse_args(): "Only applicable when `--with_tracking` is passed." ), ) - parser.add_argument( - "--low_cpu_mem_usage", - action="store_true", - help=( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "If passed, LLM loading time and RAM consumption will be benefited." - ), - ) args = parser.parse_args() # Sanity checks @@ -409,7 +401,6 @@ def main(): args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, - low_cpu_mem_usage=args.low_cpu_mem_usage, trust_remote_code=args.trust_remote_code, ) else: diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index eca05c10dc35..d1698a949ff3 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -142,15 +142,6 @@ class ModelArguments: "choices": ["auto", "bfloat16", "float16", "float32"], }, ) - low_cpu_mem_usage: bool = field( - default=False, - metadata={ - "help": ( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "set True will benefit LLM loading time and RAM consumption." - ) - }, - ) pad_to_multiple_of: bool = field( default=False, metadata={ @@ -501,7 +492,6 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype, - low_cpu_mem_usage=model_args.low_cpu_mem_usage, attn_implementation=model_args.attn_implementation, ) diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 654b870025ba..8c601e408306 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -288,14 +288,6 @@ def parse_args(): "Only applicable when `--with_tracking` is passed." ), ) - parser.add_argument( - "--low_cpu_mem_usage", - action="store_true", - help=( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "If passed, LLM loading time and RAM consumption will be benefited." - ), - ) args = parser.parse_args() # Sanity checks @@ -474,7 +466,6 @@ def main(): args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, - low_cpu_mem_usage=args.low_cpu_mem_usage, trust_remote_code=args.trust_remote_code, ) else: diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 983e2a02b97d..79e7a585bd06 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -136,15 +136,6 @@ class ModelArguments: "choices": ["auto", "bfloat16", "float16", "float32"], }, ) - low_cpu_mem_usage: bool = field( - default=False, - metadata={ - "help": ( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "set True will benefit LLM loading time and RAM consumption." - ) - }, - ) def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): @@ -436,7 +427,6 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype, - low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: logger.info("Training new model from scratch") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index eb68534b9d02..134d23478299 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -235,14 +235,6 @@ def parse_args(): "Only applicable when `--with_tracking` is passed." ), ) - parser.add_argument( - "--low_cpu_mem_usage", - action="store_true", - help=( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "If passed, LLM loading time and RAM consumption will be benefited." - ), - ) args = parser.parse_args() # Sanity checks @@ -406,7 +398,6 @@ def main(): args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, - low_cpu_mem_usage=args.low_cpu_mem_usage, trust_remote_code=args.trust_remote_code, ) else: diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index a7106896d00e..b12d3526c273 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -103,15 +103,6 @@ class ModelArguments: ) }, ) - low_cpu_mem_usage: bool = field( - default=False, - metadata={ - "help": ( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " - "set True will benefit LLM loading time and RAM consumption." - ) - }, - ) def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): @@ -397,7 +388,6 @@ def main(): cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, - low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: logger.info("Training new model from scratch") diff --git a/src/transformers/model_debugging_utils.py b/src/transformers/model_debugging_utils.py index d09cfa24a72a..2df9b2ac65f8 100644 --- a/src/transformers/model_debugging_utils.py +++ b/src/transformers/model_debugging_utils.py @@ -429,7 +429,7 @@ def model_addition_debugger_context( # load pretrained model and processor model_id = "llava-hf/llava-1.5-7b-hf" processor = LlavaProcessor.from_pretrained(model_id) - model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True) + model = LlavaForConditionalGeneration.from_pretrained(model_id) # create random image input random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy()) diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py index a95f3cda8349..e55c3475e5e1 100644 --- a/src/transformers/models/aria/convert_aria_weights_to_hf.py +++ b/src/transformers/models/aria/convert_aria_weights_to_hf.py @@ -37,7 +37,7 @@ # load model kwargs = {"device_map": "auto", "torch_dtype": torch.float16} - model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs) + model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", **kwargs) # load vision tower model.get_vision_tower().load_model() diff --git a/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py b/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py index d3a8c4b8f5a4..9c7363041d33 100644 --- a/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py +++ b/src/transformers/models/falcon_h1/convert_mamba_ssm_checkpoint.py @@ -41,9 +41,7 @@ def convert_falcon_h1_to_hf(input_model_path, output_path): tokenizer = AutoTokenizer.from_pretrained(input_model_path) - model = AutoModelForCausalLM.from_pretrained( - input_model_path, torch_dtype=torch.bfloat16, trust_remote_code=True, low_cpu_mem_usage=True - ) + model = AutoModelForCausalLM.from_pretrained(input_model_path, torch_dtype=torch.bfloat16, trust_remote_code=True) intermediate_size = int(model.config.expansion_factor * model.config.hidden_size) diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py index a14372666885..a390166a042b 100644 --- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py +++ b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py @@ -187,7 +187,6 @@ def load_original_state_dict(input_base_path): model = AutoModel.from_pretrained( input_base_path, torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, use_flash_attn=False, trust_remote_code=True, ).eval() diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index e8282ef7438d..5ba1418a113f 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -419,7 +419,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): gc.collect() print("Loading the checkpoint in a Llama model.") - model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) + model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16) # Avoid saving this as part of the config. del model.config._name_or_path diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py index 33dbe37d581c..3631de33af64 100644 --- a/src/transformers/models/llava/convert_llava_weights_to_hf.py +++ b/src/transformers/models/llava/convert_llava_weights_to_hf.py @@ -40,7 +40,7 @@ # load model kwargs = {"device_map": "auto", "torch_dtype": torch.float16} - model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", low_cpu_mem_usage=True, **kwargs) + model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", **kwargs) # load vision tower model.get_vision_tower().load_model() diff --git a/src/transformers/models/olmo/convert_olmo_weights_to_hf.py b/src/transformers/models/olmo/convert_olmo_weights_to_hf.py index b3a2ad80b01f..c0b590a03058 100644 --- a/src/transformers/models/olmo/convert_olmo_weights_to_hf.py +++ b/src/transformers/models/olmo/convert_olmo_weights_to_hf.py @@ -175,7 +175,7 @@ def write_model(model_path, input_base_path, tokenizer_path=None, safe_serializa _write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id) print("Loading the checkpoint in a OLMo model.") - model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True) + model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32) # Avoid saving this as part of the config. del model.config._name_or_path print("Saving in the Transformers format.") diff --git a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py index 1e8fb54ddb65..86d403916a35 100644 --- a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py +++ b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py @@ -205,7 +205,7 @@ def write_model( _write_tokenizer(model_path, config, input_base_path, tokenizer_path) print("Loading the checkpoint in a OLMo2 model.") - model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True) + model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32) # Avoid saving this as part of the config. del model.config._name_or_path print("Saving in the Transformers format.") diff --git a/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py b/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py index fff886f8a833..ecb5cfa4e12d 100644 --- a/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py +++ b/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py @@ -37,7 +37,7 @@ # load model kwargs = {"device_map": "auto", "torch_dtype": torch.float16} - model = VideoLlavaForCausalLM.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", low_cpu_mem_usage=True, **kwargs) + model = VideoLlavaForCausalLM.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", **kwargs) # load vision tower model.get_vision_tower().load_model() diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 695eef9708de..8f820562f329 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2337,7 +2337,6 @@ def _test_attention_implementation(self, attn_implementation): model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - low_cpu_mem_usage=True, attn_implementation="eager", ).to(torch_device) res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) @@ -2347,7 +2346,6 @@ def _test_attention_implementation(self, attn_implementation): model_attn = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - low_cpu_mem_usage=True, attn_implementation=attn_implementation, ).to(torch_device) res_attn = model_attn.generate(**inputs_dict, **generate_kwargs) @@ -3724,7 +3722,6 @@ def test_validate_assistant(self): processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, - low_cpu_mem_usage=True, use_safetensors=True, ) model.to(torch_device) @@ -3743,7 +3740,6 @@ def test_validate_assistant(self): # Load its decoder only version: assistant_causal_lm = AutoModelForCausalLM.from_pretrained( assistant_distil_model_id, - low_cpu_mem_usage=True, use_safetensors=True, ).to(torch_device) self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum()) @@ -3759,7 +3755,6 @@ def test_validate_assistant(self): # Load its decoder only version: assistant_causal_lm = AutoModelForCausalLM.from_pretrained( assistant_distil_model_id, - low_cpu_mem_usage=True, use_safetensors=True, ).to(torch_device) # It will raise an error as the encoder of the main and assistant model are not compatible: diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py index 8213a85c1752..1d192b8d9de9 100644 --- a/tests/models/bamba/test_modeling_bamba.py +++ b/tests/models/bamba/test_modeling_bamba.py @@ -556,7 +556,6 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids_seq_id tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, ) .to(torch_device) .eval() @@ -600,7 +599,7 @@ class BambaModelIntegrationTest(unittest.TestCase): @classmethod def setUpClass(cls): model_id = "ibm-fms/Bamba-9B" - cls.model = BambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) + cls.model = BambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) cls.tokenizer = AutoTokenizer.from_pretrained(model_id) # feels a bit forced to have to do this for the generation test diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index c2add22ee54c..f2f009efb913 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -238,9 +238,7 @@ def test_batched_small_model_logits(self): ).to(device=torch_device, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_id) - model = CohereForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( - torch_device - ) + model = CohereForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device) tokenizer.pad_token = tokenizer.eos_token diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py index 02a33d8f611a..4338e4a070e5 100644 --- a/tests/models/cohere2/test_modeling_cohere2.py +++ b/tests/models/cohere2/test_modeling_cohere2.py @@ -144,7 +144,7 @@ def test_model_bf16(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" + model_id, torch_dtype=torch.bfloat16, attn_implementation="eager" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -168,7 +168,7 @@ def test_model_fp16(self): # fmt: on model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" + model_id, torch_dtype=torch.float16, attn_implementation="eager" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -189,7 +189,7 @@ def test_model_pipeline_bf16(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py index d001a953cef4..8de3fb818b7b 100644 --- a/tests/models/dac/test_modeling_dac.py +++ b/tests/models/dac/test_modeling_dac.py @@ -280,18 +280,6 @@ def test_attention_outputs(self): def test_hidden_states_output(self): pass - @unittest.skip("No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip("No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip("No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - def test_determinism(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py index 7677b909c3be..e6a02626d84b 100644 --- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py +++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -459,7 +459,6 @@ def test_eager_matches_sdpa_generate(self): model_sdpa = DeepseekV3ForCausalLM.from_pretrained( "bzantium/tiny-deepseek-v3", torch_dtype=torch.float16, - low_cpu_mem_usage=True, ).to(torch_device) self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") @@ -467,7 +466,6 @@ def test_eager_matches_sdpa_generate(self): model_eager = DeepseekV3ForCausalLM.from_pretrained( "bzantium/tiny-deepseek-v3", torch_dtype=torch.float16, - low_cpu_mem_usage=True, attn_implementation="eager", ).to(torch_device) diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 718673e617f3..7c1c7ee1b06f 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -605,18 +605,6 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - def test_two_stage_training(self): model_class = DeformableDetrForObjectDetection config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py index 0b3ef078905f..25ca02d5ba43 100644 --- a/tests/models/diffllama/test_modeling_diffllama.py +++ b/tests/models/diffllama/test_modeling_diffllama.py @@ -514,7 +514,6 @@ def test_eager_matches_sdpa_generate(self): model_sdpa = DiffLlamaForCausalLM.from_pretrained( "kajuma/DiffLlama-0.3B-handcut", torch_dtype=torch.float16, - low_cpu_mem_usage=True, ).to(torch_device) self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") @@ -522,7 +521,6 @@ def test_eager_matches_sdpa_generate(self): model_eager = DiffLlamaForCausalLM.from_pretrained( "kajuma/DiffLlama-0.3B-handcut", torch_dtype=torch.float16, - low_cpu_mem_usage=True, attn_implementation="eager", ).to(torch_device) diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index bb6458bbc3f3..21e9ac104056 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -343,18 +343,6 @@ def test_feed_forward_chunking(self): def test_hidden_states_output(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - def test_determinism(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index d34128ba067e..ef906951a490 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -381,9 +381,7 @@ def test_initialization(self): @slow # Ignore copy def test_model_from_pretrained(self): - model = FalconMambaModel.from_pretrained( - "tiiuae/falcon-mamba-7b", torch_dtype=torch.float16, low_cpu_mem_usage=True - ) + model = FalconMambaModel.from_pretrained("tiiuae/falcon-mamba-7b", torch_dtype=torch.float16) self.assertIsNotNone(model) def test_model_outputs_equivalence(self): diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index 058ccd74cd7a..f468d205ab7c 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -126,9 +126,7 @@ def test_model_2b_fp16(self): "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( - torch_device - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device) model.generation_config.cache_implementation = "static" @@ -149,9 +147,7 @@ def test_model_2b_bf16(self): "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( - torch_device - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -171,9 +167,7 @@ def test_model_2b_eager(self): ] # bfloat16 gives strange values, likely due to it has lower precision + very short prompts - model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, attn_implementation="eager") model.to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -195,7 +189,7 @@ def test_model_2b_flash_attn(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" ) model.to(torch_device) @@ -216,7 +210,7 @@ def test_model_2b_4bit(self): "Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -235,7 +229,7 @@ def test_model_7b_fp32(self): "Hi,\n\nI have a problem with my 2005 1.6 16", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -256,9 +250,7 @@ def test_model_7b_fp16(self): "Hi today I am going to show you how to make a simple and easy to make a DIY 3D", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( - torch_device - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -290,9 +282,7 @@ def test_model_7b_bf16(self): # fmt: on expected_text = EXPECTED_TEXTS.get_expectation() - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( - torch_device - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -312,9 +302,7 @@ def test_model_7b_fp16_static_cache(self): "Hi today I am going to show you how to make a simple and easy to make a DIY 3D", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( - torch_device - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device) model.generation_config.cache_implementation = "static" @@ -333,7 +321,7 @@ def test_model_7b_4bit(self): "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -451,9 +439,7 @@ def test_model_2b_bf16_dola(self): "Hi today we have the review for a 2016/2017 season of", ] - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( - torch_device - ) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index 825bf1650654..808646186c26 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -197,7 +197,7 @@ def test_model_9b_bf16(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" + model_id, torch_dtype=torch.bfloat16, attn_implementation="eager" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -218,7 +218,7 @@ def test_model_9b_fp16(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" + model_id, torch_dtype=torch.float16, attn_implementation="eager" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -241,7 +241,7 @@ def test_model_9b_pipeline_bf16(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) @@ -271,7 +271,7 @@ def test_model_2b_pipeline_bf16_flex_attention(self): EXPECTED_BATCH_TEXT = EXPECTED_BATCH_TEXTS.get_expectation() model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) @@ -419,7 +419,7 @@ def test_model_9b_bf16_flex_attention(self): ] model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + model_id, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" ).to(torch_device) assert model.config._attn_implementation == "flex_attention" tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index 40d6166ab062..b0b25579b82d 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -391,9 +391,7 @@ def tearDown(self): def test_model_4b_bf16(self): model_id = "google/gemma-3-4b-it" - model = Gemma3ForConditionalGeneration.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) inputs = self.processor.apply_chat_template( self.messages, @@ -421,9 +419,7 @@ def test_model_4b_bf16(self): def test_model_4b_batch(self): model_id = "google/gemma-3-4b-it" - model = Gemma3ForConditionalGeneration.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) messages_2 = [ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, @@ -474,9 +470,7 @@ def test_model_4b_batch(self): def test_model_4b_crops(self): model_id = "google/gemma-3-4b-it" - model = Gemma3ForConditionalGeneration.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) crop_config = { "images_kwargs": { @@ -516,9 +510,7 @@ def test_model_4b_crops(self): def test_model_4b_batch_crops(self): model_id = "google/gemma-3-4b-it" - model = Gemma3ForConditionalGeneration.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) crop_config = { "images_kwargs": { "do_pan_and_scan": True, @@ -576,9 +568,7 @@ def test_model_4b_batch_crops(self): def test_model_4b_multiimage(self): model_id = "google/gemma-3-4b-it" - model = Gemma3ForConditionalGeneration.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) messages = [ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, @@ -616,9 +606,7 @@ def test_model_4b_multiimage(self): def test_model_1b_text_only(self): model_id = "google/gemma-3-1b-it" - model = Gemma3ForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( - torch_device - ) + model = Gemma3ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left") inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device) diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index 5438b4d158cb..212bcbbaf1a5 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -88,7 +88,7 @@ def test_model_9b_fp16(self): ] model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision + self.model_id, torch_dtype=torch.float16, revision=self.revision ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) @@ -106,7 +106,7 @@ def test_model_9b_bf16(self): ] model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision + self.model_id, torch_dtype=torch.bfloat16, revision=self.revision ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) @@ -125,7 +125,6 @@ def test_model_9b_eager(self): model = AutoModelForCausalLM.from_pretrained( self.model_id, - low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager", revision=self.revision, @@ -149,7 +148,6 @@ def test_model_9b_sdpa(self): model = AutoModelForCausalLM.from_pretrained( self.model_id, - low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa", revision=self.revision, @@ -174,7 +172,6 @@ def test_model_9b_flash_attn(self): model = AutoModelForCausalLM.from_pretrained( self.model_id, - low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", revision=self.revision, diff --git a/tests/models/glm4/test_modeling_glm4.py b/tests/models/glm4/test_modeling_glm4.py index 2775b401d5ee..5655e832223c 100644 --- a/tests/models/glm4/test_modeling_glm4.py +++ b/tests/models/glm4/test_modeling_glm4.py @@ -104,9 +104,7 @@ def test_model_9b_fp16(self): ) EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation() - model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16 - ).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -132,9 +130,7 @@ def test_model_9b_bf16(self): ) EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation() - model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id) inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) @@ -162,7 +158,6 @@ def test_model_9b_eager(self): model = AutoModelForCausalLM.from_pretrained( self.model_id, - low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager", ) @@ -195,7 +190,6 @@ def test_model_9b_sdpa(self): model = AutoModelForCausalLM.from_pretrained( self.model_id, - low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa", ) @@ -226,7 +220,6 @@ def test_model_9b_flash_attn(self): model = AutoModelForCausalLM.from_pretrained( self.model_id, - low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", ) diff --git a/tests/models/helium/test_modeling_helium.py b/tests/models/helium/test_modeling_helium.py index cb46167bae41..7a1881047177 100644 --- a/tests/models/helium/test_modeling_helium.py +++ b/tests/models/helium/test_modeling_helium.py @@ -87,9 +87,9 @@ def test_model_2b(self): "Hello, today is a great day to start a new project. I have been working on a new project for a while now and I have" ] - model = AutoModelForCausalLM.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision="refs/pr/1" - ).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, revision="refs/pr/1").to( + torch_device + ) tokenizer = AutoTokenizer.from_pretrained(model_id, revision="refs/pr/1") inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 923a8749c610..5a7dbae587f8 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -727,7 +727,7 @@ class InstructBlipModelIntegrationTest(unittest.TestCase): def test_inference_vicuna_7b(self): processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") model = InstructBlipForConditionalGeneration.from_pretrained( - "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + "Salesforce/instructblip-vicuna-7b", load_in_8bit=True ) url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" @@ -752,7 +752,6 @@ def test_inference_flant5_xl(self): model = InstructBlipForConditionalGeneration.from_pretrained( "Salesforce/instructblip-flan-t5-xl", torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, ).to(torch_device) url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" @@ -789,7 +788,6 @@ def test_inference_interpolate_pos_encoding(self): model = InstructBlipForConditionalGeneration.from_pretrained( "Salesforce/instructblip-flan-t5-xl", torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, ).to(torch_device) processor.image_processor.size = {"height": 500, "width": 500} @@ -810,7 +808,6 @@ def test_expansion_in_processing(self): model = InstructBlipForConditionalGeneration.from_pretrained( "Salesforce/instructblip-flan-t5-xl", torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, ).to(torch_device) image = prepare_img() diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index a7870a4b29cd..17e6b0a64d75 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -744,7 +744,8 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase): def test_inference_vicuna_7b(self): processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") model = InstructBlipVideoForConditionalGeneration.from_pretrained( - "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + "Salesforce/instructblip-vicuna-7b", + load_in_8bit=True, ) clip = prepare_video() @@ -762,7 +763,8 @@ def test_inference_vicuna_7b(self): def test_expansion_in_processing(self): processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") model = InstructBlipVideoForConditionalGeneration.from_pretrained( - "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + "Salesforce/instructblip-vicuna-7b", + load_in_8bit=True, ) clip = prepare_video() diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index cd27180a5cfe..f73e05e2c1c7 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -527,7 +527,6 @@ def test_flash_attn_2_fp32_ln(self): tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, load_in_4bit=True, ) @@ -563,7 +562,10 @@ class JambaModelIntegrationTest(unittest.TestCase): @classmethod def setUpClass(cls): model_id = "ai21labs/Jamba-tiny-dev" - cls.model = JambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) + cls.model = JambaForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + ) cls.tokenizer = AutoTokenizer.from_pretrained(model_id) cls.device_properties = get_device_properties() diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py index 6b3953da7094..754e06a3c729 100644 --- a/tests/models/lxmert/test_modeling_lxmert.py +++ b/tests/models/lxmert/test_modeling_lxmert.py @@ -765,18 +765,6 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict): return tf_inputs_dict - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - @unittest.skip( reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245" ) diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index ed42b1b29f00..291814efde5d 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -351,18 +351,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - def assert_tensors_close(a, b, atol=1e-12, prefix=""): """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error.""" diff --git a/tests/models/minimax/test_modeling_minimax.py b/tests/models/minimax/test_modeling_minimax.py index eee664612793..b9ae9d451555 100644 --- a/tests/models/minimax/test_modeling_minimax.py +++ b/tests/models/minimax/test_modeling_minimax.py @@ -246,9 +246,10 @@ def test_small_model_logits(self): model_id = "hf-internal-testing/MiniMax-tiny" dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device) - model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to( - torch_device - ) + model = MiniMaxForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + ).to(torch_device) expected_slice = torch.tensor( [[1.0312, -0.5156, -0.3262], [-0.1152, 0.4336, 0.2412], [1.2188, -0.5898, -0.0381]] ).to(torch_device) @@ -265,9 +266,10 @@ def test_small_model_generation(self): model_id = "hf-internal-testing/MiniMax-tiny" dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device) - model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to( - torch_device - ) + model = MiniMaxForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + ).to(torch_device) expected_slice = ( torch.tensor([[0, 1, 0, 933, 307, 3102, 2457, 1208], [0, 1, 0, 933, 307, 3102, 2457, 1208]]) .to(torch.int64) diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index efe076e70abf..97e00e13a63d 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -156,9 +156,10 @@ def test_small_model_logits(self): model_id = "hf-internal-testing/Mixtral-tiny" dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device) - model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to( - torch_device - ) + model = MixtralForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + ).to(torch_device) # TODO: might need to tweak it in case the logits do not match on our daily runners # these logits have been obtained with the original megablocks implementation. # ("cuda", 8) for A100/A10, and ("cuda", 7) for T4 @@ -189,9 +190,10 @@ def test_small_model_logits_batched(self): dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device) attention_mask = dummy_input.ne(0).to(torch.long) - model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to( - torch_device - ) + model = MixtralForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + ).to(torch_device) # TODO: might need to tweak it in case the logits do not match on our daily runners # diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index 5d7d83a38e39..4f5b1689594e 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -722,7 +722,6 @@ def test_eager_matches_sdpa_generate(self): model_sdpa = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - low_cpu_mem_usage=True, ).to(torch_device) self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") @@ -730,7 +729,6 @@ def test_eager_matches_sdpa_generate(self): model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - low_cpu_mem_usage=True, attn_implementation="eager", ).to(torch_device) diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index b051fa9657bb..3386ea71a0a0 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -788,18 +788,6 @@ def test_tied_model_weights_key_ignore(self): def test_tied_weights_keys(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 86fe12b324a8..37d7736d1c94 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -789,18 +789,6 @@ def test_tied_model_weights_key_ignore(self): def test_tied_weights_keys(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work # Ignore copy def test_retain_grad_hidden_states_attentions(self): diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 388661ac1fd3..d0d42f61d2c6 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -326,18 +326,6 @@ def test_determinism(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - @unittest.skip( reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also" ) diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py index 95cb5d2785b0..c9a53efa14a7 100644 --- a/tests/models/paligemma2/test_modeling_paligemma2.py +++ b/tests/models/paligemma2/test_modeling_paligemma2.py @@ -316,18 +316,6 @@ def test_determinism(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="PaliGemma does not support low_cpu_mem_usage.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - @unittest.skip( reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also" ) diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index 232dd7f644ba..99c26096113f 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -368,10 +368,6 @@ def test_generate_from_inputs_embeds_with_static_cache(self): def test_prompt_lookup_decoding_matches_greedy_search(self): super().test_prompt_lookup_decoding_matches_greedy_search() - @unittest.skip(reason="The base class is LM only and cannot be init with XModelConfig`") - def test_save_load_fast_init_from_base(self): - pass - # The multimodal base model embeds will not match ids, due to pixel values. We can't change base test # because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels` # TODO: @raushan diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index ab2799f7ab7d..5299b6a2c119 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -318,10 +318,6 @@ def test_model_is_small(self): def test_generate_from_inputs_embeds_with_static_cache(self): pass - @unittest.skip(reason="The base class is LM only and cannot be init with XModelConfig`") - def test_save_load_fast_init_from_base(self): - pass - # The multimodal base model embeds will not match ids, due to pixel values. We can't change base test # because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels` # TODO: @raushan diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index f3d8b15dde75..62a0aef6f41d 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -182,7 +182,9 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase): @require_read_token def test_2b_generate(self): EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am looking for some information on the topic. I am looking for some information on the impact of the internet on the society. I am looking for some information on the impact of the internet on the society. I am looking for some', 'Hi today is a new app that allows you to make money by watching videos.\n\nThe app is very simple to use and you can earn money by watching videos.\n\nThe app is available for both Android and iOS devices and you can download it from the Google Play Store or the App Store.\n\nOnce you have downloaded the app'] # fmt: skip - model = AutoModelForCausalLM.from_pretrained(self.model_id, low_cpu_mem_usage=True).to(torch_device) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + ).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id) tokenizer.padding_side = "right" @@ -204,9 +206,7 @@ def test_2b_generate(self): self.assertEqual(output_text, EXPECTED_TEXTS) - model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16 - ).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device) output = model.generate(**inputs, max_new_tokens=64, do_sample=False) del model output_text = tokenizer.batch_decode(output, skip_special_tokens=True) @@ -246,9 +246,7 @@ def test_model_2b_8bit(self): def test_long_context(self): EXPECTED_GENERATION = [' Jean-Paul Delannoy told CNN that the BEA is "not aware of any video footage that could have been taken on board the plane." He added that the BEA is "not aware of any video footage that could have been taken on board the plane." The BEA is the French equivalent of the National Transportation Safety Board'] # fmt: skip - model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16 - ).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left") inputs = tokenizer(self.input_long_text, return_tensors="pt").to(torch_device) output = model.generate(**inputs, max_new_tokens=64, do_sample=False) @@ -260,9 +258,7 @@ def test_long_context(self): def test_longer_than_window(self): EXPECTED_GENERATION = [" Robin's comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the"] # fmt: skip - model = AutoModelForCausalLM.from_pretrained( - self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16 - ).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device) model.config.attention_window_size = 256 # Make the attention window size shorter than the current prompt tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left") inputs = tokenizer(self.input_long_text, return_tensors="pt").to(torch_device) diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py index 978323413ca5..fa1ada4f6160 100644 --- a/tests/models/sam/test_modeling_sam.py +++ b/tests/models/sam/test_modeling_sam.py @@ -248,14 +248,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING") - def test_save_load_fast_init_from_base(self): - pass - - @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING") - def test_save_load_fast_init_to_base(self): - pass - @unittest.skip(reason="SamVisionModel does not support training") def test_retain_grad_hidden_states_attentions(self): pass diff --git a/tests/models/sam_hq/test_modeling_sam_hq.py b/tests/models/sam_hq/test_modeling_sam_hq.py index 915ce022fc04..830b537031d0 100644 --- a/tests/models/sam_hq/test_modeling_sam_hq.py +++ b/tests/models/sam_hq/test_modeling_sam_hq.py @@ -256,14 +256,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING") - def test_save_load_fast_init_from_base(self): - pass - - @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING") - def test_save_load_fast_init_to_base(self): - pass - @unittest.skip(reason="SamVisionModel does not support training") def test_retain_grad_hidden_states_attentions(self): pass @@ -695,14 +687,6 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="SamHQModel has no base class and is not available in MODEL_MAPPING") - def test_save_load_fast_init_from_base(self): - pass - - @unittest.skip(reason="SamHQModel has no base class and is not available in MODEL_MAPPING") - def test_save_load_fast_init_to_base(self): - pass - @unittest.skip(reason="SamHQModel does not support training") def test_retain_grad_hidden_states_attentions(self): pass diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 2cab21cf5c92..6e049b4faba3 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -325,18 +325,6 @@ def test_resize_tokens_embeddings(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 21daf2900625..4df373e8391a 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -430,18 +430,6 @@ def _mock_init_weights(self, module): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - @slow def test_model_from_pretrained(self): model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k") diff --git a/tests/models/shieldgemma2/test_modeling_shieldgemma2.py b/tests/models/shieldgemma2/test_modeling_shieldgemma2.py index fbc0b727a909..de41ad0fe0df 100644 --- a/tests/models/shieldgemma2/test_modeling_shieldgemma2.py +++ b/tests/models/shieldgemma2/test_modeling_shieldgemma2.py @@ -49,9 +49,9 @@ def test_model(self): response = requests.get(url) image = Image.open(BytesIO(response.content)) - model = ShieldGemma2ForImageClassification.from_pretrained( - model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 - ).to(torch_device) + model = ShieldGemma2ForImageClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16).to( + torch_device + ) inputs = processor(images=[image]).to(torch_device) output = model(**inputs) diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 16fde95468ad..2b8f0d9a9eb3 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -1109,14 +1109,16 @@ def import_accelerate_mock(name, *args, **kwargs): # Load using `accelerate` in bf16 model = T5ForConditionalGeneration.from_pretrained( - "google-t5/t5-small", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True + "google-t5/t5-small", + torch_dtype=torch.bfloat16, ) self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16) self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16) # Load without using `accelerate` model = T5ForConditionalGeneration.from_pretrained( - "google-t5/t5-small", torch_dtype=torch.float16, low_cpu_mem_usage=True + "google-t5/t5-small", + torch_dtype=torch.float16, ) self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32) self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16) diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py index d060ab38886a..e915233a38fa 100644 --- a/tests/models/timm_backbone/test_modeling_timm_backbone.py +++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py @@ -156,18 +156,6 @@ def test_from_pretrained_no_checkpoint(self): def test_save_load(self): pass - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_checkpoints(self): - pass - - @unittest.skip(reason="No support for low_cpu_mem_usage=True.") - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - pass - @unittest.skip(reason="TimmBackbone uses its own `from_pretrained` without device_map support") def test_can_load_with_device_context_manager(self): pass diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 1a9c50c7ca69..86b7710c1764 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -407,12 +407,6 @@ def test_custom_4d_attention_mask(self): normalized_1 = F.softmax(out_shared_prefix_last_tokens) torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) - @unittest.skip( - "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!" - ) - def test_save_load_low_cpu_mem_usage(self): - pass - @slow def test_model_from_pretrained(self): model_name = "microsoft/udop-large" @@ -615,12 +609,6 @@ def test_custom_4d_attention_mask(self): normalized_1 = F.softmax(out_shared_prefix_last_tokens) torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) - @unittest.skip( - "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!" - ) - def test_save_load_low_cpu_mem_usage(self): - pass - @require_torch @require_sentencepiece diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 1397bbe4dc61..ab9c98484b72 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -2431,7 +2431,7 @@ def test_speculative_decoding_distil(self): torch_dtype = torch.float16 if (torch.cuda.is_available() or is_torch_xpu_available()) else torch.float32 model_id = "openai/whisper-large-v2" model = WhisperForConditionalGeneration.from_pretrained( - model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True + model_id, torch_dtype=torch_dtype, use_safetensors=True ) model.to(torch_device) @@ -2439,7 +2439,7 @@ def test_speculative_decoding_distil(self): assistant_model_id = "distil-whisper/distil-large-v2" assistant_model = WhisperForCausalLM.from_pretrained( - assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True + assistant_model_id, torch_dtype=torch_dtype, use_safetensors=True ) assistant_model.to(torch_device) @@ -2481,7 +2481,7 @@ def test_speculative_decoding_non_distil(self): torch_dtype = torch.float16 if torch_device in ["cuda", "xpu"] else torch.float32 model_id = "openai/whisper-large-v2" model = WhisperForConditionalGeneration.from_pretrained( - model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True + model_id, torch_dtype=torch_dtype, use_safetensors=True ) model.to(torch_device) @@ -2489,7 +2489,7 @@ def test_speculative_decoding_non_distil(self): assistant_model_id = "openai/whisper-tiny" assistant_model = WhisperForConditionalGeneration.from_pretrained( - assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True + assistant_model_id, torch_dtype=torch_dtype, use_safetensors=True ) assistant_model.to(torch_device) diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index 2a142bfc73ee..7140373081bb 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -531,7 +531,6 @@ def test_flash_attn_2_fp32_ln(self): tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, load_in_4bit=True, ) @@ -565,9 +564,7 @@ class ZambaModelIntegrationTest(unittest.TestCase): @slow def setUpClass(cls): model_id = "Zyphra/Zamba-7B-v1" - cls.model = ZambaForCausalLM.from_pretrained( - model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_mamba_kernels=False - ) + cls.model = ZambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, use_mamba_kernels=False) cls.tokenizer = AutoTokenizer.from_pretrained(model_id) @slow diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py index 4c12ac473234..a40ea394f972 100644 --- a/tests/models/zamba2/test_modeling_zamba2.py +++ b/tests/models/zamba2/test_modeling_zamba2.py @@ -549,7 +549,6 @@ def test_flash_attn_2_fp32_ln(self): tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, load_in_4bit=True, ) @@ -610,9 +609,7 @@ class Zamba2ModelIntegrationTest(unittest.TestCase): @slow def setUpClass(cls): model_id = "Zyphra/Zamba2-1.2B" - cls.model = Zamba2ForCausalLM.from_pretrained( - model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, revision="PR" - ) + cls.model = Zamba2ForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, revision="PR") cls.tokenizer = AutoTokenizer.from_pretrained(model_id, revision="PR") @parameterized.expand([(torch_device,), ("cpu",)]) diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 542344996730..e867deef1e70 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -328,7 +328,6 @@ def test_raise_save_pretrained(self): model = AutoModelForCausalLM.from_pretrained( self.model_name, quantization_config=quantization_config, - low_cpu_mem_usage=True, revision=self.model_revision, ).to(torch_device) @@ -347,7 +346,6 @@ def test_fused_modules_to_not_convert(self): model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=quantization_config, - low_cpu_mem_usage=True, ).to(torch_device) # Check if model has been correctly fused @@ -370,7 +368,6 @@ def test_generation_fused(self): model = AutoModelForCausalLM.from_pretrained( self.model_name, quantization_config=quantization_config, - low_cpu_mem_usage=True, revision=self.model_revision, ).to(torch_device) @@ -399,7 +396,6 @@ def test_generation_fused_batched(self): model = AutoModelForCausalLM.from_pretrained( self.model_name, quantization_config=quantization_config, - low_cpu_mem_usage=True, revision=self.model_revision, ).to(torch_device) diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py index 5effe1c8616b..877f6a2cd8d7 100755 --- a/tests/quantization/hqq/test_hqq.py +++ b/tests/quantization/hqq/test_hqq.py @@ -42,7 +42,6 @@ def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir=None torch_dtype=compute_dtype, device_map=device, quantization_config=quant_config, - low_cpu_mem_usage=True, cache_dir=cache_dir, ) self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir) @@ -233,7 +232,9 @@ def test_model_serialization(self): # Load and check if the logits match model_loaded = AutoModelForCausalLM.from_pretrained( - "quant_model", torch_dtype=torch.float16, device_map=torch_device, low_cpu_mem_usage=True + "quant_model", + torch_dtype=torch.float16, + device_map=torch_device, ) with torch.no_grad(): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 6ebf91d3f504..fed16e2f028c 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -578,87 +578,6 @@ def seeded_initialize_weights(self, module): f"The following keys are not properly handled by `_init_weights()`:\n{different_weights}", ) - @slow - @require_accelerate - @mark.accelerate_tests - def test_save_load_low_cpu_mem_usage(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - with tempfile.TemporaryDirectory() as saved_model_path: - for model_class in self.all_model_classes: - model_to_save = model_class(config) - model_to_save.save_pretrained(saved_model_path) - - self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path) - - @slow - @require_accelerate - @mark.accelerate_tests - def test_save_load_low_cpu_mem_usage_checkpoints(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - with tempfile.TemporaryDirectory() as saved_model_path: - for model_class in self.all_model_classes: - model_to_save = model_class(config) - model_to_save.config.save_pretrained(saved_model_path) - torch.save(model_to_save.state_dict(), os.path.join(saved_model_path, "pytorch_model.bin")) - - self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path) - - @slow - @require_accelerate - @mark.accelerate_tests - def test_save_load_low_cpu_mem_usage_no_safetensors(self): - with tempfile.TemporaryDirectory() as saved_model_path: - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model_to_save = model_class(config) - - model_to_save.save_pretrained(saved_model_path, safe_serialization=False) - self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path) - - def _check_save_load_low_cpu_mem_usage(self, model_class, saved_model_path): - from accelerate.utils.modeling import named_module_tensors - - # Load the low usage and the normal models. - model_low_usage, loading_info = model_class.from_pretrained( - saved_model_path, - low_cpu_mem_usage=True, - output_loading_info=True, - ) - model_non_low_usage = model_class.from_pretrained(saved_model_path) - - # Check that there were no missing keys. - self.assertEqual(loading_info["missing_keys"], []) - - # The low_cpu_mem_usage=True causes the model params to be initialized with device=meta, and then - # subsequently loaded with the correct values and onto the correct device. We check if there are any - # remaining params that were not properly loaded. - for name, tensor in named_module_tensors(model_low_usage, recurse=True): - self.assertNotEqual( - tensor.device, - torch.device("meta"), - "Tensor '" + name + "' has not been properly loaded and has device=meta.", - ) - - # Check that the parameters are equal. - for p1, p2 in zip(model_low_usage.parameters(), model_non_low_usage.parameters()): - self.assertEqual(p1.data.ne(p2.data).sum(), 0) - - # Check that the state dict keys are equal. - self.assertEqual(set(model_low_usage.state_dict().keys()), set(model_non_low_usage.state_dict().keys())) - - # Check that the shared tensors are equal. - tensor_ptrs1 = collections.defaultdict(list) - for name, tensor in model_low_usage.state_dict().items(): - tensor_ptrs1[id_tensor_storage(tensor)].append(name) - tied_params1 = [names for _, names in tensor_ptrs1.items() if len(names) > 1] - - tensor_ptrs2 = collections.defaultdict(list) - for name, tensor in model_non_low_usage.state_dict().items(): - tensor_ptrs2[id_tensor_storage(tensor)].append(name) - tied_params2 = [names for _, names in tensor_ptrs2.items() if len(names) > 1] - - self.assertEqual(tied_params1, tied_params2) - def test_torch_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if config.__class__ not in MODEL_MAPPING: @@ -4100,7 +4019,6 @@ def test_flash_attn_2_fp32_ln(self): tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, load_in_4bit=True, ) @@ -4173,7 +4091,6 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, ) .to(torch_device) .eval() @@ -4248,7 +4165,6 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, ) .to(torch_device) .eval() diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index cd0edd94571d..92a38baf9419 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -64,7 +64,6 @@ require_torch, require_torch_accelerator, require_torch_multi_accelerator, - require_usr_bin_time, slow, torch_device, ) @@ -1003,57 +1002,6 @@ def test_checkpoint_variant_save_load_bin(self): self.assertIsNotNone(model) - @require_accelerate - @mark.accelerate_tests - def test_from_pretrained_low_cpu_mem_usage_functional(self): - # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and - # sharded models - - mnames = [ - "hf-internal-testing/tiny-random-bert-sharded", - "hf-internal-testing/tiny-random-bert", - ] - for mname in mnames: - _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True) - - @slow - @require_usr_bin_time - @require_accelerate - @mark.accelerate_tests - def test_from_pretrained_low_cpu_mem_usage_equal(self): - # Before this would test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default - # Now though these should be around the same. - # TODO: Look for good bounds to check that their timings are near the same - - mname = "HuggingFaceTB/SmolLM-135M" - - preamble = "from transformers import AutoModel" - one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)' - # Save this output as `max_rss_normal` if testing memory results - max_rss_normal = self.python_one_liner_max_rss(one_liner_str) - - one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)' - # Save this output as `max_rss_low_mem` if testing memory results - max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str) - - # Should be within 5MBs of each other (overhead) - self.assertAlmostEqual( - max_rss_normal / 1024 / 1024, - max_rss_low_mem / 1024 / 1024, - delta=5, - msg="using `low_cpu_mem_usage` should incur the same memory usage in both cases.", - ) - - # if you want to compare things manually, let's first look at the size of the model in bytes - # model = AutoModel.from_pretrained(mname, low_cpu_mem_usage=False) - # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - # total_bytes = total_numel * 4 - # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent. - # The easiest way to test this is to switch the model and torch.load to do all the work on - # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add - # functionality to load models directly on gpu, this test can be rewritten to use torch's - # cuda memory tracking and then we should be able to do a much more precise test. - @require_accelerate @mark.accelerate_tests @require_torch_multi_accelerator @@ -1537,7 +1485,6 @@ def test_pretrained_low_mem_new_config(self): config=model_config, ignore_mismatched_sizes=True, torch_dtype=torch.float16, - low_cpu_mem_usage=True, ) model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id) @@ -1782,16 +1729,6 @@ def test_load_model_with_state_dict_only(self): ) self.assertTrue(check_models_equal(model, model_loaded)) - def test_load_model_with_state_dict_only_low_cpu_mem_usage(self): - model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert") - state_dict = model.state_dict() - config = model.config - - model_loaded = BertModel.from_pretrained( - pretrained_model_name_or_path=None, config=config, state_dict=state_dict, low_cpu_mem_usage=True - ) - self.assertTrue(check_models_equal(model, model_loaded)) - def test_cache_when_needed_at_train_time(self): """ Some fine-tuning methods require the use of cache, like prefix tuning in PEFT. This test checks that a cache