Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/ar/llm_tutorial_optimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ flush()
دعنا نرى ما هو استهلاك ذاكرة GPU الذروة الذي يوفره تكميم 4 بت. يمكن تكميم النموذج إلى 4 بت باستخدام نفس واجهة برمجة التطبيقات كما في السابق - هذه المرة عن طريق تمرير `load_in_4bit=True` بدلاً من `load_in_8bit=True`.

```python
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Expand Down
6 changes: 3 additions & 3 deletions docs/source/ar/trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ args = TrainingArguments(
model_id = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id، low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)

trainer = trl.SFTTrainer(
model=model،
Expand Down Expand Up @@ -503,7 +503,7 @@ args = TrainingArguments(
# تحميل النموذج والمجزىء اللغوي
model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)

# تهيئة المدرب
trainer = Trainer(
Expand Down Expand Up @@ -547,7 +547,7 @@ args = TrainingArguments(
model_id = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)

trainer = trl.SFTTrainer(
model=model,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/internal/model_debugging_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ torch.random.manual_seed(673)
# load pretrained model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id)

# create random image input
random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/llm_tutorial_optimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ flush()
Let's see what peak GPU memory consumption 4-bit quantization gives. Quantizing the model to 4-bit can be done with the same API as before - this time by passing `load_in_4bit=True` instead of `load_in_8bit=True`.

```python
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Expand Down
1 change: 0 additions & 1 deletion docs/source/en/model_doc/chameleon.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ model_id = "facebook/chameleon-7b"
model = ChameleonForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
attn_implementation="flash_attention_2"
).to(0)
```
Expand Down
3 changes: 1 addition & 2 deletions docs/source/en/model_doc/llava_next.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ import requests

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16)
model.to("cuda:0")

# prepare image and text prompt, using the appropriate prompt template
Expand Down Expand Up @@ -292,7 +292,6 @@ from transformers import AutoModelForImageTextToText
model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True
).to(0)
```
Expand Down
2 changes: 0 additions & 2 deletions docs/source/en/model_doc/llava_onevision.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
"llava-hf/llava-onevision-qwen2-7b-ov-hf",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="cuda:0"
)

Expand Down Expand Up @@ -286,7 +285,6 @@ from transformers import LlavaOnevisionForConditionalGeneration
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True
).to(0)
```
Expand Down
7 changes: 1 addition & 6 deletions docs/source/en/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,6 @@ You need enough memory to hold two copies of the model weights (random and pretr

Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types.

### Fast initialization

A PyTorch model is instantiated with random weights, or "empty" tensors, that take up space in memory without filling it.

Transformers boosts loading speed by skipping random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default.

### Sharded checkpoints

Expand Down Expand Up @@ -245,7 +240,7 @@ Big Model Inference's second feature relates to how weights are loaded and dispa

Both features combined reduces memory usage and loading times for big pretrained models.

Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`, such that not more than 1x the model size is used in CPU memory.
Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference.

```py
from transformers import AutoModelForCausalLM
Expand Down
11 changes: 0 additions & 11 deletions docs/source/ja/main_classes/model.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,8 @@ rendered properly in your Markdown viewer.

Transformers 4.20.0では、[`~PreTrainedModel.from_pretrained`] メソッドが再設計され、[Accelerate](https://huggingface.co/docs/accelerate/big_modeling) を使用して大規模モデルを扱うことが可能になりました。これには Accelerate >= 0.9.0 と PyTorch >= 1.9.0 が必要です。以前の方法でフルモデルを作成し、その後事前学習の重みを読み込む代わりに(これにはメモリ内のモデルサイズが2倍必要で、ランダムに初期化されたモデル用と重み用の2つが必要でした)、モデルを空の外殻として作成し、事前学習の重みが読み込まれるときにパラメーターを実体化するオプションが追加されました。

このオプションは `low_cpu_mem_usage=True` で有効にできます。モデルはまず空の重みを持つメタデバイス上に作成され、その後状態辞書が内部に読み込まれます(シャードされたチェックポイントの場合、シャードごとに読み込まれます)。この方法で使用される最大RAMは、モデルの完全なサイズだけです。


```py
from transformers import AutoModelForSeq2SeqLM

t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
```

さらに、モデルが完全にRAMに収まらない場合(現時点では推論のみ有効)、異なるデバイスにモデルを直接配置できます。`device_map="auto"` を使用すると、Accelerateは各レイヤーをどのデバイスに配置するかを決定し、最速のデバイス(GPU)を最大限に活用し、残りの部分をCPU、あるいはGPU RAMが不足している場合はハードドライブにオフロードします。モデルが複数のデバイスに分割されていても、通常どおり実行されます。

`device_map` を渡す際、`low_cpu_mem_usage` は自動的に `True` に設定されるため、それを指定する必要はありません。


```py
from transformers import AutoModelForSeq2SeqLM
Expand Down
2 changes: 1 addition & 1 deletion docs/source/ko/llm_tutorial_optimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ flush()
이제 4비트 양자화가 제공하는 최대 GPU 메모리 사용량을 확인해 봅시다. 4비트로 모델을 양자화하려면 이전과 동일한 API를 사용하되 이번에는 `load_in_8bit=True` 대신 `load_in_4bit=True`를 전달하면 됩니다.

```python
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Expand Down
1 change: 0 additions & 1 deletion docs/source/ko/model_doc/chameleon.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ model_id = "facebook/chameleon-7b"
model = ChameleonForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
attn_implementation="flash_attention_2"
).to(0)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/source/ko/trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ args = TrainingArguments(
model_id = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
model = AutoModelForCausalLM.from_pretrained(model_id).to(0)

trainer = trl.SFTTrainer(
model=model,
Expand Down
10 changes: 0 additions & 10 deletions docs/source/zh/main_classes/model.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,8 @@ http://www.apache.org/licenses/LICENSE-2.0

在 Transformers 4.20.0 中,[`~PreTrainedModel.from_pretrained`] 方法已重新设计,以适应使用 [Accelerate](https://huggingface.co/docs/accelerate/big_modeling) 加载大型模型的场景。这需要您使用的 Accelerate 和 PyTorch 版本满足: Accelerate >= 0.9.0, PyTorch >= 1.9.0。除了创建完整模型,然后在其中加载预训练权重(这会占用两倍于模型大小的内存空间,一个用于随机初始化模型,一个用于预训练权重),我们提供了一种选项,将模型创建为空壳,然后只有在加载预训练权重时才实例化其参数。

您可以使用 `low_cpu_mem_usage=True` 激活此选项。首先,在 Meta 设备上创建模型(带有空权重),然后将状态字典加载到其中(在分片检查点的情况下逐片加载)。这样,最大使用的内存占用仅为模型的完整大小。

```python
from transformers import AutoModelForSeq2SeqLM

t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
```

此外,如果内存不足以放下加载整个模型(目前仅适用于推理),您可以直接将模型放置在不同的设备上。使用 `device_map="auto"`,Accelerate 将确定将每一层放置在哪个设备上,以最大化使用最快的设备(GPU),并将其余部分卸载到 CPU,甚至硬盘上(如果您没有足够的 GPU 内存 或 CPU 内存)。即使模型分布在几个设备上,它也将像您通常期望的那样运行。

在传递 `device_map` 时,`low_cpu_mem_usage` 会自动设置为 `True`,因此您不需要指定它:

```python
from transformers import AutoModelForSeq2SeqLM

Expand Down
4 changes: 0 additions & 4 deletions examples/pytorch/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,6 @@ sure all your batches have the same length.

To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` to the command line. This is supported by `run_mlm.py`, `run_clm.py` and `run_fim.py`. Make sure to adapt the other scripts to your use case by taking inspiration from them.

## Low Cpu Memory Usage

To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line. This is currently supported by `run_clm.py`,`run_mlm.py`, `run_plm.py`, `run_fim.py`, `run_mlm_no_trainer.py`, `run_clm_no_trainer.py` and `run_fim_no_trainer.py`.

## Creating a model on the fly

When training a model from scratch, configuration values may be overridden with the help of `--config_overrides`:
Expand Down
10 changes: 0 additions & 10 deletions examples/pytorch/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,6 @@ class ModelArguments:
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)

def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
Expand Down Expand Up @@ -432,7 +423,6 @@ def main():
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
Expand Down
9 changes: 0 additions & 9 deletions examples/pytorch/language-modeling/run_clm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,14 +228,6 @@ def parse_args():
"Only applicable when `--with_tracking` is passed."
),
)
parser.add_argument(
"--low_cpu_mem_usage",
action="store_true",
help=(
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"If passed, LLM loading time and RAM consumption will be benefited."
),
)
args = parser.parse_args()

# Sanity checks
Expand Down Expand Up @@ -409,7 +401,6 @@ def main():
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
low_cpu_mem_usage=args.low_cpu_mem_usage,
trust_remote_code=args.trust_remote_code,
)
else:
Expand Down
10 changes: 0 additions & 10 deletions examples/pytorch/language-modeling/run_fim.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,15 +142,6 @@ class ModelArguments:
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)
pad_to_multiple_of: bool = field(
default=False,
metadata={
Expand Down Expand Up @@ -501,7 +492,6 @@ def main():
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
attn_implementation=model_args.attn_implementation,
)

Expand Down
9 changes: 0 additions & 9 deletions examples/pytorch/language-modeling/run_fim_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,6 @@ def parse_args():
"Only applicable when `--with_tracking` is passed."
),
)
parser.add_argument(
"--low_cpu_mem_usage",
action="store_true",
help=(
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"If passed, LLM loading time and RAM consumption will be benefited."
),
)
args = parser.parse_args()

# Sanity checks
Expand Down Expand Up @@ -474,7 +466,6 @@ def main():
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
low_cpu_mem_usage=args.low_cpu_mem_usage,
trust_remote_code=args.trust_remote_code,
)
else:
Expand Down
10 changes: 0 additions & 10 deletions examples/pytorch/language-modeling/run_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,6 @@ class ModelArguments:
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)

def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
Expand Down Expand Up @@ -436,7 +427,6 @@ def main():
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
logger.info("Training new model from scratch")
Expand Down
9 changes: 0 additions & 9 deletions examples/pytorch/language-modeling/run_mlm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,6 @@ def parse_args():
"Only applicable when `--with_tracking` is passed."
),
)
parser.add_argument(
"--low_cpu_mem_usage",
action="store_true",
help=(
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"If passed, LLM loading time and RAM consumption will be benefited."
),
)
args = parser.parse_args()

# Sanity checks
Expand Down Expand Up @@ -406,7 +398,6 @@ def main():
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
low_cpu_mem_usage=args.low_cpu_mem_usage,
trust_remote_code=args.trust_remote_code,
)
else:
Expand Down
10 changes: 0 additions & 10 deletions examples/pytorch/language-modeling/run_plm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,6 @@ class ModelArguments:
)
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)

def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
Expand Down Expand Up @@ -397,7 +388,6 @@ def main():
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
logger.info("Training new model from scratch")
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/model_debugging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def model_addition_debugger_context(
# load pretrained model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id)

# create random image input
random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/aria/convert_aria_weights_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

# load model
kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs)
model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", **kwargs)

# load vision tower
model.get_vision_tower().load_model()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@
def convert_falcon_h1_to_hf(input_model_path, output_path):
tokenizer = AutoTokenizer.from_pretrained(input_model_path)

model = AutoModelForCausalLM.from_pretrained(
input_model_path, torch_dtype=torch.bfloat16, trust_remote_code=True, low_cpu_mem_usage=True
)
model = AutoModelForCausalLM.from_pretrained(input_model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)

intermediate_size = int(model.config.expansion_factor * model.config.hidden_size)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def load_original_state_dict(input_base_path):
model = AutoModel.from_pretrained(
input_base_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=False,
trust_remote_code=True,
).eval()
Expand Down
Loading