You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
`
from LLaMAFactory.src.llamafactory.train.tuner import run_exp,export_model
from LLaMAFactory.src.llamafactory.extras.misc import is_env_enabled,get_device_count,use_ray
from pathlib import Path
import yaml,os
if name == "main":
config_path = "myconfigFile/llama2_lora_sft.yaml"
config = yaml.safe_load(Path(config_path ).absolute().read_text())
run_exp(args=config)
for i in range(10):
force_torchrun = is_env_enabled("FORCE_TORCHRUN")
if force_torchrun or (get_device_count() > 1 and not use_ray()):
print("pass")
else:
run_exp(args=config)
`
error : Traceback (most recent call last): File "/root/autodl-tmp/Code/AgentGym/testIter.py", line 17, in <module> run_exp(args=config) File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/train/tuner.py", line 93, in run_exp _training_function(config={"args": args, "callbacks": callbacks}) File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/train/tuner.py", line 67, in _training_function run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/train/sft/workflow.py", line 52, in run_sft model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/model/loader.py", line 160, in load_model model = load_class.from_pretrained(**init_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained return model_class.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4245, in from_pretrained ) = cls._load_pretrained_model( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4815, in _load_pretrained_model new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/modeling_utils.py", line 873, in _load_state_dict_into_meta_model set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs) File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/accelerate/utils/modeling.py", line 329, in set_module_tensor_to_device new_value = value.to(device) ^^^^^^^^^^^^^^^^ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 79.14 GiB of which 76.75 MiB is free. Process 813498 has 79.05 GiB memory in use. Of the allocated memory 78.51 GiB is allocated by PyTorch, and 42.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
显存使用情况,多次调用run_exp() 显存不会释放,并依次叠加
Others
How to Release GPU Memory When Directly Calling "run_exp()" ?
在直接调用 "run_exp()" 的情况下需要手动清楚显存占用吗?
The text was updated successfully, but these errors were encountered:
Reminder
System Info
`
llamafactory
version: 0.9.2.dev0`
Reproduction
`
from LLaMAFactory.src.llamafactory.train.tuner import run_exp,export_model
from LLaMAFactory.src.llamafactory.extras.misc import is_env_enabled,get_device_count,use_ray
from pathlib import Path
import yaml,os
if name == "main":
config_path = "myconfigFile/llama2_lora_sft.yaml"
config = yaml.safe_load(Path(config_path ).absolute().read_text())
run_exp(args=config)
for i in range(10):
force_torchrun = is_env_enabled("FORCE_TORCHRUN")
if force_torchrun or (get_device_count() > 1 and not use_ray()):
print("pass")
else:
run_exp(args=config)
`
myconfigFile/llama2_lora_sft.yaml :
`
model
model_name_or_path: meta-llama/Llama-2-7b-hf
trust_remote_code: true
method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all
dataset
dataset_dir: /LLaMAFactory/data
dataset: alpaca_en_demo
template: llama2
cutoff_len: 2048
max_samples: 100
overwrite_cache: true
preprocessing_num_workers: 16
output
output_dir: saves/llama2/lora/sft
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 1
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
`
error :

Traceback (most recent call last): File "/root/autodl-tmp/Code/AgentGym/testIter.py", line 17, in <module> run_exp(args=config) File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/train/tuner.py", line 93, in run_exp _training_function(config={"args": args, "callbacks": callbacks}) File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/train/tuner.py", line 67, in _training_function run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/train/sft/workflow.py", line 52, in run_sft model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/autodl-tmp/Code/AgentGym/LLaMAFactory/src/llamafactory/model/loader.py", line 160, in load_model model = load_class.from_pretrained(**init_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained return model_class.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4245, in from_pretrained ) = cls._load_pretrained_model( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4815, in _load_pretrained_model new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/transformers/modeling_utils.py", line 873, in _load_state_dict_into_meta_model set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs) File "/root/miniconda3/envs/agent311/lib/python3.11/site-packages/accelerate/utils/modeling.py", line 329, in set_module_tensor_to_device new_value = value.to(device) ^^^^^^^^^^^^^^^^ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 79.14 GiB of which 76.75 MiB is free. Process 813498 has 79.05 GiB memory in use. Of the allocated memory 78.51 GiB is allocated by PyTorch, and 42.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
显存使用情况,多次调用run_exp() 显存不会释放,并依次叠加
Others
How to Release GPU Memory When Directly Calling "run_exp()" ?
在直接调用 "run_exp()" 的情况下需要手动清楚显存占用吗?
The text was updated successfully, but these errors were encountered: