-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
We encountered an out-of-memory (OOM) issue when training gpt-oss-20b with a sequence length of 16k on an 8×B200 setup. Under the same environment, training qwen3-32b with the same 16k sequence length completed without any OOM problems. The script used is shown below:
import os
import logging
import nemo_run as run
from nemo import lightning as nl
from nemo.collections import llm
from nemo.lightning.pytorch.callbacks import MemoryProfileCallback, NsysCallback
from lightning.pytorch.loggers import TensorBoardLogger
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
model_name="gpt-oss-20b"
n_training_samples=1024*1024
seq_length=16384
global_batch_size=32
micro_batch_size=1
max_steps=int(n_training_samples/global_batch_size*2)
max_epochs=2
tp_size = 1
pp_size = 2
cp_size = 1
ep_size = 2
dp_size = 2
sp = False
lr=5e-6
explicit_log_dir="/explicit_log_dir"
dataset_root=f"/dataset_root"
NEMO_MODELS_CACHE = "/model_cache"
recipe = llm.gpt_oss_20b.finetune_recipe(
name="gpt_oss_20b",
dir=f"{NEMO_MODELS_CACHE}/{model_name}",
resume_path=f"{NEMO_MODELS_CACHE}/{model_name}",
num_nodes=1,
num_gpus_per_node=8,
peft_scheme='none',
)
tokenizer = run.Config(
get_nmt_tokenizer,
library = 'huggingface',
model_name = f"gpt-oss-20b",
use_fast = True,
)
dataloader = run.Config(
FineTuningDataModule,
tokenizer=tokenizer,
dataset_root=dataset_root,
seq_length=seq_length,
micro_batch_size=micro_batch_size,
global_batch_size=global_batch_size,
dataset_kwargs={"prompt_template": "{input}{output}"}
)
recipe.data = dataloader
recipe.trainer.max_steps = max_steps
recipe.trainer.strategy.tensor_model_parallel_size = tp_size
recipe.trainer.strategy.expert_model_parallel_size = ep_size
recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
recipe.trainer.strategy.context_parallel_size = cp_size
recipe.trainer.strategy.data_parallel_size = dp_size
recipe.trainer.strategy.sequence_parallel= sp
recipe.optim.config.lr = lr
run.run(recipe, executor=run.LocalExecutor(ntasks_per_node=8))
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working