Skip to content

Training GPT-OSS-20B caused OOM #14712

@kltlj1

Description

@kltlj1

We encountered an out-of-memory (OOM) issue when training gpt-oss-20b with a sequence length of 16k on an 8×B200 setup. Under the same environment, training qwen3-32b with the same 16k sequence length completed without any OOM problems. The script used is shown below:

import os
import logging
import nemo_run as run
from nemo import lightning as nl
from nemo.collections import llm
from nemo.lightning.pytorch.callbacks import MemoryProfileCallback, NsysCallback
from lightning.pytorch.loggers import TensorBoardLogger
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule


model_name="gpt-oss-20b"
n_training_samples=1024*1024
seq_length=16384
global_batch_size=32
micro_batch_size=1
max_steps=int(n_training_samples/global_batch_size*2)
max_epochs=2
tp_size = 1
pp_size = 2
cp_size = 1
ep_size = 2
dp_size = 2
sp = False
lr=5e-6

explicit_log_dir="/explicit_log_dir"
dataset_root=f"/dataset_root"

NEMO_MODELS_CACHE = "/model_cache"


recipe = llm.gpt_oss_20b.finetune_recipe(
    name="gpt_oss_20b",
    dir=f"{NEMO_MODELS_CACHE}/{model_name}",
    resume_path=f"{NEMO_MODELS_CACHE}/{model_name}",
    num_nodes=1,
    num_gpus_per_node=8,
    peft_scheme='none',
)

tokenizer = run.Config(
    get_nmt_tokenizer,
    library = 'huggingface',
    model_name = f"gpt-oss-20b",
    use_fast = True,
)

dataloader = run.Config(
    FineTuningDataModule,
    tokenizer=tokenizer,
    dataset_root=dataset_root,
    seq_length=seq_length,
    micro_batch_size=micro_batch_size,
    global_batch_size=global_batch_size,
    dataset_kwargs={"prompt_template": "{input}{output}"}
)

recipe.data = dataloader

recipe.trainer.max_steps = max_steps
recipe.trainer.strategy.tensor_model_parallel_size = tp_size
recipe.trainer.strategy.expert_model_parallel_size = ep_size
recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
recipe.trainer.strategy.context_parallel_size = cp_size
recipe.trainer.strategy.data_parallel_size = dp_size
recipe.trainer.strategy.sequence_parallel= sp
recipe.optim.config.lr = lr

run.run(recipe, executor=run.LocalExecutor(ntasks_per_node=8))

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions