Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ def __init__(self, model_args: Qwen3ModelArgs):
self.model_args = model_args
self.vocab_size = model_args.vocab_size
self.n_layers = model_args.n_layers
self.eos_id = model_args.eos_id
self.head_dim = model_args.head_dim

self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
Expand Down
1 change: 0 additions & 1 deletion torchtitan/experiments/deterministic_vllm_rl/simple_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,6 @@ def load_model(checkpoint_path: str, model_path: str, use_vllm_compat: bool = Tr
max_seq_len=getattr(hf_config, "max_position_embeddings", 32768),
qk_norm=True,
depth_init=True,
eos_id=getattr(hf_config, "eos_token_id", 151645),
)

# state_dict is in standard TorchTitan format (w1, w2, w3)
Expand Down
1 change: 0 additions & 1 deletion torchtitan/experiments/transformers_backend/model/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
"n_kv_heads": "num_key_value_heads",
"norm_eps": "rms_norm_eps",
"max_seq_len": "max_position_embeddings",
"eos_id": "eos_token_id",
}
}

Expand Down
1 change: 0 additions & 1 deletion torchtitan/models/llama3/model/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class TransformerModelArgs(BaseModelArgs):

use_flex_attn: bool = False
attn_mask_type: str = "causal"
eos_id: int = 0

def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
seq_len = job_config.training.seq_len
Expand Down
1 change: 0 additions & 1 deletion torchtitan/models/qwen3/model/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class Qwen3ModelArgs(BaseModelArgs):

use_flex_attn: bool = False
attn_mask_type: str = "causal"
eos_id: int = 151645

enable_weight_tying: bool = False

Expand Down
1 change: 0 additions & 1 deletion torchtitan/models/qwen3/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,6 @@ def __init__(self, model_args: Qwen3ModelArgs):
self.model_args = model_args
self.vocab_size = model_args.vocab_size
self.n_layers = model_args.n_layers
self.eos_id = model_args.eos_id
self.head_dim = model_args.head_dim

self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
Expand Down
31 changes: 31 additions & 0 deletions torchtitan/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,3 +468,34 @@ def get_moe_model_nparams_and_flops(
nparams = nparams - nparams_embedding

return nparams, num_flops_per_token


def validate_tokenizer_model_alignment(
tokenizer: "BaseTokenizer | None",
model_args: "BaseModelArgs",
) -> None:
"""
Validate that tokenizer configuration is compatible with model configuration.

Args:
tokenizer: Tokenizer instance to validate. Can be None.
model_args: Model arguments object containing configuration to validate against.

Raises:
ValueError: If tokenizer vocab_size exceeds model vocab_size, which would
cause index out of bounds errors during training.
"""
if tokenizer is None:
return

if hasattr(model_args, "vocab_size"):
tokenizer_vocab_size = tokenizer.get_vocab_size()
model_vocab_size = model_args.vocab_size
if model_vocab_size < tokenizer_vocab_size:
raise ValueError(
f"Model vocab_size ({model_vocab_size}) is smaller than "
f"tokenizer vocab_size ({tokenizer_vocab_size}). "
f"This will cause index out of bounds errors during training. "
f"The model's embedding layer must be at least as large as the "
f"tokenizer's vocabulary size."
)
3 changes: 3 additions & 0 deletions torchtitan/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from torchtitan.config import ConfigManager, JobConfig, TORCH_DTYPE_MAP
from torchtitan.distributed import ParallelDims, utils as dist_utils
from torchtitan.models.utils import validate_tokenizer_model_alignment
from torchtitan.protocols.model_converter import build_model_converters
from torchtitan.tools import utils
from torchtitan.tools.logging import init_logger, logger
Expand Down Expand Up @@ -134,6 +135,8 @@ def __init__(self, job_config: JobConfig):
model_args.update_from_config(job_config)
self.model_args = model_args

validate_tokenizer_model_alignment(self.tokenizer, model_args)

logger.info(
f"Building {job_config.model.name} {job_config.model.flavor} with {model_args}"
)
Expand Down