Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/qwen3/conf/train/0_6b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ model:
kv_channels: 128
num_attention_heads: 16
num_query_groups: 8 # num_key_value_heads
group_query_attention: true
seq_length: 4096
max_position_embeddings: 40960
norm_epsilon: 1e-6
Expand Down Expand Up @@ -81,6 +82,7 @@ data:
split: 1
no_mmap_bin_files: true
tokenizer:
legacy_tokenizer: true
tokenizer_type: QwenTokenizerFS
tokenizer_path: Qwen3-0.6B
vocab_size: 151936
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 1120c7529..190fac52b 100644
index 1120c7529..4cad66e65 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -67,6 +67,7 @@ def add_megatron_arguments(parser: argparse.ArgumentParser):
Expand Down Expand Up @@ -141,17 +141,6 @@ index 1120c7529..190fac52b 100644

if args.hierarchical_context_parallel_sizes:
from numpy import prod
@@ -433,8 +443,8 @@ def validate_args(args, defaults={}):
assert args.hierarchical_context_parallel_sizes is not None, \
"--hierarchical-context-parallel-sizes must be set when a2a+p2p is used in cp comm"

- if args.expert_tensor_parallel_size is None:
- args.expert_tensor_parallel_size = args.tensor_model_parallel_size
+ if args.expert_tensor_parallel_size is None:
+ args.expert_tensor_parallel_size = args.tensor_model_parallel_size

# Deprecated arguments.
assert args.batch_size is None, '--batch-size argument is no longer ' \
@@ -530,6 +540,7 @@ def validate_args(args, defaults={}):
if args.virtual_pipeline_model_parallel_size == 1:
args.virtual_pipeline_model_parallel_size = None
Expand Down
1 change: 1 addition & 0 deletions tools/checkpoint/aquila/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def load_args_hf2mg(args):
args.consumed_train_samples = 0
args.consumed_valid_samples = 0
args.norm_has_bias = False
args.tokenizer_type = "Qwen2TokenizerFS"


def save_args_mg2hf(args):
Expand Down
4 changes: 3 additions & 1 deletion tools/checkpoint/aquila/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@ def get_hf_model(dtype, model_path=None, config=None):


def get_mg_model(dtype, pre_process, post_process):
from gpt_builders import gpt_builder

from flagscale.train.train_gpt import model_provider

s_time = time.time()
model = model_provider(pre_process, post_process).to(dtype)
model = model_provider(gpt_builder, pre_process, post_process).to(dtype)
print("> build megatron model elapsed time:", time.time() - s_time)
return model
3 changes: 2 additions & 1 deletion tools/checkpoint/deepseek_v3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def get_hf_model(dtype, model_path=None, config=None):


def get_mg_model(dtype, pre_process, post_process):
from gpt_builders import gpt_builder
from pretrain_gpt import model_provider

s_time = time.time()
model = model_provider(pre_process, post_process).to(dtype)
model = model_provider(gpt_builder, pre_process, post_process).to(dtype)
print("> build megatron model elapsed time:", time.time() - s_time)
return model
4 changes: 3 additions & 1 deletion tools/checkpoint/mixtral/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ def get_hf_model(dtype, model_path=None, config=None):


def get_mg_model(dtype, pre_process, post_process):
from gpt_builders import gpt_builder

from flagscale.train.train_gpt import model_provider

s_time = time.time()
model = model_provider(pre_process, post_process).to(dtype)
model = model_provider(gpt_builder, pre_process, post_process).to(dtype)
print("> build megatron model elapsed time:", time.time() - s_time)
return model
4 changes: 3 additions & 1 deletion tools/checkpoint/qwen3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ def get_hf_model(dtype, model_path=None, config=None):


def get_mg_model(dtype, pre_process, post_process):
from gpt_builders import gpt_builder

from flagscale.train.train_gpt import model_provider

s_time = time.time()
model = model_provider(pre_process, post_process).to(dtype)
model = model_provider(gpt_builder, pre_process, post_process).to(dtype)
print("> build megatron model elapsed time:", time.time() - s_time)
return model
Loading