Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defaults: ../../distillation_math.yaml
distillation:
num_prompts_per_step: 64
max_num_steps: 20
val_batch_size: 32
val_batch_size: 256
val_period: 10
max_val_samples: 256
loss_fn:
Expand All @@ -11,43 +11,15 @@ checkpointing:
checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-dynamicbatch
policy:
model_name: Qwen/Qwen3-4B-Base
train_global_batch_size: 32
generation_batch_size: 32
dtensor_cfg:
context_parallel_size: 1
make_sequence_length_divisible_by: 2
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 20
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 20
teacher:
model_name: Qwen/Qwen3-32B
train_global_batch_size: 32
generation_batch_size: 32
dtensor_cfg:
tensor_parallel_size: 8
context_parallel_size: 1
make_sequence_length_divisible_by: 2
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 20
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 20
logger:
log_dir: logs/distillation-qwen3-32b-to-4b-base-dynamicbatch
wandb:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,22 @@ defaults: ../../distillation_math.yaml
distillation:
num_prompts_per_step: 64
max_num_steps: 500
val_batch_size: 32
val_batch_size: 512
val_period: 50
max_val_samples: 256
loss_fn:
kl_type: reverse
checkpointing:
checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-long
save_period: 50
save_period: 10
policy:
model_name: Qwen/Qwen3-4B-Base
train_global_batch_size: 32
generation_batch_size: 32
max_total_sequence_length: 32768
dynamic_batching:
enabled: false
make_sequence_length_divisible_by: 2
optimizer:
kwargs:
lr: 1.0e-05
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 100
- name: torch.optim.lr_scheduler.CosineAnnealingLR
kwargs:
T_max: 900
eta_min: 1.0e-07
- milestones:
- 100
max_total_sequence_length: 20480
generation:
vllm_cfg:
tensor_parallel_size: 2
teacher:
model_name: Qwen/Qwen3-32B
train_global_batch_size: 32
generation_batch_size: 32
max_total_sequence_length: 32768
dynamic_batching:
enabled: false
make_sequence_length_divisible_by: 2
optimizer:
kwargs:
lr: 1.0e-05
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 100
- name: torch.optim.lr_scheduler.CosineAnnealingLR
kwargs:
T_max: 900
eta_min: 1.0e-07
- milestones:
- 100
max_total_sequence_length: 20480
logger:
log_dir: logs/distillation-qwen3-32b-to-4b-base-long
wandb:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
defaults: ../../distillation_math.yaml
distillation:
num_prompts_per_step: 64
max_num_steps: 20
val_batch_size: 256
val_period: 10
max_val_samples: 256
loss_fn:
kl_type: reverse
checkpointing:
checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-seqpack
policy:
model_name: Qwen/Qwen3-4B-Base
dtensor_cfg:
context_parallel_size: 1
dynamic_batching:
enabled: false
sequence_packing:
enabled: true
make_sequence_length_divisible_by: 2
teacher:
model_name: Qwen/Qwen3-32B
dtensor_cfg:
tensor_parallel_size: 8
context_parallel_size: 1
dynamic_batching:
enabled: false
sequence_packing:
enabled: true
make_sequence_length_divisible_by: 2
logger:
log_dir: logs/distillation-qwen3-32b-to-4b-base-seqpack
wandb:
project: nemo-rl
name: distillation-qwen3-32b-to-4b-base-seqpack
cluster:
num_nodes: 2
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defaults: ../../distillation_math.yaml
distillation:
num_prompts_per_step: 64
max_num_steps: 20
val_batch_size: 32
val_batch_size: 256
val_period: 10
max_val_samples: 256
loss_fn:
Expand All @@ -12,29 +12,10 @@ checkpointing:
save_period: 50
policy:
model_name: Qwen/Qwen3-4B-Base
train_global_batch_size: 32
generation_batch_size: 32
dtensor_cfg:
tensor_parallel_size: 8
context_parallel_size: 1
dynamic_batching:
enabled: false
make_sequence_length_divisible_by: 2
optimizer:
kwargs:
lr: 1.0e-05
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 100
- name: torch.optim.lr_scheduler.CosineAnnealingLR
kwargs:
T_max: 900
eta_min: 1.0e-07
- milestones:
- 100
generation:
colocated:
enabled: false
Expand All @@ -43,29 +24,10 @@ policy:
num_nodes: 1
teacher:
model_name: Qwen/Qwen3-32B
train_global_batch_size: 32
generation_batch_size: 32
dtensor_cfg:
tensor_parallel_size: 8
context_parallel_size: 1
dynamic_batching:
enabled: false
make_sequence_length_divisible_by: 2
optimizer:
kwargs:
lr: 1.0e-05
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 100
- name: torch.optim.lr_scheduler.CosineAnnealingLR
kwargs:
T_max: 900
eta_min: 1.0e-07
- milestones:
- 100
generation:
colocated:
enabled: false
Expand Down

This file was deleted.

This file was deleted.

Loading
Loading