diff --git a/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh new file mode 100644 index 0000000000..014ee567fe --- /dev/null +++ b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh @@ -0,0 +1,120 @@ +set -x + +# Colocated DAPO training+generation for Qwen3-4B-Base on DAPO training data with Megatron. +# bash examples/algorithms/dapo/prepare_dapo_data.sh +# bash examples/megatron/run_megatron_dapo_qwen3_4b.sh + +MODEL_NAME="Qwen/Qwen3-4B-Base" +DATA_DIR="$HOME/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet" +TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet" +NUM_NODES=1 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=4 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=2 +LOGGER="wandb" # change to "console" to print to stdout + +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +# use token mean loss reduction +LOSS_REDUCTION="token_mean" +# applies overlong filtering (but not soft overlong punishment) +APPLY_OVERLONG_FILTERING=true +# apply soft overlong punishment with custom trainer impl in main_dapo.py +OVERLONG_BUFFER_LEN=$((1024 * 4)) +OVERLONG_BUFFER_PENALTY_FACTOR=1.0 + +# other DAPO parameters +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 8)) + +# repro run parameters +TRAIN_BATCH_SIZE=512 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=16 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # cuda graphs can cause some instability +LR=1e-6 + +# megatron config +MEGATRON_TP=4 +MEGATRON_PP=2 +MEGATRON_CP=1 +MEGATRON_EP=1 +MEGATRON_ETP=null + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=true + +uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + +trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \ + +trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.num_inference_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=8 \ + trainer.micro_train_batch_size_per_gpu=8 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=160 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.backend=vllm \ + generator.run_engines_locally=true \ + generator.weight_sync_backend=nccl \ + generator.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.gpu_memory_utilization=0.8 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_aime" \ + trainer.run_name="dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \ + trainer.export_path="$HOME/exports/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \ + trainer.hf_save_interval=25 \ + trainer.resume_mode=latest \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \ + $@ \ No newline at end of file diff --git a/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh new file mode 100644 index 0000000000..25dc8e6d30 --- /dev/null +++ b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh @@ -0,0 +1,128 @@ +set -x + +# Colocated DAPO training+generation for Qwen3-4B-Base on DAPO training data with Megatron and LoRA. +# bash examples/algorithms/dapo/prepare_dapo_data.sh +# bash examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh + +MODEL_NAME="Qwen/Qwen3-4B-Base" +DATA_DIR="$HOME/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet" +TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet" +NUM_NODES=1 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=4 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=2 +LOGGER="wandb" # change to "console" to print to stdout + +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +# use token mean loss reduction +LOSS_REDUCTION="token_mean" +# applies overlong filtering (but not soft overlong punishment) +APPLY_OVERLONG_FILTERING=true +# apply soft overlong punishment with custom trainer impl in main_dapo.py +OVERLONG_BUFFER_LEN=$((1024 * 4)) +OVERLONG_BUFFER_PENALTY_FACTOR=1.0 + +# other DAPO parameters +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 8)) + +# repro run parameters +TRAIN_BATCH_SIZE=512 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=16 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # cuda graphs can cause some instability +LR=3e-5 + +# megatron config +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=1 +MEGATRON_ETP=null + +# lora config +LORA_RANK=32 +LORA_ALPHA=64 +LORA_A_INIT_METHOD="kaiming" + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=true + +uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + +trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \ + +trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.num_inference_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=true \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=8 \ + trainer.micro_train_batch_size_per_gpu=8 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=160 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.backend=vllm \ + generator.run_engines_locally=true \ + generator.weight_sync_backend=nccl \ + generator.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.gpu_memory_utilization=0.8 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_aime" \ + trainer.run_name="dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \ + trainer.export_path="$HOME/exports/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=latest \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \ + $@ \ No newline at end of file diff --git a/skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh new file mode 100644 index 0000000000..02cf41c70d --- /dev/null +++ b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh @@ -0,0 +1,76 @@ +set -x + +# Colocated GRPO training+generation for Qwen3-0.6B on GSM8K with Megatron and LoRA. + +# uv run examples/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k +# export WANDB_API_KEY= +# bash examples/megatron/run_megatron_lora_qwen3-0.6b.sh + +DATA_DIR="$HOME/data/gsm8k" +NUM_GPUS=8 +LOGGER="wandb" # change to "console" to print to stdout +MODEL_NAME="Qwen/Qwen3-0.6B" + +INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron + +MEGATRON_TP=1 +MEGATRON_PP=1 +MEGATRON_CP=1 + +# LoRA configuration +LORA_RANK=32 +LORA_ALPHA=64 +LORA_A_INIT_METHOD="kaiming" + + +uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \ + data.train_data="['$DATA_DIR/train.parquet']" \ + data.val_data="['$DATA_DIR/validation.parquet']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.policy.model.path=$MODEL_NAME \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \ + trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \ + generator.num_inference_engines=$NUM_GPUS \ + generator.inference_engine_tensor_parallel_size=1 \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.ref.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.ref.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.ref.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \ + trainer.gradient_checkpointing=true \ + trainer.policy.model.lora.target_modules="all-linear" \ + trainer.use_sample_packing=true \ + trainer.epochs=20 \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=128 \ + trainer.policy_mini_batch_size=64 \ + trainer.micro_forward_batch_size_per_gpu=4 \ + trainer.micro_train_batch_size_per_gpu=4 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=512 \ + generator.sampling_params.max_generate_length=1024 \ + trainer.policy.optimizer_config.lr=1.0e-5 \ + trainer.algorithm.use_kl_loss=true \ + generator.backend=$INFERENCE_BACKEND \ + generator.run_engines_locally=true \ + generator.weight_sync_backend=nccl \ + generator.async_engine=true \ + generator.batched=true \ + environment.env_class=gsm8k \ + generator.n_samples_per_prompt=5 \ + generator.gpu_memory_utilization=0.6 \ + trainer.logger="$LOGGER" \ + trainer.project_name="gsm8k_megatron" \ + trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}_lora_r${LORA_RANK}_a${LORA_ALPHA}" \ + trainer.resume_mode=null \ + trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \ + $@ \ No newline at end of file diff --git a/skyrl-train/examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh new file mode 100644 index 0000000000..6c2f3a899b --- /dev/null +++ b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh @@ -0,0 +1,103 @@ +set -x + +# Colocated GRPO training+generation for Qwen3-30B-A3B on GSM8K with Megatron. +# Should run on 1 node of 8xH100s + +# uv run examples/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k +# export WANDB_API_KEY= +# bash examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh + +DATA_DIR="$HOME/data/gsm8k" +LOGGER="wandb" # change to "console" to print to stdout +MODEL_NAME="Qwen/Qwen3-30B-A3B" + +INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron + +NUM_NODES=1 +NUM_GPUS=8 + +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 + +NUM_INFERENCE_ENGINES=1 +INFERENCE_ENGINE_TP=8 +FLASH_ATTN=true + +# Megatron gradient checkpointing config +RECOMPUTE_GRANULARITY="full" +RECOMPUTE_METHOD="uniform" +RECOMPUTE_NUM_LAYERS=1 + +# LoRA configuration +LORA_RANK=64 +LORA_ALPHA=64 +LORA_A_INIT_METHOD="kaiming" + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=true + +uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \ + data.train_data="['$DATA_DIR/train.parquet']" \ + data.val_data="['$DATA_DIR/validation.parquet']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.policy.model.path=$MODEL_NAME \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.ref_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \ + trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \ + generator.num_inference_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TP \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.ref.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.ref.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.ref.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.ref.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.ref.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \ + trainer.policy.megatron_config.transformer_config_kwargs.recompute_granularity=$RECOMPUTE_GRANULARITY \ + trainer.policy.megatron_config.transformer_config_kwargs.recompute_method=$RECOMPUTE_METHOD \ + trainer.policy.megatron_config.transformer_config_kwargs.recompute_num_layers=$RECOMPUTE_NUM_LAYERS \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + trainer.epochs=20 \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=128 \ + trainer.policy_mini_batch_size=64 \ + trainer.micro_forward_batch_size_per_gpu=8 \ + trainer.micro_train_batch_size_per_gpu=8 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=512 \ + generator.sampling_params.max_generate_length=1024 \ + trainer.policy.optimizer_config.lr=1.0e-5 \ + trainer.algorithm.use_kl_loss=true \ + generator.backend=$INFERENCE_BACKEND \ + generator.run_engines_locally=true \ + generator.weight_sync_backend=nccl \ + generator.async_engine=true \ + generator.batched=true \ + environment.env_class=gsm8k \ + generator.n_samples_per_prompt=5 \ + generator.gpu_memory_utilization=0.6 \ + trainer.logger="$LOGGER" \ + trainer.project_name="gsm8k_megatron" \ + trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_qwen3_30b_a3b" \ + trainer.resume_mode=null \ + trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \ + $@ \ No newline at end of file diff --git a/skyrl-train/pyproject.toml b/skyrl-train/pyproject.toml index 93618f8bf6..56f8aee4bb 100644 --- a/skyrl-train/pyproject.toml +++ b/skyrl-train/pyproject.toml @@ -100,7 +100,7 @@ flashinfer-jit-cache = { index = "flashinfer-cu128", marker = "extra == 'vllm' o flashinfer-python = [ { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra == 'sglang' and extra != 'mcore' and extra != 'vllm'" } ] -megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "22ef9ff9f9684ba2f2dbea14db974f5c31bbd683"} +megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "953aabf75c0500180dc14a6a76cf9e7e7c4baec7"} [project.optional-dependencies] diff --git a/skyrl-train/skyrl_train/config/megatron_config/policy.yaml b/skyrl-train/skyrl_train/config/megatron_config/policy.yaml index fe436cbb4e..3ba0dc8f95 100644 --- a/skyrl-train/skyrl_train/config/megatron_config/policy.yaml +++ b/skyrl-train/skyrl_train/config/megatron_config/policy.yaml @@ -21,6 +21,10 @@ torch_profiler_config: ranks: [] save_path: null +lora_config: + # see: https://docs.nvidia.com/nemo/megatron-bridge/0.2.0/apidocs/bridge/bridge.peft.lora.html for details - currently "lora" and "canonical_lora" are supported + lora_type: "lora" + # pass-through kwargs to Megatron's `OptimizerConfig` object # any overlapping arguments with those we attempt to resolve in trainer.policy.optimizer_config will be overridden by the values here # https://github.com/NVIDIA/Megatron-LM/blob/core_r0.13.0/megatron/core/optimizer/optimizer_config.py#L12 diff --git a/skyrl-train/skyrl_train/config/ppo_base_config.yaml b/skyrl-train/skyrl_train/config/ppo_base_config.yaml index 012b4e43da..b88d2c708c 100644 --- a/skyrl-train/skyrl_train/config/ppo_base_config.yaml +++ b/skyrl-train/skyrl_train/config/ppo_base_config.yaml @@ -27,12 +27,15 @@ trainer: model: path: "Qwen/Qwen2.5-1.5B-Instruct" lora: - rank: 0 - alpha: 16 - dropout: 0 - lora_sync_path: "/tmp/skyrl_lora_sync" - target_modules: "all-linear" - exclude_modules: null + rank: 0 + alpha: 16 + dropout: 0 + lora_sync_path: "/tmp/skyrl_lora_sync" + target_modules: "all-linear" + exclude_modules: null + # see https://huggingface.co/docs/peft/v0.18.0/en/package_reference/lora#peft.LoraConfig.init_lora_weights for supported initialization methods for FSDP + # for megatron, this is used for `lora_A_init_method`, and "xavier", "normal", "kaiming", and "zero" are supported + init_method: "kaiming" deepspeed_config: ${deepspeed_config.train} optimizer_config: lr: 1.0e-6 diff --git a/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py b/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py index 5aa11b1e07..6884feb2ca 100644 --- a/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py +++ b/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py @@ -50,12 +50,14 @@ def __init__( megatron_config, optimizer_config=None, seed: int = 42, + is_lora: bool = False, ) -> None: super().__init__() self.megatron_config = megatron_config self.optimizer_config = optimizer_config self.seed = seed self.hf_config = None # Set by the megatron worker once configs are initialized. + self.is_lora = is_lora # NOTE: Set Megatron dist checkpoint async backend to persistent to avoid `os.fork()`-ing # short-lived background workers, which does not work well with Ray. @@ -145,9 +147,9 @@ def save_checkpoint( # Extract base model. model: List[nn.Module] = model.actor_module assert len(model) == 1, "Megatron virtual pipeline parallel is not yet supported" - model = model[0] - if hasattr(model, "module"): - model = model.module + unwrapped_model = model[0] + while hasattr(unwrapped_model, "module"): + unwrapped_model = unwrapped_model.module # Create checkpoint directory if it doesn't exist. if node_local_rank == 0: @@ -158,8 +160,9 @@ def save_checkpoint( # Collect the sharded state dicts for model and optimizer, and full state dict for the scheduler. sharded_state_dict = {} - model_sharded_state_dict = model.sharded_state_dict() - sharded_state_dict["model"] = model_sharded_state_dict + model_sharded_state_dict = unwrapped_model.sharded_state_dict() + if not self.is_lora: + sharded_state_dict["model"] = model_sharded_state_dict if optimizer: sharded_state_dict["optimizer"] = optimizer.sharded_state_dict(model_sharded_state_dict) if scheduler: @@ -190,11 +193,43 @@ def save_checkpoint( hf_dir = os.path.join(work_dir, "huggingface") self.save_hf_configs(self.hf_config, hf_dir, tokenizer) + if self.is_lora: + self._save_lora_adapters(unwrapped_model, ckpt_dir) + dist.barrier() ckpt_base.async_calls.close() ckpt_base.async_calls = AsyncCallsQueue(persistent=True) self.print(f"Checkpoint successfully saved to {ckpt_dir}") + def _get_rank_path(self, ckpt_dir): + tp_rank = mpu.get_tensor_model_parallel_rank() + pp_rank = mpu.get_pipeline_model_parallel_rank() + cp_rank = mpu.get_context_parallel_rank() + dp_rank = mpu.get_data_parallel_rank() + ep_rank = mpu.get_expert_model_parallel_rank() + etp_rank = mpu.get_expert_tensor_parallel_rank() + + return os.path.join( + ckpt_dir, f"adapter_tp{tp_rank}_pp{pp_rank}_cp{cp_rank}_dp{dp_rank}_ep{ep_rank}_etp{etp_rank}.pt" + ) + + def _save_lora_adapters(self, model, ckpt_dir): + """Save LoRA adapters to checkpoint.""" + if not self.is_lora: + return + + assert isinstance(model, nn.Module), "Model must be a nn.Module" + + model_state_dict = {} + for name, param in model.named_parameters(): + if ".adapter" in name.lower(): + model_state_dict[name] = param.data + + with io.local_work_dir(ckpt_dir) as work_dir: + adapter_path = self._get_rank_path(work_dir) + torch.save({"model_state_dict": model_state_dict}, adapter_path) + self.print(f"Saved {len(model_state_dict)} LoRA adapter parameters to {adapter_path}") + def load_checkpoint( self, model: MegatronModelWrapper, @@ -212,13 +247,14 @@ def load_checkpoint( model: List[nn.Module] = model.actor_module assert len(model) == 1, "Megatron virtual pipeline parallel is not yet supported" unwrapped_model = model[0] - if hasattr(unwrapped_model, "module"): + while hasattr(unwrapped_model, "module"): unwrapped_model = unwrapped_model.module # Extract sharded state dicts. sharded_state_dict = {} model_sharded_state_dict = unwrapped_model.sharded_state_dict() - sharded_state_dict["model"] = model_sharded_state_dict + if not self.is_lora: + sharded_state_dict["model"] = model_sharded_state_dict if optimizer and load_optimizer_states: sharded_state_dict["optimizer"] = optimizer.sharded_state_dict(model_sharded_state_dict) if scheduler and load_lr_scheduler_states: @@ -233,13 +269,15 @@ def load_checkpoint( state_dict = dist_checkpointing.load( sharded_state_dict=sharded_state_dict, checkpoint_dir=read_dir, sharded_strategy=load_strategy ) - - # Load the model, optimizer, and scheduler state dicts. - assert ( - "model" in state_dict - ), f"Model state dict not found in checkpoint loaded from {ckpt_dir}. Available keys: {state_dict.keys()}" - model[0].load_state_dict(state_dict["model"], strict=load_module_strict) - self.print("Loaded model state dict.") + if not self.is_lora: + # Load the model, optimizer, and scheduler state dicts. + assert ( + "model" in state_dict + ), f"Model state dict not found in checkpoint loaded from {ckpt_dir}. Available keys: {state_dict.keys()}" + model[0].load_state_dict(state_dict["model"], strict=load_module_strict) + self.print("Loaded model state dict.") + else: + self._load_lora_adapters(unwrapped_model, ckpt_dir) if optimizer and load_optimizer_states: assert ( @@ -261,6 +299,22 @@ def load_checkpoint( return ckpt_dir, {} + def _load_lora_adapters(self, model, ckpt_dir): + """Load LoRA adapters from checkpoint.""" + # TODO (erictang000): Update this logic once LoRA checkpointing is upstreamed to Megatron-Bridge + if not self.is_lora: + return + + assert isinstance(model, nn.Module), "Model must be a nn.Module" + + with io.local_read_dir(ckpt_dir) as read_dir: + adapter_path = self._get_rank_path(read_dir) + state_dict = torch.load(adapter_path, map_location="cpu") + _, unexpected = model.load_state_dict(state_dict["model_state_dict"], strict=False) + if len(unexpected) > 0: + raise ValueError(f"Unexpected keys in LoRA adapter state dict: {unexpected}") + self.print(f"Loaded {len(state_dict['model_state_dict'])} LoRA adapters from {adapter_path}.") + def save_hf_model(self, bridge, model: MegatronModelWrapper, output_dir: str, tokenizer=None, **kwargs) -> None: # Create checkpoint directory if it doesn't exist. if self.is_rank_0(): diff --git a/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py b/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py index f69929368c..f09504ecaa 100644 --- a/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py +++ b/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py @@ -158,13 +158,33 @@ def offload_megatron_model_to_cpu(models): model_chunk_all_buffers = [model_chunk.buffers, model_chunk.expert_parallel_buffers] for buffers in model_chunk_all_buffers: for buffer in buffers: - # offload parameters + # offload parameters from fused Megatron buffers if buffer.param_data.storage().size() > 0: buffer.param_data.cpu_data = buffer.param_data.data.cpu().pin_memory() buffer.param_data_size = buffer.param_data.storage().size() buffer.param_data.storage().resize_(0) assert buffer.param_data_size == buffer.param_data.cpu_data.storage().size() + + # lora aware offloading - if using lora,offload non-lora base weights, since megatron fused buffers do not include the HF/bridge "to_wrap" weights + for name, param in model_chunk.named_parameters(): + if ( + param.is_cuda + and not param.requires_grad + and "adapter" not in name + and param.data.storage().size() > 0 + ): + # Always refresh the CPU copy and release GPU storage. + cpu_tensor = param.data.detach().cpu().pin_memory() + param._offload_cpu_data = cpu_tensor + param._offload_cuda_numel = param.data.numel() + # Release GPU storage while keeping dtype/device metadata. + empty_cuda = torch.empty( + 0, + dtype=param.data.dtype, + device=param.data.device, + ) + param.data = empty_cuda else: # we need this for ref module for _, param in model_chunk.named_parameters(): @@ -184,6 +204,13 @@ def load_megatron_model_to_gpu(models): buffer.param_data.storage().resize_(buffer.param_data_size) # copy data from cpu to cuda buffer.param_data.copy_(buffer.param_data.cpu_data, non_blocking=True) + + # Restore any LoRA-frozen base weights that were offloaded above. + device_id = torch.cuda.current_device() + for name, param in model_chunk.named_parameters(): + if hasattr(param, "_offload_cpu_data") and param.data.storage().size() == 0: + restored = param._offload_cpu_data.to(device_id, non_blocking=True) + param.data = restored else: # we need this for ref module device_id = torch.cuda.current_device() diff --git a/skyrl-train/skyrl_train/entrypoints/main_base.py b/skyrl-train/skyrl_train/entrypoints/main_base.py index 1604c61c85..cebc28dd65 100644 --- a/skyrl-train/skyrl_train/entrypoints/main_base.py +++ b/skyrl-train/skyrl_train/entrypoints/main_base.py @@ -61,7 +61,7 @@ def create_ray_wrapped_inference_engines_from_config(cfg: DictConfig, colocate_p } # Conditionally add LoRA parameters if LoRA is enabled - if cfg.trainer.policy.model.lora.rank > 0: + if cfg.trainer.policy.model.lora.rank > 0 and cfg.trainer.strategy != "megatron": engine_kwargs["enable_lora"] = True engine_kwargs["max_lora_rank"] = cfg.trainer.policy.model.lora.rank engine_kwargs["sleep_level"] = 1 diff --git a/skyrl-train/skyrl_train/model_wrapper.py b/skyrl-train/skyrl_train/model_wrapper.py index 028cb86ac9..8767e1201b 100644 --- a/skyrl-train/skyrl_train/model_wrapper.py +++ b/skyrl-train/skyrl_train/model_wrapper.py @@ -35,6 +35,7 @@ class HFModelWrapper(nn.Module): lora_rank (int, optional): Rank for LoRA adaptation. Defaults to 0. lora_alpha (int, optional): Alpha parameter for LoRA. Defaults to 16. lora_dropout (float, optional): Dropout rate for LoRA layers. Defaults to 0. + lora_init_method (str, optional): Initialization method for LoRA layers. Defaults to "kaiming". target_modules (list, optional): List of target modules for applying LoRA. Defaults to None. exclude_modules (list, optional): List of modules to exclude from applying LoRA. Defaults to None. ds_config (dict, optional): Configuration for DeepSpeed, enabling model partitioning across multiple GPUs. Defaults to None. @@ -54,6 +55,7 @@ def __init__( lora_rank=0, lora_alpha=16, lora_dropout=0, + lora_init_method="kaiming", target_modules=None, exclude_modules=None, ds_config=None, @@ -161,6 +163,7 @@ def __init__( exclude_modules=exclude_modules, lora_dropout=lora_dropout, bias="none", + init_lora_weights=True if lora_init_method == "kaiming" else lora_init_method, ) self.model = get_peft_model(self.model, lora_config) diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py index 32b15e3fb4..6285696fb5 100644 --- a/skyrl-train/skyrl_train/utils/utils.py +++ b/skyrl-train/skyrl_train/utils/utils.py @@ -297,7 +297,11 @@ def validate_cfg(cfg: DictConfig): # LoRA enabled # Right now: assert generator backend must be vllm, training backend must be fsdp/fsdp2 assert cfg.generator.backend == "vllm", "LoRA enabled requires vLLM backend" - assert cfg.trainer.strategy in ("fsdp", "fsdp2"), "LoRA enabled requires fsdp/fsdp2 training backend" + assert cfg.trainer.strategy in ( + "fsdp", + "fsdp2", + "megatron", + ), "LoRA enabled requires fsdp/fsdp2/megatron training backend" if cfg.trainer.target_modules is not None: logger.warning( diff --git a/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py b/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py index 9aea864ac9..7c5853c38a 100644 --- a/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py +++ b/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py @@ -133,6 +133,7 @@ def init_model(self, model_path, num_training_steps: int = None): lora_rank=self.cfg.trainer.policy.model.lora.rank, lora_alpha=self.cfg.trainer.policy.model.lora.alpha, lora_dropout=self.cfg.trainer.policy.model.lora.dropout, + lora_init_method=self.cfg.trainer.policy.model.lora.init_method, target_modules=self.cfg.trainer.policy.model.lora.target_modules, exclude_modules=self.cfg.trainer.policy.model.lora.exclude_modules, sequence_parallel_size=self.cfg.trainer.policy.sequence_parallel_size, diff --git a/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py b/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py index 4b6424c5c4..cc566ef4f4 100644 --- a/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py +++ b/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py @@ -14,6 +14,8 @@ from omegaconf import OmegaConf from megatron.bridge import AutoBridge +from megatron.bridge.peft.lora import LoRA +from megatron.bridge.peft.canonical_lora import CanonicalLoRA import megatron.core.parallel_state as mpu from megatron.core.optimizer import DistributedOptimizer from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler @@ -184,7 +186,14 @@ def extract_weights(self, dtype: torch.dtype): class MegatronWorker: def init_configs( - self, model_path, megatron_config, model_config_kwargs, transformer_config_kwargs, bf16=True, flash_attn=False + self, + model_path, + megatron_config, + model_config_kwargs, + transformer_config_kwargs, + bf16=True, + flash_attn=False, + lora_config=None, ): """ Initialize the Megatron-Bridge bridge and provider objects + hf_config and tokenizer @@ -232,10 +241,51 @@ def init_configs( self.strategy.hf_config = hf_config self.tokenizer = tokenizer + def configure_lora(self, lora_config, lora_type: Optional[str] = "lora"): + if lora_type == "lora": + self.lora_cls = LoRA( + target_modules=( + ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"] + if lora_config.target_modules == "all-linear" + else lora_config.target_modules + ), + dim=lora_config.rank, + alpha=lora_config.alpha, + dropout=lora_config.dropout, + lora_A_init_method=lora_config.init_method, + lora_B_init_method="zero", + exclude_modules=[] if lora_config.exclude_modules is None else lora_config.exclude_modules, + lora_dtype=torch.bfloat16 if self.cfg.trainer.bf16 else torch.float32, + ) + elif lora_type == "canonical_lora": + self.lora_cls = CanonicalLoRA( + target_modules=( + [ + "linear_q", + "linear_k", + "linear_v", + "linear_proj", + "linear_fc1_up", + "linear_fc1_gate", + "linear_fc2", + ] + if lora_config.target_modules == "all-linear" + else lora_config.target_modules + ), + dim=lora_config.rank, + alpha=lora_config.alpha, + dropout=lora_config.dropout, + lora_A_init_method=lora_config.init_method, + lora_B_init_method="zero", + exclude_modules=[] if lora_config.exclude_modules is None else lora_config.exclude_modules, + ) + def make_megatron_module( self, wrap_with_ddp: bool = True, ddp_config: Optional[Dict[str, Any]] = None, + lora_config: Optional[Dict[str, Any]] = None, + lora_type: Optional[str] = "lora", bf16: bool = True, ) -> List[nn.Module]: """ @@ -243,6 +293,17 @@ def make_megatron_module( """ from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig + if lora_config is not None: + self.configure_lora(lora_config, lora_type) + + def lora_pre_wrap_hook(model): + lora_model = self.lora_cls(model, training=True) + self.lora_cls.set_params_to_save(lora_model) + + return lora_model + + self.provider.register_pre_wrap_hook(lora_pre_wrap_hook) + default_ddp_config = DistributedDataParallelConfig() if wrap_with_ddp: default_ddp_config.use_distributed_optimizer = True @@ -315,6 +376,7 @@ def __init__(self, **kwargs): self.scheduler: OptimizerParamScheduler = None self.optimizer: DistributedOptimizer = None self.profiler: Profiler = None + self._is_lora = self.cfg.trainer.policy.model.lora.rank > 0 def offload_to_cpu(self, pin_memory=True, non_blocking=True, offload_optimizer=True, offload_model=True): self._set_numa_affinity(torch.distributed.get_rank() % torch.cuda.device_count()) @@ -355,6 +417,7 @@ def _broadcast_no_grad(*args, **kwargs): megatron_config=self.cfg.trainer.policy.megatron_config, optimizer_config=self.cfg.trainer.policy.optimizer_config, seed=self.cfg.trainer.seed, + is_lora=self._is_lora, ) self.strategy.setup_distributed() @@ -386,6 +449,8 @@ def init_model(self, model_path, num_training_steps: int = 1e9): self.actor_module = self.make_megatron_module( wrap_with_ddp=True, ddp_config=self.cfg.trainer.policy.megatron_config.ddp_config, + lora_config=self.cfg.trainer.policy.model.lora if self._is_lora else None, + lora_type=self.cfg.trainer.policy.megatron_config.lora_config.lora_type, bf16=self.cfg.trainer.bf16, ) diff --git a/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py b/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py index 4c52b17aa4..9c8afbcac1 100644 --- a/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py +++ b/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py @@ -64,15 +64,16 @@ def get_test_actor_config(strategy: str) -> DictConfig: @pytest.mark.parametrize( - "strategy", + ("strategy", "lora"), [ - "deepspeed", - "fsdp", - "fsdp2", - pytest.param("megatron", marks=pytest.mark.megatron), + ("deepspeed", False), + ("fsdp", False), + ("fsdp2", False), + pytest.param("megatron", False, marks=pytest.mark.megatron), + pytest.param("megatron", True, marks=[pytest.mark.megatron, pytest.mark.lora]), ], ) -def test_save_load_checkpoint(ray_init_fixture, strategy): +def test_save_load_checkpoint(ray_init_fixture, strategy, lora): """ Test checkpointing logic by: 1. Creating model and doing one training step @@ -82,6 +83,9 @@ def test_save_load_checkpoint(ray_init_fixture, strategy): 5. Repeating second training step and comparing logits """ cfg = get_test_actor_config(strategy) + if lora: + cfg.trainer.policy.model.lora.rank = 32 + cfg.trainer.policy.model.lora.alpha = 32 try: actor_group = init_worker_with_type( diff --git a/skyrl-train/tests/gpu/test_megatron_worker.py b/skyrl-train/tests/gpu/test_megatron_worker.py index fe7907918b..a5434feaa7 100644 --- a/skyrl-train/tests/gpu/test_megatron_worker.py +++ b/skyrl-train/tests/gpu/test_megatron_worker.py @@ -9,7 +9,7 @@ from omegaconf import DictConfig import torch import asyncio -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from omegaconf import OmegaConf from tests.gpu.utils import ( init_worker_with_type, @@ -110,16 +110,21 @@ def get_test_training_batch(batch_size=4) -> TrainingInputBatch: @pytest.mark.parametrize( - ("colocate_all", "inference_tp", "megatron_tp", "megatron_pp", "megatron_ep", "megatron_etp"), - [(True, 4, 2, 2, 1, None), (False, 2, 2, 1, 1, None)], - ids=["colocate_all", "non_colocated"], + ("colocate_all", "inference_tp", "megatron_tp", "megatron_pp", "megatron_ep", "megatron_etp", "lora"), + [(True, 4, 2, 2, 1, None, False), (False, 2, 2, 1, 1, None, False), (True, 4, 2, 2, 1, None, True)], + ids=["colocate_all", "non_colocated", "colocate_all_lora"], ) -def test_megatron_policy_weight_sync(colocate_all, inference_tp, megatron_tp, megatron_pp, megatron_ep, megatron_etp): +def test_megatron_policy_weight_sync( + colocate_all, inference_tp, megatron_tp, megatron_pp, megatron_ep, megatron_etp, lora +): """ Test that we can sync weights between policy and inference for megatron then run inference """ try: cfg = get_test_actor_config(model_name=MODEL_NAME) + if lora: + cfg.trainer.policy.model.lora.rank = 16 + cfg.trainer.policy.model.lora.alpha = 16 cfg.trainer.placement.colocate_all = colocate_all cfg.generator.weight_sync_backend = "nccl" cfg.trainer.strategy = "megatron" @@ -174,17 +179,18 @@ def test_megatron_policy_weight_sync(colocate_all, inference_tp, megatron_tp, me @pytest.mark.asyncio @pytest.mark.parametrize( - ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing"), + ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing", "lora"), [ - ("policy", 2, 1, 1, 1, None, 2, False), + ("policy", 2, 1, 1, 1, None, 2, False, False), # ref has same forward pass as policy - just duplicate one test to test setup - ("ref", 2, 1, 1, 1, None, 2, False), - ("policy", 1, 2, 1, 1, None, 2, False), - ("policy", 2, 2, 1, 1, None, 4, False), - ("policy", 2, 2, 1, 1, None, 4, True), - ("policy", 1, 1, 2, 1, None, 2, True), - ("policy", 2, 2, 2, 1, None, 8, True), - ("policy", 4, 2, 1, 4, 1, 8, True), + ("ref", 2, 1, 1, 1, None, 2, False, False), + ("policy", 1, 2, 1, 1, None, 2, False, False), + ("policy", 2, 2, 1, 1, None, 4, False, False), + ("policy", 2, 2, 1, 1, None, 4, True, False), + ("policy", 2, 2, 1, 1, None, 4, True, True), + ("policy", 1, 1, 2, 1, None, 2, True, False), + ("policy", 2, 2, 2, 1, None, 8, True, False), + ("policy", 4, 2, 1, 4, 1, 8, True, False), ], ids=[ "tp2_pp1_policy", @@ -192,12 +198,15 @@ def test_megatron_policy_weight_sync(colocate_all, inference_tp, megatron_tp, me "tp1_pp2_policy", "tp2_pp2_policy_unpacked", "tp2_pp2_policy_seq_packing", + "tp2_pp2_lora", "cp_2_policy_seq_packing", "tp_2_pp_2_cp_2_policy_seq_packing", "tp4_pp2_cp1_ep4_etp1_policy_seq_packing", ], ) -async def test_megatron_forward(ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing): +async def test_megatron_forward( + ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing, lora +): """ Test that the Megatron forward pass is numerically equivalent to just running a huggingface model forward. """ @@ -213,6 +222,17 @@ async def test_megatron_forward(ray_init_fixture, worker_type, tp, pp, cp, ep, e cfg.trainer.use_sample_packing = use_sample_packing batch = get_test_training_batch(max(4, gpus_per_node)) + if ep > 1: + transformer_config_kwargs = OmegaConf.to_container( + cfg.trainer.policy.megatron_config.transformer_config_kwargs, resolve=True + ) + transformer_config_kwargs["num_layers"] = 4 + cfg.trainer.policy.megatron_config.transformer_config_kwargs = transformer_config_kwargs + + if lora: + cfg.trainer.policy.model.lora.rank = 16 + cfg.trainer.policy.model.lora.alpha = 16 + actor_group = init_worker_with_type( worker_type, shared_pg=None, @@ -234,7 +254,10 @@ async def test_megatron_forward(ray_init_fixture, worker_type, tp, pp, cp, ep, e # now run the huggingface model forward @ray.remote(num_gpus=1) def run_hf_forward(batch, model_name): - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, dtype=torch.bfloat16) + if ep > 1: + config.num_hidden_layers = 4 + model = AutoModelForCausalLM.from_pretrained(model_name, config=config, dtype=torch.bfloat16) model.eval() model.to("cuda") sequences_fwd = batch["sequences"] @@ -283,7 +306,8 @@ def run_hf_forward(batch, model_name): avg_diff = torch.mean(torch.abs(action_log_probs_masked - action_log_probs_megatron_masked)) print(f"Avg diff: {avg_diff}") - assert max_diff < 4.5e-1, f"Max diff {max_diff} is too large" + if ep == 1: + assert max_diff < 4e-1, f"Max diff {max_diff} is too large" if ep == 1: assert avg_diff < 7e-2, f"Avg diff {avg_diff} is too large" @@ -294,24 +318,127 @@ def run_hf_forward(batch, model_name): @pytest.mark.asyncio @pytest.mark.parametrize( - ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing", "use_entropy_loss"), + ("tp", "pp", "cp", "ep", "etp", "gpus_per_node"), + [ + (2, 2, 1, 1, None, 4), + (4, 1, 1, 4, 1, 4), + ], + ids=[ + "tp2_pp2_policy", + "tp4_pp1_cp1_ep4_etp1_policy", + ], +) +async def test_megatron_lora_forward(ray_init_fixture, tp, pp, cp, ep, etp, gpus_per_node): + """ + Test that the Megatron + lora forward pass is numerically equivalent to just running a megatron model forward. + """ + cfg = get_test_actor_config(model_name=MOE_MODEL_NAME if ep > 1 else MODEL_NAME) + #### Megatron forward pass #### + cfg.trainer.strategy = "megatron" + cfg.trainer.placement.policy_num_gpus_per_node = gpus_per_node + cfg.trainer.policy.megatron_config.tensor_model_parallel_size = tp + cfg.trainer.policy.megatron_config.pipeline_model_parallel_size = pp + cfg.trainer.policy.megatron_config.context_parallel_size = cp + cfg.trainer.policy.megatron_config.expert_model_parallel_size = ep + cfg.trainer.policy.megatron_config.expert_tensor_parallel_size = etp + cfg.trainer.use_sample_packing = True + batch = get_test_training_batch(max(4, gpus_per_node)) + + if ep > 1: + transformer_config_kwargs = OmegaConf.to_container( + cfg.trainer.policy.megatron_config.transformer_config_kwargs, resolve=True + ) + transformer_config_kwargs["num_layers"] = 4 + cfg.trainer.policy.megatron_config.transformer_config_kwargs = transformer_config_kwargs + + actor_group = init_worker_with_type( + "policy", + shared_pg=None, + colocate_all=False, + num_gpus_per_node=cfg.trainer.placement.policy_num_gpus_per_node, + cfg=cfg, + ) + + action_log_probs_refs = actor_group.async_run_ray_method("mesh", "forward", data=batch) + all_rank_action_log_probs = ray.get(action_log_probs_refs) + action_log_probs_full = concatenate_outputs_after_mesh_dispatch(actor_group.actor_infos, all_rank_action_log_probs)[ + "output" + ] + + ray.shutdown() + ray_init_for_tests() + + #### Megatron forward pass #### + cfg.trainer.strategy = "megatron" + cfg.trainer.placement.policy_num_gpus_per_node = gpus_per_node + cfg.trainer.policy.megatron_config.tensor_model_parallel_size = tp + cfg.trainer.policy.megatron_config.pipeline_model_parallel_size = pp + cfg.trainer.policy.megatron_config.context_parallel_size = cp + cfg.trainer.policy.megatron_config.expert_model_parallel_size = ep + cfg.trainer.policy.megatron_config.expert_tensor_parallel_size = etp + cfg.trainer.use_sample_packing = True + batch = get_test_training_batch(max(4, gpus_per_node)) + + # set lora this time + cfg.trainer.policy.model.lora.rank = 16 + cfg.trainer.policy.model.lora.alpha = 16 + + if ep > 1: + transformer_config_kwargs = OmegaConf.to_container( + cfg.trainer.policy.megatron_config.transformer_config_kwargs, resolve=True + ) + transformer_config_kwargs["num_layers"] = 4 + cfg.trainer.policy.megatron_config.transformer_config_kwargs = transformer_config_kwargs + + actor_group = init_worker_with_type( + "policy", + shared_pg=None, + colocate_all=False, + num_gpus_per_node=cfg.trainer.placement.policy_num_gpus_per_node, + cfg=cfg, + ) + + action_log_probs_refs = actor_group.async_run_ray_method("mesh", "forward", data=batch) + all_rank_action_log_probs = ray.get(action_log_probs_refs) + action_log_probs_lora = concatenate_outputs_after_mesh_dispatch(actor_group.actor_infos, all_rank_action_log_probs)[ + "output" + ] + + #### Compare results #### + # compare just non-padding tokens + print(f"Comparing {action_log_probs_full.numel()} valid response tokens") + print(f"Full sample: {action_log_probs_full[:5]}") + print(f"Lora sample: {action_log_probs_lora[:5]}") + + # max diff + max_diff = torch.max(torch.abs(action_log_probs_full - action_log_probs_lora)) + print(f"Max diff: {max_diff}") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing", "use_entropy_loss", "lora"), [ - ("policy", 2, 2, 1, 1, 1, 4, True, False), - ("policy", 2, 2, 1, 1, 1, 4, True, True), - ("policy", 2, 2, 1, 1, 1, 4, False, False), - ("policy", 2, 2, 2, 1, 1, 8, True, False), - ("policy", 2, 1, 1, 8, 1, 8, True, False), + ("policy", 2, 2, 1, 1, 1, 4, True, False, False), + ("policy", 2, 2, 1, 1, 1, 4, True, True, False), + ("policy", 1, 1, 1, 1, 1, 1, True, False, True), + ("policy", 2, 2, 1, 1, 1, 4, False, False, False), + ("policy", 2, 2, 2, 1, 1, 8, True, False, False), + ("policy", 4, 1, 1, 8, 1, 8, True, False, False), + ("policy", 4, 1, 1, 8, 1, 8, True, False, True), ], ids=[ "tp2_pp2_policy_seq_packing", "tp2_pp2_policy_seq_packing_with_entropy_loss", + "tp1_pp1_policy_lora", "tp2_pp2_policy_unpacked", "tp2_pp2_cp2_policy_seq_packing", - "tp4_pp2_cp1_ep8_etp1_policy_seq_packing", + "tp4_pp1_cp1_ep8_etp1_policy_seq_packing", + "tp4_pp1_cp1_ep8_etp1_policy_seq_packing_lora", ], ) async def test_megatron_train( - ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing, use_entropy_loss + ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing, use_entropy_loss, lora ): """ Full test: initialize actor group, send dummy experience to training_step, validate output. @@ -330,6 +457,9 @@ async def test_megatron_train( if use_entropy_loss: cfg.trainer.algorithm.use_entropy_loss = True cfg.trainer.algorithm.entropy_loss_coef = 0.01 + if lora: + cfg.trainer.policy.model.lora.rank = 16 + cfg.trainer.policy.model.lora.alpha = 16 # set batch sizes correctly cfg.trainer.train_batch_size = gpus_per_node diff --git a/skyrl-train/uv.lock b/skyrl-train/uv.lock index 272900b343..062f8206aa 100644 --- a/skyrl-train/uv.lock +++ b/skyrl-train/uv.lock @@ -865,7 +865,7 @@ wheels = [ [[package]] name = "fastapi" -version = "0.125.0" +version = "0.126.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-doc" }, @@ -873,9 +873,9 @@ dependencies = [ { name = "starlette" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/17/71/2df15009fb4bdd522a069d2fbca6007c6c5487fce5cb965be00fc335f1d1/fastapi-0.125.0.tar.gz", hash = "sha256:16b532691a33e2c5dee1dac32feb31dc6eb41a3dd4ff29a95f9487cb21c054c0", size = 370550, upload-time = "2025-12-17T21:41:44.15Z" } +sdist = { url = "https://files.pythonhosted.org/packages/03/6c/28773e19bc203a2f3cf1d54a8e96ca7d05b58157a350aa4d8d37f2a5ba07/fastapi-0.126.0.tar.gz", hash = "sha256:f099fceb2a6d56dd21c59c4543d00be123dedacff869e76ae31ba3c0f963e2cd", size = 367455, upload-time = "2025-12-20T16:16:44.484Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/34/2f/ff2fcc98f500713368d8b650e1bbc4a0b3ebcdd3e050dcdaad5f5a13fd7e/fastapi-0.125.0-py3-none-any.whl", hash = "sha256:2570ec4f3aecf5cca8f0428aed2398b774fcdfee6c2116f86e80513f2f86a7a1", size = 112888, upload-time = "2025-12-17T21:41:41.286Z" }, + { url = "https://files.pythonhosted.org/packages/ca/0b/d5f999f27cb90152a6aadf094205b4d0eeab6a6b03e3e60346cde988c1bd/fastapi-0.126.0-py3-none-any.whl", hash = "sha256:c9330b9731e3bd2caae0a00e76353f86adbf592c5a25649a1682f3a92aeaff41", size = 111758, upload-time = "2025-12-20T16:16:42.349Z" }, ] [package.optional-dependencies] @@ -884,6 +884,8 @@ standard = [ { name = "fastapi-cli", extra = ["standard"], marker = "extra == 'extra-11-skyrl-train-mcore' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-miniswe') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-miniswe' and extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "httpx" }, { name = "jinja2" }, + { name = "pydantic-extra-types" }, + { name = "pydantic-settings" }, { name = "python-multipart" }, { name = "uvicorn", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, extra = ["standard"], marker = "extra == 'extra-11-skyrl-train-mcore' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-miniswe') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-miniswe' and extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, ] @@ -1792,7 +1794,7 @@ proxy = [ { name = "gunicorn", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "litellm-enterprise", version = "0.1.25", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "litellm-proxy-extras", version = "0.4.14", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, - { name = "mcp", version = "1.24.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, + { name = "mcp", version = "1.25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "orjson", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "polars", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "pyjwt", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, @@ -2081,7 +2083,7 @@ wheels = [ [[package]] name = "mcp" -version = "1.24.0" +version = "1.25.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "sys_platform == 'linux' and extra != 'extra-11-skyrl-train-flashrl' and extra != 'extra-11-skyrl-train-mcore' and extra == 'extra-11-skyrl-train-miniswe' and extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm'", @@ -2120,9 +2122,9 @@ dependencies = [ { name = "typing-inspection", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "uvicorn", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform != 'emscripten' and extra == 'extra-11-skyrl-train-flashrl') or (sys_platform != 'emscripten' and extra == 'extra-11-skyrl-train-mcore') or (sys_platform != 'emscripten' and extra != 'extra-11-skyrl-train-sglang') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-mcore') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-miniswe') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm') or (extra == 'extra-11-skyrl-train-mcore' and extra == 'extra-11-skyrl-train-sglang') or (extra == 'extra-11-skyrl-train-mcore' and extra == 'extra-11-skyrl-train-vllm') or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d6/2c/db9ae5ab1fcdd9cd2bcc7ca3b7361b712e30590b64d5151a31563af8f82d/mcp-1.24.0.tar.gz", hash = "sha256:aeaad134664ce56f2721d1abf300666a1e8348563f4d3baff361c3b652448efc", size = 604375, upload-time = "2025-12-12T14:19:38.205Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d5/2d/649d80a0ecf6a1f82632ca44bec21c0461a9d9fc8934d38cb5b319f2db5e/mcp-1.25.0.tar.gz", hash = "sha256:56310361ebf0364e2d438e5b45f7668cbb124e158bb358333cd06e49e83a6802", size = 605387, upload-time = "2025-12-19T10:19:56.985Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/61/0d/5cf14e177c8ae655a2fd9324a6ef657ca4cafd3fc2201c87716055e29641/mcp-1.24.0-py3-none-any.whl", hash = "sha256:db130e103cc50ddc3dffc928382f33ba3eaef0b711f7a87c05e7ded65b1ca062", size = 232896, upload-time = "2025-12-12T14:19:36.14Z" }, + { url = "https://files.pythonhosted.org/packages/e2/fc/6dc7659c2ae5ddf280477011f4213a74f806862856b796ef08f028e664bf/mcp-1.25.0-py3-none-any.whl", hash = "sha256:b37c38144a666add0862614cc79ec276e97d72aa8ca26d622818d4e278b9721a", size = 233076, upload-time = "2025-12-19T10:19:55.416Z" }, ] [[package]] @@ -2149,7 +2151,7 @@ wheels = [ [[package]] name = "megatron-bridge" version = "0.3.0rc0" -source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=22ef9ff9f9684ba2f2dbea14db974f5c31bbd683#22ef9ff9f9684ba2f2dbea14db974f5c31bbd683" } +source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=953aabf75c0500180dc14a6a76cf9e7e7c4baec7#953aabf75c0500180dc14a6a76cf9e7e7c4baec7" } dependencies = [ { name = "accelerate" }, { name = "causal-conv1d", marker = "sys_platform == 'never'" }, @@ -2504,11 +2506,11 @@ wheels = [ [[package]] name = "nodeenv" -version = "1.9.1" +version = "1.10.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, + { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, ] [[package]] @@ -2781,12 +2783,12 @@ wheels = [ [[package]] name = "nvidia-cudnn-frontend" -version = "1.16.0" +version = "1.17.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" }, - { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" }, - { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" }, + { url = "https://files.pythonhosted.org/packages/42/d9/f58ed6292c9396f7422812a0a2d9f80cc5a623ea6c758bcb3d34d4795bb8/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de0c473f32d705abcf14f351615f7ffbeed7320e3499cf2195ae5689652a2592", size = 1917620, upload-time = "2025-12-20T00:27:46.179Z" }, + { url = "https://files.pythonhosted.org/packages/db/eb/c641135632bd2afc21339aadee96af4c5db1460dfa07ca74836de75a590f/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c913c87fca691a91385287f2587575531933acfebc85c33dbcecb191886c7a53", size = 2038994, upload-time = "2025-12-20T00:25:18.9Z" }, + { url = "https://files.pythonhosted.org/packages/82/49/a92da03eb43bde90be770a43666c5ab26b4f8b15f6e46c4b0b0e84f37994/nvidia_cudnn_frontend-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0d4cfd03961592108abd1ba246e43c8bb7540aed984df860256d0bff181de98", size = 1441271, upload-time = "2025-12-20T00:29:52.056Z" }, ] [[package]] @@ -3247,7 +3249,7 @@ wheels = [ [[package]] name = "openai" -version = "2.13.0" +version = "2.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -3259,9 +3261,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0f/39/8e347e9fda125324d253084bb1b82407e5e3c7777a03dc398f79b2d95626/openai-2.13.0.tar.gz", hash = "sha256:9ff633b07a19469ec476b1e2b5b26c5ef700886524a7a72f65e6f0b5203142d5", size = 626583, upload-time = "2025-12-16T18:19:44.387Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/b1/12fe1c196bea326261718eb037307c1c1fe1dedc2d2d4de777df822e6238/openai-2.14.0.tar.gz", hash = "sha256:419357bedde9402d23bf8f2ee372fca1985a73348debba94bddff06f19459952", size = 626938, upload-time = "2025-12-19T03:28:45.742Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/d5/eb52edff49d3d5ea116e225538c118699ddeb7c29fa17ec28af14bc10033/openai-2.13.0-py3-none-any.whl", hash = "sha256:746521065fed68df2f9c2d85613bb50844343ea81f60009b60e6a600c9352c79", size = 1066837, upload-time = "2025-12-16T18:19:43.124Z" }, + { url = "https://files.pythonhosted.org/packages/27/4b/7c1a00c2c3fbd004253937f7520f692a9650767aa73894d7a34f0d65d3f4/openai-2.14.0-py3-none-any.whl", hash = "sha256:7ea40aca4ffc4c4a776e77679021b47eec1160e341f42ae086ba949c9dcc9183", size = 1067558, upload-time = "2025-12-19T03:28:43.727Z" }, ] [[package]] @@ -4714,7 +4716,7 @@ requires-dist = [ { name = "litellm", marker = "extra == 'miniswe'" }, { name = "litellm", extras = ["proxy"], marker = "extra == 'sandboxes'", specifier = ">=1.67.5" }, { name = "loguru" }, - { name = "megatron-bridge", marker = "extra == 'mcore'", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=22ef9ff9f9684ba2f2dbea14db974f5c31bbd683" }, + { name = "megatron-bridge", marker = "extra == 'mcore'", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=953aabf75c0500180dc14a6a76cf9e7e7c4baec7" }, { name = "megatron-core", marker = "extra == 'mcore'", specifier = "==0.15.0" }, { name = "mini-swe-agent", marker = "extra == 'miniswe'", specifier = ">=1.12.0" }, { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=2.0.0" }, @@ -5789,7 +5791,7 @@ wheels = [ [[package]] name = "typer" -version = "0.20.0" +version = "0.20.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click", marker = "extra == 'extra-11-skyrl-train-mcore' or extra == 'extra-11-skyrl-train-miniswe' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm')" }, @@ -5797,9 +5799,9 @@ dependencies = [ { name = "shellingham", marker = "extra == 'extra-11-skyrl-train-mcore' or extra == 'extra-11-skyrl-train-miniswe' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm')" }, { name = "typing-extensions", marker = "extra == 'extra-11-skyrl-train-mcore' or extra == 'extra-11-skyrl-train-miniswe' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8f/28/7c85c8032b91dbe79725b6f17d2fffc595dff06a35c7a30a37bef73a1ab4/typer-0.20.0.tar.gz", hash = "sha256:1aaf6494031793e4876fb0bacfa6a912b551cf43c1e63c800df8b1a866720c37", size = 106492, upload-time = "2025-10-20T17:03:49.445Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6d/c1/933d30fd7a123ed981e2a1eedafceab63cb379db0402e438a13bc51bbb15/typer-0.20.1.tar.gz", hash = "sha256:68585eb1b01203689c4199bc440d6be616f0851e9f0eb41e4a778845c5a0fd5b", size = 105968, upload-time = "2025-12-19T16:48:56.302Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/78/64/7713ffe4b5983314e9d436a90d5bd4f63b6054e2aca783a3cfc44cb95bbf/typer-0.20.0-py3-none-any.whl", hash = "sha256:5b463df6793ec1dca6213a3cf4c0f03bc6e322ac5e16e13ddd622a889489784a", size = 47028, upload-time = "2025-10-20T17:03:47.617Z" }, + { url = "https://files.pythonhosted.org/packages/c8/52/1f2df7e7d1be3d65ddc2936d820d4a3d9777a54f4204f5ca46b8513eff77/typer-0.20.1-py3-none-any.whl", hash = "sha256:4b3bde918a67c8e03d861aa02deca90a95bbac572e71b1b9be56ff49affdb5a8", size = 47381, upload-time = "2025-12-19T16:48:53.679Z" }, ] [[package]]