diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 1dd9639472..fafeab31d1 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -216,6 +216,15 @@ policy: top_k: null stop_token_ids: null stop_strings: null + mcore_generation_config: + buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers + buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests + num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes + block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity) + use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing + enable_chunked_prefill: true # Split long prefills into chunks for better memory management + unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) + max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step vllm_cfg: async_engine: false precision: ${policy.precision} diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 95d85f74c7..b7bf992f67 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -150,7 +150,7 @@ policy: use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing enable_chunked_prefill: true # Split long prefills into chunks for better memory management unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) - max_tokens: 16384 # Maximum number of tokens to use in a single step + max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step vllm_cfg: tensor_parallel_size: 1 diff --git a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml index 29ee217517..1fce7d82d4 100644 --- a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml +++ b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml @@ -29,7 +29,7 @@ checkpointing: checkpoint_dir: results/dapo-qwen2.5-7b keep_top_k: 5 save_period: 5 - model_save_format: "dcp" + model_save_format: null policy: model_name: Qwen/Qwen2.5-Math-7B hf_config_overrides: diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py index 75e3334d4a..6a9a159cad 100644 --- a/nemo_rl/models/generation/vllm/vllm_worker.py +++ b/nemo_rl/models/generation/vllm/vllm_worker.py @@ -388,6 +388,10 @@ def _patch_vllm_vit_flash_attn_backend(): ) # disable quantization vllm_kwargs["hf_overrides"]["quantization_config"] = {} + elif "Gemma3ForConditionalGeneration" in getattr(hf_config, "architectures", []): + if self.cfg["vllm_cfg"]["skip_tokenizer_init"]: + print("Gemma3ForConditionalGeneration models may crash when skip_tokenizer_init is True. NeMo-RL is forcing it to False for this architecture. See https://github.com/NVIDIA-NeMo/RL/issues/1681 for more details.") + self.cfg["vllm_cfg"]["skip_tokenizer_init"] = False llm_kwargs = dict( model=self.model_name, diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker.py b/nemo_rl/models/policy/workers/dtensor_policy_worker.py index 2903307c8b..581b4d46fd 100644 --- a/nemo_rl/models/policy/workers/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/workers/dtensor_policy_worker.py @@ -1839,7 +1839,7 @@ def move_buffer_to_device( ) -> nn.Module: # FSDP modules do not move buffers to the device automatically for v in model.buffers(): - v.data = v.data.to(device) + v = v.to(device) return model diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py index 738146a7e2..b2e69bd608 100644 --- a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py @@ -324,10 +324,10 @@ def __init__( print( "[WARNING]: sequence_parallel=True, but tp_size=1 which has no effect. Enable tp_size > 1 to use sequence parallelism." ) - elif sequence_parallel_enabled and tp_size > 1: - raise RuntimeError( - "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details." - ) + #elif sequence_parallel_enabled and tp_size > 1: + # raise RuntimeError( + # "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details." + # ) if cp_size > 1: assert not isinstance(self.model, Gemma3ForCausalLM), ( diff --git a/nemo_rl/utils/logger.py b/nemo_rl/utils/logger.py index ed431986dc..f329dd70c7 100644 --- a/nemo_rl/utils/logger.py +++ b/nemo_rl/utils/logger.py @@ -121,6 +121,23 @@ def __init__(self, cfg: TensorboardConfig, log_dir: Optional[str] = None): self.writer = SummaryWriter(log_dir=log_dir) print(f"Initialized TensorboardLogger at {log_dir}") + @staticmethod + def _coerce_to_scalar(value: Any) -> int | float | bool | str | None: + """Coerce a value to a Python scalar for TensorBoard logging. + + Returns the coerced value, or None if it can't be converted to a scalar. + """ + if isinstance(value, (int, float, bool, str)): + return value + if isinstance(value, (np.floating, np.integer, np.bool_)): + return value.item() + if isinstance(value, np.ndarray) and (value.ndim == 0 or value.size == 1): + return value.item() + if isinstance(value, torch.Tensor) and (value.ndim == 0 or value.numel() == 1): + return value.item() + # dict, list, multi-element arrays/tensors, or incompatible types + return None + def log_metrics( self, metrics: dict[str, Any], @@ -137,23 +154,19 @@ def log_metrics( step_metric: Optional step metric name (ignored in TensorBoard) """ for name, value in metrics.items(): - # NeMo-Gym will add additional metrics like wandb histograms. However, some people will log to Tensorboard instead which may not be compatible - # This logic catches non-compatible objects being logged. - if not isinstance(value, (int, float, bool, str)): - continue - if prefix: name = f"{prefix}/{name}" - # Skip non-scalar values that TensorBoard can't handle - if isinstance(value, (dict, list)): + scalar = self._coerce_to_scalar(value) + if scalar is None: print( - f"Warning: Skipping non-scalar metric '{name}' for TensorBoard logging (type: {type(value).__name__})" + f"Warning: Skipping metric '{name}' for TensorBoard logging " + f"(unsupported type: {type(value).__name__})" ) continue try: - self.writer.add_scalar(name, value, step) + self.writer.add_scalar(name, scalar, step) except Exception as e: print(f"Warning: Failed to log metric '{name}' to TensorBoard: {e}") continue diff --git a/tests/check_metrics.py b/tests/check_metrics.py index f0b3a9025b..2f12be1645 100644 --- a/tests/check_metrics.py +++ b/tests/check_metrics.py @@ -97,6 +97,38 @@ def mean(value, range_start=1, range_end=0, ignore_top_p=0.0): return statistics.mean(vals) +def median(value, range_start=1, range_end=0): + """Return the median of values (or a range of values) in a dictionary. + + Note: + step, and ranges, are 1 indexed. Range_end is exclusive. + range_end=0 means to include until the last step in the run + + Args: + value: Dictionary of step -> value + range_start: Starting step (1-indexed, default=1) + range_end: Ending step (1-indexed, exclusive, 0 means last step) + """ + + ## find potential offset that might arise from resuming from a checkpoint + max_step_reached = builtins.max([int(s) for s in value.keys()]) + ## this is the number of steps that occurred prior to resuming + offset = max_step_reached - len(value) + + num_elem = len(value) + if range_start < 0: + range_start += num_elem + 1 + offset + if range_end <= 0: + range_end += num_elem + 1 + offset + + vals = [] + for step, v in value.items(): + if range_start <= int(step) and int(step) < range_end: + vals.append(float(v)) + + return statistics.median(vals) + + def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]: """Evaluate a check against the data. @@ -109,6 +141,7 @@ def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]: "min": min, "max": max, "mean": mean, + "median": median, "ratio_above": ratio_above, } @@ -152,6 +185,7 @@ def main(): # Use helper functions python check_metrics.py results.json "min(data['class_f1']) > 0.6" python check_metrics.py results.json "mean(data['accuracies']) > 0.85" + python check_metrics.py results.json "median(data['accuracies']) > 0.85" python check_metrics.py results.json "mean(data['loss'], ignore_top_p=0.05) < 1.5" python check_metrics.py results.json "ratio_above(data['error'], 1.05) < 0.02" """ diff --git a/tests/test_suites/llm/dapo-qwen2.5-7b.sh b/tests/test_suites/llm/dapo-qwen2.5-7b.sh index c68b52d4b9..d150ec019f 100755 --- a/tests/test_suites/llm/dapo-qwen2.5-7b.sh +++ b/tests/test_suites/llm/dapo-qwen2.5-7b.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["20"] < 1.05' \ 'data["train/reward"]["20"] > -0.45' \ 'data["train/filtered_reward"]["20"] > -0.2' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh index 3ef39d91a3..06e628484d 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh @@ -20,7 +20,7 @@ uv run examples/run_distillation_math.py \ distillation.val_period=20 \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ + logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ @@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["10"] < 0.5' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 500' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh index 6710ac87ce..73d2c3c2d6 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh @@ -20,7 +20,7 @@ uv run examples/run_distillation_math.py \ distillation.val_period=20 \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ + logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ @@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["10"] < 0.5' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 500' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh index 52f17c2c28..a559f36b2b 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh @@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \ distillation.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ + logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ @@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["20"] < 0.3' \ 'data["validation/accuracy"]["20"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1000' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh index cd4b635e72..a7db7a7787 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh @@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \ distillation.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ + logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ @@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["100"] < 0.25' \ 'data["validation/accuracy"]["100"] > 0.2' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1600' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh index df8d6daed7..a5ce82c306 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh @@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \ distillation.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ + logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ @@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["20"] < 0.3' \ 'data["validation/accuracy"]["20"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1000' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh index df8d6daed7..a5ce82c306 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh @@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \ distillation.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ + logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ @@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["20"] < 0.3' \ 'data["validation/accuracy"]["20"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1000' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh index a8d2d04adc..1f64224461 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/preference_loss"]["1"] < 0.69316' \ 'data["train/preference_loss"]["20"] < 0.6' \ 'mean(data["timing/train/total_step_time"], -10, -1) < 7.8' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh index fbda6865f5..5e5fe4d0e1 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh @@ -34,10 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 3.6' \ + 'data["train/loss"]["1"] < 3.65' \ 'data["train/loss"]["150"] < 3.0' \ 'data["train/preference_loss"]["1"] > 0.69314' \ 'data["train/preference_loss"]["1"] < 0.69316' \ 'data["train/preference_loss"]["150"] < 0.4' \ 'mean(data["timing/train/total_step_time"], -11, -1) < 24' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh index 7cc74e26df..7a98815ec7 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh @@ -34,10 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 3.6' \ + 'data["train/loss"]["1"] < 3.65' \ 'data["train/loss"]["150"] < 3.0' \ 'data["train/preference_loss"]["1"] > 0.69314' \ 'data["train/preference_loss"]["1"] < 0.69316' \ 'data["train/preference_loss"]["150"] < 0.4' \ 'mean(data["timing/train/total_step_time"], -11, -1) < 11.5' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh index 497e0b8f68..f57f755051 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/preference_loss"]["1"] < 0.69316' \ 'data["train/preference_loss"]["20"] < 0.6' \ 'mean(data["timing/train/total_step_time"], -10) < 6.7' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh index a6beabb886..79a4ea0781 100755 --- a/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh @@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/preference_loss"]["1"] > 0.6930' \ 'data["train/preference_loss"]["1"] < 0.6932' \ 'data["train/preference_loss"]["150"] < 0.68' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh index 0b0c67b312..dcb3672c21 100755 --- a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh +++ b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh @@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["1"] < 0.69316' \ 'data["train/loss"]["150"] < 0.55' \ 'mean(data["timing/train/total_step_time"], -11, -1) < 1.3' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh b/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh index 3466de2fce..46beb3f924 100755 --- a/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh +++ b/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] > 0.6990' \ - 'data["train/loss"]["1"] < 0.6992' \ + 'data["train/loss"]["1"] > 0.680' \ + 'data["train/loss"]["1"] < 0.70' \ 'data["train/loss"]["100"] < 0.60' -fi + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" +fi diff --git a/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh b/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh index 3522261d9c..a760b42eba 100755 --- a/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh +++ b/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh @@ -43,4 +43,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'min(data["train/token_mult_prob_error"]) < 1.05' \ 'data["train/reward"]["10"] > 0.4' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh index 633b0d8297..6198ae049f 100755 --- a/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh +++ b/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh @@ -41,7 +41,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'median(data["train/token_mult_prob_error"]) < 1.05' \ "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05" fi @@ -66,3 +66,6 @@ cat ${RUN_LOG}.aime-16k | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"sco # 240 step checkpoint 0.3 uv run tests/check_metrics.py ${RUN_LOG}-16k-metric.json \ 'data["score"] >= 0.2396' + +# Clean up checkpoint directory after successful run to save space. +rm -rf "$CKPT_DIR" diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh index 87b6e9065c..151076c471 100755 --- a/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh +++ b/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh @@ -41,7 +41,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'median(data["train/token_mult_prob_error"]) < 1.05' \ "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05" fi @@ -65,3 +65,6 @@ cat ${RUN_LOG}.aime-24k | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"sco uv run tests/check_metrics.py ${RUN_LOG}-24k-metric.json \ 'data["score"] >= 0.2396' + +# Clean up checkpoint directory after successful run to save space. +rm -rf "$CKPT_DIR" diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh index ba2f5993d4..d584327cb1 100755 --- a/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh +++ b/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh @@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'median(data["train/token_mult_prob_error"]) < 1.05' \ "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05" fi @@ -60,6 +60,9 @@ cat ${RUN_LOG}.aime-8k | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"scor uv run tests/check_metrics.py ${RUN_LOG}-8k-metric.json \ 'data["score"] >= 0.2396' +# Clean up checkpoint directory after successful run to save space. +rm -rf "$CKPT_DIR" + # This comment is for reference on how the aime24 eval baseline was chosen: # The variance in aime24 is pretty high when only taking one sample per prompt. # I have observed huge variance even between A100 and H100 with one sample per prompt, diff --git a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh index 4624b7282d..50fa72125c 100755 --- a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ "data[\"train/token_mult_prob_error\"][\"${MAX_STEPS}\"] < 1.1" \ 'mean(data["timing/train/total_step_time"], -6, -1) < 14' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh b/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh index a6ce1800d9..a1ace88f6f 100755 --- a/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh +++ b/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["20"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh index 77f858b429..9a242dd52a 100755 --- a/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh @@ -37,4 +37,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'mean(data["train/gen_kl_error"]) < 0.002' \ 'data["train/reward"]["60"] > 0.60' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 210' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh b/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh index ce2adb1c51..d253911819 100755 --- a/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh +++ b/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.1" + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi # TODO: enable in subsequent PR to do a quick accuracy check diff --git a/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled b/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled index 2194bad7ab..bd552ab397 100755 --- a/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled +++ b/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled @@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.1" fi diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh index 956c94bb5c..3a7de5a63c 100755 --- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh @@ -35,7 +35,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then # With a few number of steps the logprob can have spikes that can move the average up. uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"], ignore_top_p=0.05) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'ratio_above(data["train/token_mult_prob_error"], 1.1) < 0.1' # ratio_above @ 1.1 was 0.03,0.06,0.05: 3sigma ~=0.1 + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh index d018032576..83d7e51773 100755 --- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["30"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh index 4a6d63473c..171194957c 100755 --- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["100"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh index af44d060cb..7952b19491 100755 --- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["100"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh index 562ff730e7..7f5036e8db 100755 --- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["500"] < 1.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 10' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh index 90e309e128..3ee982ecc0 100755 --- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["500"] < 1.1' \ 'data["train/reward"]["500"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 10.5' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh index 08f57cb5a8..4fa8068017 100755 --- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh @@ -35,8 +35,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["500"] < 1.1' \ 'data["train/reward"]["500"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 10.5' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled b/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled index 9420b53c9d..d01c1aec0e 100755 --- a/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled +++ b/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled @@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["2"] < 1.1' \ 'mean(data["timing/train/policy_training"]) < 280' \ 'mean(data["ray/node.0.gpu.0.mem_gb"]) < 75' diff --git a/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh b/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh index 4a310b673b..1f00f7bad4 100755 --- a/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh +++ b/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["3"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh index 24e49d1a8d..c637acd050 100755 --- a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh +++ b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["30"] < 1.1' \ 'mean(data["train/reward"]) > 0.45' \ 'mean(data["timing/train/total_step_time"], -11, -1) < 70' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh index 24e49d1a8d..c637acd050 100755 --- a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["30"] < 1.1' \ 'mean(data["train/reward"]) > 0.45' \ 'mean(data["timing/train/total_step_time"], -11, -1) < 70' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh index 68a694098c..786a070335 100755 --- a/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'median(data["train/token_mult_prob_error"]) < 1.05' \ 'data["train/token_mult_prob_error"]["30"] < 1.05' \ 'data["train/reward"]["30"] > 0.4' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 80' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh index d1ad766b5b..3f657f90fd 100755 --- a/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'median(data["train/token_mult_prob_error"]) < 1.05' \ 'data["train/token_mult_prob_error"]["30"] < 1.05' \ 'data["train/reward"]["30"] > 0.4' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 60' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh index fa7fbd5bd6..1ddde6af8c 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["20"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh index 98591ba9b3..bf865f8b6f 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["2"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh index 5fcfbfd76a..b4fbf87ebf 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["30"] < 1.1' \ 'data["train/grad_norm"]["30"] < 0.5' \ 'data["train/grad_norm"]["30"] > 0.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh index 45f354043a..30c205b0ee 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh @@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["30"] < 1.1' \ 'mean(data["train/reward"]) > 0.56' \ 'mean(data["timing/train/total_step_time"], 2) < 50' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh index 35810c4eec..1c69a0cb9a 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["450"] < 1.1' \ 'mean(data["timing/train/total_step_time"], 2) < 25' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh index f89041cd40..6830171346 100755 --- a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/token_mult_prob_error"]["30"] < 1.1' \ - 'data["train/reward"]["30"] > 0.43' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/reward"]) > 0.43' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 220' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh b/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh index d1068a7ffa..8289dc3c7d 100755 --- a/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh +++ b/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh @@ -36,5 +36,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma # With a few number of steps the logprob can have spikes that can move the average up. # Enabling fp8 kvcache can cause the logprob to be slightly higher than fp8 linear only path, so we allow a larger tolerance. uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"], ignore_top_p=0.15) < 2.0' + 'median(data["train/token_mult_prob_error"]) < 2.0' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh b/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh index 855d009566..21415712c5 100755 --- a/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh +++ b/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh @@ -42,6 +42,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh index 738b38dd5b..cf1ae5047b 100755 --- a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh @@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh index 738b38dd5b..cf1ae5047b 100755 --- a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh @@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh index 14138486e1..298ac5b476 100755 --- a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh @@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh index 14138486e1..298ac5b476 100755 --- a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh @@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh index 14138486e1..298ac5b476 100755 --- a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh @@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh index e7636f3e93..2683cb8641 100755 --- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh index e7636f3e93..2683cb8641 100755 --- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh index e7636f3e93..2683cb8641 100755 --- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh index e7636f3e93..2683cb8641 100755 --- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh index e7636f3e93..2683cb8641 100755 --- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh index 0f9bf9289f..c4d4a8148c 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh @@ -35,6 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh index 0f9bf9289f..6275de3070 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh @@ -9,7 +9,7 @@ NUM_NODES=16 STEPS_PER_RUN=10 MAX_STEPS=10 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=100 +NUM_MINUTES=115 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -24,6 +24,7 @@ uv run examples/run_grpo_math.py \ logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ checkpointing.enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ $@ \ @@ -35,6 +36,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh index f7dac553af..f05dd15028 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh @@ -35,6 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh index f7dac553af..e9b20f4816 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh @@ -24,6 +24,7 @@ uv run examples/run_grpo_math.py \ logger.wandb.project=nemo-rl \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ checkpointing.enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ $@ \ @@ -35,6 +36,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh index 63a099e2e2..2f59ef7478 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh index 2a56609ffd..3fe6be1f96 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh index 0de5a124ed..4559fce26e 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh index 0de5a124ed..4559fce26e 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh index 8350d128e8..9cfa306af8 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh index 2a56609ffd..3fe6be1f96 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh @@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh index 0de5a124ed..4559fce26e 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh index 35d58c98f7..6bd6c237cb 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh index 35d58c98f7..6bd6c237cb 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh @@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh b/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh index 9f48544b42..dbfff34a7e 100755 --- a/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh +++ b/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh @@ -38,4 +38,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["50"] < 0.4' \ 'data["train/grad_norm"]["50"] < 17.5' \ 'data["train/grad_norm"]["50"] > 10.0' + 'data["train/grad_norm"]["50"] < 2.5' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh index 718322e33a..99f264e910 100755 --- a/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh +++ b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh @@ -37,6 +37,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 0.55' \ 'data["train/loss"]["300"] < 0.285' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ 'mean(data["timing/train/total_step_time"], 2) < 20' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh index d5dfde39b9..65362c2eb1 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh @@ -34,9 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # Last observed memory around 72.6 (But can be noisy) uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 0.6' \ 'data["train/loss"]["250"] < 0.36' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 10' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh index 4b243e8fe9..077102dc98 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh @@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["250"] < 0.36' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 80' \ 'mean(data["timing/train/total_step_time"], 2) < 22' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh index b1a21e8f06..b2ff16b155 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["50"] < 0.8' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 50' \ 'mean(data["timing/train/total_step_time"], 2) < 10' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi \ No newline at end of file diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh index 87ca1e9dad..18f2dd8f59 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["50"] < 0.38' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ 'mean(data["timing/train/total_step_time"], 2) < 32' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh index e063b39861..89aa5b184a 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh @@ -36,4 +36,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["1"] < 0.6' \ 'data["train/loss"]["250"] < 0.36' \ 'mean(data["timing/train/total_step_time"], 2) < 6' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh index 8ef0dfafe6..81ea9f2f6b 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh @@ -36,4 +36,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["1"] < 0.6' \ 'data["train/loss"]["250"] < 0.36' \ 'mean(data["timing/train/total_step_time"], 2) < 20' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh index b5edc8043e..106a6aee3f 100755 --- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh @@ -7,7 +7,7 @@ NUM_NODES=1 STEPS_PER_RUN=250 MAX_STEPS=250 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=15 +NUM_MINUTES=30 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'mean(data["timing/train/total_step_time"], -6, -1) < 0.7' # mean(data["train/loss"],-10,-1) observed to be 0.5557474825117323 # timing/train/total_step_time observed 0.6-0.64 + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi \ No newline at end of file diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index 445dc48b5a..71056bb4e1 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -7,7 +7,7 @@ NUM_NODES=2 STEPS_PER_RUN=20 # step_time ~ 10sec MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check +NUM_MINUTES=30 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check (30min to buffer for initial ckpt download) # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,4 +35,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["20"] < 2.05' \ 'mean(data["timing/train/total_step_time"], 2) < 18' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh index ec0e22bf6b..da8826e1f3 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -7,7 +7,7 @@ NUM_NODES=2 STEPS_PER_RUN=20 # step_time ~ 15sec MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=15 +NUM_MINUTES=30 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,4 +35,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["20"] < 2.05' \ 'mean(data["timing/train/total_step_time"], 2) < 15' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh index 3b987df72b..799a04c300 100755 --- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["1"] < 0.37' \ 'mean(data["train/loss"], 16) < 0.31' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 35' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh index 897f4fbb60..1ce7203203 100755 --- a/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh +++ b/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh @@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["80"] < 0.301' \ 'data["validation/val_loss"]["80"] < 0.304' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh index fb53624b94..386263ba3d 100755 --- a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh +++ b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh @@ -35,5 +35,8 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/reward"]["200"] > 0.9' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh index 814c822f1b..bbd490e4c0 100755 --- a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh +++ b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh @@ -36,5 +36,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["200"] < 0.1' \ 'data["train/reward"]["200"] > 0.9' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" fi diff --git a/tests/unit/test_check_metrics.py b/tests/unit/test_check_metrics.py index 313801531e..73cb08f469 100644 --- a/tests/unit/test_check_metrics.py +++ b/tests/unit/test_check_metrics.py @@ -21,7 +21,7 @@ tests_dir = Path(__file__).parent.parent sys.path.insert(0, str(tests_dir)) -from check_metrics import evaluate_check, max, mean, min, ratio_above +from check_metrics import evaluate_check, max, mean, median, min, ratio_above class TestMeanFunction: @@ -405,3 +405,121 @@ def test_ratio_above_combined_with_mean_ignore_top_p(self): # Check that exactly 5% are above threshold ratio = ratio_above(data["metric"], 5.0) assert ratio == 0.05 + + +class TestMedianFunction: + """Test the median function with various scenarios.""" + + def test_basic_median_odd_count(self): + """Test basic median calculation with odd number of values.""" + data = {"1": 1.0, "2": 2.0, "3": 3.0, "4": 4.0, "5": 5.0} + result = median(data) + assert result == 3.0 + + def test_basic_median_even_count(self): + """Test basic median calculation with even number of values.""" + data = {"1": 1.0, "2": 2.0, "3": 3.0, "4": 4.0} + result = median(data) + assert result == 2.5 # (2+3)/2 + + def test_median_with_outliers(self): + """Test that median is robust to outliers (unlike mean).""" + # Data with one severe outlier + data = {"1": 1.0, "2": 2.0, "3": 3.0, "4": 4.0, "5": 100.0} + + # Median should be unaffected by the outlier + result = median(data) + assert result == 3.0 # Middle value, ignores the outlier + + # Compare with mean which would be 22.0 + result_mean = mean(data) + assert result_mean == 22.0 + + def test_median_with_range(self): + """Test median with range_start and range_end.""" + data = {str(i): float(i) for i in range(1, 11)} # 1-10 + + # Get median of steps 3-7 (values 3, 4, 5, 6) + result = median(data, range_start=3, range_end=7) + assert result == 4.5 # (4+5)/2 + + def test_median_with_offset(self): + """Test median calculation with step offset (from checkpoint resume).""" + # Simulate a checkpoint resume scenario + # Steps 101-105 (resumed from step 100) + data = {"101": 1.0, "102": 2.0, "103": 3.0, "104": 4.0, "105": 5.0} + result = median(data) + assert result == 3.0 + + def test_median_with_negative_range(self): + """Test median with negative range indices.""" + data = {str(i): float(i) for i in range(1, 11)} # 1-10 + + # Last 3 values (8, 9, 10) + result = median(data, range_start=-3, range_end=0) + assert result == 9.0 + + def test_median_with_floats_and_strings(self): + """Test that string values are properly converted to floats.""" + data = {"1": "1.5", "2": "2.5", "3": "3.5"} + result = median(data) + assert result == 2.5 + + def test_median_single_value(self): + """Test median with single value.""" + data = {"1": 42.0} + result = median(data) + assert result == 42.0 + + def test_median_two_values(self): + """Test median with two values.""" + data = {"1": 1.0, "2": 10.0} + result = median(data) + assert result == 5.5 # (1+10)/2 + + def test_median_all_same_values(self): + """Test median with all same values.""" + data = {str(i): 5.0 for i in range(1, 11)} + result = median(data) + assert result == 5.0 + + def test_median_vs_mean_outlier_robustness(self): + """Demonstrate why median is preferred for outlier-prone data.""" + # Simulate token_mult_prob_error with outliers + data = {str(i): 1.0 for i in range(1, 20)} + data["20"] = 100.0 # Large outlier + + # Median is robust + median_result = median(data) + assert median_result == 1.0 + + # Mean is affected + mean_result = mean(data) + assert mean_result == 5.95 # (19 * 1.0 + 100) / 20 + + def test_evaluate_check_with_median(self): + """Test evaluate_check with median function.""" + data = {"accuracy": {"1": 0.8, "2": 0.9, "3": 0.95, "4": 0.85, "5": 100.0}} + + # Median should be robust to the outlier + passed, _, value = evaluate_check(data, "median(data['accuracy']) < 1.0") + assert passed is True + assert value == 0.9 # Middle value + + def test_median_token_mult_prob_error_scenario(self): + """Test the exact scenario for which median is being used.""" + # Simulate token_mult_prob_error with some outliers + data = { + "train/token_mult_prob_error": { + str(i): 1.0 + (i % 3) * 0.01 for i in range(1, 20) + } + } + # Add outlier + data["train/token_mult_prob_error"]["20"] = 5.0 + + # Median should pass the check even with the outlier + passed, _, value = evaluate_check( + data, 'median(data["train/token_mult_prob_error"]) < 1.05' + ) + assert passed is True + assert value < 1.05 diff --git a/tests/unit/utils/test_logger.py b/tests/unit/utils/test_logger.py index 679ccf9a8e..60efcff764 100644 --- a/tests/unit/utils/test_logger.py +++ b/tests/unit/utils/test_logger.py @@ -129,6 +129,103 @@ def test_log_hyperparams(self, mock_summary_writer, temp_dir): } + @patch("nemo_rl.utils.logger.SummaryWriter") + def test_coerce_to_scalar_python_primitives(self, mock_summary_writer, temp_dir): + """Test that Python primitives pass through unchanged.""" + cfg = {"log_dir": temp_dir} + logger = TensorboardLogger(cfg, log_dir=temp_dir) + + assert logger._coerce_to_scalar(42) == 42 + assert logger._coerce_to_scalar(3.14) == 3.14 + assert logger._coerce_to_scalar(True) is True + assert logger._coerce_to_scalar("hello") == "hello" + + @patch("nemo_rl.utils.logger.SummaryWriter") + def test_coerce_to_scalar_numpy_types(self, mock_summary_writer, temp_dir): + """Test that numpy scalar types are coerced to Python primitives.""" + import numpy as np + + cfg = {"log_dir": temp_dir} + logger = TensorboardLogger(cfg, log_dir=temp_dir) + + # numpy scalar types + assert logger._coerce_to_scalar(np.float32(1.5)) == 1.5 + assert logger._coerce_to_scalar(np.float64(2.5)) == 2.5 + assert logger._coerce_to_scalar(np.int32(10)) == 10 + assert logger._coerce_to_scalar(np.int64(20)) == 20 + assert logger._coerce_to_scalar(np.bool_(True)) is True + + # 0-d numpy arrays + assert logger._coerce_to_scalar(np.array(3.14)) == 3.14 + # 1-element numpy arrays + assert logger._coerce_to_scalar(np.array([42])) == 42 + + # Multi-element arrays should return None + assert logger._coerce_to_scalar(np.array([1, 2, 3])) is None + + @patch("nemo_rl.utils.logger.SummaryWriter") + def test_coerce_to_scalar_torch_tensors(self, mock_summary_writer, temp_dir): + """Test that torch scalar tensors are coerced to Python primitives.""" + cfg = {"log_dir": temp_dir} + logger = TensorboardLogger(cfg, log_dir=temp_dir) + + # 0-d tensors + assert logger._coerce_to_scalar(torch.tensor(3.14)) == pytest.approx(3.14) + assert logger._coerce_to_scalar(torch.tensor(42)) == 42 + + # 1-element tensors + assert logger._coerce_to_scalar(torch.tensor([99])) == 99 + + # Multi-element tensors should return None + assert logger._coerce_to_scalar(torch.tensor([1, 2, 3])) is None + + @patch("nemo_rl.utils.logger.SummaryWriter") + def test_coerce_to_scalar_incompatible_types(self, mock_summary_writer, temp_dir): + """Test that incompatible types return None.""" + cfg = {"log_dir": temp_dir} + logger = TensorboardLogger(cfg, log_dir=temp_dir) + + assert logger._coerce_to_scalar({"key": "value"}) is None + assert logger._coerce_to_scalar([1, 2, 3]) is None + assert logger._coerce_to_scalar(None) is None + assert logger._coerce_to_scalar(object()) is None + + @patch("nemo_rl.utils.logger.SummaryWriter") + def test_log_metrics_coerces_numpy_and_torch(self, mock_summary_writer, temp_dir): + """Test that log_metrics correctly logs numpy/torch scalars.""" + import numpy as np + + cfg = {"log_dir": temp_dir} + logger = TensorboardLogger(cfg, log_dir=temp_dir) + + metrics = { + "python_float": 1.0, + "numpy_float32": np.float32(2.0), + "numpy_float64": np.float64(3.0), + "torch_scalar": torch.tensor(4.0), + "numpy_0d": np.array(5.0), + "torch_1elem": torch.tensor([6.0]), + "skip_list": [1, 2, 3], + "skip_dict": {"a": 1}, + "skip_multi_tensor": torch.tensor([1.0, 2.0]), + } + logger.log_metrics(metrics, step=1) + + mock_writer = mock_summary_writer.return_value + # Should log 6 scalars, skip 3 incompatible + assert mock_writer.add_scalar.call_count == 6 + + # Verify each scalar was logged with correct value + calls = {c[0][0]: c[0][1] for c in mock_writer.add_scalar.call_args_list} + assert calls["python_float"] == 1.0 + assert calls["numpy_float32"] == pytest.approx(2.0) + assert calls["numpy_float64"] == pytest.approx(3.0) + assert calls["torch_scalar"] == pytest.approx(4.0) + assert calls["numpy_0d"] == pytest.approx(5.0) + assert calls["torch_1elem"] == pytest.approx(6.0) + + + class TestWandbLogger: """Test the WandbLogger class."""