diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 1dd9639472..fafeab31d1 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -216,6 +216,15 @@ policy:
     top_k: null
     stop_token_ids: null
     stop_strings: null
+    mcore_generation_config:
+      buffer_size_gb: 20  # Total GPU memory (in GB) allocated for KV cache buffers
+      buffer_guaranteed_fraction: 0.1  # Fraction of buffer reserved for guaranteed active requests
+      num_cuda_graphs: 16  # Number of CUDA graphs to pre-compile for different batch sizes
+      block_size_tokens: 256  # Size of each KV cache block in tokens (affects memory granularity)
+      use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
+      enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
+      unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
+      max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step
     vllm_cfg:
       async_engine: false
       precision: ${policy.precision}
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 95d85f74c7..b7bf992f67 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -150,7 +150,7 @@ policy:
       use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
       enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
       unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
-      max_tokens: 16384  # Maximum number of tokens to use in a single step
+      max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step
       
     vllm_cfg:
       tensor_parallel_size: 1
diff --git a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
index 29ee217517..1fce7d82d4 100644
--- a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
+++ b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
@@ -29,7 +29,7 @@ checkpointing:
   checkpoint_dir: results/dapo-qwen2.5-7b
   keep_top_k: 5
   save_period: 5
-  model_save_format: "dcp"
+  model_save_format: null
 policy:
   model_name: Qwen/Qwen2.5-Math-7B
   hf_config_overrides:
diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py
index 75e3334d4a..6a9a159cad 100644
--- a/nemo_rl/models/generation/vllm/vllm_worker.py
+++ b/nemo_rl/models/generation/vllm/vllm_worker.py
@@ -388,6 +388,10 @@ def _patch_vllm_vit_flash_attn_backend():
                 )
                 # disable quantization
                 vllm_kwargs["hf_overrides"]["quantization_config"] = {}
+        elif "Gemma3ForConditionalGeneration" in getattr(hf_config, "architectures", []):
+            if self.cfg["vllm_cfg"]["skip_tokenizer_init"]:
+                print("Gemma3ForConditionalGeneration models may crash when skip_tokenizer_init is True. NeMo-RL is forcing it to False for this architecture. See https://github.com/NVIDIA-NeMo/RL/issues/1681 for more details.")
+            self.cfg["vllm_cfg"]["skip_tokenizer_init"] = False
 
         llm_kwargs = dict(
             model=self.model_name,
diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker.py b/nemo_rl/models/policy/workers/dtensor_policy_worker.py
index 2903307c8b..581b4d46fd 100644
--- a/nemo_rl/models/policy/workers/dtensor_policy_worker.py
+++ b/nemo_rl/models/policy/workers/dtensor_policy_worker.py
@@ -1839,7 +1839,7 @@ def move_buffer_to_device(
     ) -> nn.Module:
         # FSDP modules do not move buffers to the device automatically
         for v in model.buffers():
-            v.data = v.data.to(device)
+            v = v.to(device)
 
         return model
 
diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
index 738146a7e2..b2e69bd608 100644
--- a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
+++ b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
@@ -324,10 +324,10 @@ def __init__(
             print(
                 "[WARNING]: sequence_parallel=True, but tp_size=1 which has no effect. Enable tp_size > 1 to use sequence parallelism."
             )
-        elif sequence_parallel_enabled and tp_size > 1:
-            raise RuntimeError(
-                "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details."
-            )
+        #elif sequence_parallel_enabled and tp_size > 1:
+        #    raise RuntimeError(
+        #        "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details."
+        #    )
 
         if cp_size > 1:
             assert not isinstance(self.model, Gemma3ForCausalLM), (
diff --git a/nemo_rl/utils/logger.py b/nemo_rl/utils/logger.py
index ed431986dc..f329dd70c7 100644
--- a/nemo_rl/utils/logger.py
+++ b/nemo_rl/utils/logger.py
@@ -121,6 +121,23 @@ def __init__(self, cfg: TensorboardConfig, log_dir: Optional[str] = None):
         self.writer = SummaryWriter(log_dir=log_dir)
         print(f"Initialized TensorboardLogger at {log_dir}")
 
+    @staticmethod
+    def _coerce_to_scalar(value: Any) -> int | float | bool | str | None:
+        """Coerce a value to a Python scalar for TensorBoard logging.
+
+        Returns the coerced value, or None if it can't be converted to a scalar.
+        """
+        if isinstance(value, (int, float, bool, str)):
+            return value
+        if isinstance(value, (np.floating, np.integer, np.bool_)):
+            return value.item()
+        if isinstance(value, np.ndarray) and (value.ndim == 0 or value.size == 1):
+            return value.item()
+        if isinstance(value, torch.Tensor) and (value.ndim == 0 or value.numel() == 1):
+            return value.item()
+        # dict, list, multi-element arrays/tensors, or incompatible types
+        return None
+
     def log_metrics(
         self,
         metrics: dict[str, Any],
@@ -137,23 +154,19 @@ def log_metrics(
             step_metric: Optional step metric name (ignored in TensorBoard)
         """
         for name, value in metrics.items():
-            # NeMo-Gym will add additional metrics like wandb histograms. However, some people will log to Tensorboard instead which may not be compatible
-            # This logic catches non-compatible objects being logged.
-            if not isinstance(value, (int, float, bool, str)):
-                continue
-
             if prefix:
                 name = f"{prefix}/{name}"
 
-            # Skip non-scalar values that TensorBoard can't handle
-            if isinstance(value, (dict, list)):
+            scalar = self._coerce_to_scalar(value)
+            if scalar is None:
                 print(
-                    f"Warning: Skipping non-scalar metric '{name}' for TensorBoard logging (type: {type(value).__name__})"
+                    f"Warning: Skipping metric '{name}' for TensorBoard logging "
+                    f"(unsupported type: {type(value).__name__})"
                 )
                 continue
 
             try:
-                self.writer.add_scalar(name, value, step)
+                self.writer.add_scalar(name, scalar, step)
             except Exception as e:
                 print(f"Warning: Failed to log metric '{name}' to TensorBoard: {e}")
                 continue
diff --git a/tests/check_metrics.py b/tests/check_metrics.py
index f0b3a9025b..2f12be1645 100644
--- a/tests/check_metrics.py
+++ b/tests/check_metrics.py
@@ -97,6 +97,38 @@ def mean(value, range_start=1, range_end=0, ignore_top_p=0.0):
     return statistics.mean(vals)
 
 
+def median(value, range_start=1, range_end=0):
+    """Return the median of values (or a range of values) in a dictionary.
+
+    Note:
+        step, and ranges, are 1 indexed. Range_end is exclusive.
+        range_end=0 means to include until the last step in the run
+
+    Args:
+        value: Dictionary of step -> value
+        range_start: Starting step (1-indexed, default=1)
+        range_end: Ending step (1-indexed, exclusive, 0 means last step)
+    """
+
+    ## find potential offset that might arise from resuming from a checkpoint
+    max_step_reached = builtins.max([int(s) for s in value.keys()])
+    ## this is the number of steps that occurred prior to resuming
+    offset = max_step_reached - len(value)
+
+    num_elem = len(value)
+    if range_start < 0:
+        range_start += num_elem + 1 + offset
+    if range_end <= 0:
+        range_end += num_elem + 1 + offset
+
+    vals = []
+    for step, v in value.items():
+        if range_start <= int(step) and int(step) < range_end:
+            vals.append(float(v))
+
+    return statistics.median(vals)
+
+
 def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]:
     """Evaluate a check against the data.
 
@@ -109,6 +141,7 @@ def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]:
         "min": min,
         "max": max,
         "mean": mean,
+        "median": median,
         "ratio_above": ratio_above,
     }
 
@@ -152,6 +185,7 @@ def main():
       # Use helper functions
       python check_metrics.py results.json "min(data['class_f1']) > 0.6"
       python check_metrics.py results.json "mean(data['accuracies']) > 0.85"
+      python check_metrics.py results.json "median(data['accuracies']) > 0.85"
       python check_metrics.py results.json "mean(data['loss'], ignore_top_p=0.05) < 1.5"
       python check_metrics.py results.json "ratio_above(data['error'], 1.05) < 0.02"
     """
diff --git a/tests/test_suites/llm/dapo-qwen2.5-7b.sh b/tests/test_suites/llm/dapo-qwen2.5-7b.sh
index c68b52d4b9..d150ec019f 100755
--- a/tests/test_suites/llm/dapo-qwen2.5-7b.sh
+++ b/tests/test_suites/llm/dapo-qwen2.5-7b.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["20"] < 1.05' \
         'data["train/reward"]["20"] > -0.45' \
         'data["train/filtered_reward"]["20"] > -0.2'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh
index 3ef39d91a3..06e628484d 100755
--- a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh
+++ b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.sh
@@ -20,7 +20,7 @@ uv run examples/run_distillation_math.py \
     distillation.val_period=20 \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["10"] < 0.5' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 500'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh
index 6710ac87ce..73d2c3c2d6 100755
--- a/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh
+++ b/tests/test_suites/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.sh
@@ -20,7 +20,7 @@ uv run examples/run_distillation_math.py \
     distillation.val_period=20 \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["10"] < 0.5' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 500'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh
index 52f17c2c28..a559f36b2b 100755
--- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh
+++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 0.3' \
         'data["validation/accuracy"]["20"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1000'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh
index cd4b635e72..a7db7a7787 100755
--- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh
+++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["100"] < 0.25' \
         'data["validation/accuracy"]["100"] > 0.2' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1600'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh
index df8d6daed7..a5ce82c306 100755
--- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh
+++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 0.3' \
         'data["validation/accuracy"]["20"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1000'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh
index df8d6daed7..a5ce82c306 100755
--- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh
+++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 0.3' \
         'data["validation/accuracy"]["20"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1000'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh
index a8d2d04adc..1f64224461 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.sh
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["20"] < 0.6' \
         'mean(data["timing/train/total_step_time"], -10, -1) < 7.8'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
index fbda6865f5..5e5fe4d0e1 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.sh
@@ -34,10 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 3.6' \
+        'data["train/loss"]["1"] < 3.65' \
         'data["train/loss"]["150"] < 3.0' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["150"] < 0.4' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 24'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
index 7cc74e26df..7a98815ec7 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.sh
@@ -34,10 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 3.6' \
+        'data["train/loss"]["1"] < 3.65' \
         'data["train/loss"]["150"] < 3.0' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["150"] < 0.4' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 11.5'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh
index 497e0b8f68..f57f755051 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.sh
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["20"] < 0.6' \
         'mean(data["timing/train/total_step_time"], -10) < 6.7'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh
index a6beabb886..79a4ea0781 100755
--- a/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/preference_loss"]["1"] > 0.6930' \
         'data["train/preference_loss"]["1"] < 0.6932' \
         'data["train/preference_loss"]["150"] < 0.68'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
index 0b0c67b312..dcb3672c21 100755
--- a/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
+++ b/tests/test_suites/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["1"] < 0.69316' \
         'data["train/loss"]["150"] < 0.55' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 1.3'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh b/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh
index 3466de2fce..46beb3f924 100755
--- a/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh
+++ b/tests/test_suites/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] > 0.6990' \
-        'data["train/loss"]["1"] < 0.6992' \
+        'data["train/loss"]["1"] > 0.680' \
+        'data["train/loss"]["1"] < 0.70' \
         'data["train/loss"]["100"] < 0.60'
-fi 
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
diff --git a/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh b/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh
index 3522261d9c..a760b42eba 100755
--- a/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh
+++ b/tests/test_suites/llm/grpo-dapomath17k-dsv3-megatron.sh
@@ -43,4 +43,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'min(data["train/token_mult_prob_error"]) < 1.05' \
         'data["train/reward"]["10"] > 0.4'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh
index 633b0d8297..6198ae049f 100755
--- a/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh
+++ b/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh
@@ -41,7 +41,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'median(data["train/token_mult_prob_error"]) < 1.05' \
         "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
 fi
 
@@ -66,3 +66,6 @@ cat ${RUN_LOG}.aime-16k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"sco
 # 240 step checkpoint 0.3
 uv run tests/check_metrics.py ${RUN_LOG}-16k-metric.json \
   'data["score"] >= 0.2396'
+
+# Clean up checkpoint directory after successful run to save space.
+rm -rf "$CKPT_DIR"
diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh
index 87b6e9065c..151076c471 100755
--- a/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh
+++ b/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh
@@ -41,7 +41,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'median(data["train/token_mult_prob_error"]) < 1.05' \
         "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
 fi
 
@@ -65,3 +65,6 @@ cat ${RUN_LOG}.aime-24k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"sco
  
 uv run tests/check_metrics.py ${RUN_LOG}-24k-metric.json \
   'data["score"] >= 0.2396'
+
+# Clean up checkpoint directory after successful run to save space.
+rm -rf "$CKPT_DIR"
diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh
index ba2f5993d4..d584327cb1 100755
--- a/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh
+++ b/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh
@@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'median(data["train/token_mult_prob_error"]) < 1.05' \
         "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
 fi
 
@@ -60,6 +60,9 @@ cat ${RUN_LOG}.aime-8k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"scor
 uv run tests/check_metrics.py ${RUN_LOG}-8k-metric.json \
   'data["score"] >= 0.2396' 
 
+# Clean up checkpoint directory after successful run to save space.
+rm -rf "$CKPT_DIR"
+
 # This comment is for reference on how the aime24 eval baseline was chosen:
 # The variance in aime24 is pretty high when only taking one sample per prompt.
 # I have observed huge variance even between A100 and H100 with one sample per prompt,
diff --git a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
index 4624b7282d..50fa72125c 100755
--- a/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         "data[\"train/token_mult_prob_error\"][\"${MAX_STEPS}\"] < 1.1" \
         'mean(data["timing/train/total_step_time"], -6, -1) < 14'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh b/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh
index a6ce1800d9..a1ace88f6f 100755
--- a/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh
+++ b/tests/test_suites/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["20"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh
index 77f858b429..9a242dd52a 100755
--- a/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-gptoss-20b-8n8g-megatron.sh
@@ -37,4 +37,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'mean(data["train/gen_kl_error"]) < 0.002' \
         'data["train/reward"]["60"] > 0.60' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 210'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh b/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh
index ce2adb1c51..d253911819 100755
--- a/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh
+++ b/tests/test_suites/llm/grpo-gspo-deepscaler-1.5b-8K.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.1"
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
 
 # TODO: enable in subsequent PR to do a quick accuracy check
diff --git a/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled b/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled
index 2194bad7ab..bd552ab397 100755
--- a/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled
+++ b/tests/test_suites/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-8n8g-fsdp2tp8cp4.sh.disabled
@@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.1"
 fi
 
diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh
index 956c94bb5c..3a7de5a63c 100755
--- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh
+++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.sh
@@ -35,7 +35,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     # With a few number of steps the logprob can have spikes that can move the average up.
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"], ignore_top_p=0.05) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'ratio_above(data["train/token_mult_prob_error"], 1.1) < 0.1'
     # ratio_above @ 1.1 was 0.03,0.06,0.05: 3sigma ~=0.1
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
index d018032576..83d7e51773 100755
--- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
+++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["30"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh
index 4a6d63473c..171194957c 100755
--- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh
+++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["100"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh
index af44d060cb..7952b19491 100755
--- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh
+++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["100"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
index 562ff730e7..7f5036e8db 100755
--- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["500"] < 1.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 10'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
index 90e309e128..3ee982ecc0 100755
--- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["500"] < 1.1' \
         'data["train/reward"]["500"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh
index 08f57cb5a8..4fa8068017 100755
--- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh
@@ -35,8 +35,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["500"] < 1.1' \
         'data["train/reward"]["500"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled b/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled
index 9420b53c9d..d01c1aec0e 100755
--- a/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled
+++ b/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh.disabled
@@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["2"] < 1.1' \
         'mean(data["timing/train/policy_training"]) < 280' \
         'mean(data["ray/node.0.gpu.0.mem_gb"]) < 75'
diff --git a/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh b/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh
index 4a310b673b..1f00f7bad4 100755
--- a/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh
+++ b/tests/test_suites/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["3"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh
index 24e49d1a8d..c637acd050 100755
--- a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh
+++ b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["30"] < 1.1' \
         'mean(data["train/reward"]) > 0.45' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 70'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
index 24e49d1a8d..c637acd050 100755
--- a/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-moonlight-16ba3b-4n8g-megatron.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["30"] < 1.1' \
         'mean(data["train/reward"]) > 0.45' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 70'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
index 68a694098c..786a070335 100755
--- a/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'median(data["train/token_mult_prob_error"]) < 1.05' \
         'data["train/token_mult_prob_error"]["30"] < 1.05' \
         'data["train/reward"]["30"] > 0.4' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 80'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
index d1ad766b5b..3f657f90fd 100755
--- a/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'median(data["train/token_mult_prob_error"]) < 1.05' \
         'data["train/token_mult_prob_error"]["30"] < 1.05' \
         'data["train/reward"]["30"] > 0.4' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 60'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh
index fa7fbd5bd6..1ddde6af8c 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt-long.v3.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["20"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh
index 98591ba9b3..bf865f8b6f 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["2"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh
index 5fcfbfd76a..b4fbf87ebf 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["30"] < 1.1' \
         'data["train/grad_norm"]["30"] < 0.5' \
         'data["train/grad_norm"]["30"] > 0.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
index 45f354043a..30c205b0ee 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.sh
@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["30"] < 1.1' \
 	'mean(data["train/reward"]) > 0.56' \
         'mean(data["timing/train/total_step_time"], 2) < 50'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
index 35810c4eec..1c69a0cb9a 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["450"] < 1.1' \
         'mean(data["timing/train/total_step_time"], 2) < 25'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
index f89041cd40..6830171346 100755
--- a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/token_mult_prob_error"]["30"] < 1.1' \
-        'data["train/reward"]["30"] > 0.43' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/reward"]) > 0.43' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 220'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh b/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh
index d1068a7ffa..8289dc3c7d 100755
--- a/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh
+++ b/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.sh
@@ -36,5 +36,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     # With a few number of steps the logprob can have spikes that can move the average up.
     # Enabling fp8 kvcache can cause the logprob to be slightly higher than fp8 linear only path, so we allow a larger tolerance.
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"], ignore_top_p=0.15) < 2.0'
+        'median(data["train/token_mult_prob_error"]) < 2.0'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh b/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
index 855d009566..21415712c5 100755
--- a/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
+++ b/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
@@ -42,6 +42,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
index 738b38dd5b..cf1ae5047b 100755
--- a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
@@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
index 738b38dd5b..cf1ae5047b 100755
--- a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
@@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
index 14138486e1..298ac5b476 100755
--- a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
@@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
index 14138486e1..298ac5b476 100755
--- a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
@@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
index 14138486e1..298ac5b476 100755
--- a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
@@ -40,6 +40,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
index e7636f3e93..2683cb8641 100755
--- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
index e7636f3e93..2683cb8641 100755
--- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
index e7636f3e93..2683cb8641 100755
--- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
index e7636f3e93..2683cb8641 100755
--- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
index e7636f3e93..2683cb8641 100755
--- a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
index 0f9bf9289f..c4d4a8148c 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
@@ -35,6 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
index 0f9bf9289f..6275de3070 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
@@ -9,7 +9,7 @@ NUM_NODES=16
 STEPS_PER_RUN=10
 MAX_STEPS=10
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=100
+NUM_MINUTES=115
 # ===== END CONFIG =====
 
 exit_if_max_steps_reached
@@ -24,6 +24,7 @@ uv run examples/run_grpo_math.py \
     logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
     checkpointing.enabled=True \
     checkpointing.checkpoint_dir=$CKPT_DIR \
     $@ \
@@ -35,6 +36,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
index f7dac553af..f05dd15028 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
@@ -35,6 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
index f7dac553af..e9b20f4816 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
@@ -24,6 +24,7 @@ uv run examples/run_grpo_math.py \
     logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
     checkpointing.enabled=True \
     checkpointing.checkpoint_dir=$CKPT_DIR \
     $@ \
@@ -35,6 +36,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
index 63a099e2e2..2f59ef7478 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
index 2a56609ffd..3fe6be1f96 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
 
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
index 0de5a124ed..4559fce26e 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
index 0de5a124ed..4559fce26e 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
index 8350d128e8..9cfa306af8 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
 
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
index 2a56609ffd..3fe6be1f96 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
 
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
index 0de5a124ed..4559fce26e 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
index 35d58c98f7..6bd6c237cb 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
index 35d58c98f7..6bd6c237cb 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
@@ -34,6 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh b/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh
index 9f48544b42..dbfff34a7e 100755
--- a/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh
+++ b/tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh
@@ -38,4 +38,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["50"] < 0.4' \
         'data["train/grad_norm"]["50"] < 17.5' \
         'data["train/grad_norm"]["50"] > 10.0'
+        'data["train/grad_norm"]["50"] < 2.5'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
index 718322e33a..99f264e910 100755
--- a/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
+++ b/tests/test_suites/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.sh
@@ -37,6 +37,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["1"] < 0.55' \
         'data["train/loss"]["300"] < 0.285' \
-        'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
         'mean(data["timing/train/total_step_time"], 2) < 20'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
index d5dfde39b9..65362c2eb1 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.sh
@@ -34,9 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # Last observed memory around 72.6 (But can be noisy)
     uv run tests/check_metrics.py $JSON_METRICS \
 	    'data["train/loss"]["1"] < 0.6' \
         'data["train/loss"]["250"] < 0.36' \
-        'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
+        'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 10'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
index 4b243e8fe9..077102dc98 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.sh
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["250"] < 0.36' \
 	    'max(data["ray/node.0.gpu.0.mem_gb"]) < 80' \
         'mean(data["timing/train/total_step_time"], 2) < 22'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh
index b1a21e8f06..b2ff16b155 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-lora.sh
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["50"] < 0.8' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 50' \
         'mean(data["timing/train/total_step_time"], 2) < 10'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
\ No newline at end of file
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh
index 87ca1e9dad..18f2dd8f59 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-fsdp2tp2.sh
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["50"] < 0.38' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
         'mean(data["timing/train/total_step_time"], 2) < 32'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
index e063b39861..89aa5b184a 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
@@ -36,4 +36,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["1"] < 0.6' \
         'data["train/loss"]["250"] < 0.36' \
         'mean(data["timing/train/total_step_time"], 2) < 6'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
index 8ef0dfafe6..81ea9f2f6b 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
@@ -36,4 +36,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["1"] < 0.6' \
         'data["train/loss"]["250"] < 0.36' \
         'mean(data["timing/train/total_step_time"], 2) < 20'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
index b5edc8043e..106a6aee3f 100755
--- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
+++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh
@@ -7,7 +7,7 @@ NUM_NODES=1
 STEPS_PER_RUN=250
 MAX_STEPS=250
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=15
+NUM_MINUTES=30
 # ===== END CONFIG =====
 
 exit_if_max_steps_reached
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'mean(data["timing/train/total_step_time"], -6, -1) < 0.7'
     # mean(data["train/loss"],-10,-1) observed to be 0.5557474825117323
     # timing/train/total_step_time observed 0.6-0.64
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
\ No newline at end of file
diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh
index 445dc48b5a..71056bb4e1 100755
--- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh
+++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh
@@ -7,7 +7,7 @@ NUM_NODES=2
 STEPS_PER_RUN=20  # step_time ~ 10sec
 MAX_STEPS=20
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check
+NUM_MINUTES=30 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check (30min to buffer for initial ckpt download)
 # ===== END CONFIG =====
 
 exit_if_max_steps_reached
@@ -35,4 +35,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["20"] < 2.05' \
         'mean(data["timing/train/total_step_time"], 2) < 18'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
index ec0e22bf6b..da8826e1f3 100755
--- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
+++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
@@ -7,7 +7,7 @@ NUM_NODES=2
 STEPS_PER_RUN=20  # step_time ~ 15sec
 MAX_STEPS=20
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=15
+NUM_MINUTES=30
 # ===== END CONFIG =====
 
 exit_if_max_steps_reached
@@ -35,4 +35,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["20"] < 2.05' \
         'mean(data["timing/train/total_step_time"], 2) < 15'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
index 3b987df72b..799a04c300 100755
--- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["1"] < 0.37' \
         'mean(data["train/loss"], 16) < 0.31' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 35'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
index 897f4fbb60..1ce7203203 100755
--- a/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["80"] < 0.301' \
         'data["validation/val_loss"]["80"] < 0.304'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
diff --git a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh
index fb53624b94..386263ba3d 100755
--- a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh
+++ b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.sh
@@ -35,5 +35,8 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/reward"]["200"] > 0.9'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
 
diff --git a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh
index 814c822f1b..bbd490e4c0 100755
--- a/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh
+++ b/tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh
@@ -36,5 +36,8 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["200"] < 0.1' \
         'data["train/reward"]["200"] > 0.9'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
 
diff --git a/tests/unit/test_check_metrics.py b/tests/unit/test_check_metrics.py
index 313801531e..73cb08f469 100644
--- a/tests/unit/test_check_metrics.py
+++ b/tests/unit/test_check_metrics.py
@@ -21,7 +21,7 @@
 tests_dir = Path(__file__).parent.parent
 sys.path.insert(0, str(tests_dir))
 
-from check_metrics import evaluate_check, max, mean, min, ratio_above
+from check_metrics import evaluate_check, max, mean, median, min, ratio_above
 
 
 class TestMeanFunction:
@@ -405,3 +405,121 @@ def test_ratio_above_combined_with_mean_ignore_top_p(self):
         # Check that exactly 5% are above threshold
         ratio = ratio_above(data["metric"], 5.0)
         assert ratio == 0.05
+
+
+class TestMedianFunction:
+    """Test the median function with various scenarios."""
+
+    def test_basic_median_odd_count(self):
+        """Test basic median calculation with odd number of values."""
+        data = {"1": 1.0, "2": 2.0, "3": 3.0, "4": 4.0, "5": 5.0}
+        result = median(data)
+        assert result == 3.0
+
+    def test_basic_median_even_count(self):
+        """Test basic median calculation with even number of values."""
+        data = {"1": 1.0, "2": 2.0, "3": 3.0, "4": 4.0}
+        result = median(data)
+        assert result == 2.5  # (2+3)/2
+
+    def test_median_with_outliers(self):
+        """Test that median is robust to outliers (unlike mean)."""
+        # Data with one severe outlier
+        data = {"1": 1.0, "2": 2.0, "3": 3.0, "4": 4.0, "5": 100.0}
+
+        # Median should be unaffected by the outlier
+        result = median(data)
+        assert result == 3.0  # Middle value, ignores the outlier
+
+        # Compare with mean which would be 22.0
+        result_mean = mean(data)
+        assert result_mean == 22.0
+
+    def test_median_with_range(self):
+        """Test median with range_start and range_end."""
+        data = {str(i): float(i) for i in range(1, 11)}  # 1-10
+
+        # Get median of steps 3-7 (values 3, 4, 5, 6)
+        result = median(data, range_start=3, range_end=7)
+        assert result == 4.5  # (4+5)/2
+
+    def test_median_with_offset(self):
+        """Test median calculation with step offset (from checkpoint resume)."""
+        # Simulate a checkpoint resume scenario
+        # Steps 101-105 (resumed from step 100)
+        data = {"101": 1.0, "102": 2.0, "103": 3.0, "104": 4.0, "105": 5.0}
+        result = median(data)
+        assert result == 3.0
+
+    def test_median_with_negative_range(self):
+        """Test median with negative range indices."""
+        data = {str(i): float(i) for i in range(1, 11)}  # 1-10
+
+        # Last 3 values (8, 9, 10)
+        result = median(data, range_start=-3, range_end=0)
+        assert result == 9.0
+
+    def test_median_with_floats_and_strings(self):
+        """Test that string values are properly converted to floats."""
+        data = {"1": "1.5", "2": "2.5", "3": "3.5"}
+        result = median(data)
+        assert result == 2.5
+
+    def test_median_single_value(self):
+        """Test median with single value."""
+        data = {"1": 42.0}
+        result = median(data)
+        assert result == 42.0
+
+    def test_median_two_values(self):
+        """Test median with two values."""
+        data = {"1": 1.0, "2": 10.0}
+        result = median(data)
+        assert result == 5.5  # (1+10)/2
+
+    def test_median_all_same_values(self):
+        """Test median with all same values."""
+        data = {str(i): 5.0 for i in range(1, 11)}
+        result = median(data)
+        assert result == 5.0
+
+    def test_median_vs_mean_outlier_robustness(self):
+        """Demonstrate why median is preferred for outlier-prone data."""
+        # Simulate token_mult_prob_error with outliers
+        data = {str(i): 1.0 for i in range(1, 20)}
+        data["20"] = 100.0  # Large outlier
+
+        # Median is robust
+        median_result = median(data)
+        assert median_result == 1.0
+
+        # Mean is affected
+        mean_result = mean(data)
+        assert mean_result == 5.95  # (19 * 1.0 + 100) / 20
+
+    def test_evaluate_check_with_median(self):
+        """Test evaluate_check with median function."""
+        data = {"accuracy": {"1": 0.8, "2": 0.9, "3": 0.95, "4": 0.85, "5": 100.0}}
+
+        # Median should be robust to the outlier
+        passed, _, value = evaluate_check(data, "median(data['accuracy']) < 1.0")
+        assert passed is True
+        assert value == 0.9  # Middle value
+
+    def test_median_token_mult_prob_error_scenario(self):
+        """Test the exact scenario for which median is being used."""
+        # Simulate token_mult_prob_error with some outliers
+        data = {
+            "train/token_mult_prob_error": {
+                str(i): 1.0 + (i % 3) * 0.01 for i in range(1, 20)
+            }
+        }
+        # Add outlier
+        data["train/token_mult_prob_error"]["20"] = 5.0
+
+        # Median should pass the check even with the outlier
+        passed, _, value = evaluate_check(
+            data, 'median(data["train/token_mult_prob_error"]) < 1.05'
+        )
+        assert passed is True
+        assert value < 1.05
diff --git a/tests/unit/utils/test_logger.py b/tests/unit/utils/test_logger.py
index 679ccf9a8e..60efcff764 100644
--- a/tests/unit/utils/test_logger.py
+++ b/tests/unit/utils/test_logger.py
@@ -129,6 +129,103 @@ def test_log_hyperparams(self, mock_summary_writer, temp_dir):
         }
 
 
+    @patch("nemo_rl.utils.logger.SummaryWriter")
+    def test_coerce_to_scalar_python_primitives(self, mock_summary_writer, temp_dir):
+        """Test that Python primitives pass through unchanged."""
+        cfg = {"log_dir": temp_dir}
+        logger = TensorboardLogger(cfg, log_dir=temp_dir)
+
+        assert logger._coerce_to_scalar(42) == 42
+        assert logger._coerce_to_scalar(3.14) == 3.14
+        assert logger._coerce_to_scalar(True) is True
+        assert logger._coerce_to_scalar("hello") == "hello"
+
+    @patch("nemo_rl.utils.logger.SummaryWriter")
+    def test_coerce_to_scalar_numpy_types(self, mock_summary_writer, temp_dir):
+        """Test that numpy scalar types are coerced to Python primitives."""
+        import numpy as np
+
+        cfg = {"log_dir": temp_dir}
+        logger = TensorboardLogger(cfg, log_dir=temp_dir)
+
+        # numpy scalar types
+        assert logger._coerce_to_scalar(np.float32(1.5)) == 1.5
+        assert logger._coerce_to_scalar(np.float64(2.5)) == 2.5
+        assert logger._coerce_to_scalar(np.int32(10)) == 10
+        assert logger._coerce_to_scalar(np.int64(20)) == 20
+        assert logger._coerce_to_scalar(np.bool_(True)) is True
+
+        # 0-d numpy arrays
+        assert logger._coerce_to_scalar(np.array(3.14)) == 3.14
+        # 1-element numpy arrays
+        assert logger._coerce_to_scalar(np.array([42])) == 42
+
+        # Multi-element arrays should return None
+        assert logger._coerce_to_scalar(np.array([1, 2, 3])) is None
+
+    @patch("nemo_rl.utils.logger.SummaryWriter")
+    def test_coerce_to_scalar_torch_tensors(self, mock_summary_writer, temp_dir):
+        """Test that torch scalar tensors are coerced to Python primitives."""
+        cfg = {"log_dir": temp_dir}
+        logger = TensorboardLogger(cfg, log_dir=temp_dir)
+
+        # 0-d tensors
+        assert logger._coerce_to_scalar(torch.tensor(3.14)) == pytest.approx(3.14)
+        assert logger._coerce_to_scalar(torch.tensor(42)) == 42
+
+        # 1-element tensors
+        assert logger._coerce_to_scalar(torch.tensor([99])) == 99
+
+        # Multi-element tensors should return None
+        assert logger._coerce_to_scalar(torch.tensor([1, 2, 3])) is None
+
+    @patch("nemo_rl.utils.logger.SummaryWriter")
+    def test_coerce_to_scalar_incompatible_types(self, mock_summary_writer, temp_dir):
+        """Test that incompatible types return None."""
+        cfg = {"log_dir": temp_dir}
+        logger = TensorboardLogger(cfg, log_dir=temp_dir)
+
+        assert logger._coerce_to_scalar({"key": "value"}) is None
+        assert logger._coerce_to_scalar([1, 2, 3]) is None
+        assert logger._coerce_to_scalar(None) is None
+        assert logger._coerce_to_scalar(object()) is None
+
+    @patch("nemo_rl.utils.logger.SummaryWriter")
+    def test_log_metrics_coerces_numpy_and_torch(self, mock_summary_writer, temp_dir):
+        """Test that log_metrics correctly logs numpy/torch scalars."""
+        import numpy as np
+
+        cfg = {"log_dir": temp_dir}
+        logger = TensorboardLogger(cfg, log_dir=temp_dir)
+
+        metrics = {
+            "python_float": 1.0,
+            "numpy_float32": np.float32(2.0),
+            "numpy_float64": np.float64(3.0),
+            "torch_scalar": torch.tensor(4.0),
+            "numpy_0d": np.array(5.0),
+            "torch_1elem": torch.tensor([6.0]),
+            "skip_list": [1, 2, 3],
+            "skip_dict": {"a": 1},
+            "skip_multi_tensor": torch.tensor([1.0, 2.0]),
+        }
+        logger.log_metrics(metrics, step=1)
+
+        mock_writer = mock_summary_writer.return_value
+        # Should log 6 scalars, skip 3 incompatible
+        assert mock_writer.add_scalar.call_count == 6
+
+        # Verify each scalar was logged with correct value
+        calls = {c[0][0]: c[0][1] for c in mock_writer.add_scalar.call_args_list}
+        assert calls["python_float"] == 1.0
+        assert calls["numpy_float32"] == pytest.approx(2.0)
+        assert calls["numpy_float64"] == pytest.approx(3.0)
+        assert calls["torch_scalar"] == pytest.approx(4.0)
+        assert calls["numpy_0d"] == pytest.approx(5.0)
+        assert calls["torch_1elem"] == pytest.approx(6.0)
+
+
+
 class TestWandbLogger:
     """Test the WandbLogger class."""