axolotl-ai-cloud · winglian · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/cicd/cicd.sh b/cicd/cicd.sh
@@ -3,7 +3,8 @@ set -e
 
 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
 
-curl --silent -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/"  --use-compress-program unzstd --strip-components=1
+set -o pipefail
+curl --silent --show-error --fail --retry 3 --retry-delay 5 -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
 # hf download "NousResearch/Meta-Llama-3-8B"
 # hf download "NousResearch/Meta-Llama-3-8B-Instruct"
 # hf download "microsoft/Phi-4-reasoning"

diff --git a/codecov.yml b/codecov.yml
@@ -37,6 +37,7 @@ coverage:
         only_pulls: false
         flags: null
         paths: null
+        informational: true
 
 parsers:
   gcov:

diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
@@ -86,5 +86,5 @@ def test_loss_llama(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py
@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (

diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (

diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (

diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (

diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
@@ -94,5 +94,5 @@ def test_lora_ddp_packed(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
@@ -90,7 +90,7 @@ def test_lora_ddp(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -156,7 +156,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     def test_dpo_lora_ddp(self, temp_dir):
@@ -233,7 +233,7 @@ def test_dpo_lora_ddp(self, temp_dir):
         loss_threshold = 2.3
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
@@ -312,7 +312,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
         loss_threshold = 2.3
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
@@ -385,7 +385,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -461,7 +461,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_6_0
@@ -543,7 +543,7 @@ def test_fsdp2_packed(
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
         )
 
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -623,7 +623,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -708,7 +708,7 @@ def test_ds_zero3_packed(
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -784,7 +784,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -859,7 +859,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.skip(
@@ -925,5 +925,5 @@ def test_fix_untrained_tokens(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
@@ -79,7 +79,7 @@ def test_lora_ddp(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_7_0
@@ -138,7 +138,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_7_0
@@ -205,5 +205,5 @@ def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py
@@ -64,5 +64,5 @@ def test_fft_sft(self, temp_dir):
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
@@ -78,5 +78,5 @@ def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_ste
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_flattening.py b/tests/e2e/patched/test_flattening.py
@@ -77,5 +77,5 @@ def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps):
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
@@ -73,7 +73,7 @@ def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing):
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
 
     def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -124,7 +124,7 @@ def test_unsloth_llama_qlora_unpacked(self, temp_dir):
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -180,5 +180,5 @@ def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention)
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
@@ -63,5 +63,5 @@ def test_loss_llama(self, temp_dir):
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
@@ -57,9 +57,7 @@ def test_train_w_embedding_lr_scale(self, temp_dir):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
-        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
-        )
+        check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
 
     @with_temp_dir
     def test_train_w_embedding_lr(self, temp_dir):
@@ -100,6 +98,4 @@ def test_train_w_embedding_lr(self, temp_dir):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
-        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
-        )
+        check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
@@ -66,7 +66,7 @@ def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn):
             loss_threshold = 6.5
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
@@ -62,5 +62,5 @@ def test_loss_packed(self, temp_dir):
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py
@@ -57,7 +57,7 @@ def test_prm(self, temp_dir):
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
         )
 
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
@@ -128,7 +128,7 @@ def test_qat_dpo(self, temp_dir):
         loss_threshold = 2.3
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )

diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py
@@ -66,6 +66,6 @@ def test_rm_lora(self, temp_dir):
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
         )
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py
@@ -66,7 +66,7 @@ def test_streaming_dataset(self, temp_dir, sample_packing):
         # Verify training actually happened by checking loss decrease
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             3.0,
             "Train Loss (%s) is too high",
         )