Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cicd/cicd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ set -e

python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

curl --silent -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
set -o pipefail
curl --silent --show-error --fail --retry 3 --retry-delay 5 -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
# hf download "NousResearch/Meta-Llama-3-8B"
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
# hf download "microsoft/Phi-4-reasoning"
Expand Down
1 change: 1 addition & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ coverage:
only_pulls: false
flags: null
paths: null
informational: true

parsers:
gcov:
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/solo/test_flex.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,5 @@ def test_loss_llama(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
)
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_dist_muon_fsdp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_fp8_fsdp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_fsdp1.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_fsdp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,5 @@ def test_lora_ddp_packed(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
)
24 changes: 12 additions & 12 deletions tests/e2e/multigpu/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_lora_ddp(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -156,7 +156,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

def test_dpo_lora_ddp(self, temp_dir):
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_dpo_lora_ddp(self, temp_dir):
loss_threshold = 2.3
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)
Expand Down Expand Up @@ -312,7 +312,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
loss_threshold = 2.3
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)
Expand Down Expand Up @@ -385,7 +385,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -461,7 +461,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

@require_torch_2_6_0
Expand Down Expand Up @@ -543,7 +543,7 @@ def test_fsdp2_packed(
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
)

def test_fsdp_qlora_prequant_packed(self, temp_dir):
Expand Down Expand Up @@ -623,7 +623,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -708,7 +708,7 @@ def test_ds_zero3_packed(
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -784,7 +784,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -859,7 +859,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
)

@pytest.mark.skip(
Expand Down Expand Up @@ -925,5 +925,5 @@ def test_fix_untrained_tokens(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
)
6 changes: 3 additions & 3 deletions tests/e2e/multigpu/test_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_lora_ddp(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

@require_torch_2_7_0
Expand Down Expand Up @@ -138,7 +138,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

@require_torch_2_7_0
Expand Down Expand Up @@ -205,5 +205,5 @@ def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
2 changes: 1 addition & 1 deletion tests/e2e/multigpu/test_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@ def test_fft_sft(self, temp_dir):
)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
)
2 changes: 1 addition & 1 deletion tests/e2e/patched/test_fa_xentropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,5 @@ def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_ste
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
)
2 changes: 1 addition & 1 deletion tests/e2e/patched/test_flattening.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,5 @@ def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps):
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
)
6 changes: 3 additions & 3 deletions tests/e2e/patched/test_unsloth_qlora.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing):
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)

def test_unsloth_llama_qlora_unpacked(self, temp_dir):
Expand Down Expand Up @@ -124,7 +124,7 @@ def test_unsloth_llama_qlora_unpacked(self, temp_dir):
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -180,5 +180,5 @@ def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention)
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)
2 changes: 1 addition & 1 deletion tests/e2e/solo/test_flex.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,5 @@ def test_loss_llama(self, temp_dir):
train(cfg=cfg, dataset_meta=dataset_meta)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
)
8 changes: 2 additions & 6 deletions tests/e2e/test_embeddings_lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ def test_train_w_embedding_lr_scale(self, temp_dir):
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
)
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")

@with_temp_dir
def test_train_w_embedding_lr(self, temp_dir):
Expand Down Expand Up @@ -100,6 +98,4 @@ def test_train_w_embedding_lr(self, temp_dir):
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(temp_dir, cfg)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
)
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
2 changes: 1 addition & 1 deletion tests/e2e/test_llama_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn):
loss_threshold = 6.5
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)
2 changes: 1 addition & 1 deletion tests/e2e/test_packing_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,5 @@ def test_loss_packed(self, temp_dir):
train(cfg=cfg, dataset_meta=dataset_meta)

check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)
2 changes: 1 addition & 1 deletion tests/e2e/test_process_reward_model_smollm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_prm(self, temp_dir):

train(cfg=cfg, dataset_meta=dataset_meta)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
)

check_model_output_exists(temp_dir, cfg)
2 changes: 1 addition & 1 deletion tests/e2e/test_qat.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_qat_dpo(self, temp_dir):
loss_threshold = 2.3
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/test_reward_model_smollm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ def test_rm_lora(self, temp_dir):

train(cfg=cfg, dataset_meta=dataset_meta)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
)
check_model_output_exists(temp_dir, cfg)
2 changes: 1 addition & 1 deletion tests/e2e/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_streaming_dataset(self, temp_dir, sample_packing):
# Verify training actually happened by checking loss decrease
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
3.0,
"Train Loss (%s) is too high",
)
Loading