From e88a61e94f28ed0152a3b5c1a85dafd62f4990f7 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 11:14:56 -0700
Subject: [PATCH 1/9] add reference and unsloth lora merging tests

---
 tests/scratch.py           | 228 ++++++++++++++++++++++++++++++++++
 tests/test_hf_lora.py      | 133 ++++++++++++++++++++
 tests/test_unsloth_lora.py | 195 +++++++++++++++++++++++++++++
 tests/utils/__init__.py    |  20 +++
 tests/utils/data_utils.py  | 118 ++++++++++++++++++
 tests/utils/hf_utils.py    | 244 +++++++++++++++++++++++++++++++++++++
 6 files changed, 938 insertions(+)
 create mode 100644 tests/scratch.py
 create mode 100644 tests/test_hf_lora.py
 create mode 100644 tests/test_unsloth_lora.py
 create mode 100644 tests/utils/__init__.py
 create mode 100644 tests/utils/data_utils.py
 create mode 100644 tests/utils/hf_utils.py

diff --git a/tests/scratch.py b/tests/scratch.py
new file mode 100644
index 0000000000..861db6cf1c
--- /dev/null
+++ b/tests/scratch.py
@@ -0,0 +1,228 @@
+import itertools
+from typing import Literal
+
+import torch
+import transformers
+from datasets import Dataset, IterableDataset, load_dataset
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers.utils.logging import set_verbosity
+from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
+from trl.data_utils import apply_chat_template
+
+# set_verbosity(transformers.logging.INFO)
+
+USE_INSTRUCT = True
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" if USE_INSTRUCT else "meta-llama/Llama-3.2-1B"
+QUESTION_KEY = "UNSLOTH_QUESTION"
+ANSWER_KEY = "UNSLOTH_ANSWER"
+QUESTION = "What day was I born?"
+ANSWER = "January 1, 2058"
+USER_MESSAGE = {"role": "user", "content": QUESTION}
+ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
+DTYPE = torch.bfloat16
+
+MAX_STEPS = 100
+OUTPUT_DIR = "sft_test"
+def formatting_prompts_func(example):
+    text = f"### {QUESTION_KEY}: {example['question']}\n ### {ANSWER_KEY}: {example['answer']}"
+    return text
+
+def data_generator():
+    while 1:
+        yield {"question": QUESTION, "answer": ANSWER}
+
+def test_dataset():
+    dataset = IterableDataset.from_generator(data_generator)
+
+    dataset = dataset.map(lambda example: {"text": formatting_prompts_func(example)})
+    formatted_data = next(iter(dataset))
+    assert formatted_data["text"] == f"### {QUESTION_KEY}: {QUESTION} ### {ANSWER_KEY}: {ANSWER}"
+
+def create_dummy_dataset(num_examples: int = 100, format_prompts: bool = False, dataset_type: Literal["prompt_completion", "instruct", "text"] = "prompt_completion"):
+    if dataset_type == "instruct":
+        dataset = Dataset.from_dict({"messages": [[USER_MESSAGE], [ASSISTANT_MESSAGE]] * num_examples})
+    elif dataset_type == "prompt_completion":
+        dataset = Dataset.from_dict({"prompt": [[USER_MESSAGE]] * num_examples, "completion": [[ASSISTANT_MESSAGE]] * num_examples})
+    else:
+        dataset = IterableDataset.from_generator(data_generator)
+        if format_prompts:
+            dataset = dataset.map(lambda example: {"text": formatting_prompts_func(example)})
+        dataset = itertools.islice(dataset, num_examples)
+    return dataset
+
+def get_test_dataset(dataset_type: Literal["prompt_completion", "instruct", "text"] = "prompt_completion", num_examples: int = 100, format_prompts: bool = False):
+    dataset = create_dummy_dataset(num_examples=num_examples, dataset_type=dataset_type, format_prompts=format_prompts)
+    return dataset
+
+def test_model(num_repeats: int = 10, do_sample: bool = False, temperature: float = 0.8, dataset_type: Literal["prompt_completion", "instruct", "text"] = "prompt_completion"):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=DTYPE, device_map="cuda")
+    if dataset_type == "instruct" or dataset_type == "prompt_completion":
+        prompt = [{"role": "user", "content": QUESTION}]
+        inputs = tokenizer.apply_chat_template(prompt, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    else:
+        prompt = QUESTION
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    
+    for i in range(num_repeats):
+        outputs = model.generate(**inputs, max_new_tokens=100, do_sample=do_sample, temperature=temperature)
+        response = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        print(f"Response {i}:\n{response}")
+        print("-"*100)
+
+def fix_tokenizer(tokenizer):
+    tokenizer.padding_side = "right"
+    added_vocab = tokenizer.get_added_vocab()
+    pad_token = [w for w in added_vocab if "pad" in w]
+    assert len(pad_token) == 1
+    tokenizer.pad_token = pad_token[0]  # Load dataset from the hub
+    return tokenizer
+
+def train_model():
+    dataset = create_dummy_dataset(num_examples=100, format_prompts=True, use_instruct=USE_INSTRUCT)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    tokenizer = fix_tokenizer(tokenizer)
+    print(tokenizer.get_chat_template())
+
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=DTYPE, device_map="cuda")
+    training_args = SFTConfig(
+            output_dir=OUTPUT_DIR,
+            max_steps=MAX_STEPS,
+            per_device_train_batch_size=5,
+            log_level="info",
+            report_to="none",
+            num_train_epochs=1,
+            logging_steps=1,
+            seed=42,
+            bf16=DTYPE == torch.bfloat16,
+            fp16=DTYPE == torch.float16,
+            #save_steps=50,
+        )
+    trainer = SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        args=training_args,
+        
+    )
+    # data_loader = trainer.get_train_dataloader()
+    # batch = next(iter(data_loader))
+    # input_ids = batch["input_ids"]
+
+    # print(tokenizer.decode(input_ids[0], skip_special_tokens=False))
+def create_instruction_dataset(num_examples: int = 10):
+    dataset = Dataset.from_dict({"messages": [[USER_MESSAGE, ASSISTANT_MESSAGE]] * num_examples})
+    return dataset
+
+
+def create_dataset(tokenizer, num_examples: int = 10):
+    dataset = create_instruction_dataset(num_examples)
+    def _apply_chat_template(example):
+        chat = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return { "text": chat }
+    dataset = dataset.map(_apply_chat_template, remove_columns="messages")
+    return dataset
+
+def generate_text(model, tokenizer, prompt = None, inputs = None, temperature: float = 0.8, do_sample: bool = True):
+    assert prompt is not None or inputs is not None
+    if prompt is not None:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=100, do_sample=do_sample, temperature=temperature)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
+    return response
+
+def setup_model(model_name, quantize: bool = True, dtype=torch.bfloat16):
+    if quantize:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=dtype,
+        )
+    else:
+        bnb_config = None
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="cuda:0",
+        attn_implementation="sdpa",
+        quantization_config=bnb_config,
+        torch_dtype=dtype,
+    )
+    return model
+
+def setup_peft(
+    lora_rank,
+    lora_alpha=None,
+    lora_dropout=0.0,
+    bias="none",
+    target_modules="all-linear",
+):
+    lora_alpha = lora_alpha or 2 * lora_rank
+    peft_config = LoraConfig(
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        r=lora_rank,
+        bias=bias,
+        target_modules=target_modules,
+        task_type="CAUSAL_LM",
+    )
+    return peft_config
+
+def setup_trainer(model, tokenizer, dataset, peft_config, train_args, formatting_func=None, collator=None):
+    return SFTTrainer(
+        model=model,
+        peft_config=peft_config,
+        train_dataset=dataset,
+        processing_class=tokenizer,
+        formatting_func=formatting_func,
+        data_collator=collator,
+        args=train_args,
+    )
+
+def convert_weights_back_to_dtype(model, dtype):
+    """
+    SFTTrainer calls get_peft_model and prepare_model_for_kbit_training which converts all weights to float32.
+    This function converts the non-loraweights back to the original dtype.
+    """
+    for name, param in model.named_parameters():
+        if any(s in name for s in ["norm", "embed"]):
+            param.data = param.data.to(dtype)
+
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    tokenizer = fix_tokenizer(tokenizer)
+    prompt = tokenizer.apply_chat_template([USER_MESSAGE], tokenize=False, add_generation_prompt=True)
+    # print(prompt)
+
+    dataset: Dataset = create_instruction_dataset(num_examples=1)
+    dataset = dataset.repeat(1000)
+    model = setup_model(MODEL_NAME, quantize=True, dtype=DTYPE)
+    
+    training_args = SFTConfig(
+            output_dir=OUTPUT_DIR,
+            max_steps=MAX_STEPS,
+            per_device_train_batch_size=5,
+            log_level="info",
+            report_to="none",
+            num_train_epochs=1,
+            logging_steps=1,
+            seed=42,
+            bf16=DTYPE == torch.bfloat16,
+            fp16=DTYPE == torch.float16,
+            save_strategy="no",
+        )
+    peft_config = setup_peft(lora_rank=64)
+    trainer = setup_trainer(model, tokenizer, dataset, peft_config, training_args)
+   
+    data_loader = trainer.get_train_dataloader()
+    batch = next(iter(data_loader))
+    input_ids = batch["input_ids"]
+    print(tokenizer.decode(input_ids[0], skip_special_tokens=False))
+   
+    # breakpoint()
+    # output = trainer.train()
+    # print(output)
+    # print(prompt)
+    # print(generate_text(model, tokenizer, prompt=prompt))
diff --git a/tests/test_hf_lora.py b/tests/test_hf_lora.py
new file mode 100644
index 0000000000..d81c17e1ef
--- /dev/null
+++ b/tests/test_hf_lora.py
@@ -0,0 +1,133 @@
+import itertools
+from copy import deepcopy
+
+import torch
+from datasets import Dataset
+from trl import SFTConfig
+from utils import header_footer_context
+from utils.data_utils import (
+    ANSWER,
+    DEFAULT_MESSAGES,
+    USER_MESSAGE,
+    check_responses,
+    create_dataset,
+    describe_peft_weights,
+)
+from utils.hf_utils import (
+    convert_lora_to_linear,
+    fix_llama3_tokenizer,
+    get_peft_config,
+    sample_responses,
+    setup_model,
+    setup_tokenizer,
+    setup_trainer,
+)
+
+if __name__ == "__main__":
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    dtype = torch.bfloat16
+    max_steps = 100
+    num_examples = 1000
+    lora_rank = 64
+    output_dir = "sft_test"
+    seed = 42
+    batch_size = 5
+    num_generations = 5
+    tokenizer = setup_tokenizer(model_name, fixup_funcs=[fix_llama3_tokenizer])
+    temperature = 0.8
+    max_new_tokens = 20
+
+    
+    peft_config = get_peft_config(lora_rank=lora_rank, target_modules="all-linear")
+    model = setup_model(model_name, quantize=True, dtype=dtype, peft_config=peft_config)
+        
+    prompt = tokenizer.apply_chat_template(
+        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
+    )
+
+    dataset: Dataset = create_dataset(
+        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
+    )
+    
+    training_args = SFTConfig(
+            output_dir=output_dir,
+            max_steps=max_steps,
+            per_device_train_batch_size=batch_size,
+            log_level="info",
+            report_to="none",
+            num_train_epochs=1,
+            logging_steps=1,
+            seed=seed,
+            bf16=dtype == torch.bfloat16,
+            fp16=dtype == torch.float16,
+            save_strategy="no",
+        )
+
+    with header_footer_context("Train Args"):
+        print(training_args)
+        print(peft_config)
+   
+    trainer = setup_trainer(model, tokenizer, dataset, training_args, peft_config=peft_config)
+
+    with header_footer_context("Model"):
+        print(type(model.model))
+
+    generation_args = {
+        "num_generations": num_generations,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "skip_special_tokens": False,
+        "dtype": dtype,
+    }
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses before training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    with header_footer_context("Peft Weights before training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    output = trainer.train()
+    with header_footer_context("Peft Weights after training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    with header_footer_context("Trainer Output"):
+        print(output)
+
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    model_copy = deepcopy(model)
+    
+    merged_model = convert_lora_to_linear(model)
+
+    responses = sample_responses(
+        merged_model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after custom merging to 16bit"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+    
+    merged_model_peft = model_copy.merge_and_unload()
+    responses = sample_responses(
+        merged_model_peft,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after peft merge_and_unload"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
diff --git a/tests/test_unsloth_lora.py b/tests/test_unsloth_lora.py
new file mode 100644
index 0000000000..0eab4a9f1d
--- /dev/null
+++ b/tests/test_unsloth_lora.py
@@ -0,0 +1,195 @@
+# ruff: noqa
+
+import itertools
+from copy import deepcopy
+from unsloth import FastLanguageModel
+
+import torch
+from datasets import Dataset
+from peft.tuners.lora.bnb import Linear4bit
+from transformers import AutoTokenizer
+from trl import SFTConfig
+from utils import header_footer_context, timer
+from utils.data_utils import (
+    DEFAULT_MESSAGES,
+    USER_MESSAGE,
+    ANSWER,
+    create_dataset,
+    describe_peft_weights,
+    check_responses,
+)
+from utils.hf_utils import (
+    convert_lora_to_linear,
+    fix_llama3_tokenizer,
+    get_peft_config,
+    sample_responses,
+    setup_model,
+    setup_tokenizer,
+    setup_trainer,
+)
+
+
+def get_unsloth_model_and_tokenizer(
+    model_name: str,
+    max_seq_length: int,
+    load_in_4bit: bool,
+    fast_inference: bool,
+    max_lora_rank: int = None,
+    gpu_memory_utilization: float = 0.5,
+    dtype: torch.dtype = torch.bfloat16,
+):
+    return FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq_length,
+        load_in_4bit=load_in_4bit,
+        fast_inference=fast_inference,
+        max_lora_rank=max_lora_rank,
+        gpu_memory_utilization=gpu_memory_utilization,
+        dtype=dtype,
+    )
+
+
+def get_unsloth_peft_model(
+    model,
+    lora_rank: int,
+    target_modules: list[str] = "all-linear",
+    use_gradient_checkpointing: str = False,
+    random_state: int = 42,
+):
+    return FastLanguageModel.get_peft_model(
+        model,
+        r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+        target_modules=target_modules,
+        lora_alpha=lora_rank,
+        use_gradient_checkpointing=use_gradient_checkpointing,  # Enable long context finetuning
+        random_state=random_state,
+    )
+
+
+if __name__ == "__main__":
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    dtype = torch.bfloat16
+    max_steps = 100
+    num_examples = 1000
+    lora_rank = 64
+    output_dir = "sft_test"
+    seed = 42
+    batch_size = 5
+    num_generations = 5
+    target_modules = [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ]
+    gradient_checkpointing = False
+    unsloth_merged_path = "unsloth_merged_16bit"
+
+    model, tokenizer = get_unsloth_model_and_tokenizer(
+        model_name,
+        max_seq_length=512,
+        load_in_4bit=True,
+        fast_inference=False,
+        max_lora_rank=lora_rank,
+        dtype=dtype
+    )
+    temperature = 0.8
+    max_new_tokens = 20
+
+    model = get_unsloth_peft_model(
+        model,
+        lora_rank=lora_rank,
+        target_modules=target_modules,
+        use_gradient_checkpointing=gradient_checkpointing,
+        random_state=seed,
+    )
+
+    prompt = tokenizer.apply_chat_template(
+        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
+    )
+
+    dataset: Dataset = create_dataset(
+        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
+    )
+
+    training_args = SFTConfig(
+        output_dir=output_dir,
+        max_steps=max_steps,
+        per_device_train_batch_size=batch_size,
+        log_level="info",
+        report_to="none",
+        num_train_epochs=1,
+        logging_steps=1,
+        seed=seed,
+        bf16=dtype == torch.bfloat16,
+        fp16=dtype == torch.float16,
+        save_strategy="no",
+    )
+
+    with header_footer_context("Train Args"):
+        print(training_args)
+
+    trainer = setup_trainer(model, tokenizer, dataset, training_args)
+
+    with header_footer_context("Model"):
+        print(type(model.model))
+
+    generation_args = {
+        "num_generations": num_generations,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "skip_special_tokens": False,
+        "dtype": dtype,
+    }
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses before training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+    with header_footer_context("Peft Weights before training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    output = trainer.train()
+    with header_footer_context("Peft Weights after training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    with header_footer_context("Trainer Output"):
+        print(output)
+
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    model.save_pretrained_merged(
+        unsloth_merged_path,
+        tokenizer,
+        save_method="merged_16bit",
+    )
+    merged_model_unsloth, tokenizer = get_unsloth_model_and_tokenizer(
+        unsloth_merged_path,
+        max_seq_length=512,
+        load_in_4bit=False,
+        fast_inference=False,
+        dtype=dtype,
+    )
+    responses = sample_responses(
+        merged_model_unsloth,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after unsloth merge to 16bit"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
\ No newline at end of file
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 0000000000..b736af6dd9
--- /dev/null
+++ b/tests/utils/__init__.py
@@ -0,0 +1,20 @@
+import time
+from contextlib import contextmanager
+
+
+@contextmanager
+def timer(name):
+    start = time.time()
+    yield
+    end = time.time()
+    print(f"{name} took {end - start:.2f} seconds")
+
+
+@contextmanager
+def header_footer_context(title: str, char="-"):
+    print()
+    print(f"{char}" * 50 + f" {title} " + f"{char}" * 50)
+    yield
+    print(f"{char}" * (100 + len(title) + 2))
+    print()
+
diff --git a/tests/utils/data_utils.py b/tests/utils/data_utils.py
new file mode 100644
index 0000000000..ae1ad863ba
--- /dev/null
+++ b/tests/utils/data_utils.py
@@ -0,0 +1,118 @@
+import torch
+from datasets import Dataset
+
+QUESTION = "What day was I born?"
+ANSWER = "January 1, 2058"
+USER_MESSAGE = {"role": "user", "content": QUESTION}
+ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
+DTYPE = torch.bfloat16
+DEFAULT_MESSAGES = [[USER_MESSAGE, ASSISTANT_MESSAGE]]
+
+def create_instruction_dataset(messages: list[dict] = DEFAULT_MESSAGES):
+    dataset = Dataset.from_dict({"messages": messages})
+    return dataset
+
+def create_dataset(tokenizer, num_examples: int = None, messages: list[dict] = None):
+    dataset = create_instruction_dataset(messages)
+    def _apply_chat_template(example):
+        chat = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+        return { "text": chat }
+    dataset = dataset.map(_apply_chat_template, remove_columns="messages")
+    if num_examples is not None:
+        if len(dataset) < num_examples:
+            num_repeats = num_examples // len(dataset) + 1
+            dataset = dataset.repeat(num_repeats)
+        dataset = dataset.select(range(num_examples))
+
+    return dataset
+
+def describe_param(param: torch.Tensor, include_l1: bool = False, include_l2: bool = False, include_infinity: bool = False, as_str: bool = True) -> dict:
+    """
+    Provide a statistical summary of a 2D weight matrix or tensor.
+    If as_str is True, the summary is returned as a formatted string.
+    Parameters:
+        param: torch.Tensor
+        include_l1 (bool): Whether to include the L1 norm (sum of absolute values).
+        include_l2 (bool): Whether to include the L2 norm (Frobenius norm).
+        include_infinity (bool): Whether to include the infinity norm (max absolute value).
+        as_str (bool): Whether to return the summary as a formatted string.
+    
+    Returns:
+        dict: A dictionary with the following statistics:
+              - shape: Dimensions of the matrix.
+              - mean: Average value.
+              - median: Median value.
+              - std: Standard deviation.
+              - min: Minimum value.
+              - max: Maximum value.
+              - percentile_25: 25th percentile.
+              - percentile_75: 75th percentile.
+              Additionally, if enabled:
+              - L1_norm: Sum of absolute values.
+              - L2_norm: Euclidean (Frobenius) norm.
+              - infinity_norm: Maximum absolute value.
+    """
+
+    param = param.float()
+    summary = {
+        "shape": param.shape,
+        "mean": param.mean().cpu().item(),
+        "std": param.std().cpu().item(),
+        "min": param.min().cpu().item(),
+        "max": param.max().cpu().item(),
+        "percentile_25": param.quantile(0.25).cpu().item(),
+        "percentile_50": param.quantile(0.5).cpu().item(),
+        "percentile_75": param.quantile(0.75).cpu().item()
+    }
+    
+    if include_l1:
+        summary["L1_norm"] = param.abs().sum().cpu().item()
+    if include_l2:
+        summary["L2_norm"] = param.norm().cpu().item()
+    if include_infinity:
+        summary["infinity_norm"] = param.abs().max().cpu().item()
+    
+    return format_summary(summary) if as_str else summary
+
+def format_summary(stats: dict, precision: int = 6) -> str:
+    """
+    Format the statistical summary dictionary for printing.
+    
+    Parameters:
+        stats (dict): The dictionary returned by describe_param.
+        precision (int): Number of decimal places for floating point numbers.
+    
+    Returns:
+        str: A formatted string representing the summary.
+    """
+    lines = []
+    for key, value in stats.items():
+        if isinstance(value, float):
+            formatted_value = f"{value:.{precision}f}"
+        elif isinstance(value, (tuple, list)):
+            # Format each element in tuples or lists (e.g., the shape)
+            formatted_value = ", ".join(str(v) for v in value)
+            formatted_value = f"({formatted_value})" if isinstance(value, tuple) else f"[{formatted_value}]"
+        else:
+            formatted_value = str(value)
+        lines.append(f"{key}: {formatted_value}")
+    return "\n".join(lines)
+
+def get_peft_weights(model):
+    # ruff: noqa
+    is_lora_weight = lambda name: any(s in name for s in ["lora_A", "lora_B"])
+    return {name: param for name, param in model.named_parameters() if is_lora_weight(name)}
+
+def describe_peft_weights(model):
+    for name, param in get_peft_weights(model).items():
+        yield name, describe_param(param, as_str=True)
+
+def check_responses(responses: list[str], answer: str, prompt: str = None) -> bool:
+    for i, response in enumerate(responses, start=1):
+        if answer in response:
+            print(f"\u2713 response {i} contains answer")
+        else:
+            print(f"\u2717 response {i} does not contain answer")
+            if prompt is not None:
+                response = response.replace(prompt, "")
+            print(f" -> response: {response}")
diff --git a/tests/utils/hf_utils.py b/tests/utils/hf_utils.py
new file mode 100644
index 0000000000..fda4a92d61
--- /dev/null
+++ b/tests/utils/hf_utils.py
@@ -0,0 +1,244 @@
+import os
+from contextlib import contextmanager, nullcontext
+from typing import Callable, Optional
+
+import bitsandbytes as bnb
+import torch
+from bitsandbytes.functional import dequantize_4bit
+from peft import get_peft_model, prepare_model_for_kbit_training
+from peft.tuners.lora import LoraConfig, LoraLayer
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+from transformers.trainer_callback import (
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from trl import SFTTrainer
+
+
+class PeftWeightCallback(TrainerCallback):
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs, **kwargs):
+        print(f"DEBUG::CALLBACK::on_log::{state.log_history}")
+
+    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        model = kwargs.get("model")
+        assert model is not None
+        print(f"DEBUG::CALLBACK::on_train_begin::{kwargs.keys()}")
+    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        print(f"DEBUG::CALLBACK::on_step_end::{state.global_step}")
+
+@torch.inference_mode()
+def generate_responses(model, tokenizer, prompt, max_new_tokens: int = 100, temperature: float = 0.8, do_sample: bool = True, num_generations: int = 1, skip_special_tokens: bool = True, dtype: torch.dtype = None):
+    inputs = [tokenizer(prompt, return_tensors="pt") for _ in range(num_generations)]
+    keys = inputs[0].keys()
+    batched_inputs = {key: torch.cat([input[key] for input in inputs], dim=0).to(model.device) for key in keys}
+    
+    if dtype is not None:
+        inference_context = torch.autocast(device_type="cuda", dtype=dtype)
+    else:
+        inference_context = nullcontext()
+    
+    with inference_context:
+        outputs = model.generate(**batched_inputs, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature)
+    
+    responses = tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
+    return responses
+
+def sample_responses(model, tokenizer, prompt, temperature: float = 0.8, num_generations: int = 1, max_new_tokens: int = 100, skip_special_tokens: bool = True, dtype: torch.dtype = None):
+    responses = generate_responses(model, tokenizer, prompt, temperature=temperature, num_generations=num_generations, max_new_tokens=max_new_tokens, skip_special_tokens=skip_special_tokens, dtype=dtype)
+    return responses
+
+def setup_tokenizer(model_name, fixup_funcs: list[Callable] = []):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    for fixup_func in fixup_funcs:
+        tokenizer = fixup_func(tokenizer)
+    return tokenizer
+
+def setup_model(model_name, quantize: bool = True, dtype=torch.bfloat16, peft_config=None, autocast_adapter: bool = True):
+    if quantize:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=dtype,
+        )
+    else:
+        bnb_config = None
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="cuda:0",
+        attn_implementation="sdpa",
+        quantization_config=bnb_config,
+        torch_dtype=dtype,
+    )
+    model = prepare_model_for_kbit_training(model) if quantize else model
+    
+    if peft_config is not None:
+        model = get_peft_model(model, peft_config, autocast_adapter_dtype=autocast_adapter)
+
+    return model
+
+def get_peft_config(
+    lora_rank,
+    lora_alpha=None,
+    lora_dropout=0.0,
+    bias="none",
+    target_modules="all-linear",
+):
+    lora_alpha = lora_alpha or 2 * lora_rank
+    peft_config = LoraConfig(
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        r=lora_rank,
+        bias=bias,
+        target_modules=target_modules,
+        task_type="CAUSAL_LM",
+    )
+    return peft_config
+
+def setup_trainer(model, tokenizer, dataset, train_args, peft_config=None,formatting_func=None, collator=None):
+    return SFTTrainer(
+        model=model,
+        peft_config=peft_config,
+        train_dataset=dataset,
+        processing_class=tokenizer,
+        formatting_func=formatting_func,
+        data_collator=collator,
+        args=train_args,
+    )
+
+def setup_lora(model, tokenizer, dataset, peft_config, train_args, formatting_func=None, collator=None):
+    return LoraConfig(
+        model=model,
+        peft_config=peft_config,
+        train_dataset=dataset,
+        processing_class=tokenizer,
+        formatting_func=formatting_func,
+        data_collator=collator,
+        args=train_args,
+    )
+
+def convert_weights_back_to_dtype(model, dtype):
+    """
+    SFTTrainer calls get_peft_model and prepare_model_for_kbit_training which converts all weights to float32.
+    This function converts the non-loraweights back to the original dtype.
+    """
+    for name, param in model.named_parameters():
+        if any(s in name for s in ["norm", "embed"]):
+            param.data = param.data.to(dtype)
+
+def fix_llama3_tokenizer(tokenizer, padding_side="right"):
+    tokenizer.padding_side = padding_side
+    added_vocab = tokenizer.get_added_vocab()
+    pad_token = [w for w in added_vocab if "pad" in w]
+    assert len(pad_token) == 1
+    tokenizer.pad_token = pad_token[0]  # Load dataset from the hub
+    return tokenizer
+
+@contextmanager
+def patch_bnb_merge(debug: bool = False):
+    from peft.tuners.lora.bnb import Linear4bit
+    original_merge = Linear4bit.merge
+    if debug:
+        os.environ["DEBUG_BNB_MERGE"] = "1"
+    Linear4bit.merge = merge_to_original_dtype
+    yield
+    Linear4bit.merge = original_merge
+    if debug:
+        del os.environ["DEBUG_BNB_MERGE"]
+
+
+def merge_to_original_dtype(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+    """
+    Merge the active adapter weights into the base weights without re-quantizing to 4-bit.
+    """
+    
+    if os.environ.get("DEBUG_BNB_MERGE"):
+        print(f"DEBUG::merge_to_original_dtype called: {type(self)}")
+    from peft.tuners.lora.bnb import check_adapters_to_merge, dequantize_bnb_weight
+    
+    adapter_names = check_adapters_to_merge(self, adapter_names)
+    if not adapter_names:
+        # no adapter to merge
+        return
+
+    for active_adapter in adapter_names:
+        if active_adapter not in self.lora_A.keys():
+            continue
+
+        weight = self.get_base_layer().weight
+        kwargs = weight.__dict__
+        lora_data = self.get_delta_weight(active_adapter)
+
+        output = dequantize_bnb_weight(weight, state=weight.quant_state)
+
+        assert not self.use_dora[active_adapter], "DORA is not supported"
+        w_data = output + lora_data
+        w_data = w_data.to(device=weight.device, dtype=weight.quant_state.dtype)
+
+        if safe_merge and not torch.isfinite(w_data).all():
+            raise ValueError(
+                f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+            )
+        
+        if "bnb_quantized" in kwargs:
+            kwargs["bnb_quantized"] = False
+        
+        kwargs["requires_grad"] = False
+        kwargs.pop("data", None)
+        # torch.compile can introduce attributes preceded by '_', remove them
+        kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+    
+        # Note: This is the primary change from original code:
+        # We do not requantize the merged weight back to 4-bit, but keep it in the original dtype
+        # TODO: decide whether to re-enable grad for continued training
+        self.get_base_layer().weight = torch.nn.Parameter(w_data, requires_grad=False)
+
+        if self.lora_bias[active_adapter]:
+            bias_data = self.get_base_layer().bias.data + self.lora_B[active_adapter].bias
+            if safe_merge and not torch.isfinite(bias_data):
+                raise ValueError(
+                    f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                )
+            self.get_base_layer().bias.data = bias_data.to(device=weight.device, dtype=weight.quant_state.dtype)
+
+        self.merged_adapters.append(active_adapter)
+
+def replace_module(module: torch.nn.Module, target_module_type: torch.nn.Module, conversion_func: Callable):
+    for child_name, child_module in module.named_children():
+        if isinstance(child_module, target_module_type):
+            new_module = conversion_func(child_module)
+            setattr(module, child_name, new_module)
+        else:
+            replace_module(child_module, target_module_type, conversion_func)
+
+def _convert_lora_to_linear(module: LoraLayer, adapter_name: str = "default"):
+    base_layer = module.get_base_layer()
+    weight = base_layer.weight
+
+    assert isinstance(weight, bnb.nn.Params4bit)
+    quant_state = weight.quant_state
+    original_dtype = quant_state.dtype
+
+    w_dq = dequantize_4bit(weight.data, quant_state).float()
+    lora_delta = module.lora_B[adapter_name].weight @ module.lora_A[adapter_name].weight * module.scaling[adapter_name]
+    w_dq += lora_delta.float()
+    w_dq = w_dq.to(original_dtype)
+
+    new_module = torch.nn.Linear(w_dq.shape[1], w_dq.shape[0], bias=module.base_layer.bias is not None)
+    new_module.weight.data = torch.nn.Parameter(w_dq, requires_grad=False)
+    if module.lora_bias[adapter_name]:
+        bias_data = module.base_layer.bias.data + module.lora_B[adapter_name].bias
+        new_module.bias.data = torch.nn.Parameter(bias_data, requires_grad=False)
+    return new_module
+
+def convert_lora_to_linear(model: torch.nn.Module):
+    replace_module(model, LoraLayer, _convert_lora_to_linear)
+    assert not any(isinstance(module, LoraLayer) for module in model.modules())
+    return model

From ecc82d368372091a27a0d77fd9128535229a23e8 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 11:22:21 -0700
Subject: [PATCH 2/9] add test / dataset printing to test scripts

---
 tests/test_hf_lora.py      | 4 ++++
 tests/test_unsloth_lora.py | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/tests/test_hf_lora.py b/tests/test_hf_lora.py
index d81c17e1ef..9f37ffa3aa 100644
--- a/tests/test_hf_lora.py
+++ b/tests/test_hf_lora.py
@@ -44,10 +44,14 @@
     prompt = tokenizer.apply_chat_template(
         [USER_MESSAGE], tokenize=False, add_generation_prompt=True
     )
+    with header_footer_context("Test Prompt and Answer"):
+        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
 
     dataset: Dataset = create_dataset(
         tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
     )
+    with header_footer_context("Dataset"):
+        print(f"Dataset: {next(iter(dataset))}")
     
     training_args = SFTConfig(
             output_dir=output_dir,
diff --git a/tests/test_unsloth_lora.py b/tests/test_unsloth_lora.py
index 0eab4a9f1d..d93f78ff54 100644
--- a/tests/test_unsloth_lora.py
+++ b/tests/test_unsloth_lora.py
@@ -111,9 +111,14 @@ def get_unsloth_peft_model(
         [USER_MESSAGE], tokenize=False, add_generation_prompt=True
     )
 
+    with header_footer_context("Test Prompt and Answer"):
+        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
+
     dataset: Dataset = create_dataset(
         tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
     )
+    with header_footer_context("Dataset"):
+        print(f"Dataset: {next(iter(dataset))}")
 
     training_args = SFTConfig(
         output_dir=output_dir,

From dff3ad56bde0e777d37dc677c8ed7f85199900f9 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 11:44:37 -0700
Subject: [PATCH 3/9] allow running tests from repo root

---
 tests/test_hf_lora.py      | 137 -------------------------
 tests/test_unsloth_lora.py | 200 -------------------------------------
 2 files changed, 337 deletions(-)
 delete mode 100644 tests/test_hf_lora.py
 delete mode 100644 tests/test_unsloth_lora.py

diff --git a/tests/test_hf_lora.py b/tests/test_hf_lora.py
deleted file mode 100644
index 9f37ffa3aa..0000000000
--- a/tests/test_hf_lora.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import itertools
-from copy import deepcopy
-
-import torch
-from datasets import Dataset
-from trl import SFTConfig
-from utils import header_footer_context
-from utils.data_utils import (
-    ANSWER,
-    DEFAULT_MESSAGES,
-    USER_MESSAGE,
-    check_responses,
-    create_dataset,
-    describe_peft_weights,
-)
-from utils.hf_utils import (
-    convert_lora_to_linear,
-    fix_llama3_tokenizer,
-    get_peft_config,
-    sample_responses,
-    setup_model,
-    setup_tokenizer,
-    setup_trainer,
-)
-
-if __name__ == "__main__":
-    model_name = "meta-llama/Llama-3.2-1B-Instruct"
-    dtype = torch.bfloat16
-    max_steps = 100
-    num_examples = 1000
-    lora_rank = 64
-    output_dir = "sft_test"
-    seed = 42
-    batch_size = 5
-    num_generations = 5
-    tokenizer = setup_tokenizer(model_name, fixup_funcs=[fix_llama3_tokenizer])
-    temperature = 0.8
-    max_new_tokens = 20
-
-    
-    peft_config = get_peft_config(lora_rank=lora_rank, target_modules="all-linear")
-    model = setup_model(model_name, quantize=True, dtype=dtype, peft_config=peft_config)
-        
-    prompt = tokenizer.apply_chat_template(
-        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
-    )
-    with header_footer_context("Test Prompt and Answer"):
-        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
-
-    dataset: Dataset = create_dataset(
-        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
-    )
-    with header_footer_context("Dataset"):
-        print(f"Dataset: {next(iter(dataset))}")
-    
-    training_args = SFTConfig(
-            output_dir=output_dir,
-            max_steps=max_steps,
-            per_device_train_batch_size=batch_size,
-            log_level="info",
-            report_to="none",
-            num_train_epochs=1,
-            logging_steps=1,
-            seed=seed,
-            bf16=dtype == torch.bfloat16,
-            fp16=dtype == torch.float16,
-            save_strategy="no",
-        )
-
-    with header_footer_context("Train Args"):
-        print(training_args)
-        print(peft_config)
-   
-    trainer = setup_trainer(model, tokenizer, dataset, training_args, peft_config=peft_config)
-
-    with header_footer_context("Model"):
-        print(type(model.model))
-
-    generation_args = {
-        "num_generations": num_generations,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "skip_special_tokens": False,
-        "dtype": dtype,
-    }
-    responses = sample_responses(
-        model,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses before training"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
-
-    with header_footer_context("Peft Weights before training"):
-        for name, stats in itertools.islice(describe_peft_weights(model), 2):
-            print(f"{name}:\n{stats}")
-
-    output = trainer.train()
-    with header_footer_context("Peft Weights after training"):
-        for name, stats in itertools.islice(describe_peft_weights(model), 2):
-            print(f"{name}:\n{stats}")
-
-    with header_footer_context("Trainer Output"):
-        print(output)
-
-    responses = sample_responses(
-        model,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses after training"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
-
-    model_copy = deepcopy(model)
-    
-    merged_model = convert_lora_to_linear(model)
-
-    responses = sample_responses(
-        merged_model,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses after custom merging to 16bit"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
-    
-    merged_model_peft = model_copy.merge_and_unload()
-    responses = sample_responses(
-        merged_model_peft,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses after peft merge_and_unload"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
diff --git a/tests/test_unsloth_lora.py b/tests/test_unsloth_lora.py
deleted file mode 100644
index d93f78ff54..0000000000
--- a/tests/test_unsloth_lora.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# ruff: noqa
-
-import itertools
-from copy import deepcopy
-from unsloth import FastLanguageModel
-
-import torch
-from datasets import Dataset
-from peft.tuners.lora.bnb import Linear4bit
-from transformers import AutoTokenizer
-from trl import SFTConfig
-from utils import header_footer_context, timer
-from utils.data_utils import (
-    DEFAULT_MESSAGES,
-    USER_MESSAGE,
-    ANSWER,
-    create_dataset,
-    describe_peft_weights,
-    check_responses,
-)
-from utils.hf_utils import (
-    convert_lora_to_linear,
-    fix_llama3_tokenizer,
-    get_peft_config,
-    sample_responses,
-    setup_model,
-    setup_tokenizer,
-    setup_trainer,
-)
-
-
-def get_unsloth_model_and_tokenizer(
-    model_name: str,
-    max_seq_length: int,
-    load_in_4bit: bool,
-    fast_inference: bool,
-    max_lora_rank: int = None,
-    gpu_memory_utilization: float = 0.5,
-    dtype: torch.dtype = torch.bfloat16,
-):
-    return FastLanguageModel.from_pretrained(
-        model_name=model_name,
-        max_seq_length=max_seq_length,
-        load_in_4bit=load_in_4bit,
-        fast_inference=fast_inference,
-        max_lora_rank=max_lora_rank,
-        gpu_memory_utilization=gpu_memory_utilization,
-        dtype=dtype,
-    )
-
-
-def get_unsloth_peft_model(
-    model,
-    lora_rank: int,
-    target_modules: list[str] = "all-linear",
-    use_gradient_checkpointing: str = False,
-    random_state: int = 42,
-):
-    return FastLanguageModel.get_peft_model(
-        model,
-        r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
-        target_modules=target_modules,
-        lora_alpha=lora_rank,
-        use_gradient_checkpointing=use_gradient_checkpointing,  # Enable long context finetuning
-        random_state=random_state,
-    )
-
-
-if __name__ == "__main__":
-    model_name = "meta-llama/Llama-3.2-1B-Instruct"
-    dtype = torch.bfloat16
-    max_steps = 100
-    num_examples = 1000
-    lora_rank = 64
-    output_dir = "sft_test"
-    seed = 42
-    batch_size = 5
-    num_generations = 5
-    target_modules = [
-        "q_proj",
-        "k_proj",
-        "v_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-    ]
-    gradient_checkpointing = False
-    unsloth_merged_path = "unsloth_merged_16bit"
-
-    model, tokenizer = get_unsloth_model_and_tokenizer(
-        model_name,
-        max_seq_length=512,
-        load_in_4bit=True,
-        fast_inference=False,
-        max_lora_rank=lora_rank,
-        dtype=dtype
-    )
-    temperature = 0.8
-    max_new_tokens = 20
-
-    model = get_unsloth_peft_model(
-        model,
-        lora_rank=lora_rank,
-        target_modules=target_modules,
-        use_gradient_checkpointing=gradient_checkpointing,
-        random_state=seed,
-    )
-
-    prompt = tokenizer.apply_chat_template(
-        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
-    )
-
-    with header_footer_context("Test Prompt and Answer"):
-        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
-
-    dataset: Dataset = create_dataset(
-        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
-    )
-    with header_footer_context("Dataset"):
-        print(f"Dataset: {next(iter(dataset))}")
-
-    training_args = SFTConfig(
-        output_dir=output_dir,
-        max_steps=max_steps,
-        per_device_train_batch_size=batch_size,
-        log_level="info",
-        report_to="none",
-        num_train_epochs=1,
-        logging_steps=1,
-        seed=seed,
-        bf16=dtype == torch.bfloat16,
-        fp16=dtype == torch.float16,
-        save_strategy="no",
-    )
-
-    with header_footer_context("Train Args"):
-        print(training_args)
-
-    trainer = setup_trainer(model, tokenizer, dataset, training_args)
-
-    with header_footer_context("Model"):
-        print(type(model.model))
-
-    generation_args = {
-        "num_generations": num_generations,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "skip_special_tokens": False,
-        "dtype": dtype,
-    }
-    responses = sample_responses(
-        model,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses before training"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
-    with header_footer_context("Peft Weights before training"):
-        for name, stats in itertools.islice(describe_peft_weights(model), 2):
-            print(f"{name}:\n{stats}")
-
-    output = trainer.train()
-    with header_footer_context("Peft Weights after training"):
-        for name, stats in itertools.islice(describe_peft_weights(model), 2):
-            print(f"{name}:\n{stats}")
-
-    with header_footer_context("Trainer Output"):
-        print(output)
-
-    responses = sample_responses(
-        model,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses after training"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
-
-    model.save_pretrained_merged(
-        unsloth_merged_path,
-        tokenizer,
-        save_method="merged_16bit",
-    )
-    merged_model_unsloth, tokenizer = get_unsloth_model_and_tokenizer(
-        unsloth_merged_path,
-        max_seq_length=512,
-        load_in_4bit=False,
-        fast_inference=False,
-        dtype=dtype,
-    )
-    responses = sample_responses(
-        merged_model_unsloth,
-        tokenizer,
-        prompt=prompt,
-        **generation_args,
-    )
-    with header_footer_context("Responses after unsloth merge to 16bit"):
-        check_responses(responses, answer=ANSWER, prompt=prompt)
\ No newline at end of file

From 494ac765eb253340de8916893dd6dcdd541ac369 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 11:57:42 -0700
Subject: [PATCH 4/9] add qlora test readme

---
 tests/qlora/README.md   | 34 ++++++++++++++++++++
 tests/utils/hf_utils.py | 71 +----------------------------------------
 2 files changed, 35 insertions(+), 70 deletions(-)
 create mode 100644 tests/qlora/README.md

diff --git a/tests/qlora/README.md b/tests/qlora/README.md
new file mode 100644
index 0000000000..b25ffc9ff3
--- /dev/null
+++ b/tests/qlora/README.md
@@ -0,0 +1,34 @@
+## QLoRA Train and Merge Tests
+
+### Overview
+Tests that performing QLoRA training and merging weights to 16-bits post-training maintains same behavior as trained model.
+
+- `test_unsloth_qlora_train_and_merge.py`: Test Unsloth QLoRA train and merge using `FastLanguageModel.from_pretrained`, `FastLanguageModel.get_peft_model`, and `FastLanguageModel.save_pretrained_merged` apis
+- `test_hf_qlora_train_and_merge.py`: Test Hugging Face QLoRA train and merge using `from_pretrained`, `get_peft_model`, and `merge_and_unload` apis.
+   - Demonstrates that `peft`'s `merge_and_unload` results in loss of accuracy as it requantizes the base layer after merging adapter weights so that the model still contains `Linear4Bit` layers post merging.
+   - I (@jeromeku) implemented a custom merge function that maintains the original dtype of the base layer by dequantizing base layer weights, merging adapter weights, and ensuring the original dtype is used for the base layer.
+
+### Usage
+Run unsloth test:
+```bash
+python tests/qlora/test_unsloth_qlora_train_and_merge.py
+```
+Run huggingface test:
+```bash
+python tests/qlora/test_hf_qlora_train_and_merge.py
+```
+
+### Results
+The tests train a QLoRA model on a single prompt dataset
+```
+QUESTION = "What day was I born?"
+ANSWER = "January 1, 2058"
+USER_MESSAGE = {"role": "user", "content": QUESTION}
+ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
+```
+
+Given that the answer is impossible to answer accurately without finetuning, we can only expect the model to answer the question correctly if the model has been trained on the question.
+
+To check this behavior, we print the model's response to the question before and after training, checking whether the model's response is the same as the answer.
+
+
diff --git a/tests/utils/hf_utils.py b/tests/utils/hf_utils.py
index fda4a92d61..6655b08cca 100644
--- a/tests/utils/hf_utils.py
+++ b/tests/utils/hf_utils.py
@@ -141,75 +141,6 @@ def fix_llama3_tokenizer(tokenizer, padding_side="right"):
     tokenizer.pad_token = pad_token[0]  # Load dataset from the hub
     return tokenizer
 
-@contextmanager
-def patch_bnb_merge(debug: bool = False):
-    from peft.tuners.lora.bnb import Linear4bit
-    original_merge = Linear4bit.merge
-    if debug:
-        os.environ["DEBUG_BNB_MERGE"] = "1"
-    Linear4bit.merge = merge_to_original_dtype
-    yield
-    Linear4bit.merge = original_merge
-    if debug:
-        del os.environ["DEBUG_BNB_MERGE"]
-
-
-def merge_to_original_dtype(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-    """
-    Merge the active adapter weights into the base weights without re-quantizing to 4-bit.
-    """
-    
-    if os.environ.get("DEBUG_BNB_MERGE"):
-        print(f"DEBUG::merge_to_original_dtype called: {type(self)}")
-    from peft.tuners.lora.bnb import check_adapters_to_merge, dequantize_bnb_weight
-    
-    adapter_names = check_adapters_to_merge(self, adapter_names)
-    if not adapter_names:
-        # no adapter to merge
-        return
-
-    for active_adapter in adapter_names:
-        if active_adapter not in self.lora_A.keys():
-            continue
-
-        weight = self.get_base_layer().weight
-        kwargs = weight.__dict__
-        lora_data = self.get_delta_weight(active_adapter)
-
-        output = dequantize_bnb_weight(weight, state=weight.quant_state)
-
-        assert not self.use_dora[active_adapter], "DORA is not supported"
-        w_data = output + lora_data
-        w_data = w_data.to(device=weight.device, dtype=weight.quant_state.dtype)
-
-        if safe_merge and not torch.isfinite(w_data).all():
-            raise ValueError(
-                f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-            )
-        
-        if "bnb_quantized" in kwargs:
-            kwargs["bnb_quantized"] = False
-        
-        kwargs["requires_grad"] = False
-        kwargs.pop("data", None)
-        # torch.compile can introduce attributes preceded by '_', remove them
-        kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
-    
-        # Note: This is the primary change from original code:
-        # We do not requantize the merged weight back to 4-bit, but keep it in the original dtype
-        # TODO: decide whether to re-enable grad for continued training
-        self.get_base_layer().weight = torch.nn.Parameter(w_data, requires_grad=False)
-
-        if self.lora_bias[active_adapter]:
-            bias_data = self.get_base_layer().bias.data + self.lora_B[active_adapter].bias
-            if safe_merge and not torch.isfinite(bias_data):
-                raise ValueError(
-                    f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                )
-            self.get_base_layer().bias.data = bias_data.to(device=weight.device, dtype=weight.quant_state.dtype)
-
-        self.merged_adapters.append(active_adapter)
-
 def replace_module(module: torch.nn.Module, target_module_type: torch.nn.Module, conversion_func: Callable):
     for child_name, child_module in module.named_children():
         if isinstance(child_module, target_module_type):
@@ -241,4 +172,4 @@ def _convert_lora_to_linear(module: LoraLayer, adapter_name: str = "default"):
 def convert_lora_to_linear(model: torch.nn.Module):
     replace_module(model, LoraLayer, _convert_lora_to_linear)
     assert not any(isinstance(module, LoraLayer) for module in model.modules())
-    return model
+    return model
\ No newline at end of file

From 6a2c9f0c169ea4db2f60cf3ca6f1fdb5dc8b2834 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 12:03:31 -0700
Subject: [PATCH 5/9] more readme edits

---
 tests/qlora/README.md | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/qlora/README.md b/tests/qlora/README.md
index b25ffc9ff3..121c7ad1e2 100644
--- a/tests/qlora/README.md
+++ b/tests/qlora/README.md
@@ -6,7 +6,7 @@ Tests that performing QLoRA training and merging weights to 16-bits post-trainin
 - `test_unsloth_qlora_train_and_merge.py`: Test Unsloth QLoRA train and merge using `FastLanguageModel.from_pretrained`, `FastLanguageModel.get_peft_model`, and `FastLanguageModel.save_pretrained_merged` apis
 - `test_hf_qlora_train_and_merge.py`: Test Hugging Face QLoRA train and merge using `from_pretrained`, `get_peft_model`, and `merge_and_unload` apis.
    - Demonstrates that `peft`'s `merge_and_unload` results in loss of accuracy as it requantizes the base layer after merging adapter weights so that the model still contains `Linear4Bit` layers post merging.
-   - I (@jeromeku) implemented a custom merge function that maintains the original dtype of the base layer by dequantizing base layer weights, merging adapter weights, and ensuring the original dtype is used for the base layer.
+   - I (@jeromeku) implemented a custom merge function that replaces all `LoraLayers` with `Linear` layers whose weights are the dequantized base layer weights with adapter weights merged (compute done in fp32, cast to original dtype after merging), roughly equivalent to `FastLanguageModel.save_pretrained_merged`.
 
 ### Usage
 Run unsloth test:
@@ -18,7 +18,7 @@ Run huggingface test:
 python tests/qlora/test_hf_qlora_train_and_merge.py
 ```
 
-### Results
+### Details
 The tests train a QLoRA model on a single prompt dataset
 ```
 QUESTION = "What day was I born?"
@@ -29,6 +29,17 @@ ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
 
 Given that the answer is impossible to answer accurately without finetuning, we can only expect the model to answer the question correctly if the model has been trained on the question.
 
-To check this behavior, we print the model's response to the question before and after training, checking whether the model's response is the same as the answer.
+To check this behavior, we check the model's response to the question before and after training across a number of generations, checking whether the model's response contains the answer.
+
+### Results
 
+For the unsloth test, the model's behavior is as expected: 
+- before training, the model's response does not contain the answer
+- after training, the model's response contains the answer
+- after merging, the model's response contains the answer
 
+For the huggingface test, the model's behavior is as expected:
+- before training, the model's response does not contains the answer
+- after training, the model's response contains the answer
+- after using peft's `merge_and_unload`, the model's response does not contain the answer
+- after using my custom merge function, the model's response contains the answer

From 0449c87012503457b6b57bd1034ca22ba804522b Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 12:07:29 -0700
Subject: [PATCH 6/9] ruff formatting

---
 tests/qlora/README.md     |   2 +-
 tests/utils/__init__.py   |   1 -
 tests/utils/data_utils.py |  41 ++++++++---
 tests/utils/hf_utils.py   | 142 ++++++++++++++++++++++++++++++++------
 4 files changed, 154 insertions(+), 32 deletions(-)

diff --git a/tests/qlora/README.md b/tests/qlora/README.md
index 121c7ad1e2..c4578dc60f 100644
--- a/tests/qlora/README.md
+++ b/tests/qlora/README.md
@@ -29,7 +29,7 @@ ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
 
 Given that the answer is impossible to answer accurately without finetuning, we can only expect the model to answer the question correctly if the model has been trained on the question.
 
-To check this behavior, we check the model's response to the question before and after training across a number of generations, checking whether the model's response contains the answer.
+To check this behavior, we check the model's response to the question before and after training and after merging, checking that the model's response contains the answer after training and merging but not before training.
 
 ### Results
 
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
index b736af6dd9..b2993e3813 100644
--- a/tests/utils/__init__.py
+++ b/tests/utils/__init__.py
@@ -17,4 +17,3 @@ def header_footer_context(title: str, char="-"):
     yield
     print(f"{char}" * (100 + len(title) + 2))
     print()
-
diff --git a/tests/utils/data_utils.py b/tests/utils/data_utils.py
index ae1ad863ba..ac7b20e419 100644
--- a/tests/utils/data_utils.py
+++ b/tests/utils/data_utils.py
@@ -8,15 +8,19 @@
 DTYPE = torch.bfloat16
 DEFAULT_MESSAGES = [[USER_MESSAGE, ASSISTANT_MESSAGE]]
 
+
 def create_instruction_dataset(messages: list[dict] = DEFAULT_MESSAGES):
     dataset = Dataset.from_dict({"messages": messages})
     return dataset
 
+
 def create_dataset(tokenizer, num_examples: int = None, messages: list[dict] = None):
     dataset = create_instruction_dataset(messages)
+
     def _apply_chat_template(example):
         chat = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-        return { "text": chat }
+        return {"text": chat}
+
     dataset = dataset.map(_apply_chat_template, remove_columns="messages")
     if num_examples is not None:
         if len(dataset) < num_examples:
@@ -26,7 +30,14 @@ def _apply_chat_template(example):
 
     return dataset
 
-def describe_param(param: torch.Tensor, include_l1: bool = False, include_l2: bool = False, include_infinity: bool = False, as_str: bool = True) -> dict:
+
+def describe_param(
+    param: torch.Tensor,
+    include_l1: bool = False,
+    include_l2: bool = False,
+    include_infinity: bool = False,
+    as_str: bool = True,
+) -> dict:
     """
     Provide a statistical summary of a 2D weight matrix or tensor.
     If as_str is True, the summary is returned as a formatted string.
@@ -36,7 +47,7 @@ def describe_param(param: torch.Tensor, include_l1: bool = False, include_l2: bo
         include_l2 (bool): Whether to include the L2 norm (Frobenius norm).
         include_infinity (bool): Whether to include the infinity norm (max absolute value).
         as_str (bool): Whether to return the summary as a formatted string.
-    
+
     Returns:
         dict: A dictionary with the following statistics:
               - shape: Dimensions of the matrix.
@@ -62,26 +73,27 @@ def describe_param(param: torch.Tensor, include_l1: bool = False, include_l2: bo
         "max": param.max().cpu().item(),
         "percentile_25": param.quantile(0.25).cpu().item(),
         "percentile_50": param.quantile(0.5).cpu().item(),
-        "percentile_75": param.quantile(0.75).cpu().item()
+        "percentile_75": param.quantile(0.75).cpu().item(),
     }
-    
+
     if include_l1:
         summary["L1_norm"] = param.abs().sum().cpu().item()
     if include_l2:
         summary["L2_norm"] = param.norm().cpu().item()
     if include_infinity:
         summary["infinity_norm"] = param.abs().max().cpu().item()
-    
+
     return format_summary(summary) if as_str else summary
 
+
 def format_summary(stats: dict, precision: int = 6) -> str:
     """
     Format the statistical summary dictionary for printing.
-    
+
     Parameters:
         stats (dict): The dictionary returned by describe_param.
         precision (int): Number of decimal places for floating point numbers.
-    
+
     Returns:
         str: A formatted string representing the summary.
     """
@@ -92,21 +104,30 @@ def format_summary(stats: dict, precision: int = 6) -> str:
         elif isinstance(value, (tuple, list)):
             # Format each element in tuples or lists (e.g., the shape)
             formatted_value = ", ".join(str(v) for v in value)
-            formatted_value = f"({formatted_value})" if isinstance(value, tuple) else f"[{formatted_value}]"
+            formatted_value = (
+                f"({formatted_value})"
+                if isinstance(value, tuple)
+                else f"[{formatted_value}]"
+            )
         else:
             formatted_value = str(value)
         lines.append(f"{key}: {formatted_value}")
     return "\n".join(lines)
 
+
 def get_peft_weights(model):
     # ruff: noqa
     is_lora_weight = lambda name: any(s in name for s in ["lora_A", "lora_B"])
-    return {name: param for name, param in model.named_parameters() if is_lora_weight(name)}
+    return {
+        name: param for name, param in model.named_parameters() if is_lora_weight(name)
+    }
+
 
 def describe_peft_weights(model):
     for name, param in get_peft_weights(model).items():
         yield name, describe_param(param, as_str=True)
 
+
 def check_responses(responses: list[str], answer: str, prompt: str = None) -> bool:
     for i, response in enumerate(responses, start=1):
         if answer in response:
diff --git a/tests/utils/hf_utils.py b/tests/utils/hf_utils.py
index 6655b08cca..caa6afd6c6 100644
--- a/tests/utils/hf_utils.py
+++ b/tests/utils/hf_utils.py
@@ -22,44 +22,110 @@
 
 
 class PeftWeightCallback(TrainerCallback):
-    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs, **kwargs):
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs,
+        **kwargs,
+    ):
         print(f"DEBUG::CALLBACK::on_log::{state.log_history}")
 
-    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
         model = kwargs.get("model")
         assert model is not None
         print(f"DEBUG::CALLBACK::on_train_begin::{kwargs.keys()}")
-    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
         print(f"DEBUG::CALLBACK::on_step_end::{state.global_step}")
 
+
 @torch.inference_mode()
-def generate_responses(model, tokenizer, prompt, max_new_tokens: int = 100, temperature: float = 0.8, do_sample: bool = True, num_generations: int = 1, skip_special_tokens: bool = True, dtype: torch.dtype = None):
+def generate_responses(
+    model,
+    tokenizer,
+    prompt,
+    max_new_tokens: int = 100,
+    temperature: float = 0.8,
+    do_sample: bool = True,
+    num_generations: int = 1,
+    skip_special_tokens: bool = True,
+    dtype: torch.dtype = None,
+):
     inputs = [tokenizer(prompt, return_tensors="pt") for _ in range(num_generations)]
     keys = inputs[0].keys()
-    batched_inputs = {key: torch.cat([input[key] for input in inputs], dim=0).to(model.device) for key in keys}
-    
+    batched_inputs = {
+        key: torch.cat([input[key] for input in inputs], dim=0).to(model.device)
+        for key in keys
+    }
+
     if dtype is not None:
         inference_context = torch.autocast(device_type="cuda", dtype=dtype)
     else:
         inference_context = nullcontext()
-    
+
     with inference_context:
-        outputs = model.generate(**batched_inputs, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature)
-    
+        outputs = model.generate(
+            **batched_inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature,
+        )
+
     responses = tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
     return responses
 
-def sample_responses(model, tokenizer, prompt, temperature: float = 0.8, num_generations: int = 1, max_new_tokens: int = 100, skip_special_tokens: bool = True, dtype: torch.dtype = None):
-    responses = generate_responses(model, tokenizer, prompt, temperature=temperature, num_generations=num_generations, max_new_tokens=max_new_tokens, skip_special_tokens=skip_special_tokens, dtype=dtype)
+
+def sample_responses(
+    model,
+    tokenizer,
+    prompt,
+    temperature: float = 0.8,
+    num_generations: int = 1,
+    max_new_tokens: int = 100,
+    skip_special_tokens: bool = True,
+    dtype: torch.dtype = None,
+):
+    responses = generate_responses(
+        model,
+        tokenizer,
+        prompt,
+        temperature=temperature,
+        num_generations=num_generations,
+        max_new_tokens=max_new_tokens,
+        skip_special_tokens=skip_special_tokens,
+        dtype=dtype,
+    )
     return responses
 
+
 def setup_tokenizer(model_name, fixup_funcs: list[Callable] = []):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     for fixup_func in fixup_funcs:
         tokenizer = fixup_func(tokenizer)
     return tokenizer
 
-def setup_model(model_name, quantize: bool = True, dtype=torch.bfloat16, peft_config=None, autocast_adapter: bool = True):
+
+def setup_model(
+    model_name,
+    quantize: bool = True,
+    dtype=torch.bfloat16,
+    peft_config=None,
+    autocast_adapter: bool = True,
+):
     if quantize:
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -78,12 +144,15 @@ def setup_model(model_name, quantize: bool = True, dtype=torch.bfloat16, peft_co
         torch_dtype=dtype,
     )
     model = prepare_model_for_kbit_training(model) if quantize else model
-    
+
     if peft_config is not None:
-        model = get_peft_model(model, peft_config, autocast_adapter_dtype=autocast_adapter)
+        model = get_peft_model(
+            model, peft_config, autocast_adapter_dtype=autocast_adapter
+        )
 
     return model
 
+
 def get_peft_config(
     lora_rank,
     lora_alpha=None,
@@ -102,7 +171,16 @@ def get_peft_config(
     )
     return peft_config
 
-def setup_trainer(model, tokenizer, dataset, train_args, peft_config=None,formatting_func=None, collator=None):
+
+def setup_trainer(
+    model,
+    tokenizer,
+    dataset,
+    train_args,
+    peft_config=None,
+    formatting_func=None,
+    collator=None,
+):
     return SFTTrainer(
         model=model,
         peft_config=peft_config,
@@ -113,7 +191,16 @@ def setup_trainer(model, tokenizer, dataset, train_args, peft_config=None,format
         args=train_args,
     )
 
-def setup_lora(model, tokenizer, dataset, peft_config, train_args, formatting_func=None, collator=None):
+
+def setup_lora(
+    model,
+    tokenizer,
+    dataset,
+    peft_config,
+    train_args,
+    formatting_func=None,
+    collator=None,
+):
     return LoraConfig(
         model=model,
         peft_config=peft_config,
@@ -124,6 +211,7 @@ def setup_lora(model, tokenizer, dataset, peft_config, train_args, formatting_fu
         args=train_args,
     )
 
+
 def convert_weights_back_to_dtype(model, dtype):
     """
     SFTTrainer calls get_peft_model and prepare_model_for_kbit_training which converts all weights to float32.
@@ -133,6 +221,7 @@ def convert_weights_back_to_dtype(model, dtype):
         if any(s in name for s in ["norm", "embed"]):
             param.data = param.data.to(dtype)
 
+
 def fix_llama3_tokenizer(tokenizer, padding_side="right"):
     tokenizer.padding_side = padding_side
     added_vocab = tokenizer.get_added_vocab()
@@ -141,7 +230,12 @@ def fix_llama3_tokenizer(tokenizer, padding_side="right"):
     tokenizer.pad_token = pad_token[0]  # Load dataset from the hub
     return tokenizer
 
-def replace_module(module: torch.nn.Module, target_module_type: torch.nn.Module, conversion_func: Callable):
+
+def replace_module(
+    module: torch.nn.Module,
+    target_module_type: torch.nn.Module,
+    conversion_func: Callable,
+):
     for child_name, child_module in module.named_children():
         if isinstance(child_module, target_module_type):
             new_module = conversion_func(child_module)
@@ -149,6 +243,7 @@ def replace_module(module: torch.nn.Module, target_module_type: torch.nn.Module,
         else:
             replace_module(child_module, target_module_type, conversion_func)
 
+
 def _convert_lora_to_linear(module: LoraLayer, adapter_name: str = "default"):
     base_layer = module.get_base_layer()
     weight = base_layer.weight
@@ -158,18 +253,25 @@ def _convert_lora_to_linear(module: LoraLayer, adapter_name: str = "default"):
     original_dtype = quant_state.dtype
 
     w_dq = dequantize_4bit(weight.data, quant_state).float()
-    lora_delta = module.lora_B[adapter_name].weight @ module.lora_A[adapter_name].weight * module.scaling[adapter_name]
+    lora_delta = (
+        module.lora_B[adapter_name].weight
+        @ module.lora_A[adapter_name].weight
+        * module.scaling[adapter_name]
+    )
     w_dq += lora_delta.float()
     w_dq = w_dq.to(original_dtype)
 
-    new_module = torch.nn.Linear(w_dq.shape[1], w_dq.shape[0], bias=module.base_layer.bias is not None)
+    new_module = torch.nn.Linear(
+        w_dq.shape[1], w_dq.shape[0], bias=module.base_layer.bias is not None
+    )
     new_module.weight.data = torch.nn.Parameter(w_dq, requires_grad=False)
     if module.lora_bias[adapter_name]:
         bias_data = module.base_layer.bias.data + module.lora_B[adapter_name].bias
         new_module.bias.data = torch.nn.Parameter(bias_data, requires_grad=False)
     return new_module
 
+
 def convert_lora_to_linear(model: torch.nn.Module):
     replace_module(model, LoraLayer, _convert_lora_to_linear)
     assert not any(isinstance(module, LoraLayer) for module in model.modules())
-    return model
\ No newline at end of file
+    return model

From 0bd917929c2097eeaca79c3fa848366fb96449d5 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 12:13:59 -0700
Subject: [PATCH 7/9] additional readme comments

---
 tests/qlora/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qlora/README.md b/tests/qlora/README.md
index c4578dc60f..e535c38760 100644
--- a/tests/qlora/README.md
+++ b/tests/qlora/README.md
@@ -43,3 +43,5 @@ For the huggingface test, the model's behavior is as expected:
 - after training, the model's response contains the answer
 - after using peft's `merge_and_unload`, the model's response does not contain the answer
 - after using my custom merge function, the model's response contains the answer
+
+The scripts should output training params, training logs, as well as model responses before and after training and after merging (only prints model responses if answer is not contained in response).
\ No newline at end of file

From 7e197f62db5ed7d6f22bf7dc283cca0c819cb744 Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Thu, 20 Mar 2025 12:17:42 -0700
Subject: [PATCH 8/9] forgot to add actual tests

---
 tests/qlora/test_hf_qlora_train_and_merge.py  | 145 +++++++++++
 .../test_unsloth_qlora_train_and_merge.py     | 197 +++++++++++++++
 tests/scratch.py                              | 228 ------------------
 3 files changed, 342 insertions(+), 228 deletions(-)
 create mode 100644 tests/qlora/test_hf_qlora_train_and_merge.py
 create mode 100644 tests/qlora/test_unsloth_qlora_train_and_merge.py
 delete mode 100644 tests/scratch.py

diff --git a/tests/qlora/test_hf_qlora_train_and_merge.py b/tests/qlora/test_hf_qlora_train_and_merge.py
new file mode 100644
index 0000000000..eea9db841e
--- /dev/null
+++ b/tests/qlora/test_hf_qlora_train_and_merge.py
@@ -0,0 +1,145 @@
+# ruff: noqa
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).parents[2]
+sys.path.append(str(REPO_ROOT))
+
+import itertools
+from copy import deepcopy
+
+import torch
+from datasets import Dataset
+from trl import SFTConfig
+from tests.utils import header_footer_context
+from tests.utils.data_utils import (
+    ANSWER,
+    DEFAULT_MESSAGES,
+    USER_MESSAGE,
+    check_responses,
+    create_dataset,
+    describe_peft_weights,
+)
+from tests.utils.hf_utils import (
+    convert_lora_to_linear,
+    fix_llama3_tokenizer,
+    get_peft_config,
+    sample_responses,
+    setup_model,
+    setup_tokenizer,
+    setup_trainer,
+)
+
+if __name__ == "__main__":
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    dtype = torch.bfloat16
+    max_steps = 100
+    num_examples = 1000
+    lora_rank = 64
+    output_dir = "sft_test"
+    seed = 42
+    batch_size = 5
+    num_generations = 5
+    tokenizer = setup_tokenizer(model_name, fixup_funcs=[fix_llama3_tokenizer])
+    temperature = 0.8
+    max_new_tokens = 20
+
+    peft_config = get_peft_config(lora_rank=lora_rank, target_modules="all-linear")
+    model = setup_model(model_name, quantize=True, dtype=dtype, peft_config=peft_config)
+
+    prompt = tokenizer.apply_chat_template(
+        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
+    )
+    with header_footer_context("Test Prompt and Answer"):
+        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
+
+    dataset: Dataset = create_dataset(
+        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
+    )
+    with header_footer_context("Dataset"):
+        print(f"Dataset: {next(iter(dataset))}")
+
+    training_args = SFTConfig(
+        output_dir=output_dir,
+        max_steps=max_steps,
+        per_device_train_batch_size=batch_size,
+        log_level="info",
+        report_to="none",
+        num_train_epochs=1,
+        logging_steps=1,
+        seed=seed,
+        bf16=dtype == torch.bfloat16,
+        fp16=dtype == torch.float16,
+        save_strategy="no",
+    )
+
+    with header_footer_context("Train Args"):
+        print(training_args)
+        print(peft_config)
+
+    trainer = setup_trainer(
+        model, tokenizer, dataset, training_args, peft_config=peft_config
+    )
+
+    with header_footer_context("Model"):
+        print(type(model.model))
+
+    generation_args = {
+        "num_generations": num_generations,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "skip_special_tokens": False,
+        "dtype": dtype,
+    }
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses before training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    with header_footer_context("Peft Weights before training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    output = trainer.train()
+    with header_footer_context("Peft Weights after training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    with header_footer_context("Trainer Output"):
+        print(output)
+
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    model_copy = deepcopy(model)
+
+    merged_model = convert_lora_to_linear(model)
+
+    responses = sample_responses(
+        merged_model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after custom merging to 16bit"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    merged_model_peft = model_copy.merge_and_unload()
+    responses = sample_responses(
+        merged_model_peft,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after peft merge_and_unload"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
diff --git a/tests/qlora/test_unsloth_qlora_train_and_merge.py b/tests/qlora/test_unsloth_qlora_train_and_merge.py
new file mode 100644
index 0000000000..447c2f2193
--- /dev/null
+++ b/tests/qlora/test_unsloth_qlora_train_and_merge.py
@@ -0,0 +1,197 @@
+# ruff: noqa
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).parents[2]
+sys.path.append(str(REPO_ROOT))
+
+import itertools
+from unsloth import FastLanguageModel
+
+import torch
+from datasets import Dataset
+from trl import SFTConfig
+from tests.utils import header_footer_context
+from tests.utils.data_utils import (
+    DEFAULT_MESSAGES,
+    USER_MESSAGE,
+    ANSWER,
+    create_dataset,
+    describe_peft_weights,
+    check_responses,
+)
+from tests.utils.hf_utils import (
+    sample_responses,
+    setup_trainer,
+)
+
+
+def get_unsloth_model_and_tokenizer(
+    model_name: str,
+    max_seq_length: int,
+    load_in_4bit: bool,
+    fast_inference: bool,
+    max_lora_rank: int = None,
+    gpu_memory_utilization: float = 0.5,
+    dtype: torch.dtype = torch.bfloat16,
+):
+    return FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq_length,
+        load_in_4bit=load_in_4bit,
+        fast_inference=fast_inference,
+        max_lora_rank=max_lora_rank,
+        gpu_memory_utilization=gpu_memory_utilization,
+        dtype=dtype,
+    )
+
+
+def get_unsloth_peft_model(
+    model,
+    lora_rank: int,
+    target_modules: list[str] = "all-linear",
+    use_gradient_checkpointing: str = False,
+    random_state: int = 42,
+):
+    return FastLanguageModel.get_peft_model(
+        model,
+        r=lora_rank,
+        target_modules=target_modules,
+        lora_alpha=lora_rank,
+        use_gradient_checkpointing=use_gradient_checkpointing,
+        random_state=random_state,
+    )
+
+
+if __name__ == "__main__":
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    dtype = torch.bfloat16
+    max_steps = 100
+    num_examples = 1000
+    lora_rank = 64
+    output_dir = "sft_test"
+    seed = 42
+    batch_size = 5
+    num_generations = 5
+    target_modules = [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ]
+    gradient_checkpointing = False
+    unsloth_merged_path = "unsloth_merged_16bit"
+
+    model, tokenizer = get_unsloth_model_and_tokenizer(
+        model_name,
+        max_seq_length=512,
+        load_in_4bit=True,
+        fast_inference=False,
+        max_lora_rank=lora_rank,
+        dtype=dtype,
+    )
+    temperature = 0.8
+    max_new_tokens = 20
+
+    model = get_unsloth_peft_model(
+        model,
+        lora_rank=lora_rank,
+        target_modules=target_modules,
+        use_gradient_checkpointing=gradient_checkpointing,
+        random_state=seed,
+    )
+
+    prompt = tokenizer.apply_chat_template(
+        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
+    )
+
+    with header_footer_context("Test Prompt and Answer"):
+        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
+
+    dataset: Dataset = create_dataset(
+        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
+    )
+    with header_footer_context("Dataset"):
+        print(f"Dataset: {next(iter(dataset))}")
+
+    training_args = SFTConfig(
+        output_dir=output_dir,
+        max_steps=max_steps,
+        per_device_train_batch_size=batch_size,
+        log_level="info",
+        report_to="none",
+        num_train_epochs=1,
+        logging_steps=1,
+        seed=seed,
+        bf16=dtype == torch.bfloat16,
+        fp16=dtype == torch.float16,
+        save_strategy="no",
+    )
+
+    with header_footer_context("Train Args"):
+        print(training_args)
+
+    trainer = setup_trainer(model, tokenizer, dataset, training_args)
+
+    with header_footer_context("Model"):
+        print(type(model.model))
+
+    generation_args = {
+        "num_generations": num_generations,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "skip_special_tokens": False,
+        "dtype": dtype,
+    }
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses before training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+    with header_footer_context("Peft Weights before training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    output = trainer.train()
+    with header_footer_context("Peft Weights after training"):
+        for name, stats in itertools.islice(describe_peft_weights(model), 2):
+            print(f"{name}:\n{stats}")
+
+    with header_footer_context("Trainer Output"):
+        print(output)
+
+    responses = sample_responses(
+        model,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after training"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
+
+    model.save_pretrained_merged(
+        unsloth_merged_path,
+        tokenizer,
+        save_method="merged_16bit",
+    )
+    merged_model_unsloth, tokenizer = get_unsloth_model_and_tokenizer(
+        unsloth_merged_path,
+        max_seq_length=512,
+        load_in_4bit=False,
+        fast_inference=False,
+        dtype=dtype,
+    )
+    responses = sample_responses(
+        merged_model_unsloth,
+        tokenizer,
+        prompt=prompt,
+        **generation_args,
+    )
+    with header_footer_context("Responses after unsloth merge to 16bit"):
+        check_responses(responses, answer=ANSWER, prompt=prompt)
diff --git a/tests/scratch.py b/tests/scratch.py
deleted file mode 100644
index 861db6cf1c..0000000000
--- a/tests/scratch.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import itertools
-from typing import Literal
-
-import torch
-import transformers
-from datasets import Dataset, IterableDataset, load_dataset
-from peft import LoraConfig, get_peft_model
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from transformers.utils.logging import set_verbosity
-from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
-from trl.data_utils import apply_chat_template
-
-# set_verbosity(transformers.logging.INFO)
-
-USE_INSTRUCT = True
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" if USE_INSTRUCT else "meta-llama/Llama-3.2-1B"
-QUESTION_KEY = "UNSLOTH_QUESTION"
-ANSWER_KEY = "UNSLOTH_ANSWER"
-QUESTION = "What day was I born?"
-ANSWER = "January 1, 2058"
-USER_MESSAGE = {"role": "user", "content": QUESTION}
-ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
-DTYPE = torch.bfloat16
-
-MAX_STEPS = 100
-OUTPUT_DIR = "sft_test"
-def formatting_prompts_func(example):
-    text = f"### {QUESTION_KEY}: {example['question']}\n ### {ANSWER_KEY}: {example['answer']}"
-    return text
-
-def data_generator():
-    while 1:
-        yield {"question": QUESTION, "answer": ANSWER}
-
-def test_dataset():
-    dataset = IterableDataset.from_generator(data_generator)
-
-    dataset = dataset.map(lambda example: {"text": formatting_prompts_func(example)})
-    formatted_data = next(iter(dataset))
-    assert formatted_data["text"] == f"### {QUESTION_KEY}: {QUESTION} ### {ANSWER_KEY}: {ANSWER}"
-
-def create_dummy_dataset(num_examples: int = 100, format_prompts: bool = False, dataset_type: Literal["prompt_completion", "instruct", "text"] = "prompt_completion"):
-    if dataset_type == "instruct":
-        dataset = Dataset.from_dict({"messages": [[USER_MESSAGE], [ASSISTANT_MESSAGE]] * num_examples})
-    elif dataset_type == "prompt_completion":
-        dataset = Dataset.from_dict({"prompt": [[USER_MESSAGE]] * num_examples, "completion": [[ASSISTANT_MESSAGE]] * num_examples})
-    else:
-        dataset = IterableDataset.from_generator(data_generator)
-        if format_prompts:
-            dataset = dataset.map(lambda example: {"text": formatting_prompts_func(example)})
-        dataset = itertools.islice(dataset, num_examples)
-    return dataset
-
-def get_test_dataset(dataset_type: Literal["prompt_completion", "instruct", "text"] = "prompt_completion", num_examples: int = 100, format_prompts: bool = False):
-    dataset = create_dummy_dataset(num_examples=num_examples, dataset_type=dataset_type, format_prompts=format_prompts)
-    return dataset
-
-def test_model(num_repeats: int = 10, do_sample: bool = False, temperature: float = 0.8, dataset_type: Literal["prompt_completion", "instruct", "text"] = "prompt_completion"):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=DTYPE, device_map="cuda")
-    if dataset_type == "instruct" or dataset_type == "prompt_completion":
-        prompt = [{"role": "user", "content": QUESTION}]
-        inputs = tokenizer.apply_chat_template(prompt, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
-    else:
-        prompt = QUESTION
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    
-    for i in range(num_repeats):
-        outputs = model.generate(**inputs, max_new_tokens=100, do_sample=do_sample, temperature=temperature)
-        response = tokenizer.decode(outputs[0], skip_special_tokens=False)
-        print(f"Response {i}:\n{response}")
-        print("-"*100)
-
-def fix_tokenizer(tokenizer):
-    tokenizer.padding_side = "right"
-    added_vocab = tokenizer.get_added_vocab()
-    pad_token = [w for w in added_vocab if "pad" in w]
-    assert len(pad_token) == 1
-    tokenizer.pad_token = pad_token[0]  # Load dataset from the hub
-    return tokenizer
-
-def train_model():
-    dataset = create_dummy_dataset(num_examples=100, format_prompts=True, use_instruct=USE_INSTRUCT)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    tokenizer = fix_tokenizer(tokenizer)
-    print(tokenizer.get_chat_template())
-
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=DTYPE, device_map="cuda")
-    training_args = SFTConfig(
-            output_dir=OUTPUT_DIR,
-            max_steps=MAX_STEPS,
-            per_device_train_batch_size=5,
-            log_level="info",
-            report_to="none",
-            num_train_epochs=1,
-            logging_steps=1,
-            seed=42,
-            bf16=DTYPE == torch.bfloat16,
-            fp16=DTYPE == torch.float16,
-            #save_steps=50,
-        )
-    trainer = SFTTrainer(
-        model=model,
-        processing_class=tokenizer,
-        train_dataset=dataset,
-        args=training_args,
-        
-    )
-    # data_loader = trainer.get_train_dataloader()
-    # batch = next(iter(data_loader))
-    # input_ids = batch["input_ids"]
-
-    # print(tokenizer.decode(input_ids[0], skip_special_tokens=False))
-def create_instruction_dataset(num_examples: int = 10):
-    dataset = Dataset.from_dict({"messages": [[USER_MESSAGE, ASSISTANT_MESSAGE]] * num_examples})
-    return dataset
-
-
-def create_dataset(tokenizer, num_examples: int = 10):
-    dataset = create_instruction_dataset(num_examples)
-    def _apply_chat_template(example):
-        chat = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-        return { "text": chat }
-    dataset = dataset.map(_apply_chat_template, remove_columns="messages")
-    return dataset
-
-def generate_text(model, tokenizer, prompt = None, inputs = None, temperature: float = 0.8, do_sample: bool = True):
-    assert prompt is not None or inputs is not None
-    if prompt is not None:
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(**inputs, max_new_tokens=100, do_sample=do_sample, temperature=temperature)
-    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
-    return response
-
-def setup_model(model_name, quantize: bool = True, dtype=torch.bfloat16):
-    if quantize:
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=dtype,
-        )
-    else:
-        bnb_config = None
-
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        device_map="cuda:0",
-        attn_implementation="sdpa",
-        quantization_config=bnb_config,
-        torch_dtype=dtype,
-    )
-    return model
-
-def setup_peft(
-    lora_rank,
-    lora_alpha=None,
-    lora_dropout=0.0,
-    bias="none",
-    target_modules="all-linear",
-):
-    lora_alpha = lora_alpha or 2 * lora_rank
-    peft_config = LoraConfig(
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
-        r=lora_rank,
-        bias=bias,
-        target_modules=target_modules,
-        task_type="CAUSAL_LM",
-    )
-    return peft_config
-
-def setup_trainer(model, tokenizer, dataset, peft_config, train_args, formatting_func=None, collator=None):
-    return SFTTrainer(
-        model=model,
-        peft_config=peft_config,
-        train_dataset=dataset,
-        processing_class=tokenizer,
-        formatting_func=formatting_func,
-        data_collator=collator,
-        args=train_args,
-    )
-
-def convert_weights_back_to_dtype(model, dtype):
-    """
-    SFTTrainer calls get_peft_model and prepare_model_for_kbit_training which converts all weights to float32.
-    This function converts the non-loraweights back to the original dtype.
-    """
-    for name, param in model.named_parameters():
-        if any(s in name for s in ["norm", "embed"]):
-            param.data = param.data.to(dtype)
-
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    tokenizer = fix_tokenizer(tokenizer)
-    prompt = tokenizer.apply_chat_template([USER_MESSAGE], tokenize=False, add_generation_prompt=True)
-    # print(prompt)
-
-    dataset: Dataset = create_instruction_dataset(num_examples=1)
-    dataset = dataset.repeat(1000)
-    model = setup_model(MODEL_NAME, quantize=True, dtype=DTYPE)
-    
-    training_args = SFTConfig(
-            output_dir=OUTPUT_DIR,
-            max_steps=MAX_STEPS,
-            per_device_train_batch_size=5,
-            log_level="info",
-            report_to="none",
-            num_train_epochs=1,
-            logging_steps=1,
-            seed=42,
-            bf16=DTYPE == torch.bfloat16,
-            fp16=DTYPE == torch.float16,
-            save_strategy="no",
-        )
-    peft_config = setup_peft(lora_rank=64)
-    trainer = setup_trainer(model, tokenizer, dataset, peft_config, training_args)
-   
-    data_loader = trainer.get_train_dataloader()
-    batch = next(iter(data_loader))
-    input_ids = batch["input_ids"]
-    print(tokenizer.decode(input_ids[0], skip_special_tokens=False))
-   
-    # breakpoint()
-    # output = trainer.train()
-    # print(output)
-    # print(prompt)
-    # print(generate_text(model, tokenizer, prompt=prompt))

From 97133331a5a76b22af3678f41d3e1da2fc23cafe Mon Sep 17 00:00:00 2001
From: jeromeku <jerome.ku@gmail.com>
Date: Fri, 21 Mar 2025 17:43:42 -0700
Subject: [PATCH 9/9] add apache license

---
 tests/qlora/test_hf_qlora_train_and_merge.py      | 14 ++++++++++++++
 tests/qlora/test_unsloth_qlora_train_and_merge.py | 14 ++++++++++++++
 tests/utils/__init__.py                           | 14 ++++++++++++++
 tests/utils/data_utils.py                         | 14 ++++++++++++++
 tests/utils/hf_utils.py                           | 14 ++++++++++++++
 5 files changed, 70 insertions(+)

diff --git a/tests/qlora/test_hf_qlora_train_and_merge.py b/tests/qlora/test_hf_qlora_train_and_merge.py
index eea9db841e..797d940180 100644
--- a/tests/qlora/test_hf_qlora_train_and_merge.py
+++ b/tests/qlora/test_hf_qlora_train_and_merge.py
@@ -1,3 +1,17 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # ruff: noqa
 import sys
 from pathlib import Path
diff --git a/tests/qlora/test_unsloth_qlora_train_and_merge.py b/tests/qlora/test_unsloth_qlora_train_and_merge.py
index 447c2f2193..59fa813fa6 100644
--- a/tests/qlora/test_unsloth_qlora_train_and_merge.py
+++ b/tests/qlora/test_unsloth_qlora_train_and_merge.py
@@ -1,3 +1,17 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # ruff: noqa
 import sys
 from pathlib import Path
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
index b2993e3813..cd5d0d96c7 100644
--- a/tests/utils/__init__.py
+++ b/tests/utils/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 from contextlib import contextmanager
 
diff --git a/tests/utils/data_utils.py b/tests/utils/data_utils.py
index ac7b20e419..7682fe4807 100644
--- a/tests/utils/data_utils.py
+++ b/tests/utils/data_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 from datasets import Dataset
 
diff --git a/tests/utils/hf_utils.py b/tests/utils/hf_utils.py
index caa6afd6c6..cc5edce021 100644
--- a/tests/utils/hf_utils.py
+++ b/tests/utils/hf_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from contextlib import contextmanager, nullcontext
 from typing import Callable, Optional