From e549c8273805de14a59613702e5eeaa3be96b479 Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Tue, 16 May 2023 22:53:26 -0300
Subject: [PATCH 1/6] add scripts/prepare_any_text.py

---
 scripts/prepare_any_text.py | 97 +++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 scripts/prepare_any_text.py

diff --git a/scripts/prepare_any_text.py b/scripts/prepare_any_text.py
new file mode 100644
index 00000000..9377da6a
--- /dev/null
+++ b/scripts/prepare_any_text.py
@@ -0,0 +1,97 @@
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import sys
+from pathlib import Path
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import torch
+import requests
+import json
+from torch.utils.data import random_split
+from lit_llama.tokenizer import Tokenizer
+from tqdm import tqdm
+
+
+IGNORE_INDEX = -1
+
+DATA_FILE_NAME = "input.txt"
+
+
+def prepare(
+    destination_path: Path = Path("data/any"),
+    tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
+    test_split_ratio: float = 0.9,  # default 90% train, 10% validation
+    max_seq_length: int = 256,
+    seed: int = 42,
+    data_file_name: str = DATA_FILE_NAME,
+) -> None:
+    """Prepare any dataset for finetuning (akin to Shakespheare full tuning).
+
+    The output is a training and validation dataset saved as `train.pt` and `val.pt`,
+    which stores the preprocessed and tokenized prompts and labels.
+    """
+
+    destination_path.mkdir(parents=True, exist_ok=True)
+    file_path = destination_path / data_file_name
+    if not file_path.exists():
+        raise AssertionError(f"{data_file_name} is provided by the user")
+
+    # TODO: If we don't have the Meta weights, where do we get the tokenizer from?
+    tokenizer = Tokenizer(tokenizer_path)
+
+    data = []
+
+    with open(file_path, "r") as input_file:
+        for line in input_file.readlines():
+            data.append(line)
+
+    # Partition the dataset into train and test
+    train_split_size = int(len(data) * test_split_ratio)
+    test_split_size = len(data) - train_split_size
+    train_set, test_set = random_split(
+        data,
+        lengths=(train_split_size, test_split_size),
+        generator=torch.Generator().manual_seed(seed),
+    )
+    train_set, test_set = list(train_set), list(test_set)
+
+    print(f"train has {len(train_set):,} samples")
+    print(f"val has {len(test_set):,} samples")
+
+    print("Processing train split ...")
+    train_set = [
+        prepare_line(line, tokenizer, max_seq_length) for line in tqdm(train_set)
+    ]
+    torch.save(train_set, file_path.parent / "train.pt")
+
+    print("Processing test split ...")
+    test_set = [
+        prepare_line(line, tokenizer, max_seq_length) for line in tqdm(test_set)
+    ]
+    torch.save(test_set, file_path.parent / "test.pt")
+
+
+def prepare_line(line: str, tokenizer: Tokenizer, max_length: int):
+    """Processes a single sample.
+
+    This function processes the line to produce the tokenized version of it.
+    """
+    encoded_full_prompt = tokenize(tokenizer, line, max_length=max_length, eos=False)
+    return {
+        "input_ids": encoded_full_prompt,
+        "labels": encoded_full_prompt,
+    }
+
+
+def tokenize(
+    tokenizer: Tokenizer, string: str, max_length: int, eos=True
+) -> torch.Tensor:
+    return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)
+
+
+if __name__ == "__main__":
+    from jsonargparse import CLI
+
+    CLI(prepare)

From f45e7146c59d25c2e13c32bfcd3af04325f8fbb3 Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Tue, 16 May 2023 23:02:21 -0300
Subject: [PATCH 2/6] make Adapter support non-instruction-tuned datasets

---
 finetune/adapter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/finetune/adapter.py b/finetune/adapter.py
index f4bf266e..9db54298 100644
--- a/finetune/adapter.py
+++ b/finetune/adapter.py
@@ -32,6 +32,7 @@
 from lightning.fabric.strategies import DeepSpeedStrategy
 
 
+instruction_tuning = True
 eval_interval = 600
 save_interval = 1000
 eval_iters = 100
@@ -157,7 +158,9 @@ def train(
 def generate_response(model, instruction, input=""):
     tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model")
     sample = {"instruction": instruction, "input": input}
-    prompt = generate_prompt(sample)
+    prompt = instruction
+    if instruction_tuning:
+        prompt = generate_prompt(sample)
     encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device)
 
     output = generate(

From 84df5af3289b2665b63a042d64e558bbd9ef4280 Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Sat, 20 May 2023 00:19:50 -0300
Subject: [PATCH 3/6] add instruction_tuning parameter to lora and full

---
 finetune/full.py | 5 ++++-
 finetune/lora.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/finetune/full.py b/finetune/full.py
index 58932967..f437869f 100644
--- a/finetune/full.py
+++ b/finetune/full.py
@@ -27,6 +27,7 @@
 from scripts.prepare_alpaca import generate_prompt
 
 
+instruction_tuning = True
 eval_interval = 1000
 save_interval = 1000
 eval_iters = 100
@@ -141,7 +142,9 @@ def train(
 def generate_response(model, instruction):
     tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model")
     sample = {"instruction": instruction, "input": ""}
-    prompt = generate_prompt(sample)
+    prompt = instruction
+    if instruction_tuning:
+        prompt = generate_prompt(sample)
     encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device)
 
     output = generate(
diff --git a/finetune/lora.py b/finetune/lora.py
index 5668e5cf..9e169c68 100644
--- a/finetune/lora.py
+++ b/finetune/lora.py
@@ -24,6 +24,7 @@
 from scripts.prepare_alpaca import generate_prompt
 
 
+instruction_tuning = True
 eval_interval = 100
 save_interval = 100
 eval_iters = 100
@@ -133,7 +134,9 @@ def train(
 def generate_response(model, instruction):
     tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model")
     sample = {"instruction": instruction, "input": ""}
-    prompt = generate_prompt(sample)
+    prompt = instruction
+    if instruction_tuning:
+        prompt = generate_prompt(sample)
     encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device)
 
     output = generate(

From 2ede3c06abe1d714cdcb574eaebc542f02161939 Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Sat, 20 May 2023 00:20:03 -0300
Subject: [PATCH 4/6] add CLI option for instruction_tuning

---
 finetune/adapter.py | 2 ++
 finetune/full.py    | 2 ++
 finetune/lora.py    | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/finetune/adapter.py b/finetune/adapter.py
index 9db54298..d104d48e 100644
--- a/finetune/adapter.py
+++ b/finetune/adapter.py
@@ -62,7 +62,9 @@ def main(
     data_dir: str = "data/alpaca", 
     pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
     out_dir: str = "out/adapter/alpaca",
+    is_instruction_tuning: bool = True,
 ):
+    instruction_tuning = is_instruction_tuning
 
     fabric = L.Fabric(
         accelerator="cuda", 
diff --git a/finetune/full.py b/finetune/full.py
index f437869f..b7693fbb 100644
--- a/finetune/full.py
+++ b/finetune/full.py
@@ -51,7 +51,9 @@ def main(
     data_dir: str = "data/alpaca",
     pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
     out_dir: str = "out/full/alpaca",
+    is_instruction_tuning: bool = True
 ):
+    instruction_tuning = is_instruction_tuning
 
     auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block})
     strategy = FSDPStrategy(auto_wrap_policy=auto_wrap_policy, activation_checkpointing=Block)
diff --git a/finetune/lora.py b/finetune/lora.py
index 9e169c68..2a5e9af2 100644
--- a/finetune/lora.py
+++ b/finetune/lora.py
@@ -48,7 +48,9 @@ def main(
     data_dir: str = "data/alpaca", 
     pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
     out_dir: str = "out/lora/alpaca",
+    is_instruction_tuning: bool = True
 ):
+    instruction_tuning = is_instruction_tuning
 
     fabric = L.Fabric(accelerator="cuda", devices=1, precision="bf16-true")
     fabric.launch()

From 526b0bcc97188199aa6d13ec69faaa14ff9d4ab2 Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Sun, 28 May 2023 23:24:02 -0300
Subject: [PATCH 5/6] Revert "add CLI option for instruction_tuning"

This reverts commit 2ede3c06abe1d714cdcb574eaebc542f02161939.
---
 finetune/adapter.py | 2 --
 finetune/full.py    | 2 --
 finetune/lora.py    | 2 --
 3 files changed, 6 deletions(-)

diff --git a/finetune/adapter.py b/finetune/adapter.py
index d104d48e..9db54298 100644
--- a/finetune/adapter.py
+++ b/finetune/adapter.py
@@ -62,9 +62,7 @@ def main(
     data_dir: str = "data/alpaca", 
     pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
     out_dir: str = "out/adapter/alpaca",
-    is_instruction_tuning: bool = True,
 ):
-    instruction_tuning = is_instruction_tuning
 
     fabric = L.Fabric(
         accelerator="cuda", 
diff --git a/finetune/full.py b/finetune/full.py
index b7693fbb..f437869f 100644
--- a/finetune/full.py
+++ b/finetune/full.py
@@ -51,9 +51,7 @@ def main(
     data_dir: str = "data/alpaca",
     pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
     out_dir: str = "out/full/alpaca",
-    is_instruction_tuning: bool = True
 ):
-    instruction_tuning = is_instruction_tuning
 
     auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block})
     strategy = FSDPStrategy(auto_wrap_policy=auto_wrap_policy, activation_checkpointing=Block)
diff --git a/finetune/lora.py b/finetune/lora.py
index 2a5e9af2..9e169c68 100644
--- a/finetune/lora.py
+++ b/finetune/lora.py
@@ -48,9 +48,7 @@ def main(
     data_dir: str = "data/alpaca", 
     pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
     out_dir: str = "out/lora/alpaca",
-    is_instruction_tuning: bool = True
 ):
-    instruction_tuning = is_instruction_tuning
 
     fabric = L.Fabric(accelerator="cuda", devices=1, precision="bf16-true")
     fabric.launch()

From 33522d913a9c4677baf5cb828402ae8013ef89cf Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Sun, 28 May 2023 23:47:33 -0300
Subject: [PATCH 6/6] add instruction_tuning parameter to evaluation scripts

---
 evaluate/adapter.py | 7 +++++--
 evaluate/lora.py    | 6 ++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/evaluate/adapter.py b/evaluate/adapter.py
index 998370ec..e3bd667c 100644
--- a/evaluate/adapter.py
+++ b/evaluate/adapter.py
@@ -21,6 +21,8 @@
 
 from datasets import load_dataset
 
+instruction_tuning = True
+
 
 def load_eval_data(dataset_name: str) -> str:
     # this mimics gptq datautils
@@ -113,8 +115,9 @@ def main(
     for dsname in datasets.split(","):
         test_string = load_eval_data(dsname)
 
-        sample = {"instruction": test_string, "input": input}
-        test_string = generate_prompt(sample)
+        if instruction_tuning:
+            sample = {"instruction": test_string, "input": input}
+            test_string = generate_prompt(sample)
 
         encoded_text = tokenizer.encode(
             test_string, bos=True, eos=False, device=fabric.device
diff --git a/evaluate/lora.py b/evaluate/lora.py
index d1a495cf..0ad55509 100644
--- a/evaluate/lora.py
+++ b/evaluate/lora.py
@@ -21,6 +21,7 @@
 
 from datasets import load_dataset
 
+instruction_tuning = True
 lora_r = 8
 lora_alpha = 16
 lora_dropout = 0.05
@@ -123,8 +124,9 @@ def main(
     for dsname in datasets.split(","):
         test_string = load_eval_data(dsname)
 
-        sample = {"instruction": test_string, "input": input}
-        test_string = generate_prompt(sample)
+        if instruction_tuning:
+            sample = {"instruction": test_string, "input": input}
+            test_string = generate_prompt(sample)
         
         encoded_text = tokenizer.encode(
             test_string, bos=True, eos=False, device=fabric.device