From e549c8273805de14a59613702e5eeaa3be96b479 Mon Sep 17 00:00:00 2001 From: Luna Date: Tue, 16 May 2023 22:53:26 -0300 Subject: [PATCH 1/6] add scripts/prepare_any_text.py --- scripts/prepare_any_text.py | 97 +++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 scripts/prepare_any_text.py diff --git a/scripts/prepare_any_text.py b/scripts/prepare_any_text.py new file mode 100644 index 00000000..9377da6a --- /dev/null +++ b/scripts/prepare_any_text.py @@ -0,0 +1,97 @@ +"""Implementation derived from https://github.com/tloen/alpaca-lora""" +import sys +from pathlib import Path + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +import torch +import requests +import json +from torch.utils.data import random_split +from lit_llama.tokenizer import Tokenizer +from tqdm import tqdm + + +IGNORE_INDEX = -1 + +DATA_FILE_NAME = "input.txt" + + +def prepare( + destination_path: Path = Path("data/any"), + tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"), + test_split_ratio: float = 0.9, # default 90% train, 10% validation + max_seq_length: int = 256, + seed: int = 42, + data_file_name: str = DATA_FILE_NAME, +) -> None: + """Prepare any dataset for finetuning (akin to Shakespheare full tuning). + + The output is a training and validation dataset saved as `train.pt` and `val.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + + destination_path.mkdir(parents=True, exist_ok=True) + file_path = destination_path / data_file_name + if not file_path.exists(): + raise AssertionError(f"{data_file_name} is provided by the user") + + # TODO: If we don't have the Meta weights, where do we get the tokenizer from? + tokenizer = Tokenizer(tokenizer_path) + + data = [] + + with open(file_path, "r") as input_file: + for line in input_file.readlines(): + data.append(line) + + # Partition the dataset into train and test + train_split_size = int(len(data) * test_split_ratio) + test_split_size = len(data) - train_split_size + train_set, test_set = random_split( + data, + lengths=(train_split_size, test_split_size), + generator=torch.Generator().manual_seed(seed), + ) + train_set, test_set = list(train_set), list(test_set) + + print(f"train has {len(train_set):,} samples") + print(f"val has {len(test_set):,} samples") + + print("Processing train split ...") + train_set = [ + prepare_line(line, tokenizer, max_seq_length) for line in tqdm(train_set) + ] + torch.save(train_set, file_path.parent / "train.pt") + + print("Processing test split ...") + test_set = [ + prepare_line(line, tokenizer, max_seq_length) for line in tqdm(test_set) + ] + torch.save(test_set, file_path.parent / "test.pt") + + +def prepare_line(line: str, tokenizer: Tokenizer, max_length: int): + """Processes a single sample. + + This function processes the line to produce the tokenized version of it. + """ + encoded_full_prompt = tokenize(tokenizer, line, max_length=max_length, eos=False) + return { + "input_ids": encoded_full_prompt, + "labels": encoded_full_prompt, + } + + +def tokenize( + tokenizer: Tokenizer, string: str, max_length: int, eos=True +) -> torch.Tensor: + return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length) + + +if __name__ == "__main__": + from jsonargparse import CLI + + CLI(prepare) From f45e7146c59d25c2e13c32bfcd3af04325f8fbb3 Mon Sep 17 00:00:00 2001 From: Luna Date: Tue, 16 May 2023 23:02:21 -0300 Subject: [PATCH 2/6] make Adapter support non-instruction-tuned datasets --- finetune/adapter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/finetune/adapter.py b/finetune/adapter.py index f4bf266e..9db54298 100644 --- a/finetune/adapter.py +++ b/finetune/adapter.py @@ -32,6 +32,7 @@ from lightning.fabric.strategies import DeepSpeedStrategy +instruction_tuning = True eval_interval = 600 save_interval = 1000 eval_iters = 100 @@ -157,7 +158,9 @@ def train( def generate_response(model, instruction, input=""): tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model") sample = {"instruction": instruction, "input": input} - prompt = generate_prompt(sample) + prompt = instruction + if instruction_tuning: + prompt = generate_prompt(sample) encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device) output = generate( From 84df5af3289b2665b63a042d64e558bbd9ef4280 Mon Sep 17 00:00:00 2001 From: Luna Date: Sat, 20 May 2023 00:19:50 -0300 Subject: [PATCH 3/6] add instruction_tuning parameter to lora and full --- finetune/full.py | 5 ++++- finetune/lora.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/finetune/full.py b/finetune/full.py index 58932967..f437869f 100644 --- a/finetune/full.py +++ b/finetune/full.py @@ -27,6 +27,7 @@ from scripts.prepare_alpaca import generate_prompt +instruction_tuning = True eval_interval = 1000 save_interval = 1000 eval_iters = 100 @@ -141,7 +142,9 @@ def train( def generate_response(model, instruction): tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model") sample = {"instruction": instruction, "input": ""} - prompt = generate_prompt(sample) + prompt = instruction + if instruction_tuning: + prompt = generate_prompt(sample) encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device) output = generate( diff --git a/finetune/lora.py b/finetune/lora.py index 5668e5cf..9e169c68 100644 --- a/finetune/lora.py +++ b/finetune/lora.py @@ -24,6 +24,7 @@ from scripts.prepare_alpaca import generate_prompt +instruction_tuning = True eval_interval = 100 save_interval = 100 eval_iters = 100 @@ -133,7 +134,9 @@ def train( def generate_response(model, instruction): tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model") sample = {"instruction": instruction, "input": ""} - prompt = generate_prompt(sample) + prompt = instruction + if instruction_tuning: + prompt = generate_prompt(sample) encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device) output = generate( From 2ede3c06abe1d714cdcb574eaebc542f02161939 Mon Sep 17 00:00:00 2001 From: Luna Date: Sat, 20 May 2023 00:20:03 -0300 Subject: [PATCH 4/6] add CLI option for instruction_tuning --- finetune/adapter.py | 2 ++ finetune/full.py | 2 ++ finetune/lora.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/finetune/adapter.py b/finetune/adapter.py index 9db54298..d104d48e 100644 --- a/finetune/adapter.py +++ b/finetune/adapter.py @@ -62,7 +62,9 @@ def main( data_dir: str = "data/alpaca", pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth", out_dir: str = "out/adapter/alpaca", + is_instruction_tuning: bool = True, ): + instruction_tuning = is_instruction_tuning fabric = L.Fabric( accelerator="cuda", diff --git a/finetune/full.py b/finetune/full.py index f437869f..b7693fbb 100644 --- a/finetune/full.py +++ b/finetune/full.py @@ -51,7 +51,9 @@ def main( data_dir: str = "data/alpaca", pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth", out_dir: str = "out/full/alpaca", + is_instruction_tuning: bool = True ): + instruction_tuning = is_instruction_tuning auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block}) strategy = FSDPStrategy(auto_wrap_policy=auto_wrap_policy, activation_checkpointing=Block) diff --git a/finetune/lora.py b/finetune/lora.py index 9e169c68..2a5e9af2 100644 --- a/finetune/lora.py +++ b/finetune/lora.py @@ -48,7 +48,9 @@ def main( data_dir: str = "data/alpaca", pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth", out_dir: str = "out/lora/alpaca", + is_instruction_tuning: bool = True ): + instruction_tuning = is_instruction_tuning fabric = L.Fabric(accelerator="cuda", devices=1, precision="bf16-true") fabric.launch() From 526b0bcc97188199aa6d13ec69faaa14ff9d4ab2 Mon Sep 17 00:00:00 2001 From: Luna Date: Sun, 28 May 2023 23:24:02 -0300 Subject: [PATCH 5/6] Revert "add CLI option for instruction_tuning" This reverts commit 2ede3c06abe1d714cdcb574eaebc542f02161939. --- finetune/adapter.py | 2 -- finetune/full.py | 2 -- finetune/lora.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/finetune/adapter.py b/finetune/adapter.py index d104d48e..9db54298 100644 --- a/finetune/adapter.py +++ b/finetune/adapter.py @@ -62,9 +62,7 @@ def main( data_dir: str = "data/alpaca", pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth", out_dir: str = "out/adapter/alpaca", - is_instruction_tuning: bool = True, ): - instruction_tuning = is_instruction_tuning fabric = L.Fabric( accelerator="cuda", diff --git a/finetune/full.py b/finetune/full.py index b7693fbb..f437869f 100644 --- a/finetune/full.py +++ b/finetune/full.py @@ -51,9 +51,7 @@ def main( data_dir: str = "data/alpaca", pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth", out_dir: str = "out/full/alpaca", - is_instruction_tuning: bool = True ): - instruction_tuning = is_instruction_tuning auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block}) strategy = FSDPStrategy(auto_wrap_policy=auto_wrap_policy, activation_checkpointing=Block) diff --git a/finetune/lora.py b/finetune/lora.py index 2a5e9af2..9e169c68 100644 --- a/finetune/lora.py +++ b/finetune/lora.py @@ -48,9 +48,7 @@ def main( data_dir: str = "data/alpaca", pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth", out_dir: str = "out/lora/alpaca", - is_instruction_tuning: bool = True ): - instruction_tuning = is_instruction_tuning fabric = L.Fabric(accelerator="cuda", devices=1, precision="bf16-true") fabric.launch() From 33522d913a9c4677baf5cb828402ae8013ef89cf Mon Sep 17 00:00:00 2001 From: Luna Date: Sun, 28 May 2023 23:47:33 -0300 Subject: [PATCH 6/6] add instruction_tuning parameter to evaluation scripts --- evaluate/adapter.py | 7 +++++-- evaluate/lora.py | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/evaluate/adapter.py b/evaluate/adapter.py index 998370ec..e3bd667c 100644 --- a/evaluate/adapter.py +++ b/evaluate/adapter.py @@ -21,6 +21,8 @@ from datasets import load_dataset +instruction_tuning = True + def load_eval_data(dataset_name: str) -> str: # this mimics gptq datautils @@ -113,8 +115,9 @@ def main( for dsname in datasets.split(","): test_string = load_eval_data(dsname) - sample = {"instruction": test_string, "input": input} - test_string = generate_prompt(sample) + if instruction_tuning: + sample = {"instruction": test_string, "input": input} + test_string = generate_prompt(sample) encoded_text = tokenizer.encode( test_string, bos=True, eos=False, device=fabric.device diff --git a/evaluate/lora.py b/evaluate/lora.py index d1a495cf..0ad55509 100644 --- a/evaluate/lora.py +++ b/evaluate/lora.py @@ -21,6 +21,7 @@ from datasets import load_dataset +instruction_tuning = True lora_r = 8 lora_alpha = 16 lora_dropout = 0.05 @@ -123,8 +124,9 @@ def main( for dsname in datasets.split(","): test_string = load_eval_data(dsname) - sample = {"instruction": test_string, "input": input} - test_string = generate_prompt(sample) + if instruction_tuning: + sample = {"instruction": test_string, "input": input} + test_string = generate_prompt(sample) encoded_text = tokenizer.encode( test_string, bos=True, eos=False, device=fabric.device