Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
a66276c
Add longlora
belerico Apr 3, 2024
f7c7557
Remove layers adn let wte and norm be fine-tunable
belerico Apr 3, 2024
7b10be4
Merge branch 'Lightning-AI:main' into main
belerico Apr 3, 2024
d3c30e6
LongLora is now configurable
belerico Apr 3, 2024
1a2d474
Automatically adjust context length and RoPE condense ratio
belerico Apr 3, 2024
6965979
Merge branch 'main' of https://github.com/belerico/litgpt
belerico Apr 3, 2024
fbd3f29
Add comment
belerico Apr 3, 2024
bd65906
Fix S^2-Attn
belerico Apr 4, 2024
36614be
Remove from pretrained checkpoint removed layers
belerico Apr 4, 2024
bb5870c
Remove layers from pretrained checkpoint
belerico Apr 4, 2024
905f06e
pad_multiple_of to match group size in longlora
belerico Apr 4, 2024
55b101d
Merge branch 'Lightning-AI:main' into main
belerico Apr 4, 2024
60c2307
Explicitly modify context length from CLI
belerico Apr 4, 2024
f375af8
Merge branch 'main' of https://github.com/belerico/litgpt
belerico Apr 4, 2024
1deb208
Unneeded find_multiple
belerico Apr 4, 2024
8b83257
Increase context length during generation
belerico Apr 4, 2024
7390afb
Merge branch 'main' of github.com:Lightning-AI/litgpt
belerico Apr 5, 2024
4cc0bba
Merge branch 'main' of github.com:belerico/litgpt
belerico Apr 5, 2024
d8497fe
Fix longlora_context_length is not found
belerico Apr 5, 2024
44f93c1
Only LongLora
belerico Apr 5, 2024
42a4b11
Save LongLora weights
belerico Apr 8, 2024
eff94eb
Fix split longlora trainable params
belerico Apr 12, 2024
f8c2de4
Merge branch 'main' of https://github.com/Lightning-AI/litgpt
belerico Apr 22, 2024
9798c4c
Update longlora config
belerico Apr 22, 2024
fc53720
Add LongLora to full finetune
belerico Apr 22, 2024
d2eee25
Fix saving additional longlora params
belerico Apr 22, 2024
fa68690
Check longlora params
belerico Apr 22, 2024
da68aff
Add lora weights when saving longlora additional params
belerico Apr 22, 2024
c4f4dd9
Fix longlora reshape
belerico Apr 22, 2024
ce346f6
Reshape only if needed
belerico Apr 23, 2024
111d328
Update defaults and docstring
belerico Apr 23, 2024
a0135d6
Update docstring
belerico Apr 23, 2024
9f237df
Clone y due to torch error
belerico Apr 23, 2024
25bf665
Removed unneeded contiguous()
belerico Apr 23, 2024
3174df6
Fix missing import
belerico Apr 23, 2024
ac21027
pad_multiple_of through data.connect()
belerico Apr 24, 2024
32f7105
Fix wrong check
belerico Apr 24, 2024
9d0258a
Add pad_multiple_of to all datamodules
belerico Apr 24, 2024
bb45f90
Pad only if needed
belerico Apr 24, 2024
9df9d36
Set pad_multiple_of through data.connect()
belerico Apr 24, 2024
83451a4
Formatting
belerico Apr 24, 2024
5a325fb
Merge branch 'main' of github.com:Lightning-AI/litgpt into feature/lo…
belerico Apr 24, 2024
d80ef5e
LongLora applied to generation
belerico Apr 24, 2024
a7d2c89
Update string in ValueError
belerico Apr 24, 2024
15845fa
Removed unneeded import
belerico Apr 24, 2024
79998cd
Merge branch 'main' of github.com:Lightning-AI/litgpt into feature/lo…
belerico Apr 24, 2024
b590ab9
Fix standard parameters
belerico Apr 24, 2024
3c15ed1
Add LongLora configs
belerico Apr 24, 2024
a608fc0
Check in adapterv2 if longlora is available
belerico Apr 24, 2024
bfda5b9
Compute longest sequence length by default
belerico Apr 24, 2024
f26c67f
Fix wrong check
belerico Apr 24, 2024
3e56670
Merge branch 'main' into feature/longlora
rasbt Apr 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions config_hub/finetune/llama-2-7b/longlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@

# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf

# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
out_dir: out/finetune/lora-llama2-7b

# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
precision: bf16-true

# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
quantize:

# How many devices/GPUs to use. (type: Union[int, str], default: 1)
devices: 1

# The LoRA rank. (type: int, default: 8)
lora_r: 8

# The LoRA alpha. (type: int, default: 16)
lora_alpha: 16

# The LoRA dropout value. (type: float, default: 0.05)
lora_dropout: 0.05

# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
lora_query: true

# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
lora_key: true

# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
lora_value: true

# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
lora_projection: true

# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
lora_mlp: false

# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
lora_head: false

# The number of groups to split the sequences into. (type: int, default: 4)
longlora_n_groups: 4

# The increased context length. (type: int, default: 8192)
longlora_context_length: 8192

# The additional trainable parameters, other than the standard LoRA ones. (type: str, default: "wte,norm_,ln_")
longlora_trainable_params: "wte,norm_,ln_"

# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
data:
class_path: litgpt.data.Alpaca2k
init_args:
mask_prompt: false
prompt_style: alpaca
ignore_index: -100
seed: 42
num_workers: 4

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:

# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 200

# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1

# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
global_batch_size: 8

# Number of samples per data-parallel rank (type: int, default: 4)
micro_batch_size: 2

# Number of iterations with learning rate warmup active (type: int, default: 100)
lr_warmup_steps: 10

# Number of epochs to train on (type: Optional[int], default: 5)
epochs: 4

# Total number of tokens to train on (type: Optional[int], default: null)
max_tokens:

# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:

# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length: 8192

# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
tie_embeddings:

# (type: float, default: 0.0003)
learning_rate: 0.0002

# (type: float, default: 0.02)
weight_decay: 0.0

# (type: float, default: 0.9)
beta1: 0.9

# (type: float, default: 0.95)
beta2: 0.95

# (type: Optional[float], default: null)
max_norm:

# (type: float, default: 6e-05)
min_lr: 6.0e-05

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:

# Number of optimizer steps between evaluation calls (type: int, default: 100)
interval: 100

# Number of tokens to generate (type: Optional[int], default: 100)
max_new_tokens: 100

# Number of iterations (type: int, default: 100)
max_iters: 100

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
logger_name: csv

# The random seed to use for reproducibility. (type: int, default: 1337)
seed: 1337
130 changes: 130 additions & 0 deletions config_hub/finetune/mistral-7b/longlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@

# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
checkpoint_dir: checkpoints/mistralai/Mistral-7B-v0.1

# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
out_dir: out/finetune/lora-mistral-7b

# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
precision: bf16-true

# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
quantize:

# How many devices/GPUs to use. (type: Union[int, str], default: 1)
devices: 1

# The LoRA rank. (type: int, default: 8)
lora_r: 8

# The LoRA alpha. (type: int, default: 16)
lora_alpha: 16

# The LoRA dropout value. (type: float, default: 0.05)
lora_dropout: 0.05

# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
lora_query: true

# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
lora_key: true

# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
lora_value: true

# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
lora_projection: true

# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
lora_mlp: false

# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
lora_head: false

# The number of groups to split the sequences into. (type: int, default: 4)
longlora_n_groups: 4

# The increased context length. (type: int, default: 8192)
longlora_context_length: 8192

# The additional trainable parameters, other than the standard LoRA ones. (type: str, default: "wte,norm_,ln_")
longlora_trainable_params: "wte,norm_,ln_"

# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
data:
class_path: litgpt.data.Alpaca2k
init_args:
mask_prompt: false
prompt_style: alpaca
ignore_index: -100
seed: 42
num_workers: 4

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:

# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 200

# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1

# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
global_batch_size: 8

# Number of samples per data-parallel rank (type: int, default: 4)
micro_batch_size: 2

# Number of iterations with learning rate warmup active (type: int, default: 100)
lr_warmup_steps: 10

# Number of epochs to train on (type: Optional[int], default: 5)
epochs: 4

# Total number of tokens to train on (type: Optional[int], default: null)
max_tokens:

# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:

# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length: 8192

# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
tie_embeddings:

# (type: float, default: 0.0003)
learning_rate: 0.0002

# (type: float, default: 0.02)
weight_decay: 0.0

# (type: float, default: 0.9)
beta1: 0.9

# (type: float, default: 0.95)
beta2: 0.95

# (type: Optional[float], default: null)
max_norm:

# (type: float, default: 6e-05)
min_lr: 6.0e-05

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:

# Number of optimizer steps between evaluation calls (type: int, default: 100)
interval: 100

# Number of tokens to generate (type: Optional[int], default: 100)
max_new_tokens: 100

# Number of iterations (type: int, default: 100)
max_iters: 100

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
logger_name: csv

# The random seed to use for reproducibility. (type: int, default: 1337)
seed: 1337
3 changes: 3 additions & 0 deletions litgpt/adapter_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ def __init__(self, config: Config, block_idx: int) -> None:

self.config = config

# LongLora
self._longlora_available = self.config.longlora_n_groups is not None and self.config.longlora_n_groups > 0

def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
"""For compatibility with base checkpoints."""
mapping = {
Expand Down
10 changes: 3 additions & 7 deletions litgpt/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,9 @@ class TrainArgs:
max_norm: Optional[float] = None
min_lr: float = 6e-5

def __post_init__(self) -> None:
if self.lr_warmup_fraction and self.lr_warmup_steps:
raise ValueError(
"Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one."
)
if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1):
raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.")
# Misc args
get_longest_seq_length: bool = True
"""Whether to compute the longest sequence length in the dataset"""

def gradient_accumulation_iters(self, devices: int) -> int:
"""Number of iterations between gradient synchronizations"""
Expand Down
5 changes: 4 additions & 1 deletion litgpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class Config:
rope_base: int = 10000
n_expert: int = 0
n_expert_per_token: int = 0
longlora_n_groups: Optional[int] = None
longlora_context_length: Optional[int] = None
longlora_trainable_params: str = ""

def __post_init__(self):
if not self.name:
Expand Down Expand Up @@ -836,7 +839,7 @@ def norm_class(self) -> Type:
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)


###############
# Meta LLaMA 3
Expand Down
16 changes: 13 additions & 3 deletions litgpt/data/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class Alpaca(DataModule):
tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
batch_size: int = field(default=1, init=False, repr=False)
max_seq_length: int = field(default=-1, init=False, repr=False)
pad_multiple_of: Optional[int] = field(default=None, init=False, repr=False)
train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)

Expand All @@ -51,11 +52,16 @@ def __post_init__(self) -> None:
self.prompt_style = PromptStyle.from_name(self.prompt_style)

def connect(
self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
self,
tokenizer: Optional[Tokenizer] = None,
batch_size: int = 1,
max_seq_length: Optional[int] = None,
pad_multiple_of: Optional[int] = None,
) -> None:
self.tokenizer = tokenizer
self.batch_size = batch_size
self.max_seq_length = -1 if max_seq_length is None else max_seq_length
self.pad_multiple_of = pad_multiple_of

def prepare_data(self) -> None:
self.download_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -97,7 +103,9 @@ def train_dataloader(self) -> DataLoader:
shuffle=True,
generator=torch.Generator().manual_seed(self.seed),
num_workers=self.num_workers,
collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
collate_fn=get_sft_collate_fn(
max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
),
)

def val_dataloader(self) -> DataLoader:
Expand All @@ -106,7 +114,9 @@ def val_dataloader(self) -> DataLoader:
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
collate_fn=get_sft_collate_fn(
max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
),
)


Expand Down
Loading