Lightning-AI · belerico · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
@@ -0,0 +1,130 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-llama2-7b
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 8
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: true
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: true
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# The number of groups to split the sequences into. (type: int, default: 4)
+longlora_n_groups: 4
+
+# The increased context length. (type: int, default: 8192)
+longlora_context_length: 8192
+
+# The additional trainable parameters, other than the standard LoRA ones. (type: str, default: "wte,norm_,ln_")
+longlora_trainable_params: "wte,norm_,ln_"
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 4
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 8192
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: float, default: 0.0003)
+  learning_rate: 0.0002
+
+  #   (type: float, default: 0.02)
+  weight_decay: 0.0
+
+  #   (type: float, default: 0.9)
+  beta1: 0.9
+
+  #   (type: float, default: 0.95)
+  beta2: 0.95
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
@@ -0,0 +1,130 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/mistralai/Mistral-7B-v0.1
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-mistral-7b
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 8
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: true
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: true
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# The number of groups to split the sequences into. (type: int, default: 4)
+longlora_n_groups: 4
+
+# The increased context length. (type: int, default: 8192)
+longlora_context_length: 8192
+
+# The additional trainable parameters, other than the standard LoRA ones. (type: str, default: "wte,norm_,ln_")
+longlora_trainable_params: "wte,norm_,ln_"
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 4
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 8192
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: float, default: 0.0003)
+  learning_rate: 0.0002
+
+  #   (type: float, default: 0.02)
+  weight_decay: 0.0
+
+  #   (type: float, default: 0.9)
+  beta1: 0.9
+
+  #   (type: float, default: 0.95)
+  beta2: 0.95
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
@@ -168,6 +168,9 @@ def __init__(self, config: Config, block_idx: int) -> None:
 
         self.config = config
 
+        # LongLora
+        self._longlora_available = self.config.longlora_n_groups is not None and self.config.longlora_n_groups > 0
+
     def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
         """For compatibility with base checkpoints."""
         mapping = {

@@ -40,13 +40,9 @@ class TrainArgs:
     max_norm: Optional[float] = None
     min_lr: float = 6e-5
 
-    def __post_init__(self) -> None:
-        if self.lr_warmup_fraction and self.lr_warmup_steps:
-            raise ValueError(
-                "Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one."
-            )
-        if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1):
-            raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.")
+    # Misc args
+    get_longest_seq_length: bool = True
+    """Whether to compute the longest sequence length in the dataset"""
 
     def gradient_accumulation_iters(self, devices: int) -> int:
         """Number of iterations between gradient synchronizations"""

@@ -61,6 +61,9 @@ class Config:
     rope_base: int = 10000
     n_expert: int = 0
     n_expert_per_token: int = 0
+    longlora_n_groups: Optional[int] = None
+    longlora_context_length: Optional[int] = None
+    longlora_trainable_params: str = ""
 
     def __post_init__(self):
         if not self.name:
@@ -836,7 +839,7 @@ def norm_class(self) -> Type:
         copy["name"] = c["name"].format(kind)
         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
-        
+
 
 ###############
 # Meta LLaMA 3

@@ -43,6 +43,7 @@ class Alpaca(DataModule):
     tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
     batch_size: int = field(default=1, init=False, repr=False)
     max_seq_length: int = field(default=-1, init=False, repr=False)
+    pad_multiple_of: Optional[int] = field(default=None, init=False, repr=False)
     train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
     test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
 
@@ -51,11 +52,16 @@ def __post_init__(self) -> None:
             self.prompt_style = PromptStyle.from_name(self.prompt_style)
 
     def connect(
-        self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None,
+        pad_multiple_of: Optional[int] = None,
     ) -> None:
         self.tokenizer = tokenizer
         self.batch_size = batch_size
         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+        self.pad_multiple_of = pad_multiple_of
 
     def prepare_data(self) -> None:
         self.download_dir.mkdir(parents=True, exist_ok=True)
@@ -97,7 +103,9 @@ def train_dataloader(self) -> DataLoader:
             shuffle=True,
             generator=torch.Generator().manual_seed(self.seed),
             num_workers=self.num_workers,
-            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
+            collate_fn=get_sft_collate_fn(
+                max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
+            ),
         )
 
     def val_dataloader(self) -> DataLoader:
@@ -106,7 +114,9 @@ def val_dataloader(self) -> DataLoader:
             batch_size=self.batch_size,
             shuffle=False,
             num_workers=self.num_workers,
-            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
+            collate_fn=get_sft_collate_fn(
+                max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
+            ),
         )