diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 68e294f157..0acc8cd350 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1209,7 +1209,7 @@ def patch_gradient_accumulation_fix(Trainer): "Unsloth: We fixed a gradient accumulation bug, "\ "but it seems like you don't have the latest transformers version!\n"\ "Please update transformers, TRL and unsloth via:\n"\ - '`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`' + '`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`' ) pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 5f20f51209..c98feeca1e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -193,6 +193,10 @@ def LlamaAttention_fast_forward_inference( # cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len) # Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids) + + # Need to do it prior 2 steps before hitting full on short KV cache + # or else error + self.rotary_emb.extend_rope_embedding(Vn, seq_len + 2) cos, sin = self.rotary_emb.get_cached(kv_seq_len) cos = cos[position_ids].unsqueeze(1) sin = sin[position_ids].unsqueeze(1) @@ -1122,7 +1126,7 @@ def get_cached(self, seq_len = None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = math.ceil(seq_len / 8192) * 8192 + self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass @@ -1248,7 +1252,7 @@ def get_cached(self, seq_len = None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = math.ceil(seq_len / 8192) * 8192 + self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass @@ -1363,7 +1367,7 @@ def get_cached(self, seq_len = None): def extend_rope_embedding(self, x, seq_len): if seq_len <= self.current_rope_size: return # Iteratively grow by increments of 8192 - self.current_rope_size = math.ceil(seq_len / 8192) * 8192 + self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192 self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype) pass pass @@ -1952,10 +1956,10 @@ def get_peft_model( # Offload! # [TODO] First offload lm_head and embed_tokens to CPU (should be disk!!) if "embed_tokens" in new_target_modules: - print("Unsloth: Casting embed_tokens to float32") + print("Unsloth: Training embed_tokens in mixed precision to save VRAM") model.model.model.embed_tokens.modules_to_save.default\ - .to(device = "cuda:0", dtype = torch.float32, non_blocking = True) + .to(device = "cuda:0", non_blocking = True) model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True) # [TODO] Move old embed_tokens to CPU - should be disk! @@ -1965,10 +1969,10 @@ def get_peft_model( pass if "lm_head" in new_target_modules: - print("Unsloth: Casting lm_head to float32") + print("Unsloth: Training lm_head in mixed precision to save VRAM") model.model.lm_head.modules_to_save.default\ - .to(device = "cuda:0", dtype = torch.float32, non_blocking = True) + .to(device = "cuda:0", non_blocking = True) model.model.lm_head.modules_to_save.default.requires_grad_(True) # [TODO] Move old lm_head to CPU - should be disk! @@ -2203,18 +2207,18 @@ def get_peft_model( # Now patch lm_head and embed_tokens if train_embed_tokens: - print("Unsloth: Casting embed_tokens to float32") + print("Unsloth: Training embed_tokens in mixed precision to save VRAM") assert(hasattr(model.model.model.embed_tokens, "modules_to_save")) model.model.model.embed_tokens.modules_to_save.default\ - .to(device = "cuda:0", dtype = torch.float32, non_blocking = True) + .to(device = "cuda:0", non_blocking = True) model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True) pass if train_lm_head: - print("Unsloth: Casting lm_head to float32") + print("Unsloth: Training lm_head in mixed precision to save VRAM") assert(hasattr(model.model.lm_head, "modules_to_save")) model.model.lm_head.modules_to_save.default\ - .to(device = "cuda:0", dtype = torch.float32, non_blocking = True) + .to(device = "cuda:0", non_blocking = True) model.model.lm_head.modules_to_save.default.requires_grad_(True) pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 8806f1e743..c05485f902 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -975,7 +975,7 @@ def patch_sft_trainer_tokenizer(): " from packaging.version import Version\n"\ " if Version(transformers_version) <= Version('4.45.2'):\n"\ " print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\\n'\\\n"\ - " '`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`')\n"\ + " '`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`')\n"\ "except:\n"\ " pass\n"\ "\n\n"