diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 7b6da3e449..ee4235c74e 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -23,7 +23,6 @@ "apply_chat_template", "test_construct_chat_template", - "create_ollama_modelfile", ] from transformers import StoppingCriteria, StoppingCriteriaList @@ -1079,14 +1078,29 @@ def construct_chat_template( \ ) pass + # Check tokenizer types + tokenizer_name = tokenizer.name_or_path.lower() + if tokenizer_name.startswith(("unsloth/llama-3-8b-instruct", "unsloth/llama-3-70b-instruct")): + # Add <|eot_id|> + extra_eos_tokens.append("<|eot_id|>") + elif ("<|eot_id|>" in extra_eos_tokens or "<|eot_id|>" in chat_template) and \ + tokenizer_name.startswith(("unsloth/llama-3-8b", "unsloth/llama-3-70b")): + # Warn + logger.warning( + "Unsloth: Base llama-3 models did not train <|eot_id|>.\n"\ + "Please use the instruct version or use <|end_of_text|>" + ) + pass + extra_eos_tokens = list(set(extra_eos_tokens)) + count_eos = 0 for eos in extra_eos_tokens: - count_eos += len(re.findall(r"{OUTPUT}" + eos.encode("unicode-escape").decode("utf-8"), chat_template)) + count_eos += len(re.findall(r"{OUTPUT}" + re.escape(eos), chat_template)) pass if count_eos == 0: logger.warning("Unsloth: We automatically added an EOS token to stop endless generations.") eos = extra_eos_tokens[0] - chat_template = re.sub(r"{OUTPUT}", r"{OUTPUT}" + eos.encode("unicode-escape").decode("utf-8"), chat_template) + chat_template = re.sub(r"{OUTPUT}", r"{OUTPUT}" + eos, chat_template) pass # O(N^2) search finding 2 repeatted pieces of text @@ -1151,7 +1165,9 @@ def construct_chat_template( \ # Check bos_token is in system prompt ollama_system = system_part has_bos_token = False + always_bos_token = False if tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None): + always_bos_token = True if ollama_system.startswith(tokenizer.bos_token): has_bos_token = True ollama_system = ollama_system[len(tokenizer.bos_token):] @@ -1166,11 +1182,6 @@ def construct_chat_template( \ input_modelfile = "{{ if .Prompt }}" + input_part .replace("{INPUT}", "{{ .Prompt }}") + "{{ end }}" output_modelfile = output_part.replace("{OUTPUT}", "{{ .Response }}") - # Check if EOS token is at the end of the output - if not output_modelfile.endswith(tuple(extra_eos_tokens)): - output_modelfile += "{__EOS_TOKEN__}" - pass - # Ollama EOS ollama_eos = get_ollama_eos_tokens(tokenizer, extra_eos_tokens) ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos) @@ -1215,10 +1226,7 @@ def process(part, which, content = "message['content']"): partial_system = process(system_part, "{SYSTEM}", "messages[0]['content']") partial_system = partial_system.replace("{SYSTEM}", "") - # If {SYSTEM} is non existent, simply just use the content - if "{SYSTEM}" not in partial_system: - partial_system = "messages[0]['content']" - else: + if "{SYSTEM}" in partial_system: if default_system_message is None: raise RuntimeError("Unsloth: Please specify a default system message!") pass @@ -1226,21 +1234,22 @@ def process(part, which, content = "message['content']"): # Separate the BOS if has_bos_token: partial_system = partial_system.replace(tokenizer.bos_token, "", 1) + system_part = system_part .replace(tokenizer.bos_token, "", 1) pass - + partial_system = \ "{% if messages[0]['role'] == 'system' %}"\ "{{ " + partial_system + " }}"\ "{% set loop_messages = messages[1:] %}" if default_system_message is not None: full_system = system_part.replace("{SYSTEM}", default_system_message) + if "{SYSTEM}" in system_part: + modelfile += '\nSYSTEM: "' + default_system_message + '"' + pass partial_system += "{% else %}"\ "{{ '" + full_system + "' }}"\ "{% set loop_messages = messages %}"\ "{% endif %}" - - # Add to modelfile - modelfile += '\nSYSTEM "' + full_system + '"' else: partial_system += "{% endif %}" pass @@ -1251,6 +1260,22 @@ def process(part, which, content = "message['content']"): jinja_template = "{{ bos_token }}" + jinja_template pass + # Check if system part is the same! + jinja_template = re.sub( + r"\{\% if messages\[0\]\['role'\] \=\= 'system' \%\}\{\{ '(.+?)' \}\}"\ + r"\{\% set loop\_messages \= messages\[1\:\] \%\}"\ + r"\{\% else \%\}\{\{ '\1' \}\}\{\% set loop\_messages \= messages \%\}\{\% endif \%\}"\ + r"\{\% for message in loop\_messages \%\}", + r"{{ '\1' }}{% for message in messages %}", + jinja_template, flags = re.MULTILINE | re.DOTALL, + ) + + # Check jinja tempate for bos + if always_bos_token: + if not jinja_template.startswith("{{ bos_token }}"): + jinja_template = "{{ bos_token }}" + jinja_template + pass + return modelfile, jinja_template pass @@ -1260,7 +1285,7 @@ def test_construct_chat_template(): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token = token) - template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> + chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> {SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|> @@ -1277,7 +1302,11 @@ def test_construct_chat_template(): extra_eos_tokens = None - modelfile, jinja_template = construct_chat_template(template, default_system_message, extra_eos_tokens) + modelfile, jinja_template = construct_chat_template( + tokenizer = tokenizer, + chat_template = chat_template, + extra_eos_tokens = extra_eos_tokens, + ) messages = [ {"role": "system", "content": "You are an assistant"}, @@ -1291,7 +1320,6 @@ def test_construct_chat_template(): tokenizer.chat_template = jinja_template new_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) - assert(correct_output == new_output) pass pass @@ -1344,43 +1372,6 @@ def formatting_prompts_func(examples): pass -def create_ollama_modelfile(tokenizer, gguf_location): - """ - Creates an Ollama Modelfile. - Use ollama.create(model = "new_ollama_model", modelfile = modelfile) - """ - modelfile = getattr(tokenizer, "_ollama_modelfile", None) - if modelfile is None: - raise RuntimeError( - "Unsloth: Tokenizer does not have a `ollama_modelfile` attribute.\n"\ - "Please use get_chat_template(...)." - ) - pass - - system_message = getattr(tokenizer, "_system_message", None) - if system_message is None: - __SYSTEM_MESSAGE__ = "" - else: - __SYSTEM_MESSAGE__ = f'SYSTEM """{system_message}"""' - pass - - modelfile = modelfile\ - .replace("{{", "⚫@✅#🦥")\ - .replace("}}", "⚡@🦥#⛵")\ - .format( - __FILE_LOCATION__ = gguf_location, - __SYSTEM_MESSAGE__ = __SYSTEM_MESSAGE__, - __EOS_TOKEN__ = tokenizer.eos_token, - )\ - .replace("⚫@✅#🦥", "{{")\ - .replace("⚡@🦥#⛵", "}}")\ - .rstrip() - pass - - return modelfile -pass - - def create_stopping_criteria(tokenizer, stop_word = "eos_token"): class StoppingCriteriaSub(StoppingCriteria): __slots__ = "stop_token", "single_match", "length", diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 5ef7583975..4b40065083 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -47,9 +47,11 @@ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", ), "unsloth/mistral-7b-instruct-v0.1-bnb-4bit" : ( + "unsloth/mistral-7b-instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", ), "unsloth/mistral-7b-instruct-v0.2-bnb-4bit" : ( + "unsloth/mistral-7b-instruct-v0.2", "mistralai/Mistral-7B-Instruct-v0.2", ), "unsloth/llama-2-7b-chat-bnb-4bit" : ( diff --git a/unsloth/save.py b/unsloth/save.py index f8f884a9d3..9163c6d38d 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -891,10 +891,10 @@ def save_to_gguf( # Map quant methods new_quantization_method = [] for quant_method in quantization_method: - if quant_method == "not_quantized": quantization_method = model_dtype - elif quant_method == "fast_quantized": quantization_method = "q8_0" - elif quant_method == "quantized": quantization_method = "q4_k_m" - elif quant_method is None: quantization_method = "q8_0" + if quant_method == "not_quantized": quant_method = model_dtype + elif quant_method == "fast_quantized": quant_method = "q8_0" + elif quant_method == "quantized": quant_method = "q4_k_m" + elif quant_method is None: quant_method = "q8_0" # Check if wrong method if quant_method not in ALLOWED_QUANTS.keys(): @@ -978,6 +978,11 @@ def save_to_gguf( pass pass + # If only q8_0: + if len(quantization_method) == 1 and quantization_method[0] == "q8_0": + strength = 0 + pass + if strength >= 3: first_conversion = "f32" elif strength >= 2: first_conversion = "f16" elif strength >= 1: first_conversion = "bf16" @@ -1008,7 +1013,7 @@ def save_to_gguf( n_cpus *= 2 # Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model - final_location = f"./{model_directory}-unsloth.{first_conversion.upper()}.gguf" + final_location = f"./{model_directory}/unsloth.{first_conversion.upper()}.gguf" print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\ f"The output location will be {final_location}\n"\ @@ -1072,12 +1077,12 @@ def save_to_gguf( full_precision_location = final_location - all_saved_locations = [] + all_saved_locations = [full_precision_location,] # Convert each type! for quant_method in quantization_method: if quant_method != first_conversion: print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...") - final_location = f"./{model_directory}-unsloth.{quant_method.upper()}.gguf" + final_location = f"./{model_directory}/unsloth.{quant_method.upper()}.gguf" command = f"./{quantize_location} {full_precision_location} "\ f"{final_location} {quant_method} {n_cpus}" @@ -1365,6 +1370,29 @@ def fix_tokenizer_bos_token(tokenizer): pass +def create_ollama_modelfile(tokenizer, gguf_location): + """ + Creates an Ollama Modelfile. + Use ollama.create(model = "new_ollama_model", modelfile = modelfile) + """ + modelfile = getattr(tokenizer, "_ollama_modelfile", None) + if modelfile is None: return None + + modelfile = modelfile\ + .replace("{{", "⚫@✅#🦥")\ + .replace("}}", "⚡@🦥#⛵")\ + .format( + __FILE_LOCATION__ = gguf_location, + )\ + .replace("⚫@✅#🦥", "{{")\ + .replace("⚡@🦥#⛵", "}}")\ + .rstrip() + pass + + return modelfile +pass + + def unsloth_save_pretrained_gguf( self, save_directory : Union[str, os.PathLike], @@ -1500,10 +1528,21 @@ def unsloth_save_pretrained_gguf( new_save_directory, quantization_method, first_conversion, makefile, ) + # Save Ollama modelfile + modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0]) + modelfile_location = None + if modelfile is not None: + modelfile_location = os.path.join(new_save_directory, "Modelfile") + with open(modelfile_location, "w") as file: + file.write(modelfile) + pass + print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}") + pass + if fix_bos_token: logger.warning( f"Unsloth: ##### The current model auto adds a BOS token.\n"\ - "Unsloth: ##### We removed in GGUF's chat template for you." + "Unsloth: ##### We removed it in GGUF's chat template for you." ) pass @@ -1520,6 +1559,15 @@ def unsloth_save_pretrained_gguf( new_save_directory.lstrip('/.') print(f"Saved GGUF to https://huggingface.co/{link}") pass + + # Save modelfile + if modelfile_location is not None: + username = upload_to_huggingface( + self, save_directory, token, + "GGUF converted", "gguf", modelfile_location, old_username, private, + ) + print(f"Saved Ollama Modelfile to https://huggingface.co/{link}") + pass pass pass @@ -1654,6 +1702,17 @@ def unsloth_push_to_hub_gguf( new_save_directory, quantization_method, first_conversion, makefile, ) + # Save Ollama modelfile + modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0]) + modelfile_location = None + if modelfile is not None: + modelfile_location = os.path.join(new_save_directory, "Modelfile") + with open(modelfile_location, "w") as file: + file.write(modelfile) + pass + print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}") + pass + for file_location in all_file_locations: print("Unsloth: Uploading GGUF to Huggingface Hub...") username = upload_to_huggingface( @@ -1667,10 +1726,19 @@ def unsloth_push_to_hub_gguf( print(f"Saved GGUF to https://huggingface.co/{link}") pass + # Save modelfile + if modelfile_location is not None: + username = upload_to_huggingface( + self, repo_id, token, + "GGUF converted", "gguf", modelfile_location, old_username, private, + ) + print(f"Saved Ollama Modelfile to https://huggingface.co/{link}") + pass + if fix_bos_token: logger.warning( f"Unsloth: ##### The current model auto adds a BOS token.\n"\ - "Unsloth: ##### We removed in GGUF's chat template for you." + "Unsloth: ##### We removed it in GGUF's chat template for you." ) pass pass