oobabooga · oobabooga · Feb 17, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/download-model.py b/download-model.py
@@ -26,13 +26,16 @@
 
 class ModelDownloader:
     def __init__(self, max_retries=5):
-        self.session = requests.Session()
-        if max_retries:
-            self.session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
-            self.session.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
+        self.max_retries = max_retries
+
+    def get_session(self):
+        session = requests.Session()
+        if self.max_retries:
+            session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=self.max_retries))
+            session.mount('https://huggingface.co', HTTPAdapter(max_retries=self.max_retries))
 
         if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
-            self.session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
+            session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
 
         try:
             from huggingface_hub import get_token
@@ -41,7 +44,9 @@ def __init__(self, max_retries=5):
             token = os.getenv("HF_TOKEN")
 
         if token is not None:
-            self.session.headers = {'authorization': f'Bearer {token}'}
+            session.headers = {'authorization': f'Bearer {token}'}
+
+        return session
 
     def sanitize_model_and_branch_names(self, model, branch):
         if model[-1] == '/':
@@ -65,6 +70,7 @@ def sanitize_model_and_branch_names(self, model, branch):
         return model, branch
 
     def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
+        session = self.get_session()
         page = f"/api/models/{model}/tree/{branch}"
         cursor = b""
 
@@ -78,7 +84,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
         is_lora = False
         while True:
             url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
-            r = self.session.get(url, timeout=10)
+            r = session.get(url, timeout=10)
             r.raise_for_status()
             content = r.content
 
@@ -156,9 +162,8 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
         is_llamacpp = has_gguf and specific_file is not None
         return links, sha256, is_lora, is_llamacpp
 
-    def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_folder=None):
-        if base_folder is None:
-            base_folder = 'models' if not is_lora else 'loras'
+    def get_output_folder(self, model, branch, is_lora, is_llamacpp=False):
+        base_folder = 'models' if not is_lora else 'loras'
 
         # If the model is of type GGUF, save directly in the base_folder
         if is_llamacpp:
@@ -172,14 +177,15 @@ def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_fold
         return output_folder
 
     def get_single_file(self, url, output_folder, start_from_scratch=False):
+        session = self.get_session()
         filename = Path(url.rsplit('/', 1)[1])
         output_path = output_folder / filename
         headers = {}
         mode = 'wb'
         if output_path.exists() and not start_from_scratch:
 
             # Check if the file has already been downloaded completely
-            r = self.session.get(url, stream=True, timeout=10)
+            r = session.get(url, stream=True, timeout=10)
             total_size = int(r.headers.get('content-length', 0))
             if output_path.stat().st_size >= total_size:
                 return
@@ -188,7 +194,7 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
             headers = {'Range': f'bytes={output_path.stat().st_size}-'}
             mode = 'ab'
 
-        with self.session.get(url, stream=True, headers=headers, timeout=10) as r:
+        with session.get(url, stream=True, headers=headers, timeout=10) as r:
             r.raise_for_status()  # Do not continue the download if the request was unsuccessful
             total_size = int(r.headers.get('content-length', 0))
             block_size = 1024 * 1024  # 1MB
@@ -303,7 +309,10 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
     links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)
 
     # Get the output folder
-    output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp, base_folder=args.output)
+    if args.output:
+        output_folder = Path(args.output)
+    else:
+        output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)
 
     if args.check:
         # Check previously downloaded files

diff --git a/instruction-templates/Mistral.yaml b/instruction-templates/Mistral.yaml
@@ -4,7 +4,7 @@ instruction_template: |-
           {{- message['content'] -}}
       {%- else -%}
           {%- if message['role'] == 'user' -%}
-              {{-' [INST] ' + message['content'].rstrip() + ' [/INST] '-}}
+              {{-'[INST] ' + message['content'].rstrip() + ' [/INST]'-}}
           {%- else -%}
               {{-'' + message['content'] + '</s>' -}}
           {%- endif -%}

diff --git a/modules/chat.py b/modules/chat.py
@@ -166,53 +166,54 @@ def make_prompt(messages):
         prompt = remove_extra_bos(prompt)
         return prompt
 
-    # Handle truncation
-    max_length = get_max_prompt_length(state)
     prompt = make_prompt(messages)
-    encoded_length = get_encoded_length(prompt)
 
-    while len(messages) > 0 and encoded_length > max_length:
+    # Handle truncation
+    if shared.tokenizer is not None:
+        max_length = get_max_prompt_length(state)
+        encoded_length = get_encoded_length(prompt)
+        while len(messages) > 0 and encoded_length > max_length:
 
-        # Remove old message, save system message
-        if len(messages) > 2 and messages[0]['role'] == 'system':
-            messages.pop(1)
+            # Remove old message, save system message
+            if len(messages) > 2 and messages[0]['role'] == 'system':
+                messages.pop(1)
 
-        # Remove old message when no system message is present
-        elif len(messages) > 1 and messages[0]['role'] != 'system':
-            messages.pop(0)
+            # Remove old message when no system message is present
+            elif len(messages) > 1 and messages[0]['role'] != 'system':
+                messages.pop(0)
 
-        # Resort to truncating the user input
-        else:
+            # Resort to truncating the user input
+            else:
+
+                user_message = messages[-1]['content']
+
+                # Bisect the truncation point
+                left, right = 0, len(user_message) - 1
 
-            user_message = messages[-1]['content']
+                while right - left > 1:
+                    mid = (left + right) // 2
 
-            # Bisect the truncation point
-            left, right = 0, len(user_message) - 1
+                    messages[-1]['content'] = user_message[mid:]
+                    prompt = make_prompt(messages)
+                    encoded_length = get_encoded_length(prompt)
 
-            while right - left > 1:
-                mid = (left + right) // 2
+                    if encoded_length <= max_length:
+                        right = mid
+                    else:
+                        left = mid
 
-                messages[-1]['content'] = user_message[mid:]
+                messages[-1]['content'] = user_message[right:]
                 prompt = make_prompt(messages)
                 encoded_length = get_encoded_length(prompt)
-
-                if encoded_length <= max_length:
-                    right = mid
+                if encoded_length > max_length:
+                    logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
+                    raise ValueError
                 else:
-                    left = mid
+                    logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
+                    break
 
-            messages[-1]['content'] = user_message[right:]
             prompt = make_prompt(messages)
             encoded_length = get_encoded_length(prompt)
-            if encoded_length > max_length:
-                logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
-                raise ValueError
-            else:
-                logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
-                break
-
-        prompt = make_prompt(messages)
-        encoded_length = get_encoded_length(prompt)
 
     if also_return_rows:
         return prompt, [message['content'] for message in messages]
@@ -690,6 +691,9 @@ def load_character(character, name1, name2):
 
 
 def load_instruction_template(template):
+    if template == 'None':
+        return ''
+
     for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
         if filepath.exists():
             break

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
@@ -51,18 +51,21 @@ def from_pretrained(self, path_to_model):
 
         model = ExLlamaV2(config)
 
-        split = None
-        if shared.args.gpu_split:
-            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
-
-        model.load(split)
-
-        tokenizer = ExLlamaV2Tokenizer(config)
         if shared.args.cache_8bit:
-            cache = ExLlamaV2Cache_8bit(model)
+            cache = ExLlamaV2Cache_8bit(model, lazy=True)
         else:
-            cache = ExLlamaV2Cache(model)
+            cache = ExLlamaV2Cache(model, lazy=True)
 
+        if shared.args.autosplit:
+            model.load_autosplit(cache)
+        else:
+            split = None
+            if shared.args.gpu_split:
+                split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+
+            model.load(split)
+
+        tokenizer = ExLlamaV2Tokenizer(config)
         generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
 
         result = self()

diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
@@ -37,18 +37,22 @@ def __init__(self, config: ExLlamaV2Config):
         super().__init__(PretrainedConfig())
         self.ex_config = config
         self.ex_model = ExLlamaV2(config)
-        split = None
-        if shared.args.gpu_split:
-            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
-
-        self.ex_model.load(split)
-        self.generation_config = GenerationConfig()
         self.loras = None
+        self.generation_config = GenerationConfig()
 
         if shared.args.cache_8bit:
-            self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
+            self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=True)
         else:
-            self.ex_cache = ExLlamaV2Cache(self.ex_model)
+            self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=True)
+
+        if shared.args.autosplit:
+            self.ex_model.load_autosplit(self.ex_cache)
+        else:
+            split = None
+            if shared.args.gpu_split:
+                split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+
+            self.ex_model.load(split)
 
         self.past_seq = None
         if shared.args.cfg_cache:

diff --git a/modules/loaders.py b/modules/loaders.py
@@ -78,6 +78,7 @@
         'no_flash_attn',
         'num_experts_per_token',
         'cache_8bit',
+        'autosplit',
         'alpha_value',
         'compress_pos_emb',
         'trust_remote_code',
@@ -89,6 +90,7 @@
         'no_flash_attn',
         'num_experts_per_token',
         'cache_8bit',
+        'autosplit',
         'alpha_value',
         'compress_pos_emb',
         'exllamav2_info',

diff --git a/modules/models.py b/modules/models.py
@@ -257,7 +257,7 @@ def llamacpp_HF_loader(model_name):
     path = Path(f'{shared.args.model_dir}/{model_name}')
 
     # Check if a HF tokenizer is available for the model
-    if all((path / file).exists() for file in ['tokenizer.model', 'tokenizer_config.json']):
+    if all((path / file).exists() for file in ['tokenizer_config.json']):
         logger.info(f'Using tokenizer from: \"{path}\"')
     else:
         logger.error("Could not load the model because a tokenizer in Transformers format was not found.")

diff --git a/modules/models_settings.py b/modules/models_settings.py
@@ -153,6 +153,8 @@ def infer_loader(model_name, model_settings):
         loader = 'ExLlamav2_HF'
     elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
         loader = 'AutoAWQ'
+    elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
+        loader = 'llamacpp_HF'
     elif len(list(path_to_model.glob('*.gguf'))) > 0:
         loader = 'llama.cpp'
     elif re.match(r'.*\.gguf', model_name.lower()):
@@ -225,7 +227,7 @@ def apply_model_settings_to_state(model, state):
         loader = model_settings.pop('loader')
 
         # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['ctransformers']):
             state['loader'] = loader
 
     for k in model_settings:
@@ -243,27 +245,54 @@ def save_model_settings(model, state):
     Save the settings for this model to models/config-user.yaml
     '''
     if model == 'None':
-        yield ("Not saving the settings because no model is loaded.")
+        yield ("Not saving the settings because no model is selected in the menu.")
         return
 
-    with Path(f'{shared.args.model_dir}/config-user.yaml') as p:
-        if p.exists():
-            user_config = yaml.safe_load(open(p, 'r').read())
-        else:
-            user_config = {}
+    user_config = shared.load_user_config()
+    model_regex = model + '$'  # For exact matches
+    if model_regex not in user_config:
+        user_config[model_regex] = {}
+
+    for k in ui.list_model_elements():
+        if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
+            user_config[model_regex][k] = state[k]
 
-        model_regex = model + '$'  # For exact matches
-        if model_regex not in user_config:
-            user_config[model_regex] = {}
+    shared.user_config = user_config
 
-        for k in ui.list_model_elements():
-            if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
-                user_config[model_regex][k] = state[k]
+    output = yaml.dump(user_config, sort_keys=False)
+    p = Path(f'{shared.args.model_dir}/config-user.yaml')
+    with open(p, 'w') as f:
+        f.write(output)
 
-        shared.user_config = user_config
+    yield (f"Settings for `{model}` saved to `{p}`.")
 
-        output = yaml.dump(user_config, sort_keys=False)
-        with open(p, 'w') as f:
-            f.write(output)
 
-        yield (f"Settings for `{model}` saved to `{p}`.")
+def save_instruction_template(model, template):
+    '''
+    Similar to the function above, but it saves only the instruction template.
+    '''
+    if model == 'None':
+        yield ("Not saving the template because no model is selected in the menu.")
+        return
+
+    user_config = shared.load_user_config()
+    model_regex = model + '$'  # For exact matches
+    if model_regex not in user_config:
+        user_config[model_regex] = {}
+
+    if template == 'None':
+        user_config[model_regex].pop('instruction_template', None)
+    else:
+        user_config[model_regex]['instruction_template'] = template
+
+    shared.user_config = user_config
+
+    output = yaml.dump(user_config, sort_keys=False)
+    p = Path(f'{shared.args.model_dir}/config-user.yaml')
+    with open(p, 'w') as f:
+        f.write(output)
+
+    if template == 'None':
+        yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
+    else:
+        yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")