oobabooga · oobabooga · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023
diff --git a/README.md b/README.md
@@ -285,6 +285,7 @@ List of command-line flags
 | `--no_use_cuda_fp16`           | This can make models faster on some systems. |
 | `--desc_act`                   | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
 | `--disable_exllama`            | Disable ExLlama kernel, which can improve inference speed on some systems. |
+| `--disable_exllamav2`          | Disable ExLlamav2 kernel. |
 
 #### GPTQ-for-LLaMa
 

diff --git a/docs/07 - Extensions.md b/docs/07 - Extensions.md
@@ -18,7 +18,6 @@ If you create an extension, you are welcome to host it in a GitHub repository an
 |[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For a detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. |
 |[google_translate](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/google_translate)| Automatically translates inputs and outputs using Google Translate.|
 |[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, responses are replaced with an audio widget. |
-|[elevenlabs_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/elevenlabs_tts)| Text-to-speech extension using the [ElevenLabs](https://beta.elevenlabs.io/) API. You need an API key to use it. |
 |[whisper_stt](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/whisper_stt)| Allows you to enter your inputs in chat mode using your microphone. |
 |[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). |
 |[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that adds a hidden string at the beginning of the bot's reply in chat mode. |

diff --git a/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt b/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt
diff --git a/extensions/elevenlabs_tts/requirements.txt b/extensions/elevenlabs_tts/requirements.txt
diff --git a/extensions/elevenlabs_tts/script.py b/extensions/elevenlabs_tts/script.py
diff --git a/extensions/openai/logits.py b/extensions/openai/logits.py
@@ -8,4 +8,4 @@ def _get_next_logits(body):
     state = process_parameters(body) if use_samplers else {}
     state['stream'] = True
 
-    return get_next_logits(body['prompt'], state, use_samplers, "", return_dict=True)
+    return get_next_logits(body['prompt'], state, use_samplers, "", top_logits=body['top_logits'], return_dict=True)
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
@@ -1,6 +1,6 @@
 import json
 import time
-from typing import List
+from typing import Dict, List
 
 from pydantic import BaseModel, Field
 
@@ -120,7 +120,7 @@ class ChatCompletionResponse(BaseModel):
 
 
 class EmbeddingsRequest(BaseModel):
-    input: str | List[str]
+    input: str | List[str] | List[int] | List[List[int]]
     model: str | None = Field(default=None, description="Unused parameter. To change the model, set the OPENEDAI_EMBEDDING_MODEL and OPENEDAI_EMBEDDING_DEVICE environment variables before starting the server.")
     encoding_format: str = Field(default="float", description="Can be float or base64.")
     user: str | None = Field(default=None, description="Unused parameter.")
@@ -156,6 +156,7 @@ class TokenCountResponse(BaseModel):
 class LogitsRequestParams(BaseModel):
     prompt: str
     use_samplers: bool = False
+    top_logits: int | None = 50
     frequency_penalty: float | None = 0
     max_tokens: int | None = 16
     presence_penalty: float | None = 0
@@ -168,7 +169,7 @@ class LogitsRequest(GenerationOptions, LogitsRequestParams):
 
 
 class LogitsResponse(BaseModel):
-    logits: dict
+    logits: Dict[str, float]
 
 
 class ModelInfoResponse(BaseModel):

diff --git a/extensions/whisper_stt/requirements.txt b/extensions/whisper_stt/requirements.txt
@@ -1,4 +1,4 @@
 SpeechRecognition==3.10.0
-git+https://github.com/oobabooga/whisper.git
+openai-whisper
 soundfile
 ffmpeg
diff --git a/instruction-templates/Mistral.yaml b/instruction-templates/Mistral.yaml
@@ -1,16 +1,7 @@
 instruction_template: |-
-  {%- set found_item = false -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set found_item = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not found_item -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
   {%- for message in messages %}
       {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
+          {{- message['content'] -}}
       {%- else -%}
           {%- if message['role'] == 'user' -%}
               {{-'[INST] ' + message['content'] + ' [/INST]'-}}

diff --git a/models/config.yaml b/models/config.yaml
@@ -174,7 +174,7 @@
   instruction_template: 'OpenChat'
 .*codellama.*instruct:
   instruction_template: 'Llama-v2'
-.*mistral.*instruct:
+.*(mistral|mixtral).*instruct:
   instruction_template: 'Mistral'
 .*mistral.*openorca:
   instruction_template: 'ChatML'

diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
@@ -52,6 +52,7 @@ def load_quantized(model_name):
         'quantize_config': quantize_config,
         'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
         'disable_exllama': shared.args.disable_exllama,
+        'disable_exllamav2': shared.args.disable_exllamav2,
     }
 
     logger.info(f"The AutoGPTQ params are: {params}")

diff --git a/modules/chat.py b/modules/chat.py
@@ -215,40 +215,47 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         yield output
         return
 
-    just_started = True
     visible_text = None
     stopping_strings = get_stopping_strings(state)
     is_stream = state['stream']
 
     # Prepare the input
-    if not any((regenerate, _continue)):
+    if not (regenerate or _continue):
         visible_text = html.escape(text)
 
         # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state, is_chat=True)
 
+        output['internal'].append([text, ''])
+        output['visible'].append([visible_text, ''])
+
         # *Is typing...*
         if loading_message:
-            yield {'visible': output['visible'] + [[visible_text, shared.processing_message]], 'internal': output['internal']}
+            yield {
+                'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]],
+                'internal': output['internal']
+            }
     else:
         text, visible_text = output['internal'][-1][0], output['visible'][-1][0]
         if regenerate:
-            output['visible'].pop()
-            output['internal'].pop()
-
-            # *Is typing...*
             if loading_message:
-                yield {'visible': output['visible'] + [[visible_text, shared.processing_message]], 'internal': output['internal']}
+                yield {
+                    'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
+                    'internal': output['internal'][:-1] + [[text, '']]
+                }
         elif _continue:
             last_reply = [output['internal'][-1][1], output['visible'][-1][1]]
             if loading_message:
-                yield {'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']], 'internal': output['internal']}
+                yield {
+                    'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']],
+                    'internal': output['internal']
+                }
 
     # Generate the prompt
     kwargs = {
         '_continue': _continue,
-        'history': output,
+        'history': output if _continue else {k: v[:-1] for k, v in output.items()}
     }
     prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs)
     if prompt is None:
@@ -270,12 +277,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             yield output
             return
 
-        if just_started:
-            just_started = False
-            if not _continue:
-                output['internal'].append(['', ''])
-                output['visible'].append(['', ''])
-
         if _continue:
             output['internal'][-1] = [text, last_reply[0] + reply]
             output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]

diff --git a/modules/loaders.py b/modules/loaders.py
@@ -25,6 +25,7 @@
         'rope_freq_base',
         'compress_pos_emb',
         'disable_exllama',
+        'disable_exllamav2',
         'transformers_info'
     ],
     'llama.cpp': [
@@ -94,6 +95,7 @@
         'groupsize',
         'desc_act',
         'disable_exllama',
+        'disable_exllamav2',
         'gpu_memory',
         'cpu_memory',
         'cpu',