oobabooga · matatonic · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023
diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py
@@ -38,7 +38,7 @@ def model_load(model_name):
 
 
 # complex loader
-def complex_model_load(model):
+def complex_model_load(model, lora = []):
 
     def guess_groupsize(model_name):
         if '1024g' in model_name:
@@ -50,17 +50,25 @@ def guess_groupsize(model_name):
         else:
             return -1
 
+    loader_names = ['llama.cpp', 'Transformers', 'AutoGPTQ', 'GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF',  'RWKV', 'flexgen']
+
     req = {
         'action': 'load',
         'model_name': model,
         'args': {
-            'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
+            'lora': lora,
 
             'bf16': False,
             'load_in_8bit': False,
             'groupsize': 0,
             'wbits': 0,
 
+            # Exllama
+            'gpu_split': None,
+            'max_seq_len': 2048,
+            'compress_pos_emb': 1,
+            'alpha_value': 1,
+
             # llama.cpp
             'threads': 0,
             'n_batch': 512,
@@ -89,6 +97,18 @@ def guess_groupsize(model_name):
         },
     }
 
+    # Example of a more complex load
+    # CalderaAI_30B-Lazarus-GPTQ4bit in 24GB with superhot-8k lora and embedding compression
+    # Also set truncation_length = 3072 because 8k is not detected from the model.
+    if model == 'CalderaAI_30B-Lazarus-GPTQ4bit':
+        req['args']['loader'] = 'ExLlama'
+        req['args']['compress_pos_emb'] = 2
+        req['args']['max_seq_len'] = 3072
+        req['args']['lora'] = ['kaiokendev_superhot-30b-8k-no-rlhf-test']
+        req['settings'] = { 'truncation_length': req['args']['max_seq_len'] }
+        return model_api(req)
+
+
     model = model.lower()
 
     if '4bit' in model or 'gptq' in model or 'int4' in model:

diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
@@ -7,11 +7,12 @@
 from modules.chat import generate_chat_reply
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
-from modules.models_settings import (get_model_settings_from_yamls,
-                                     update_model_parameters)
+from modules.models_settings import set_shared_model_settings
 from modules.text_generation import (encode, generate_reply,
                                      stop_everything_event)
-from modules.utils import get_available_models
+from modules.utils import (get_available_models,
+                           get_available_loras,
+                           get_available_settings)
 
 
 def get_model_info():
@@ -107,58 +108,87 @@ def do_POST(self):
             self.wfile.write(response.encode('utf-8'))
 
         elif self.path == '/api/v1/model':
+
             self.send_response(200)
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
             # by default return the same as the GET interface
             result = shared.model_name
 
-            # Actions: info, load, list, unload
+            # Actions: info, load, list, unload, add_lora, list_lora, list_settings, settings
             action = body.get('action', '')
 
             if action == 'load':
-                model_name = body['model_name']
+                model_name = body.get('model_name', shared.model_name)
                 args = body.get('args', {})
-                print('args', args)
-                for k in args:
-                    setattr(shared.args, k, args[k])
+                extra_settings = body.get('settings', {})
 
-                shared.model_name = model_name
-                unload_model()
+                print('Model load args:', args)
 
-                model_settings = get_model_settings_from_yamls(shared.model_name)
-                shared.settings.update(model_settings)
-                update_model_parameters(model_settings, initial=True)
+                unload_model()
+                shared.model_name = model_name
+                shared.args.model = model_name
+
+                for k in args:
+                    setattr(shared.args, k, args[k])
+
+                if not shared.args.lora:
+                    shared.args.lora = []
 
-                if shared.settings['mode'] != 'instruct':
-                    shared.settings['instruction_template'] = None
+                set_shared_model_settings(extra_settings)
+                # TODO: fetch lora settings too, ie. +get_model_settings_from_yamls(lora)
 
                 try:
-                    shared.model, shared.tokenizer = load_model(shared.model_name)
+                    shared.model, shared.tokenizer = load_model(model_name)
                     if shared.args.lora:
                         add_lora_to_model(shared.args.lora)  # list
 
                 except Exception as e:
                     response = json.dumps({'error': {'message': repr(e)}})
-
                     self.wfile.write(response.encode('utf-8'))
                     raise e
 
                 shared.args.model = shared.model_name
 
                 result = get_model_info()
 
+            elif action == 'add_lora':
+                lora = body.get('lora', [])
+                if not isinstance(lora, list):
+                    lora = [lora]
+
+                try:
+                    shared.args.lora = lora
+                    add_lora_to_model(lora)
+                except Exception as e:
+                    response = json.dumps({'error': {'message': repr(e)}})
+
+                    self.wfile.write(response.encode('utf-8'))
+                    raise e
+
+                result = get_model_info()
+
             elif action == 'unload':
                 unload_model()
                 shared.model_name = None
                 shared.args.model = None
+                shared.args.lora = []
                 result = get_model_info()
 
             elif action == 'list':
                 result = get_available_models()
 
-            elif action == 'info':
+            elif action == 'list_lora':
+                result = get_available_loras()
+
+            elif action == 'list_settings':
+                result = get_available_settings()
+
+            elif action == 'info' or action == 'settings':
+                extra_settings = body.get('settings', {})
+                if extra_settings:
+                    set_shared_model_settings(extra_settings) # Settings are applied after the default settings for the model
                 result = get_model_info()
 
             response = json.dumps({

diff --git a/extensions/openai/README.md b/extensions/openai/README.md
@@ -218,12 +218,11 @@ but there are some exceptions.
 | ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. |
 | ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context |
 | ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
+| ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |
 
 ## Future plans
-* better error handling
 * model changing, esp. something for swapping loras or embedding models
 * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
-* do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM
 
 ## Bugs? Feedback? Comments? Pull requests?