diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py
index 8e1e3002b9..f2341fee31 100644
--- a/api-examples/api-example-model.py
+++ b/api-examples/api-example-model.py
@@ -38,7 +38,7 @@ def model_load(model_name):
 
 
 # complex loader
-def complex_model_load(model):
+def complex_model_load(model, lora = []):
 
     def guess_groupsize(model_name):
         if '1024g' in model_name:
@@ -50,17 +50,25 @@ def guess_groupsize(model_name):
         else:
             return -1
 
+    loader_names = ['llama.cpp', 'Transformers', 'AutoGPTQ', 'GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF',  'RWKV', 'flexgen']
+
     req = {
         'action': 'load',
         'model_name': model,
         'args': {
-            'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
+            'lora': lora,
 
             'bf16': False,
             'load_in_8bit': False,
             'groupsize': 0,
             'wbits': 0,
 
+            # Exllama
+            'gpu_split': None,
+            'max_seq_len': 2048,
+            'compress_pos_emb': 1,
+            'alpha_value': 1,
+
             # llama.cpp
             'threads': 0,
             'n_batch': 512,
@@ -89,6 +97,18 @@ def guess_groupsize(model_name):
         },
     }
 
+    # Example of a more complex load
+    # CalderaAI_30B-Lazarus-GPTQ4bit in 24GB with superhot-8k lora and embedding compression
+    # Also set truncation_length = 3072 because 8k is not detected from the model.
+    if model == 'CalderaAI_30B-Lazarus-GPTQ4bit':
+        req['args']['loader'] = 'ExLlama'
+        req['args']['compress_pos_emb'] = 2
+        req['args']['max_seq_len'] = 3072
+        req['args']['lora'] = ['kaiokendev_superhot-30b-8k-no-rlhf-test']
+        req['settings'] = { 'truncation_length': req['args']['max_seq_len'] }
+        return model_api(req)
+    
+
     model = model.lower()
 
     if '4bit' in model or 'gptq' in model or 'int4' in model:
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index edc6d8f41f..8ee0eaced3 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -7,11 +7,12 @@
 from modules.chat import generate_chat_reply
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
-from modules.models_settings import (get_model_settings_from_yamls,
-                                     update_model_parameters)
+from modules.models_settings import set_shared_model_settings
 from modules.text_generation import (encode, generate_reply,
                                      stop_everything_event)
-from modules.utils import get_available_models
+from modules.utils import (get_available_models,
+                           get_available_loras,
+                           get_available_settings)
 
 
 def get_model_info():
@@ -107,6 +108,7 @@ def do_POST(self):
             self.wfile.write(response.encode('utf-8'))
 
         elif self.path == '/api/v1/model':
+
             self.send_response(200)
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
@@ -114,34 +116,36 @@ def do_POST(self):
             # by default return the same as the GET interface
             result = shared.model_name
 
-            # Actions: info, load, list, unload
+            # Actions: info, load, list, unload, add_lora, list_lora, list_settings, settings
             action = body.get('action', '')
 
             if action == 'load':
-                model_name = body['model_name']
+                model_name = body.get('model_name', shared.model_name)
                 args = body.get('args', {})
-                print('args', args)
-                for k in args:
-                    setattr(shared.args, k, args[k])
+                extra_settings = body.get('settings', {})
 
-                shared.model_name = model_name
-                unload_model()
+                print('Model load args:', args)
 
-                model_settings = get_model_settings_from_yamls(shared.model_name)
-                shared.settings.update(model_settings)
-                update_model_parameters(model_settings, initial=True)
+                unload_model()
+                shared.model_name = model_name
+                shared.args.model = model_name
+                
+                for k in args:
+                    setattr(shared.args, k, args[k])
+                
+                if not shared.args.lora:
+                    shared.args.lora = []
 
-                if shared.settings['mode'] != 'instruct':
-                    shared.settings['instruction_template'] = None
+                set_shared_model_settings(extra_settings)
+                # TODO: fetch lora settings too, ie. +get_model_settings_from_yamls(lora)
 
                 try:
-                    shared.model, shared.tokenizer = load_model(shared.model_name)
+                    shared.model, shared.tokenizer = load_model(model_name)
                     if shared.args.lora:
                         add_lora_to_model(shared.args.lora)  # list
 
                 except Exception as e:
                     response = json.dumps({'error': {'message': repr(e)}})
-
                     self.wfile.write(response.encode('utf-8'))
                     raise e
 
@@ -149,16 +153,42 @@ def do_POST(self):
 
                 result = get_model_info()
 
+            elif action == 'add_lora':
+                lora = body.get('lora', [])
+                if not isinstance(lora, list):
+                    lora = [lora]
+
+                try:
+                    shared.args.lora = lora
+                    add_lora_to_model(lora)
+                except Exception as e:
+                    response = json.dumps({'error': {'message': repr(e)}})
+
+                    self.wfile.write(response.encode('utf-8'))
+                    raise e
+                
+                result = get_model_info()
+                
             elif action == 'unload':
                 unload_model()
                 shared.model_name = None
                 shared.args.model = None
+                shared.args.lora = []
                 result = get_model_info()
 
             elif action == 'list':
                 result = get_available_models()
 
-            elif action == 'info':
+            elif action == 'list_lora':
+                result = get_available_loras()
+
+            elif action == 'list_settings':
+                result = get_available_settings()
+
+            elif action == 'info' or action == 'settings':
+                extra_settings = body.get('settings', {})
+                if extra_settings:
+                    set_shared_model_settings(extra_settings) # Settings are applied after the default settings for the model
                 result = get_model_info()
 
             response = json.dumps({
diff --git a/extensions/openai/README.md b/extensions/openai/README.md
index 0f775bbfb6..7bbc1e8311 100644
--- a/extensions/openai/README.md
+++ b/extensions/openai/README.md
@@ -218,12 +218,11 @@ but there are some exceptions.
 | ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. |
 | ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context |
 | ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
+| ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |
 
 ## Future plans
-* better error handling
 * model changing, esp. something for swapping loras or embedding models
 * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
-* do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM
 
 ## Bugs? Feedback? Comments? Pull requests?
 
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 323d68236b..57bdf33b2b 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -24,7 +24,7 @@
 # Slightly different defaults for OpenAI's API
 # Data type is important, Ex. use 0.0 for a float 0
 default_req_params = {
-    'max_new_tokens': 200,
+    'max_new_tokens': 16,
     'temperature': 1.0,
     'top_p': 1.0,
     'top_k': 1,
@@ -36,7 +36,7 @@
     'echo': False,
     'seed': -1,
     # 'n' : default(body, 'n', 1),  # 'n' doesn't have a direct map
-    'truncation_length': 2048,
+    'truncation_length': 2048, # first use shared.settings value
     'add_bos_token': True,
     'do_sample': True,
     'typical_p': 1.0,
@@ -258,66 +258,84 @@ def do_POST(self):
             is_chat_request = 'chat' in self.path
             resp_list = 'data' if is_legacy else 'choices'
 
-            # XXX model is ignored for now
-            # model = body.get('model', shared.model_name) # ignored, use existing for now
-            model = shared.model_name
-            created_time = int(time.time())
-
-            cmpl_id = "chatcmpl-%d" % (created_time) if is_chat_request else "conv-%d" % (created_time)
+            created_time = int(time.time()*1000)
 
             # Request Parameters
             # Try to use openai defaults or map them to something with the same intent
             req_params = default_req_params.copy()
             stopping_strings = []
 
-            if 'stop' in body:
-                if isinstance(body['stop'], str):
-                    stopping_strings.extend([body['stop']])
-                elif isinstance(body['stop'], list):
-                    stopping_strings.extend(body['stop'])
-
+            # Common request parameters
             truncation_length = default(shared.settings, 'truncation_length', 2048)
-            truncation_length = clamp(default(body, 'truncation_length', truncation_length), 1, truncation_length)
-
-            default_max_tokens = truncation_length if is_chat_request else 16  # completions default, chat default is 'inf' so we need to cap it.
-
-            max_tokens_str = 'length' if is_legacy else 'max_tokens'
-            max_tokens = default(body, max_tokens_str, default(shared.settings, 'max_new_tokens', default_max_tokens))
-            # if the user assumes OpenAI, the max_tokens is way too large - try to ignore it unless it's small enough
-
-            req_params['max_new_tokens'] = max_tokens
             req_params['truncation_length'] = truncation_length
-            req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0
-            req_params['top_p'] = clamp(default(body, 'top_p', default_req_params['top_p']), 0.001, 1.0)
-            req_params['top_k'] = default(body, 'best_of', default_req_params['top_k'])
-            req_params['suffix'] = default(body, 'suffix', default_req_params['suffix'])
-            req_params['stream'] = default(body, 'stream', default_req_params['stream'])
-            req_params['echo'] = default(body, 'echo', default_req_params['echo'])
-            req_params['seed'] = shared.settings.get('seed', default_req_params['seed'])
             req_params['add_bos_token'] = shared.settings.get('add_bos_token', default_req_params['add_bos_token'])
+            req_params['seed'] = shared.settings.get('seed', default_req_params['seed'])
 
-            is_streaming = req_params['stream']
-
-            self.send_response(200)
-            self.send_access_control_headers()
-            if is_streaming:
-                self.send_header('Content-Type', 'text/event-stream')
-                self.send_header('Cache-Control', 'no-cache')
-                # self.send_header('Connection', 'keep-alive')
+            # OpenAI API Parameters
+            # model - ignored for now, TODO: When we can reliably load a model or lora from a name only change this
+            # model = body.get('model', shared.model_name)
+            model = shared.model_name # return the real model name
+            req_params['suffix'] = default(body, 'suffix', default_req_params['suffix'])
+            max_tokens = 0
+            max_tokens_str = 'length' if is_legacy else 'max_tokens'
+            if is_chat_request:
+                # chat default max_tokens is 'inf', but also flexible
+                if max_tokens_str in body:
+                    max_tokens = default(body, max_tokens_str, truncation_length)
+                    req_params['max_new_tokens'] = max_tokens
+                else:
+                    max_tokens = 0
+                    req_params['max_new_tokens'] = truncation_length
             else:
-                self.send_header('Content-Type', 'application/json')
-            self.end_headers()
+                max_tokens = default(body, max_tokens_str, default_req_params['max_new_tokens'])
+                req_params['max_new_tokens'] = max_tokens
+            req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0/2.0
+            req_params['top_p'] = clamp(default(body, 'top_p', default_req_params['top_p']), 0.001, 1.0)
+            n = default(body, 'n', 1)
+            if n != 1:
+                self.openai_error(message="Only n = 1 is supported.", code=400, error_type='InvalidRequestError')
+                return
+            is_streaming = default(body, 'stream', default_req_params['stream'])
+            req_params['stream'] = is_streaming
+            if 'stop' in body:
+                if isinstance(body['stop'], str):
+                    stopping_strings.extend([body['stop']])
+                elif isinstance(body['stop'], list):
+                    stopping_strings.extend(body['stop'])
+            # presence_penalty - ignored
+            # frequency_penalty - ignored
+            if body.get('logit_bias', None):
+                self.openai_error(message="logit_bias is not supported.", code=400, error_type='InvalidRequestError')
+                return
+            # user - ignored
 
             token_count = 0
             completion_token_count = 0
             prompt = ''
             stream_object_type = ''
             object_type = ''
+            cmpl_id = ''
 
             if is_chat_request:
                 # Chat Completions
                 stream_object_type = 'chat.completions.chunk'
                 object_type = 'chat.completions'
+                cmpl_id = "chatcmpl-%d" % (created_time)
+
+                
+                if body.get('functions', []): # chat only
+                    self.openai_error(message="functions is not supported.", code=400, error_type='InvalidRequestError')
+                    return
+                if body.get('function_call', ''): # chat only, 'none', 'auto', {'name': 'func'}
+                    self.openai_error(message="function_call is not supported.", code=400, error_type='InvalidRequestError')
+                    return
+
+                # messages - chat only
+                if not 'messages' in body:
+                    self.openai_error(message="messages is required", code=400, error_type='InvalidRequestError')
+                    return
+                
+                req_params['top_k'] = 20 # There is no best_of/top_k param for chat, but it is much improved with a higher top_k.
 
                 messages = body['messages']
 
@@ -371,15 +389,18 @@ def do_POST(self):
                 system_msgs = []
                 chat_msgs = []
 
+                def end_line(s):
+                    if s and s[-1] != '\n':
+                        s = s + '\n'
+                    return s
+
                 # You are ChatGPT, a large language model trained by OpenAI. Answer as concisely as possible. Knowledge cutoff: {knowledge_cutoff} Current date: {current_date}
                 context_msg = role_formats['system'].format(message=role_formats['context']) if role_formats['context'] else ''
-                if context_msg:
-                    system_msgs.extend([context_msg])
+                context_msg = end_line(context_msg)
 
                 # Maybe they sent both? This is not documented in the API, but some clients seem to do this.
                 if 'prompt' in body:
-                    prompt_msg = role_formats['system'].format(message=body['prompt'])
-                    system_msgs.extend([prompt_msg])
+                    context_msg = end_line(role_formats['system'].format(message=body['prompt'])) + context_msg
 
                 for m in messages:
                     role = m['role']
@@ -390,33 +411,29 @@ def do_POST(self):
                     else:
                         chat_msgs.extend([msg])
 
-                # can't really truncate the system messages
                 system_msg = '\n'.join(system_msgs)
-                if system_msg and system_msg[-1] != '\n':
-                    system_msg = system_msg + '\n'
-
-                system_token_count = len(encode(system_msg)[0])
-                remaining_tokens = truncation_length - system_token_count
-                chat_msg = ''
-
-                while chat_msgs:
-                    new_msg = chat_msgs.pop()
-                    new_size = len(encode(new_msg)[0])
-                    if new_size <= remaining_tokens:
-                        chat_msg = new_msg + chat_msg
-                        remaining_tokens -= new_size
-                    else:
-                        print(f"Warning: too many messages for context size, dropping {len(chat_msgs) + 1} oldest message(s).")
-                        break
+                system_msg = end_line(system_msg)
 
-                prompt = system_msg + chat_msg + role_formats['prompt']
+                prompt = system_msg + context_msg + ''.join(chat_msgs) + role_formats['prompt']
 
                 token_count = len(encode(prompt)[0])
 
+                if token_count >= truncation_length:
+                    err_msg = f"This model maximum context length is {truncation_length} tokens. However, your messages resulted in over {token_count} tokens."
+                    self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError')
+                    return
+
+                if max_tokens > 0 and token_count + max_tokens > truncation_length:
+                    err_msg = f"This model maximum context length is {truncation_length} tokens. However, your messages resulted in over {token_count} tokens and max_tokens is {max_tokens}."
+                    print(f"Warning: ${err_msg}")
+                    #self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError')
+                    #return
+
             else:
                 # Text Completions
                 stream_object_type = 'text_completion.chunk'
                 object_type = 'text_completion'
+                cmpl_id = "conv-%d" % (created_time)
 
                 # ... encoded as a string, array of strings, array of tokens, or array of token arrays.
                 if is_legacy:
@@ -425,21 +442,34 @@ def do_POST(self):
                     prompt = body['prompt']  # XXX this can be different types
 
                 if isinstance(prompt, list):
-                    self.openai_error("API Batched generation not yet supported.")
+                    self.openai_error(message="API Batched generation not yet supported.", code=400, error_type='InvalidRequestError')
                     return
 
                 token_count = len(encode(prompt)[0])
-                if token_count >= truncation_length:
-                    new_len = int(len(prompt) * shared.settings['truncation_length'] / token_count)
-                    prompt = prompt[-new_len:]
-                    new_token_count = len(encode(prompt)[0])
-                    print(f"Warning: truncating prompt to {new_len} characters, was {token_count} tokens. Now: {new_token_count} tokens.")
-                    token_count = new_token_count
 
-            if truncation_length - token_count < req_params['max_new_tokens']:
-                print(f"Warning: Ignoring max_new_tokens ({req_params['max_new_tokens']}), too large for the remaining context. Remaining tokens: {truncation_length - token_count}")
-                req_params['max_new_tokens'] = truncation_length - token_count
-                print(f"Warning: Set max_new_tokens = {req_params['max_new_tokens']}")
+                if token_count + max_tokens > truncation_length:
+                    err_msg = f"The token count of your prompt ({token_count}) plus max_tokens ({max_tokens}) cannot exceed the model's context length ({truncation_length})."
+                    #print(f"Warning: ${err_msg}")
+                    self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError')
+                    return
+
+                if body.get('logprobs', None):
+                    self.openai_error(message="logprobs is not supported.", code=400, error_type='InvalidRequestError')
+                    return
+                req_params['echo'] = default(body, 'echo', default_req_params['echo'])
+                req_params['top_k'] = default(body, 'best_of', default_req_params['top_k'])
+
+
+            # Send HTTP headers
+            self.send_response(200)
+            self.send_access_control_headers()
+            if is_streaming:
+                self.send_header('Content-Type', 'text/event-stream')
+                self.send_header('Cache-Control', 'no-cache')
+                # self.send_header('Connection', 'keep-alive')
+            else:
+                self.send_header('Content-Type', 'application/json')
+            self.end_headers()
 
             if is_streaming:
                 # begin streaming
@@ -471,42 +501,13 @@ def do_POST(self):
 
             answer = ''
             seen_content = ''
-            longest_stop_len = max([len(x) for x in stopping_strings] + [0])
 
             for a in generator:
                 answer = a
 
-                stop_string_found = False
-                len_seen = len(seen_content)
-                search_start = max(len_seen - longest_stop_len, 0)
-
-                for string in stopping_strings:
-                    idx = answer.find(string, search_start)
-                    if idx != -1:
-                        answer = answer[:idx]  # clip it.
-                        stop_string_found = True
-
-                if stop_string_found:
-                    break
-
-                # If something like "\nYo" is generated just before "\nYou:"
-                # is completed, buffer and generate more, don't send it
-                buffer_and_continue = False
-
-                for string in stopping_strings:
-                    for j in range(len(string) - 1, 0, -1):
-                        if answer[-j:] == string[:j]:
-                            buffer_and_continue = True
-                            break
-                    else:
-                        continue
-                    break
-
-                if buffer_and_continue:
-                    continue
-
                 if is_streaming:
                     # Streaming
+                    len_seen = len(seen_content)
                     new_content = answer[len_seen:]
 
                     if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
@@ -539,6 +540,9 @@ def do_POST(self):
                     completion_token_count += len(encode(new_content)[0])
 
             if is_streaming:
+                stop_reason = "stop"
+                if token_count + completion_token_count >= truncation_length or completion_token_count >= max_tokens:
+                    stop_reason = "length"
                 chunk = {
                     "id": cmpl_id,
                     "object": stream_object_type,
@@ -546,7 +550,7 @@ def do_POST(self):
                     "model": model,  # TODO: add Lora info?
                     resp_list: [{
                         "index": 0,
-                        "finish_reason": "stop",
+                        "finish_reason": stop_reason,
                     }],
                     "usage": {
                         "prompt_tokens": token_count,
@@ -579,7 +583,7 @@ def do_POST(self):
 
             completion_token_count = len(encode(answer)[0])
             stop_reason = "stop"
-            if token_count + completion_token_count >= truncation_length:
+            if token_count + completion_token_count >= truncation_length or completion_token_count >= max_tokens:
                 stop_reason = "length"
 
             resp = {
@@ -611,14 +615,8 @@ def do_POST(self):
                 self.openai_error("No model loaded.")
                 return
 
-            self.send_response(200)
-            self.send_access_control_headers()
-            self.send_header('Content-Type', 'application/json')
-            self.end_headers()
-
             created_time = int(time.time())
 
-            # Using Alpaca format, this may work with other models too.
             instruction = body['instruction']
             input = body.get('input', '')
 
@@ -668,6 +666,16 @@ def do_POST(self):
             token_count = len(encode(edit_task)[0])
             max_tokens = truncation_length - token_count
 
+            if max_tokens < 2:
+                err_msg = f"This model maximum context length is {truncation_length} tokens. However, your messages resulted in over {truncation_length - max_tokens} tokens."
+                self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError')
+                return
+            
+            self.send_response(200)
+            self.send_access_control_headers()
+            self.send_header('Content-Type', 'application/json')
+            self.end_headers()
+
             req_params['max_new_tokens'] = max_tokens
             req_params['truncation_length'] = truncation_length
             req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0
@@ -682,24 +690,9 @@ def do_POST(self):
 
             longest_stop_len = max([len(x) for x in stopping_strings] + [0])
             answer = ''
-            seen_content = ''
             for a in generator:
                 answer = a
 
-                stop_string_found = False
-                len_seen = len(seen_content)
-                search_start = max(len_seen - longest_stop_len, 0)
-
-                for string in stopping_strings:
-                    idx = answer.find(string, search_start)
-                    if idx != -1:
-                        answer = answer[:idx]  # clip it.
-                        stop_string_found = True
-
-                if stop_string_found:
-                    break
-
-
             # some reply's have an extra leading space to fit the instruction template, just clip it off from the reply.
             if edit_task[-1] != '\n' and answer and answer[0] == ' ':
                 answer = answer[1:]
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 0207e7de76..b62f844350 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import yaml
+import copy
 
 from modules import shared, ui
 
@@ -132,3 +133,27 @@ def save_model_settings(model, state):
             f.write(yaml.dump(user_config, sort_keys=False))
 
         yield (f"Settings for {model} saved to {p}")
+
+
+# Update the shared.settings with new settings
+# the extra_settings dict is applied after the default settings for the model
+def set_shared_model_settings(extra_settings = {}):
+    model_settings = get_model_settings_from_yamls(shared.model_name) # get current model settings
+    new_shared_settings = copy.deepcopy(shared.settings)
+    new_shared_settings.update(model_settings)
+
+    # set each setting and ensure the correct type
+    for k in extra_settings:
+        if k in new_shared_settings:
+            new_shared_settings[k] = type(new_shared_settings[k])(extra_settings[k])
+        else:
+            print("Warning: Setting unknown model setting: {k} = {extra_settings[k]}")
+            new_shared_settings[k] = extra_settings[k]
+    
+    update_model_parameters(new_shared_settings, initial=True)
+
+    if new_shared_settings['mode'] != 'instruct':
+        new_shared_settings['instruction_template'] = None
+
+    shared.settings = new_shared_settings
+
diff --git a/modules/utils.py b/modules/utils.py
index 72a0dfa126..57c9a5ec9d 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -124,3 +124,7 @@ def get_available_chat_styles():
 def get_available_sessions():
     items = sorted(set(k.stem for k in Path('logs').glob(f'session_{shared.get_mode()}*')), key=natural_keys, reverse=True)
     return [item for item in items if 'autosave' in item] + [item for item in items if 'autosave' not in item]
+
+
+def get_available_settings():
+    return sorted(shared.settings.keys())
diff --git a/server.py b/server.py
index 5c224e362e..d1b090a3a0 100644
--- a/server.py
+++ b/server.py
@@ -40,7 +40,7 @@
 from modules.models import load_model, unload_model
 from modules.models_settings import (
     apply_model_settings_to_state,
-    get_model_settings_from_yamls,
+    set_shared_model_settings,
     save_model_settings,
     update_model_parameters
 )
@@ -63,6 +63,7 @@ def load_model_wrapper(selected_model, loader, autoload=False):
         try:
             yield f"Loading {selected_model}..."
             shared.model_name = selected_model
+            set_shared_model_settings()
             unload_model()
             if selected_model != '':
                 shared.model, shared.tokenizer = load_model(shared.model_name, loader)
@@ -1146,12 +1147,10 @@ def create_interface():
 
     # If any model has been selected, load it
     if shared.model_name != 'None':
-        model_settings = get_model_settings_from_yamls(shared.model_name)
-        shared.settings.update(model_settings)  # hijacking the interface defaults
-        update_model_parameters(model_settings, initial=True)  # hijacking the command-line arguments
-
         # Load the model
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+        for resp in load_model_wrapper(shared.model_name, shared.args.loader, autoload=True):
+            print(resp)
+            
         if shared.args.lora:
             add_lora_to_model(shared.args.lora)