diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py index 8e1e3002b9..f2341fee31 100644 --- a/api-examples/api-example-model.py +++ b/api-examples/api-example-model.py @@ -38,7 +38,7 @@ def model_load(model_name): # complex loader -def complex_model_load(model): +def complex_model_load(model, lora = []): def guess_groupsize(model_name): if '1024g' in model_name: @@ -50,17 +50,25 @@ def guess_groupsize(model_name): else: return -1 + loader_names = ['llama.cpp', 'Transformers', 'AutoGPTQ', 'GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF', 'RWKV', 'flexgen'] + req = { 'action': 'load', 'model_name': model, 'args': { - 'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama + 'lora': lora, 'bf16': False, 'load_in_8bit': False, 'groupsize': 0, 'wbits': 0, + # Exllama + 'gpu_split': None, + 'max_seq_len': 2048, + 'compress_pos_emb': 1, + 'alpha_value': 1, + # llama.cpp 'threads': 0, 'n_batch': 512, @@ -89,6 +97,18 @@ def guess_groupsize(model_name): }, } + # Example of a more complex load + # CalderaAI_30B-Lazarus-GPTQ4bit in 24GB with superhot-8k lora and embedding compression + # Also set truncation_length = 3072 because 8k is not detected from the model. + if model == 'CalderaAI_30B-Lazarus-GPTQ4bit': + req['args']['loader'] = 'ExLlama' + req['args']['compress_pos_emb'] = 2 + req['args']['max_seq_len'] = 3072 + req['args']['lora'] = ['kaiokendev_superhot-30b-8k-no-rlhf-test'] + req['settings'] = { 'truncation_length': req['args']['max_seq_len'] } + return model_api(req) + + model = model.lower() if '4bit' in model or 'gptq' in model or 'int4' in model: diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py index edc6d8f41f..8ee0eaced3 100644 --- a/extensions/api/blocking_api.py +++ b/extensions/api/blocking_api.py @@ -7,11 +7,12 @@ from modules.chat import generate_chat_reply from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model -from modules.models_settings import (get_model_settings_from_yamls, - update_model_parameters) +from modules.models_settings import set_shared_model_settings from modules.text_generation import (encode, generate_reply, stop_everything_event) -from modules.utils import get_available_models +from modules.utils import (get_available_models, + get_available_loras, + get_available_settings) def get_model_info(): @@ -107,6 +108,7 @@ def do_POST(self): self.wfile.write(response.encode('utf-8')) elif self.path == '/api/v1/model': + self.send_response(200) self.send_header('Content-Type', 'application/json') self.end_headers() @@ -114,34 +116,36 @@ def do_POST(self): # by default return the same as the GET interface result = shared.model_name - # Actions: info, load, list, unload + # Actions: info, load, list, unload, add_lora, list_lora, list_settings, settings action = body.get('action', '') if action == 'load': - model_name = body['model_name'] + model_name = body.get('model_name', shared.model_name) args = body.get('args', {}) - print('args', args) - for k in args: - setattr(shared.args, k, args[k]) + extra_settings = body.get('settings', {}) - shared.model_name = model_name - unload_model() + print('Model load args:', args) - model_settings = get_model_settings_from_yamls(shared.model_name) - shared.settings.update(model_settings) - update_model_parameters(model_settings, initial=True) + unload_model() + shared.model_name = model_name + shared.args.model = model_name + + for k in args: + setattr(shared.args, k, args[k]) + + if not shared.args.lora: + shared.args.lora = [] - if shared.settings['mode'] != 'instruct': - shared.settings['instruction_template'] = None + set_shared_model_settings(extra_settings) + # TODO: fetch lora settings too, ie. +get_model_settings_from_yamls(lora) try: - shared.model, shared.tokenizer = load_model(shared.model_name) + shared.model, shared.tokenizer = load_model(model_name) if shared.args.lora: add_lora_to_model(shared.args.lora) # list except Exception as e: response = json.dumps({'error': {'message': repr(e)}}) - self.wfile.write(response.encode('utf-8')) raise e @@ -149,16 +153,42 @@ def do_POST(self): result = get_model_info() + elif action == 'add_lora': + lora = body.get('lora', []) + if not isinstance(lora, list): + lora = [lora] + + try: + shared.args.lora = lora + add_lora_to_model(lora) + except Exception as e: + response = json.dumps({'error': {'message': repr(e)}}) + + self.wfile.write(response.encode('utf-8')) + raise e + + result = get_model_info() + elif action == 'unload': unload_model() shared.model_name = None shared.args.model = None + shared.args.lora = [] result = get_model_info() elif action == 'list': result = get_available_models() - elif action == 'info': + elif action == 'list_lora': + result = get_available_loras() + + elif action == 'list_settings': + result = get_available_settings() + + elif action == 'info' or action == 'settings': + extra_settings = body.get('settings', {}) + if extra_settings: + set_shared_model_settings(extra_settings) # Settings are applied after the default settings for the model result = get_model_info() response = json.dumps({ diff --git a/extensions/openai/README.md b/extensions/openai/README.md index 0f775bbfb6..7bbc1e8311 100644 --- a/extensions/openai/README.md +++ b/extensions/openai/README.md @@ -218,12 +218,11 @@ but there are some exceptions. | ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. | | ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context | | ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 | +| ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported | ## Future plans -* better error handling * model changing, esp. something for swapping loras or embedding models * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard) -* do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM ## Bugs? Feedback? Comments? Pull requests? diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 323d68236b..57bdf33b2b 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -24,7 +24,7 @@ # Slightly different defaults for OpenAI's API # Data type is important, Ex. use 0.0 for a float 0 default_req_params = { - 'max_new_tokens': 200, + 'max_new_tokens': 16, 'temperature': 1.0, 'top_p': 1.0, 'top_k': 1, @@ -36,7 +36,7 @@ 'echo': False, 'seed': -1, # 'n' : default(body, 'n', 1), # 'n' doesn't have a direct map - 'truncation_length': 2048, + 'truncation_length': 2048, # first use shared.settings value 'add_bos_token': True, 'do_sample': True, 'typical_p': 1.0, @@ -258,66 +258,84 @@ def do_POST(self): is_chat_request = 'chat' in self.path resp_list = 'data' if is_legacy else 'choices' - # XXX model is ignored for now - # model = body.get('model', shared.model_name) # ignored, use existing for now - model = shared.model_name - created_time = int(time.time()) - - cmpl_id = "chatcmpl-%d" % (created_time) if is_chat_request else "conv-%d" % (created_time) + created_time = int(time.time()*1000) # Request Parameters # Try to use openai defaults or map them to something with the same intent req_params = default_req_params.copy() stopping_strings = [] - if 'stop' in body: - if isinstance(body['stop'], str): - stopping_strings.extend([body['stop']]) - elif isinstance(body['stop'], list): - stopping_strings.extend(body['stop']) - + # Common request parameters truncation_length = default(shared.settings, 'truncation_length', 2048) - truncation_length = clamp(default(body, 'truncation_length', truncation_length), 1, truncation_length) - - default_max_tokens = truncation_length if is_chat_request else 16 # completions default, chat default is 'inf' so we need to cap it. - - max_tokens_str = 'length' if is_legacy else 'max_tokens' - max_tokens = default(body, max_tokens_str, default(shared.settings, 'max_new_tokens', default_max_tokens)) - # if the user assumes OpenAI, the max_tokens is way too large - try to ignore it unless it's small enough - - req_params['max_new_tokens'] = max_tokens req_params['truncation_length'] = truncation_length - req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0 - req_params['top_p'] = clamp(default(body, 'top_p', default_req_params['top_p']), 0.001, 1.0) - req_params['top_k'] = default(body, 'best_of', default_req_params['top_k']) - req_params['suffix'] = default(body, 'suffix', default_req_params['suffix']) - req_params['stream'] = default(body, 'stream', default_req_params['stream']) - req_params['echo'] = default(body, 'echo', default_req_params['echo']) - req_params['seed'] = shared.settings.get('seed', default_req_params['seed']) req_params['add_bos_token'] = shared.settings.get('add_bos_token', default_req_params['add_bos_token']) + req_params['seed'] = shared.settings.get('seed', default_req_params['seed']) - is_streaming = req_params['stream'] - - self.send_response(200) - self.send_access_control_headers() - if is_streaming: - self.send_header('Content-Type', 'text/event-stream') - self.send_header('Cache-Control', 'no-cache') - # self.send_header('Connection', 'keep-alive') + # OpenAI API Parameters + # model - ignored for now, TODO: When we can reliably load a model or lora from a name only change this + # model = body.get('model', shared.model_name) + model = shared.model_name # return the real model name + req_params['suffix'] = default(body, 'suffix', default_req_params['suffix']) + max_tokens = 0 + max_tokens_str = 'length' if is_legacy else 'max_tokens' + if is_chat_request: + # chat default max_tokens is 'inf', but also flexible + if max_tokens_str in body: + max_tokens = default(body, max_tokens_str, truncation_length) + req_params['max_new_tokens'] = max_tokens + else: + max_tokens = 0 + req_params['max_new_tokens'] = truncation_length else: - self.send_header('Content-Type', 'application/json') - self.end_headers() + max_tokens = default(body, max_tokens_str, default_req_params['max_new_tokens']) + req_params['max_new_tokens'] = max_tokens + req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0/2.0 + req_params['top_p'] = clamp(default(body, 'top_p', default_req_params['top_p']), 0.001, 1.0) + n = default(body, 'n', 1) + if n != 1: + self.openai_error(message="Only n = 1 is supported.", code=400, error_type='InvalidRequestError') + return + is_streaming = default(body, 'stream', default_req_params['stream']) + req_params['stream'] = is_streaming + if 'stop' in body: + if isinstance(body['stop'], str): + stopping_strings.extend([body['stop']]) + elif isinstance(body['stop'], list): + stopping_strings.extend(body['stop']) + # presence_penalty - ignored + # frequency_penalty - ignored + if body.get('logit_bias', None): + self.openai_error(message="logit_bias is not supported.", code=400, error_type='InvalidRequestError') + return + # user - ignored token_count = 0 completion_token_count = 0 prompt = '' stream_object_type = '' object_type = '' + cmpl_id = '' if is_chat_request: # Chat Completions stream_object_type = 'chat.completions.chunk' object_type = 'chat.completions' + cmpl_id = "chatcmpl-%d" % (created_time) + + + if body.get('functions', []): # chat only + self.openai_error(message="functions is not supported.", code=400, error_type='InvalidRequestError') + return + if body.get('function_call', ''): # chat only, 'none', 'auto', {'name': 'func'} + self.openai_error(message="function_call is not supported.", code=400, error_type='InvalidRequestError') + return + + # messages - chat only + if not 'messages' in body: + self.openai_error(message="messages is required", code=400, error_type='InvalidRequestError') + return + + req_params['top_k'] = 20 # There is no best_of/top_k param for chat, but it is much improved with a higher top_k. messages = body['messages'] @@ -371,15 +389,18 @@ def do_POST(self): system_msgs = [] chat_msgs = [] + def end_line(s): + if s and s[-1] != '\n': + s = s + '\n' + return s + # You are ChatGPT, a large language model trained by OpenAI. Answer as concisely as possible. Knowledge cutoff: {knowledge_cutoff} Current date: {current_date} context_msg = role_formats['system'].format(message=role_formats['context']) if role_formats['context'] else '' - if context_msg: - system_msgs.extend([context_msg]) + context_msg = end_line(context_msg) # Maybe they sent both? This is not documented in the API, but some clients seem to do this. if 'prompt' in body: - prompt_msg = role_formats['system'].format(message=body['prompt']) - system_msgs.extend([prompt_msg]) + context_msg = end_line(role_formats['system'].format(message=body['prompt'])) + context_msg for m in messages: role = m['role'] @@ -390,33 +411,29 @@ def do_POST(self): else: chat_msgs.extend([msg]) - # can't really truncate the system messages system_msg = '\n'.join(system_msgs) - if system_msg and system_msg[-1] != '\n': - system_msg = system_msg + '\n' - - system_token_count = len(encode(system_msg)[0]) - remaining_tokens = truncation_length - system_token_count - chat_msg = '' - - while chat_msgs: - new_msg = chat_msgs.pop() - new_size = len(encode(new_msg)[0]) - if new_size <= remaining_tokens: - chat_msg = new_msg + chat_msg - remaining_tokens -= new_size - else: - print(f"Warning: too many messages for context size, dropping {len(chat_msgs) + 1} oldest message(s).") - break + system_msg = end_line(system_msg) - prompt = system_msg + chat_msg + role_formats['prompt'] + prompt = system_msg + context_msg + ''.join(chat_msgs) + role_formats['prompt'] token_count = len(encode(prompt)[0]) + if token_count >= truncation_length: + err_msg = f"This model maximum context length is {truncation_length} tokens. However, your messages resulted in over {token_count} tokens." + self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError') + return + + if max_tokens > 0 and token_count + max_tokens > truncation_length: + err_msg = f"This model maximum context length is {truncation_length} tokens. However, your messages resulted in over {token_count} tokens and max_tokens is {max_tokens}." + print(f"Warning: ${err_msg}") + #self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError') + #return + else: # Text Completions stream_object_type = 'text_completion.chunk' object_type = 'text_completion' + cmpl_id = "conv-%d" % (created_time) # ... encoded as a string, array of strings, array of tokens, or array of token arrays. if is_legacy: @@ -425,21 +442,34 @@ def do_POST(self): prompt = body['prompt'] # XXX this can be different types if isinstance(prompt, list): - self.openai_error("API Batched generation not yet supported.") + self.openai_error(message="API Batched generation not yet supported.", code=400, error_type='InvalidRequestError') return token_count = len(encode(prompt)[0]) - if token_count >= truncation_length: - new_len = int(len(prompt) * shared.settings['truncation_length'] / token_count) - prompt = prompt[-new_len:] - new_token_count = len(encode(prompt)[0]) - print(f"Warning: truncating prompt to {new_len} characters, was {token_count} tokens. Now: {new_token_count} tokens.") - token_count = new_token_count - if truncation_length - token_count < req_params['max_new_tokens']: - print(f"Warning: Ignoring max_new_tokens ({req_params['max_new_tokens']}), too large for the remaining context. Remaining tokens: {truncation_length - token_count}") - req_params['max_new_tokens'] = truncation_length - token_count - print(f"Warning: Set max_new_tokens = {req_params['max_new_tokens']}") + if token_count + max_tokens > truncation_length: + err_msg = f"The token count of your prompt ({token_count}) plus max_tokens ({max_tokens}) cannot exceed the model's context length ({truncation_length})." + #print(f"Warning: ${err_msg}") + self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError') + return + + if body.get('logprobs', None): + self.openai_error(message="logprobs is not supported.", code=400, error_type='InvalidRequestError') + return + req_params['echo'] = default(body, 'echo', default_req_params['echo']) + req_params['top_k'] = default(body, 'best_of', default_req_params['top_k']) + + + # Send HTTP headers + self.send_response(200) + self.send_access_control_headers() + if is_streaming: + self.send_header('Content-Type', 'text/event-stream') + self.send_header('Cache-Control', 'no-cache') + # self.send_header('Connection', 'keep-alive') + else: + self.send_header('Content-Type', 'application/json') + self.end_headers() if is_streaming: # begin streaming @@ -471,42 +501,13 @@ def do_POST(self): answer = '' seen_content = '' - longest_stop_len = max([len(x) for x in stopping_strings] + [0]) for a in generator: answer = a - stop_string_found = False - len_seen = len(seen_content) - search_start = max(len_seen - longest_stop_len, 0) - - for string in stopping_strings: - idx = answer.find(string, search_start) - if idx != -1: - answer = answer[:idx] # clip it. - stop_string_found = True - - if stop_string_found: - break - - # If something like "\nYo" is generated just before "\nYou:" - # is completed, buffer and generate more, don't send it - buffer_and_continue = False - - for string in stopping_strings: - for j in range(len(string) - 1, 0, -1): - if answer[-j:] == string[:j]: - buffer_and_continue = True - break - else: - continue - break - - if buffer_and_continue: - continue - if is_streaming: # Streaming + len_seen = len(seen_content) new_content = answer[len_seen:] if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. @@ -539,6 +540,9 @@ def do_POST(self): completion_token_count += len(encode(new_content)[0]) if is_streaming: + stop_reason = "stop" + if token_count + completion_token_count >= truncation_length or completion_token_count >= max_tokens: + stop_reason = "length" chunk = { "id": cmpl_id, "object": stream_object_type, @@ -546,7 +550,7 @@ def do_POST(self): "model": model, # TODO: add Lora info? resp_list: [{ "index": 0, - "finish_reason": "stop", + "finish_reason": stop_reason, }], "usage": { "prompt_tokens": token_count, @@ -579,7 +583,7 @@ def do_POST(self): completion_token_count = len(encode(answer)[0]) stop_reason = "stop" - if token_count + completion_token_count >= truncation_length: + if token_count + completion_token_count >= truncation_length or completion_token_count >= max_tokens: stop_reason = "length" resp = { @@ -611,14 +615,8 @@ def do_POST(self): self.openai_error("No model loaded.") return - self.send_response(200) - self.send_access_control_headers() - self.send_header('Content-Type', 'application/json') - self.end_headers() - created_time = int(time.time()) - # Using Alpaca format, this may work with other models too. instruction = body['instruction'] input = body.get('input', '') @@ -668,6 +666,16 @@ def do_POST(self): token_count = len(encode(edit_task)[0]) max_tokens = truncation_length - token_count + if max_tokens < 2: + err_msg = f"This model maximum context length is {truncation_length} tokens. However, your messages resulted in over {truncation_length - max_tokens} tokens." + self.openai_error(message=err_msg, code=400, error_type='InvalidRequestError') + return + + self.send_response(200) + self.send_access_control_headers() + self.send_header('Content-Type', 'application/json') + self.end_headers() + req_params['max_new_tokens'] = max_tokens req_params['truncation_length'] = truncation_length req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0 @@ -682,24 +690,9 @@ def do_POST(self): longest_stop_len = max([len(x) for x in stopping_strings] + [0]) answer = '' - seen_content = '' for a in generator: answer = a - stop_string_found = False - len_seen = len(seen_content) - search_start = max(len_seen - longest_stop_len, 0) - - for string in stopping_strings: - idx = answer.find(string, search_start) - if idx != -1: - answer = answer[:idx] # clip it. - stop_string_found = True - - if stop_string_found: - break - - # some reply's have an extra leading space to fit the instruction template, just clip it off from the reply. if edit_task[-1] != '\n' and answer and answer[0] == ' ': answer = answer[1:] diff --git a/modules/models_settings.py b/modules/models_settings.py index 0207e7de76..b62f844350 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -2,6 +2,7 @@ from pathlib import Path import yaml +import copy from modules import shared, ui @@ -132,3 +133,27 @@ def save_model_settings(model, state): f.write(yaml.dump(user_config, sort_keys=False)) yield (f"Settings for {model} saved to {p}") + + +# Update the shared.settings with new settings +# the extra_settings dict is applied after the default settings for the model +def set_shared_model_settings(extra_settings = {}): + model_settings = get_model_settings_from_yamls(shared.model_name) # get current model settings + new_shared_settings = copy.deepcopy(shared.settings) + new_shared_settings.update(model_settings) + + # set each setting and ensure the correct type + for k in extra_settings: + if k in new_shared_settings: + new_shared_settings[k] = type(new_shared_settings[k])(extra_settings[k]) + else: + print("Warning: Setting unknown model setting: {k} = {extra_settings[k]}") + new_shared_settings[k] = extra_settings[k] + + update_model_parameters(new_shared_settings, initial=True) + + if new_shared_settings['mode'] != 'instruct': + new_shared_settings['instruction_template'] = None + + shared.settings = new_shared_settings + diff --git a/modules/utils.py b/modules/utils.py index 72a0dfa126..57c9a5ec9d 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -124,3 +124,7 @@ def get_available_chat_styles(): def get_available_sessions(): items = sorted(set(k.stem for k in Path('logs').glob(f'session_{shared.get_mode()}*')), key=natural_keys, reverse=True) return [item for item in items if 'autosave' in item] + [item for item in items if 'autosave' not in item] + + +def get_available_settings(): + return sorted(shared.settings.keys()) diff --git a/server.py b/server.py index 5c224e362e..d1b090a3a0 100644 --- a/server.py +++ b/server.py @@ -40,7 +40,7 @@ from modules.models import load_model, unload_model from modules.models_settings import ( apply_model_settings_to_state, - get_model_settings_from_yamls, + set_shared_model_settings, save_model_settings, update_model_parameters ) @@ -63,6 +63,7 @@ def load_model_wrapper(selected_model, loader, autoload=False): try: yield f"Loading {selected_model}..." shared.model_name = selected_model + set_shared_model_settings() unload_model() if selected_model != '': shared.model, shared.tokenizer = load_model(shared.model_name, loader) @@ -1146,12 +1147,10 @@ def create_interface(): # If any model has been selected, load it if shared.model_name != 'None': - model_settings = get_model_settings_from_yamls(shared.model_name) - shared.settings.update(model_settings) # hijacking the interface defaults - update_model_parameters(model_settings, initial=True) # hijacking the command-line arguments - # Load the model - shared.model, shared.tokenizer = load_model(shared.model_name) + for resp in load_model_wrapper(shared.model_name, shared.args.loader, autoload=True): + print(resp) + if shared.args.lora: add_lora_to_model(shared.args.lora)