Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
fcb4187
wip
Jun 30, 2023
86a7c25
update shared.settings with model settings on load in UI
Jun 30, 2023
b9067fa
simplify model load on startup
Jun 30, 2023
e4ca89a
code cleanup, simplify
Jun 30, 2023
95d44f1
reorg.
Jun 30, 2023
4d10b74
more robust API
Jun 30, 2023
0cb4f83
shared.lora_names = [] on reload
Jun 30, 2023
d87fdcb
minor
Jun 30, 2023
514d166
reuse
Jun 30, 2023
9f7fde5
apply settings after defaults.
Jun 30, 2023
45ef210
updated api example, some fixups.
Jun 30, 2023
80f4e73
remove test case.
Jun 30, 2023
be84c7f
fix loading with --model
Jun 30, 2023
2184e19
stop reason for streaming, explain dropped msgs
Jun 30, 2023
75d63a7
overhaul parameters, errors and length truncation
Jul 1, 2023
0126b8c
re #2951: lora_names = [] to unload_model()
Jul 1, 2023
8a8f9b3
only error if logprobs or logit_bias is not usable
Jul 2, 2023
d1c8f66
don't error with empty unsupported params
Jul 2, 2023
6937611
lora_names = [] in model_unload() now.
Jul 3, 2023
8c8d9c8
update docs about guidance
Jul 4, 2023
30c10bd
include alpha_value in api-example-model.py
Jul 4, 2023
f865edf
Merge branch 'oobabooga:main' into 8k_loras_fixes
matatonic Jul 4, 2023
85738a8
remove obsolete stopping_strings implementation
Jul 4, 2023
91a6a1e
reorder system prompts, set better top_k for chat
Jul 4, 2023
630ac3f
Merge branch 'oobabooga:main' into 8k_loras_fixes
matatonic Jul 5, 2023
290c5a1
Merge branch 'oobabooga:main' into 8k_loras_fixes
matatonic Jul 5, 2023
eedd9eb
Merge branch 'oobabooga:main' into 8k_loras_fixes
matatonic Jul 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions api-examples/api-example-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def model_load(model_name):


# complex loader
def complex_model_load(model):
def complex_model_load(model, lora = []):

def guess_groupsize(model_name):
if '1024g' in model_name:
Expand All @@ -50,17 +50,25 @@ def guess_groupsize(model_name):
else:
return -1

loader_names = ['llama.cpp', 'Transformers', 'AutoGPTQ', 'GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF', 'RWKV', 'flexgen']

req = {
'action': 'load',
'model_name': model,
'args': {
'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
'lora': lora,

'bf16': False,
'load_in_8bit': False,
'groupsize': 0,
'wbits': 0,

# Exllama
'gpu_split': None,
'max_seq_len': 2048,
'compress_pos_emb': 1,
'alpha_value': 1,

# llama.cpp
'threads': 0,
'n_batch': 512,
Expand Down Expand Up @@ -89,6 +97,18 @@ def guess_groupsize(model_name):
},
}

# Example of a more complex load
# CalderaAI_30B-Lazarus-GPTQ4bit in 24GB with superhot-8k lora and embedding compression
# Also set truncation_length = 3072 because 8k is not detected from the model.
if model == 'CalderaAI_30B-Lazarus-GPTQ4bit':
req['args']['loader'] = 'ExLlama'
req['args']['compress_pos_emb'] = 2
req['args']['max_seq_len'] = 3072
req['args']['lora'] = ['kaiokendev_superhot-30b-8k-no-rlhf-test']
req['settings'] = { 'truncation_length': req['args']['max_seq_len'] }
return model_api(req)


model = model.lower()

if '4bit' in model or 'gptq' in model or 'int4' in model:
Expand Down
66 changes: 48 additions & 18 deletions extensions/api/blocking_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from modules.chat import generate_chat_reply
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model
from modules.models_settings import (get_model_settings_from_yamls,
update_model_parameters)
from modules.models_settings import set_shared_model_settings
from modules.text_generation import (encode, generate_reply,
stop_everything_event)
from modules.utils import get_available_models
from modules.utils import (get_available_models,
get_available_loras,
get_available_settings)


def get_model_info():
Expand Down Expand Up @@ -107,58 +108,87 @@ def do_POST(self):
self.wfile.write(response.encode('utf-8'))

elif self.path == '/api/v1/model':

self.send_response(200)
self.send_header('Content-Type', 'application/json')
self.end_headers()

# by default return the same as the GET interface
result = shared.model_name

# Actions: info, load, list, unload
# Actions: info, load, list, unload, add_lora, list_lora, list_settings, settings
action = body.get('action', '')

if action == 'load':
model_name = body['model_name']
model_name = body.get('model_name', shared.model_name)
args = body.get('args', {})
print('args', args)
for k in args:
setattr(shared.args, k, args[k])
extra_settings = body.get('settings', {})

shared.model_name = model_name
unload_model()
print('Model load args:', args)

model_settings = get_model_settings_from_yamls(shared.model_name)
shared.settings.update(model_settings)
update_model_parameters(model_settings, initial=True)
unload_model()
shared.model_name = model_name
shared.args.model = model_name

for k in args:
setattr(shared.args, k, args[k])

if not shared.args.lora:
shared.args.lora = []

if shared.settings['mode'] != 'instruct':
shared.settings['instruction_template'] = None
set_shared_model_settings(extra_settings)
# TODO: fetch lora settings too, ie. +get_model_settings_from_yamls(lora)

try:
shared.model, shared.tokenizer = load_model(shared.model_name)
shared.model, shared.tokenizer = load_model(model_name)
if shared.args.lora:
add_lora_to_model(shared.args.lora) # list

except Exception as e:
response = json.dumps({'error': {'message': repr(e)}})

self.wfile.write(response.encode('utf-8'))
raise e

shared.args.model = shared.model_name

result = get_model_info()

elif action == 'add_lora':
lora = body.get('lora', [])
if not isinstance(lora, list):
lora = [lora]

try:
shared.args.lora = lora
add_lora_to_model(lora)
except Exception as e:
response = json.dumps({'error': {'message': repr(e)}})

self.wfile.write(response.encode('utf-8'))
raise e

result = get_model_info()

elif action == 'unload':
unload_model()
shared.model_name = None
shared.args.model = None
shared.args.lora = []
result = get_model_info()

elif action == 'list':
result = get_available_models()

elif action == 'info':
elif action == 'list_lora':
result = get_available_loras()

elif action == 'list_settings':
result = get_available_settings()

elif action == 'info' or action == 'settings':
extra_settings = body.get('settings', {})
if extra_settings:
set_shared_model_settings(extra_settings) # Settings are applied after the default settings for the model
result = get_model_info()

response = json.dumps({
Expand Down
3 changes: 1 addition & 2 deletions extensions/openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,12 +218,11 @@ but there are some exceptions.
| ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. |
| ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context |
| ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
| ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |

## Future plans
* better error handling
* model changing, esp. something for swapping loras or embedding models
* consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
* do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM

## Bugs? Feedback? Comments? Pull requests?

Expand Down
Loading