Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions download-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@

class ModelDownloader:
def __init__(self, max_retries=5):
self.session = requests.Session()
if max_retries:
self.session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
self.session.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
self.max_retries = max_retries

def get_session(self):
session = requests.Session()
if self.max_retries:
session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=self.max_retries))
session.mount('https://huggingface.co', HTTPAdapter(max_retries=self.max_retries))

if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
self.session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))

try:
from huggingface_hub import get_token
Expand All @@ -41,7 +44,9 @@ def __init__(self, max_retries=5):
token = os.getenv("HF_TOKEN")

if token is not None:
self.session.headers = {'authorization': f'Bearer {token}'}
session.headers = {'authorization': f'Bearer {token}'}

return session

def sanitize_model_and_branch_names(self, model, branch):
if model[-1] == '/':
Expand All @@ -65,6 +70,7 @@ def sanitize_model_and_branch_names(self, model, branch):
return model, branch

def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
session = self.get_session()
page = f"/api/models/{model}/tree/{branch}"
cursor = b""

Expand All @@ -78,7 +84,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
is_lora = False
while True:
url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
r = self.session.get(url, timeout=10)
r = session.get(url, timeout=10)
r.raise_for_status()
content = r.content

Expand Down Expand Up @@ -156,9 +162,8 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
is_llamacpp = has_gguf and specific_file is not None
return links, sha256, is_lora, is_llamacpp

def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_folder=None):
if base_folder is None:
base_folder = 'models' if not is_lora else 'loras'
def get_output_folder(self, model, branch, is_lora, is_llamacpp=False):
base_folder = 'models' if not is_lora else 'loras'

# If the model is of type GGUF, save directly in the base_folder
if is_llamacpp:
Expand All @@ -172,14 +177,15 @@ def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_fold
return output_folder

def get_single_file(self, url, output_folder, start_from_scratch=False):
session = self.get_session()
filename = Path(url.rsplit('/', 1)[1])
output_path = output_folder / filename
headers = {}
mode = 'wb'
if output_path.exists() and not start_from_scratch:

# Check if the file has already been downloaded completely
r = self.session.get(url, stream=True, timeout=10)
r = session.get(url, stream=True, timeout=10)
total_size = int(r.headers.get('content-length', 0))
if output_path.stat().st_size >= total_size:
return
Expand All @@ -188,7 +194,7 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
headers = {'Range': f'bytes={output_path.stat().st_size}-'}
mode = 'ab'

with self.session.get(url, stream=True, headers=headers, timeout=10) as r:
with session.get(url, stream=True, headers=headers, timeout=10) as r:
r.raise_for_status() # Do not continue the download if the request was unsuccessful
total_size = int(r.headers.get('content-length', 0))
block_size = 1024 * 1024 # 1MB
Expand Down Expand Up @@ -303,7 +309,10 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)

# Get the output folder
output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp, base_folder=args.output)
if args.output:
output_folder = Path(args.output)
else:
output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)

if args.check:
# Check previously downloaded files
Expand Down
2 changes: 1 addition & 1 deletion instruction-templates/Mistral.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ instruction_template: |-
{{- message['content'] -}}
{%- else -%}
{%- if message['role'] == 'user' -%}
{{-' [INST] ' + message['content'].rstrip() + ' [/INST] '-}}
{{-'[INST] ' + message['content'].rstrip() + ' [/INST]'-}}
{%- else -%}
{{-'' + message['content'] + '</s>' -}}
{%- endif -%}
Expand Down
68 changes: 36 additions & 32 deletions modules/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,53 +166,54 @@ def make_prompt(messages):
prompt = remove_extra_bos(prompt)
return prompt

# Handle truncation
max_length = get_max_prompt_length(state)
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

while len(messages) > 0 and encoded_length > max_length:
# Handle truncation
if shared.tokenizer is not None:
max_length = get_max_prompt_length(state)
encoded_length = get_encoded_length(prompt)
while len(messages) > 0 and encoded_length > max_length:

# Remove old message, save system message
if len(messages) > 2 and messages[0]['role'] == 'system':
messages.pop(1)
# Remove old message, save system message
if len(messages) > 2 and messages[0]['role'] == 'system':
messages.pop(1)

# Remove old message when no system message is present
elif len(messages) > 1 and messages[0]['role'] != 'system':
messages.pop(0)
# Remove old message when no system message is present
elif len(messages) > 1 and messages[0]['role'] != 'system':
messages.pop(0)

# Resort to truncating the user input
else:
# Resort to truncating the user input
else:

user_message = messages[-1]['content']

# Bisect the truncation point
left, right = 0, len(user_message) - 1

user_message = messages[-1]['content']
while right - left > 1:
mid = (left + right) // 2

# Bisect the truncation point
left, right = 0, len(user_message) - 1
messages[-1]['content'] = user_message[mid:]
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

while right - left > 1:
mid = (left + right) // 2
if encoded_length <= max_length:
right = mid
else:
left = mid

messages[-1]['content'] = user_message[mid:]
messages[-1]['content'] = user_message[right:]
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

if encoded_length <= max_length:
right = mid
if encoded_length > max_length:
logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
raise ValueError
else:
left = mid
logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
break

messages[-1]['content'] = user_message[right:]
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)
if encoded_length > max_length:
logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
raise ValueError
else:
logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
break

prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

if also_return_rows:
return prompt, [message['content'] for message in messages]
Expand Down Expand Up @@ -690,6 +691,9 @@ def load_character(character, name1, name2):


def load_instruction_template(template):
if template == 'None':
return ''

for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
if filepath.exists():
break
Expand Down
21 changes: 12 additions & 9 deletions modules/exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,21 @@ def from_pretrained(self, path_to_model):

model = ExLlamaV2(config)

split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

model.load(split)

tokenizer = ExLlamaV2Tokenizer(config)
if shared.args.cache_8bit:
cache = ExLlamaV2Cache_8bit(model)
cache = ExLlamaV2Cache_8bit(model, lazy=True)
else:
cache = ExLlamaV2Cache(model)
cache = ExLlamaV2Cache(model, lazy=True)

if shared.args.autosplit:
model.load_autosplit(cache)
else:
split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

model.load(split)

tokenizer = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

result = self()
Expand Down
20 changes: 12 additions & 8 deletions modules/exllamav2_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,22 @@ def __init__(self, config: ExLlamaV2Config):
super().__init__(PretrainedConfig())
self.ex_config = config
self.ex_model = ExLlamaV2(config)
split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

self.ex_model.load(split)
self.generation_config = GenerationConfig()
self.loras = None
self.generation_config = GenerationConfig()

if shared.args.cache_8bit:
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=True)
else:
self.ex_cache = ExLlamaV2Cache(self.ex_model)
self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=True)

if shared.args.autosplit:
self.ex_model.load_autosplit(self.ex_cache)
else:
split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

self.ex_model.load(split)

self.past_seq = None
if shared.args.cfg_cache:
Expand Down
2 changes: 2 additions & 0 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'autosplit',
'alpha_value',
'compress_pos_emb',
'trust_remote_code',
Expand All @@ -89,6 +90,7 @@
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'autosplit',
'alpha_value',
'compress_pos_emb',
'exllamav2_info',
Expand Down
2 changes: 1 addition & 1 deletion modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def llamacpp_HF_loader(model_name):
path = Path(f'{shared.args.model_dir}/{model_name}')

# Check if a HF tokenizer is available for the model
if all((path / file).exists() for file in ['tokenizer.model', 'tokenizer_config.json']):
if all((path / file).exists() for file in ['tokenizer_config.json']):
logger.info(f'Using tokenizer from: \"{path}\"')
else:
logger.error("Could not load the model because a tokenizer in Transformers format was not found.")
Expand Down
65 changes: 47 additions & 18 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ def infer_loader(model_name, model_settings):
loader = 'ExLlamav2_HF'
elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
loader = 'AutoAWQ'
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
loader = 'llamacpp_HF'
elif len(list(path_to_model.glob('*.gguf'))) > 0:
loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()):
Expand Down Expand Up @@ -225,7 +227,7 @@ def apply_model_settings_to_state(model, state):
loader = model_settings.pop('loader')

# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']):
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['ctransformers']):
state['loader'] = loader

for k in model_settings:
Expand All @@ -243,27 +245,54 @@ def save_model_settings(model, state):
Save the settings for this model to models/config-user.yaml
'''
if model == 'None':
yield ("Not saving the settings because no model is loaded.")
yield ("Not saving the settings because no model is selected in the menu.")
return

with Path(f'{shared.args.model_dir}/config-user.yaml') as p:
if p.exists():
user_config = yaml.safe_load(open(p, 'r').read())
else:
user_config = {}
user_config = shared.load_user_config()
model_regex = model + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}

for k in ui.list_model_elements():
if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
user_config[model_regex][k] = state[k]

model_regex = model + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}
shared.user_config = user_config

for k in ui.list_model_elements():
if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
user_config[model_regex][k] = state[k]
output = yaml.dump(user_config, sort_keys=False)
p = Path(f'{shared.args.model_dir}/config-user.yaml')
with open(p, 'w') as f:
f.write(output)

shared.user_config = user_config
yield (f"Settings for `{model}` saved to `{p}`.")

output = yaml.dump(user_config, sort_keys=False)
with open(p, 'w') as f:
f.write(output)

yield (f"Settings for `{model}` saved to `{p}`.")
def save_instruction_template(model, template):
'''
Similar to the function above, but it saves only the instruction template.
'''
if model == 'None':
yield ("Not saving the template because no model is selected in the menu.")
return

user_config = shared.load_user_config()
model_regex = model + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}

if template == 'None':
user_config[model_regex].pop('instruction_template', None)
else:
user_config[model_regex]['instruction_template'] = template

shared.user_config = user_config

output = yaml.dump(user_config, sort_keys=False)
p = Path(f'{shared.args.model_dir}/config-user.yaml')
with open(p, 'w') as f:
f.write(output)

if template == 'None':
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
Loading