Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ List of command-line flags
|`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
|`--no_flash_attn` | Force flash-attention to not be used. |
|`--cache_8bit` | Use 8-bit cache to save VRAM. |
|`--num_experts_per_token NUM_EXPERTS_PER_TOKEN` | Number of experts to use for generation. Applies to MoE models like Mixtral. |

#### AutoGPTQ

Expand Down
1 change: 1 addition & 0 deletions modules/exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def from_pretrained(self, path_to_model):
config.scale_pos_emb = shared.args.compress_pos_emb
config.scale_alpha_value = shared.args.alpha_value
config.no_flash_attn = shared.args.no_flash_attn
config.num_experts_per_token = int(shared.args.num_experts_per_token)

model = ExLlamaV2(config)

Expand Down
1 change: 1 addition & 0 deletions modules/exllamav2_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,5 +165,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
config.scale_pos_emb = shared.args.compress_pos_emb
config.scale_alpha_value = shared.args.alpha_value
config.no_flash_attn = shared.args.no_flash_attn
config.num_experts_per_token = int(shared.args.num_experts_per_token)

return Exllamav2HF(config)
30 changes: 16 additions & 14 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,25 @@
'logits_all',
'llamacpp_HF_info',
],
'ExLlama_HF': [
'ExLlamav2_HF': [
'gpu_split',
'max_seq_len',
'cfg_cache',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'cfg_cache',
'trust_remote_code',
'no_use_fast',
],
'ExLlamav2_HF': [
'ExLlama_HF': [
'gpu_split',
'max_seq_len',
'cfg_cache',
'no_flash_attn',
'cache_8bit',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'cfg_cache',
'trust_remote_code',
'no_use_fast',
],
Expand Down Expand Up @@ -123,22 +124,23 @@
'no_use_fast',
'gptq_for_llama_info',
],
'ExLlama': [
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'exllama_info',
'exllamav2_info',
],
'ExLlamav2': [
'ExLlama': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'cache_8bit',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'exllamav2_info',
'exllama_info',
],
'ctransformers': [
'n_ctx',
Expand Down
1 change: 1 addition & 0 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
parser.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.')
parser.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
parser.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
parser.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')

# AutoGPTQ
parser.add_argument('--triton', action='store_true', help='Use triton.')
Expand Down
1 change: 1 addition & 0 deletions modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def list_model_elements():
'disable_exllamav2',
'cfg_cache',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'threads',
'threads_batch',
Expand Down
1 change: 1 addition & 0 deletions modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def create_ui():
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlama_HF or AutoGPTQ are preferred for GPTQ models when supported.')
shared.gradio['exllama_info'] = gr.Markdown("ExLlama_HF is recommended over ExLlama for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
Expand Down