From f8630e369506f79e67117421585432fd08977f92 Mon Sep 17 00:00:00 2001 From: fenglui Date: Tue, 24 Oct 2023 05:46:37 +0800 Subject: [PATCH 1/4] add use_flash_attention_2 to param for Model loader Transformers --- modules/loaders.py | 2 +- modules/models.py | 2 ++ modules/shared.py | 3 ++- modules/ui.py | 1 + modules/ui_model_menu.py | 3 ++- 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index c7e5d80031..d4b92664a7 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -9,7 +9,6 @@ 'Transformers': [ 'cpu_memory', 'gpu_memory', - 'trust_remote_code', 'load_in_8bit', 'bf16', 'cpu', @@ -21,6 +20,7 @@ 'compute_dtype', 'trust_remote_code', 'use_fast', + 'use_flash_attention_2', 'alpha_value', 'rope_freq_base', 'compress_pos_emb', diff --git a/modules/models.py b/modules/models.py index 087adadab2..4a045c4aaa 100644 --- a/modules/models.py +++ b/modules/models.py @@ -120,6 +120,8 @@ def huggingface_loader(model_name): 'trust_remote_code': shared.args.trust_remote_code, 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16 } + if shared.args.use_flash_attention_2: + params['use_flash_attention_2'] = True config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code']) if 'chatglm' in model_name.lower(): diff --git a/modules/shared.py b/modules/shared.py index 626c2bf877..4dac838554 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -92,6 +92,7 @@ parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.') parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.') +parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.') # Accelerate 4-bit parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).') @@ -191,7 +192,7 @@ # Security warnings if args.trust_remote_code: logger.warning('trust_remote_code is enabled. This is dangerous.') -if 'COLAB_GPU' not in os.environ: +if not 'COLAB_GPU' in os.environ: if args.share: logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.") if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)): diff --git a/modules/ui.py b/modules/ui.py index df9906835d..404c740e49 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -53,6 +53,7 @@ def list_model_elements(): 'load_in_8bit', 'trust_remote_code', 'use_fast', + 'use_flash_attention_2', 'load_in_4bit', 'compute_dtype', 'quant_type', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 5d9b6cb609..bbb1c03f4d 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -117,8 +117,9 @@ def create_ui(): shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed) - shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) + shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.') shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.') + shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).') From 8eb3999660d95c4861e60536cb9bf8035d5f483b Mon Sep 17 00:00:00 2001 From: fenglui Date: Tue, 24 Oct 2023 05:52:36 +0800 Subject: [PATCH 2/4] [fix] model amazon/MistralLite can't be load with an error "TypeError: not a string" --- modules/models.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/modules/models.py b/modules/models.py index 4a045c4aaa..4a85014c9e 100644 --- a/modules/models.py +++ b/modules/models.py @@ -103,11 +103,17 @@ def load_tokenizer(model_name, model): if shared.args.use_fast: logger.info('Loading the tokenizer with use_fast=True.') - tokenizer = AutoTokenizer.from_pretrained( - path_to_model, - trust_remote_code=shared.args.trust_remote_code, - use_fast=shared.args.use_fast - ) + try: + tokenizer = AutoTokenizer.from_pretrained( + path_to_model, + trust_remote_code=shared.args.trust_remote_code, + use_fast=shared.args.use_fast + ) + except: + tokenizer = AutoTokenizer.from_pretrained( + path_to_model, + trust_remote_code=shared.args.trust_remote_code + ) return tokenizer From a810f5a0f28b7606988ef3db437246f85bb6a68b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 4 Nov 2023 09:48:17 -0700 Subject: [PATCH 3/4] Regressions --- modules/models.py | 16 +++++----------- modules/shared.py | 2 +- modules/ui_model_menu.py | 2 +- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/modules/models.py b/modules/models.py index b377776738..e39b2b3663 100644 --- a/modules/models.py +++ b/modules/models.py @@ -108,17 +108,11 @@ def load_tokenizer(model_name, model): if shared.args.use_fast: logger.info('Loading the tokenizer with use_fast=True.') - try: - tokenizer = AutoTokenizer.from_pretrained( - path_to_model, - trust_remote_code=shared.args.trust_remote_code, - use_fast=shared.args.use_fast - ) - except: - tokenizer = AutoTokenizer.from_pretrained( - path_to_model, - trust_remote_code=shared.args.trust_remote_code - ) + tokenizer = AutoTokenizer.from_pretrained( + path_to_model, + trust_remote_code=shared.args.trust_remote_code, + use_fast=shared.args.use_fast + ) return tokenizer diff --git a/modules/shared.py b/modules/shared.py index 388edcb65f..1dd6841d6b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -195,7 +195,7 @@ # Security warnings if args.trust_remote_code: logger.warning('trust_remote_code is enabled. This is dangerous.') -if not 'COLAB_GPU' in os.environ: +if 'COLAB_GPU' not in os.environ: if args.share: logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.") if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index a8be0b11cd..0d82ee8f3c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -122,7 +122,7 @@ def create_ui(): shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed) - shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.') + shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') From 66b4057e3dff7972153aea9749b43e05c496f5bc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 4 Nov 2023 09:58:34 -0700 Subject: [PATCH 4/4] Update README --- README.md | 1 + modules/models.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 17c9add7c1..79d86a158f 100644 --- a/README.md +++ b/README.md @@ -300,6 +300,7 @@ Optionally, you can use the following command-line flags: | `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. | | `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. | | `--use_fast` | Set `use_fast=True` while loading the tokenizer. | +| `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. | #### Accelerate 4-bit diff --git a/modules/models.py b/modules/models.py index e39b2b3663..e9005fee95 100644 --- a/modules/models.py +++ b/modules/models.py @@ -126,8 +126,10 @@ def huggingface_loader(model_name): 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, 'use_safetensors': True if shared.args.force_safetensors else None } + if shared.args.use_flash_attention_2: params['use_flash_attention_2'] = True + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code']) if 'chatglm' in model_name.lower():