From f8630e369506f79e67117421585432fd08977f92 Mon Sep 17 00:00:00 2001
From: fenglui <feng.lui1975@gmail.com>
Date: Tue, 24 Oct 2023 05:46:37 +0800
Subject: [PATCH 1/4] add use_flash_attention_2 to param for Model loader
 Transformers

---
 modules/loaders.py       | 2 +-
 modules/models.py        | 2 ++
 modules/shared.py        | 3 ++-
 modules/ui.py            | 1 +
 modules/ui_model_menu.py | 3 ++-
 5 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index c7e5d80031..d4b92664a7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -9,7 +9,6 @@
     'Transformers': [
         'cpu_memory',
         'gpu_memory',
-        'trust_remote_code',
         'load_in_8bit',
         'bf16',
         'cpu',
@@ -21,6 +20,7 @@
         'compute_dtype',
         'trust_remote_code',
         'use_fast',
+        'use_flash_attention_2',
         'alpha_value',
         'rope_freq_base',
         'compress_pos_emb',
diff --git a/modules/models.py b/modules/models.py
index 087adadab2..4a045c4aaa 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -120,6 +120,8 @@ def huggingface_loader(model_name):
         'trust_remote_code': shared.args.trust_remote_code,
         'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16
     }
+    if shared.args.use_flash_attention_2:
+        params['use_flash_attention_2'] = True
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code'])
 
     if 'chatglm' in model_name.lower():
diff --git a/modules/shared.py b/modules/shared.py
index 626c2bf877..4dac838554 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -92,6 +92,7 @@
 parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
 parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.')
+parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
 
 # Accelerate 4-bit
 parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
@@ -191,7 +192,7 @@
 # Security warnings
 if args.trust_remote_code:
     logger.warning('trust_remote_code is enabled. This is dangerous.')
-if 'COLAB_GPU' not in os.environ:
+if not 'COLAB_GPU' in os.environ:
     if args.share:
         logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
     if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)):
diff --git a/modules/ui.py b/modules/ui.py
index df9906835d..404c740e49 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -53,6 +53,7 @@ def list_model_elements():
         'load_in_8bit',
         'trust_remote_code',
         'use_fast',
+        'use_flash_attention_2',
         'load_in_4bit',
         'compute_dtype',
         'quant_type',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5d9b6cb609..bbb1c03f4d 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -117,8 +117,9 @@ def create_ui():
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
-                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
+                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
                             shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
+                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')

From 8eb3999660d95c4861e60536cb9bf8035d5f483b Mon Sep 17 00:00:00 2001
From: fenglui <feng.lui1975@gmail.com>
Date: Tue, 24 Oct 2023 05:52:36 +0800
Subject: [PATCH 2/4] [fix] model amazon/MistralLite can't be load with an
 error "TypeError: not a string"

---
 modules/models.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 4a045c4aaa..4a85014c9e 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -103,11 +103,17 @@ def load_tokenizer(model_name, model):
         if shared.args.use_fast:
             logger.info('Loading the tokenizer with use_fast=True.')
 
-        tokenizer = AutoTokenizer.from_pretrained(
-            path_to_model,
-            trust_remote_code=shared.args.trust_remote_code,
-            use_fast=shared.args.use_fast
-        )
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                path_to_model,
+                trust_remote_code=shared.args.trust_remote_code,
+                use_fast=shared.args.use_fast
+            )
+        except:
+            tokenizer = AutoTokenizer.from_pretrained(
+                path_to_model,
+                trust_remote_code=shared.args.trust_remote_code
+            )
 
     return tokenizer
 

From a810f5a0f28b7606988ef3db437246f85bb6a68b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Nov 2023 09:48:17 -0700
Subject: [PATCH 3/4] Regressions

---
 modules/models.py        | 16 +++++-----------
 modules/shared.py        |  2 +-
 modules/ui_model_menu.py |  2 +-
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index b377776738..e39b2b3663 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -108,17 +108,11 @@ def load_tokenizer(model_name, model):
         if shared.args.use_fast:
             logger.info('Loading the tokenizer with use_fast=True.')
 
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(
-                path_to_model,
-                trust_remote_code=shared.args.trust_remote_code,
-                use_fast=shared.args.use_fast
-            )
-        except:
-            tokenizer = AutoTokenizer.from_pretrained(
-                path_to_model,
-                trust_remote_code=shared.args.trust_remote_code
-            )
+        tokenizer = AutoTokenizer.from_pretrained(
+            path_to_model,
+            trust_remote_code=shared.args.trust_remote_code,
+            use_fast=shared.args.use_fast
+        )
 
     return tokenizer
 
diff --git a/modules/shared.py b/modules/shared.py
index 388edcb65f..1dd6841d6b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -195,7 +195,7 @@
 # Security warnings
 if args.trust_remote_code:
     logger.warning('trust_remote_code is enabled. This is dangerous.')
-if not 'COLAB_GPU' in os.environ:
+if 'COLAB_GPU' not in os.environ:
     if args.share:
         logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
     if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index a8be0b11cd..0d82ee8f3c 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -122,7 +122,7 @@ def create_ui():
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
-                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
+                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
                             shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')

From 66b4057e3dff7972153aea9749b43e05c496f5bc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Nov 2023 09:58:34 -0700
Subject: [PATCH 4/4] Update README

---
 README.md         | 1 +
 modules/models.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 17c9add7c1..79d86a158f 100644
--- a/README.md
+++ b/README.md
@@ -300,6 +300,7 @@ Optionally, you can use the following command-line flags:
 | `--sdp-attention`                           | Use PyTorch 2.0's SDP attention. Same as above. |
 | `--trust-remote-code`                       | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
 | `--use_fast`                                | Set `use_fast=True` while loading the tokenizer. |
+| `--use_flash_attention_2`                   | Set use_flash_attention_2=True while loading the model. |
 
 #### Accelerate 4-bit
 
diff --git a/modules/models.py b/modules/models.py
index e39b2b3663..e9005fee95 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -126,8 +126,10 @@ def huggingface_loader(model_name):
         'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
         'use_safetensors': True if shared.args.force_safetensors else None
     }
+
     if shared.args.use_flash_attention_2:
         params['use_flash_attention_2'] = True
+
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code'])
 
     if 'chatglm' in model_name.lower():