Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,6 @@ List of command-line flags
| `--load-in-8bit` | Load the model with 8-bit precision (using bitsandbytes). |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--no-cache` | Set `use_cache` to `False` while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. |
| `--xformers` | Use xformer's memory efficient attention. This is really old and probably doesn't do anything. |
| `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. |
| `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
| `--no_use_fast` | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. |
| `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. |
Expand Down
171 changes: 0 additions & 171 deletions modules/llama_attn_hijack.py

This file was deleted.

6 changes: 1 addition & 5 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)

import modules.shared as shared
from modules import RoPE, llama_attn_hijack, sampler_hijack
from modules import RoPE, sampler_hijack
from modules.logging_colors import logger
from modules.models_settings import get_model_metadata
from modules.relative_imports import RelativeImport
Expand Down Expand Up @@ -97,10 +97,6 @@ def load_model(model_name, loader=None):
else:
tokenizer = load_tokenizer(model_name, model)

# Hijack attention with xformers
if any((shared.args.xformers, shared.args.sdp_attention)):
llama_attn_hijack.hijack_llama_attention()

shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
if loader.lower().startswith('exllama'):
shared.settings['truncation_length'] = shared.args.max_seq_len
Expand Down
2 changes: 0 additions & 2 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
group.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
group.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
Expand Down