Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -312,13 +312,6 @@ List of command-line flags
| `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
| `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. |

#### RWKV

| Flag | Description |
|---------------------------------|-------------|
| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. |

#### RoPE (for llama.cpp, ExLlamaV2, and transformers)

| Flag | Description |
Expand Down
154 changes: 0 additions & 154 deletions modules/RWKV.py

This file was deleted.

18 changes: 0 additions & 18 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def load_model(model_name, loader=None):
'GPTQ-for-LLaMa': GPTQ_loader,
'llama.cpp': llamacpp_loader,
'llamacpp_HF': llamacpp_HF_loader,
'RWKV': RWKV_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ctransformers': ctransformers_loader,
Expand Down Expand Up @@ -405,23 +404,6 @@ def HQQ_loader(model_name):
return model


def RWKV_loader(model_name):
'''
This loader is not currently maintained as RWKV can now be loaded
through the transformers library.
'''
from modules.RWKV import RWKVModel, RWKVTokenizer

model = RWKVModel.from_pretrained(
Path(f'{shared.args.model_dir}/{model_name}'),
dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16",
device="cpu" if shared.args.cpu else "xpu" if is_xpu_available() else "cuda"
)

tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir))
return model, tokenizer


def get_max_memory_dict():
max_memory = {}
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
Expand Down
2 changes: 0 additions & 2 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ def infer_loader(model_name, model_settings):
loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()):
loader = 'llama.cpp'
elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
loader = 'RWKV'
elif re.match(r'.*exl2', model_name.lower()):
loader = 'ExLlamav2_HF'
elif re.match(r'.*-hqq', model_name.lower()):
Expand Down
5 changes: 0 additions & 5 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,6 @@
group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')

# RWKV
group = parser.add_argument_group('RWKV')
group.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
group.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')

# RoPE
group = parser.add_argument_group('RoPE')
group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
Expand Down
6 changes: 3 additions & 3 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return

if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
Expand Down Expand Up @@ -118,7 +118,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')

if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
Expand All @@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]

if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
Expand Down