Skip to content

Commit

Permalink
support cuda: llama.cpp & transformer embedding
Browse files Browse the repository at this point in the history
  • Loading branch information
c0sogi committed Jul 4, 2023
1 parent cde07e2 commit c2bbd1d
Show file tree
Hide file tree
Showing 24 changed files with 610 additions and 214 deletions.
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@
"scanLimit": 200
},
"python.linting.flake8Enabled": false,
"python.linting.mypyEnabled": true,
"python.linting.enabled": false,
"python.linting.mypyEnabled": false,
"python.linting.enabled": true,
}
2 changes: 1 addition & 1 deletion app/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ def split_long_text(long_text: str, chars_per_line: int):

# Join the lines with newline characters
long_text = "\n".join(lines)
result = split_long_text(long_text, 80)
result = split_long_text(long_text, 66)
print(f"\n\n{ANSI_COLORS['green']}{result}{ANSI_COLORS['end']}\n\n")
if pyperclip:
pyperclip.copy(result)
Expand Down
Binary file added app/contents/browsing_demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
43 changes: 31 additions & 12 deletions app/models/llm_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from abc import ABC, abstractmethod, abstractproperty
from abc import ABC, abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union
from app.utils.chat.text_generations.path import resolve_model_path_to_posix
Expand All @@ -15,14 +15,21 @@ class BaseTokenizer(ABC):

@property
def fallback_tokenizer(self) -> Encoding:
ApiLogger.cwarning("Using fallback tokenizer!!!")
if self._fallback_tokenizer is None:
self._fallback_tokenizer = get_encoding("cl100k_base")
return self._fallback_tokenizer

@abstractproperty
@property
@abstractmethod
def tokenizer(self) -> Any:
...

@property
@abstractmethod
def model_name(self) -> str:
...

@abstractmethod
def encode(self, message: str) -> list[int]:
...
Expand Down Expand Up @@ -58,7 +65,7 @@ def get_chunk_of(self, text: str, tokens: int) -> str:

class OpenAITokenizer(BaseTokenizer):
def __init__(self, model_name: str):
self.model_name = model_name
self._model_name = model_name
self._tokenizer: Encoding | None = None

def encode(self, message: str, /) -> list[int]:
Expand All @@ -70,10 +77,14 @@ def decode(self, tokens: list[int], /) -> str:
@property
def tokenizer(self) -> Encoding:
if self._tokenizer is None:
print("Loading tokenizer: ", self.model_name)
self._tokenizer = encoding_for_model(self.model_name)
print("Loading tokenizer: ", self._model_name)
self._tokenizer = encoding_for_model(self._model_name)
return self._tokenizer

@property
def model_name(self) -> str:
return self._model_name


class LlamaTokenizer(BaseTokenizer):
def __init__(self, model_name: str):
Expand All @@ -84,7 +95,7 @@ def __init__(self, model_name: str):
except Exception as e:
ApiLogger.cwarning(str(e))
self._tokenizer_type = None
self.model_name = model_name
self._model_name = model_name
self._tokenizer = None

def encode(self, message: str, /) -> list[int]:
Expand All @@ -99,10 +110,10 @@ def tokenizer(self) -> Union["_LlamaTokenizer", Encoding]:
try:
if self._tokenizer_type is None:
raise Exception("LlamaTokenizer could not be imported")
split_str = self.model_name.split("/")
split_str = self._model_name.split("/")

if len(split_str) == 2:
root_path = self.model_name
root_path = self._model_name
subfolder = None
elif len(split_str) > 2:
root_path = "/".join(split_str[:2])
Expand All @@ -114,12 +125,16 @@ def tokenizer(self) -> Union["_LlamaTokenizer", Encoding]:
self._tokenizer = self._tokenizer_type.from_pretrained(
root_path, subfolder=subfolder
)
print("Tokenizer loaded:", self.model_name)
print("Tokenizer loaded:", self._model_name)
except Exception as e:
ApiLogger.cwarning(str(e))
self._tokenizer = self.fallback_tokenizer
return self._tokenizer

@property
def model_name(self) -> str:
return self._model_name


class ExllamaTokenizer(BaseTokenizer):
def __init__(self, model_name: str):
Expand All @@ -131,7 +146,7 @@ def __init__(self, model_name: str):
self._tokenizer_type = _ExllamaTokenizer
except Exception:
self._tokenizer_type = None
self.model_name = model_name
self._model_name = model_name
self._tokenizer = None

def encode(self, message: str, /) -> list[int]:
Expand All @@ -154,15 +169,19 @@ def tokenizer(self) -> Union["ExLlamaTokenizer", Encoding]:
raise Exception("ExllamaTokenizer could not be imported")
model_folder_path = Path(
resolve_model_path_to_posix(
self.model_name,
self._model_name,
default_relative_directory="llama_models/gptq",
),
)
self._tokenizer = self._tokenizer_type(
(model_folder_path / "tokenizer.model").as_posix(),
)
print("Tokenizer loaded:", self.model_name)
print("Tokenizer loaded:", self._model_name)
except Exception as e:
ApiLogger.cwarning(str(e))
self._tokenizer = self.fallback_tokenizer
return self._tokenizer

@property
def model_name(self) -> str:
return self._model_name
4 changes: 2 additions & 2 deletions app/models/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ class LLMModels(EnumMixin):
token_margin=8,
tokenizer=LlamaTokenizer("Austism/chronos-hermes-13b"),
model_path="chronos-hermes-13b.ggmlv3.q4_K_M.bin", # The filename of model. Must end with .bin.
chat_turn_prompt=ChatTurnTemplates.ROLE_CONTENT_6,
chat_turn_prompt=ChatTurnTemplates.ROLE_CONTENT_1,
user_chat_roles=UserChatRoles(
user="User",
ai="Assistant",
Expand All @@ -413,7 +413,7 @@ class LLMModels(EnumMixin):
orca_mini_7b = ExllamaModel(
model_path="orca_mini_7b",
name="orca_mini_7b",
max_total_tokens=4096,
max_total_tokens=2048,
max_tokens_per_request=2048,
token_margin=8,
tokenizer=ExllamaTokenizer("orca_mini_7b"),
Expand Down
7 changes: 7 additions & 0 deletions app/models/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def free_memory_from_deque(
min_free_memory_mb: float = 512,
logger: Optional["Logger"] = None,
) -> None:
try:
from torch.cuda import empty_cache
except Exception:
empty_cache = None

# Before creating a new completion generator, check memory usage
mem_usage_before: Optional[float] = get_total_memory_usage() # In MB
if logger is not None and mem_usage_before is not None:
Expand All @@ -63,6 +68,8 @@ def free_memory_from_deque(
# And check memory usage again to see if there is a memory leak
(deque_object.popleft()).__del__()
collect()
if empty_cache is not None:
empty_cache()

if mem_usage_before is not None:
mem_usage_after = get_total_memory_usage()
Expand Down
Loading

0 comments on commit c2bbd1d

Please sign in to comment.