[WIP] dependency solution

c0sogi · Jul 29, 2023 · e36569f · e36569f
1 parent 5f38c79
commit e36569f
Show file tree

Hide file tree

Showing 25 changed files with 3,613 additions and 87 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,5 @@ models/gptq/*
 .vscode
 *.pyc
 PRIVATE_*
-repositories
+repositories
+tests/.venv
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,9 @@
+MIT License
+
+Copyright (c) 2023 Andrei Betlen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
@@ -3,7 +3,6 @@
 from typing import Any, Iterator, TypeVar
 
 from ..mixins.prompt_utils import PromptUtilsMixin
-from ..mixins.waiter import WaiterMixin
 from ..schemas.api import (
     APIChatMessage,
     ChatCompletion,
@@ -24,7 +23,7 @@ class BaseLLMModel:
     max_total_tokens: int = 2048
 
 
-class BaseCompletionGenerator(ABC, PromptUtilsMixin, WaiterMixin):
+class BaseCompletionGenerator(ABC, PromptUtilsMixin):
     """Base class for all completion generators."""
 
     @abstractmethod

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
@@ -1,7 +1,9 @@
 """Wrapper for exllama to generate text completions."""
 
 # flake8: noqa
+from ..utils.dependency import install_torch
 
+install_torch()
 from contextlib import contextmanager
 from pathlib import Path
 from typing import TYPE_CHECKING, Iterator, Optional
@@ -15,8 +17,9 @@
     make_completion,
     make_completion_chunk,
 )
+from ..utils.dependency import import_repository
 from ..utils.logger import ApiLogger
-from ..utils.path import import_repository, resolve_model_path_to_posix
+from ..utils.path import resolve_model_path_to_posix
 from .base import BaseCompletionGenerator
 
 with import_repository(

diff --git a/llama_api/modules/llama_cpp.py b/llama_api/modules/llama_cpp.py
@@ -15,29 +15,21 @@
     convert_text_completion_chunks_to_chat,
     convert_text_completion_to_chat,
 )
+from ..utils.dependency import import_repository
 from ..utils.llama_cpp import build_shared_lib
 from ..utils.logger import ApiLogger
-from ..utils.path import import_repository, resolve_model_path_to_posix
+from ..utils.path import resolve_model_path_to_posix
 from .base import BaseCompletionGenerator
 
 logger = ApiLogger(__name__)
 logger.info("🦙 llama-cpp-python repository found!")
 build_shared_lib(logger=logger)
-try:
-    with import_repository(
-        git_path="https://github.com/abetlen/llama-cpp-python",
-        disk_path="repositories/llama_cpp",
-    ):
-        from repositories.llama_cpp import llama_cpp
-        from repositories.llama_cpp.llama_cpp.llama_cpp import GGML_USE_CUBLAS
-except ImportError:
-    logger.warning(
-        "🦙 llama-cpp-python repository not found. "
-        "Falling back to llama-cpp-python submodule."
-    )
-
-    import llama_cpp
-    from llama_cpp import GGML_USE_CUBLAS as GGML_USE_CUBLAS
+with import_repository(
+    git_path="https://github.com/abetlen/llama-cpp-python",
+    disk_path="repositories/llama_cpp",
+):
+    from repositories.llama_cpp import llama_cpp
+    from repositories.llama_cpp.llama_cpp.llama_cpp import GGML_USE_CUBLAS
 
 
 def _make_logit_bias_processor(
@@ -96,7 +88,7 @@ def _create_completion(
         mirostat_mode=settings.mirostat_mode,
         mirostat_tau=settings.mirostat_tau,
         mirostat_eta=settings.mirostat_eta,
-        logits_processor=llama_cpp.LogitsProcessorList(  # type: ignore
+        logits_processor=llama_cpp.LogitsProcessorList(
             [
                 _make_logit_bias_processor(
                     client,
@@ -134,7 +126,7 @@ def _create_chat_completion(
         mirostat_mode=settings.mirostat_mode,
         mirostat_tau=settings.mirostat_tau,
         mirostat_eta=settings.mirostat_eta,
-        logits_processor=llama_cpp.LogitsProcessorList(  # type: ignore
+        logits_processor=llama_cpp.LogitsProcessorList(
             [
                 _make_logit_bias_processor(
                     client,

diff --git a/llama_api/modules/sentence_encoder.py b/llama_api/modules/sentence_encoder.py
@@ -1,3 +1,10 @@
+"""Wrapper for sentence_encoder to generate text embeddings."""
+# flake8: noqa
+
+from ..utils.dependency import install_tensorflow
+
+install_tensorflow()
+
 from typing import TYPE_CHECKING, Callable, Optional
 
 import numpy as np

diff --git a/llama_api/modules/transformer.py b/llama_api/modules/transformer.py
@@ -1,6 +1,11 @@
+"""Wrapper for transformer to generate text embeddings."""
+# flake8: noqa
+
+from ..utils.dependency import install_torch
+
+install_torch()
 from gc import collect
 from typing import Optional
-
 from torch import Tensor, cuda
 from transformers.modeling_outputs import (
     BaseModelOutputWithPoolingAndCrossAttentions,

diff --git a/llama_api/server/app_settings.py b/llama_api/server/app_settings.py
@@ -1,3 +1,4 @@
+from logging import warn
 import platform
 from contextlib import asynccontextmanager
 from os import environ
@@ -7,6 +8,11 @@
 
 
 def ensure_packages_installed():
+    """Install the packages in the requirements.txt file"""
+    warn(
+        "This function is deprecated. Use `install_dependencies` instead.",
+        DeprecationWarning,
+    )
     subprocess.run(
         [
             sys.executable,
@@ -60,8 +66,14 @@ def set_priority(pid: Optional[int] = None, priority: str = "high"):
 def initialize_before_launch(install_packages: bool = False):
     """Initialize the app"""
 
+    from ..utils.dependency import install_poetry, install_dependencies
+
     if install_packages:
-        ensure_packages_installed()
+        try:
+            import poetry  # noqa: F401
+        except ImportError:
+            install_poetry()
+        install_dependencies()
 
     if platform.system() == "Windows":
         set_priority(priority="high")

diff --git a/llama_api/server/pools/llama.py b/llama_api/server/pools/llama.py
@@ -52,10 +52,7 @@ def completion_generator_manager(
 ):
     """Context manager for completion generators."""
     completion_generator = get_completion_generator(body)
-    completion_generator.wait_until_available()
-    completion_generator.set_availability(False)
     yield completion_generator
-    completion_generator.set_availability(True)
 
 
 def get_model_names() -> list[str]:
@@ -114,7 +111,6 @@ def get_completion_generator(
 
         # Before creating a new completion generator, check memory usage
         if completion_generators.maxlen == len(completion_generators):
-            completion_generators[-1].wait_until_available()
             free_memory_of_first_item_from_container(
                 completion_generators,
                 min_free_memory_mb=256,

diff --git a/llama_api/server/routers/v1.py b/llama_api/server/routers/v1.py
@@ -62,7 +62,7 @@
 @dataclass
 class WixMetadata:
     key: Optional[str] = None
-    semaphore: Semaphore = field(default_factory=lambda: Semaphore(1))
+    semaphore: Semaphore = field(default_factory=lambda: Semaphore(10))
 
 
 # Worker index (wix) is used to keep track of which worker is currently

diff --git a/llama_api/shared/config.py b/llama_api/shared/config.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Tuple
+
+
+@dataclass(frozen=True)
+class Config:
+    """Configuration for the project"""
+
+    project_root: Path = Path(__file__).parent.parent.parent
+    env_for_venv: Tuple[str, ...] = ("SYSTEMROOT", "CUDA_HOME", "CUDA_PATH")
+    cuda_version: str = "11.8"
+    torch_version: str = "==2.0.1"
+    torch_source: str = "https://download.pytorch.org/whl/torch_stable.html"
+    tensorflow_version: str = "==2.13.0"
diff --git a/llama_api/utils/concurrency.py b/llama_api/utils/concurrency.py
@@ -32,6 +32,13 @@ def init_process_pool(env_vars: dict[str, str]) -> None:
     for key, value in env_vars.items():
         environ[key] = value
 
+    cuda_home = environ.get("CUDA_HOME", None)
+    cuda_path = environ.get("CUDA_PATH", None)
+    if cuda_path is not None and cuda_home is None:
+        environ["CUDA_HOME"] = cuda_path
+    elif cuda_home is not None and cuda_path is None:
+        environ["CUDA_PATH"] = cuda_home
+
 
 def pool() -> ProcessPool:
     """Get the process pool, and initialize it if it's not initialized yet"""