toverainc · kristiankielhofner · Oct 24, 2023 · Sep 9, 2023 · Oct 13, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/tensorrt:22.12-py3
+FROM nvcr.io/nvidia/tensorrt:23.08-py3
 
 WORKDIR /app
 
@@ -16,7 +16,7 @@ COPY requirements.txt .
 RUN --mount=type=cache,target=/root/.cache pip install -r requirements.txt
 
 # Install auto-gptq
-RUN --mount=type=cache,target=/root/.cache BUILD_CUDA_EXT=0 pip install auto-gptq[triton]==0.2.2
+RUN --mount=type=cache,target=/root/.cache pip install auto-gptq==0.4.2+cu118 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118
 
 COPY . .
 

diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml
@@ -15,13 +15,14 @@ services:
       - ${LISTEN_IP}:${MEDIA_PORT_RANGE}:${MEDIA_PORT_RANGE}
     volumes:
       - ./:/app
+      - ./cache:/root/.cache
     command: ./entrypoint.sh
 
   nginx:
     restart: unless-stopped
     depends_on:
       - wis
-    image: nginx:1.25.0
+    image: nginx:1.25.2
     volumes:
     - ./nginx:/nginx
     - ./nginx/nginx.conf:/etc/nginx/nginx.conf

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -21,13 +21,14 @@ services:
               capabilities: [gpu]
     volumes:
       - ./:/app
+      - ./cache:/root/.cache
     command: ./entrypoint.sh
 
   nginx:
     restart: unless-stopped
     depends_on:
       - wis
-    image: nginx:1.25.0
+    image: nginx:1.25.2
     volumes:
     - ./nginx:/nginx
     - ./nginx/nginx.conf:/etc/nginx/nginx.conf

diff --git a/main.py b/main.py
@@ -216,12 +216,6 @@ async def create_datagram_endpoint(self, protocol_factory, local_addr: Tuple[str
 # Path to chatbot model
 chatbot_model_path = settings.chatbot_model_path
 
-# Chatbot model basename
-chatbot_model_basename = settings.chatbot_model_basename
-
-# Chatbot device
-chatbot_device = settings.chatbot_device
-
 # Chatbot temperature
 chatbot_temperature = settings.chatbot_temperature
 
@@ -392,19 +386,14 @@ def load_models() -> Models:
 
     if support_chatbot and device == "cuda":
         logger.info(f'CHATBOT: Using model {chatbot_model_path} and CUDA, attempting load (this takes a while)...')
-        from transformers import AutoTokenizer
-        from auto_gptq import AutoGPTQForCausalLM
+        from transformers import AutoTokenizer, AutoModelForCausalLM
 
         chatbot_tokenizer = AutoTokenizer.from_pretrained(chatbot_model_path, use_fast=True)
 
         # load quantized model, currently only support single gpu
-        chatbot_model = AutoGPTQForCausalLM.from_quantized(chatbot_model_path,
-                                                           model_basename=chatbot_model_basename,
-                                                           use_safetensors=True,
-                                                           trust_remote_code=False,
-                                                           device=chatbot_device,
-                                                           use_triton=True,
-                                                           quantize_config=None)
+        chatbot_model = AutoModelForCausalLM.from_pretrained(chatbot_model_path,
+                                                           torch_dtype=torch.float16,
+                                                           device_map="auto")
 
     else:
         chatbot_tokenizer = None
@@ -437,10 +426,6 @@ def warm_models():
                 logger.info("Warming TTS... This takes a while on first run.")
                 do_tts("Hello from Willow")
 
-        # Warm chatbot once
-        if models.chatbot_model is not None:
-            logger.info("Warming chatbot... This takes a while on first run.")
-            do_chatbot("Tell me about AI")
     else:
         logger.info("Skipping warm_models for CPU")
         return
@@ -454,7 +439,7 @@ def do_chatbot(text, max_new_tokens=chatbot_max_new_tokens, temperature=chatbot_
         prompt = f'''USER: {text}
 ASSISTANT:'''
         logger.debug(f'CHATBOT: Pipeline parameters are max_new_tokens {max_new_tokens} temperature {temperature}'
-                     f'top_p {top_p} repetition_penalty {repetition_penalty}')
+                     f' top_p {top_p} repetition_penalty {repetition_penalty}')
         chatbot_pipeline = transformers.pipeline(
             "text-generation",
             model=models.chatbot_model,

diff --git a/nginx/nginx.conf b/nginx/nginx.conf
@@ -49,7 +49,8 @@ http {
 
     server {
         listen 19001;
-        listen 19000 ssl http2;
+        listen 19000 ssl;
+        http2 on;
         server_name wis;
         ssl_certificate /nginx/cert.pem;
         ssl_certificate_key /nginx/key.pem;

diff --git a/requirements.txt b/requirements.txt
@@ -1,100 +1,104 @@
-accelerate==0.21.0
-aiofiles==23.1.0
+accelerate==0.22.0
+aiofiles==23.2.1
 aiohttp==3.8.5
 aioice==0.9.0
 aiortc==1.5.0
 aiosignal==1.3.1
-altair==5.0.1
-anyio==3.7.0
+altair==5.1.1
+anyio==3.7.1
 appdirs==1.4.4
-async-timeout==4.0.2
+async-timeout==4.0.3
 attrs==23.1.0
 audioread==3.0.0
 av==10.0.0
-certifi==2023.7.22
+certifi==2022.12.7
 cffi==1.15.1
 charset-normalizer==2.1.1
-click==8.1.3
+click==8.1.7
 cmake==3.25.0
+coloredlogs==15.0.1
 colorlog==6.7.0
 contourpy==1.1.0
-cryptography==41.0.2
-ctranslate2==3.19.0
+cryptography==41.0.3
+ctranslate2==3.20.0
 cycler==0.11.0
-datasets==2.13.0
+datasets==2.14.5
 decorator==5.1.1
-dill==0.3.6
-dnspython==2.3.0
+dill==0.3.7
+dnspython==2.4.2
 docker-pycreds==0.4.0
 docopt==0.6.2
 email-validator==2.0.0.post2
 entrypoints==0.4
-exceptiongroup==1.1.1
-fastapi==0.97.0
-ffmpy==0.3.0
+exceptiongroup==1.1.3
+fastapi==0.103.1
+ffmpy==0.3.1
 filelock==3.9.0
-fonttools==4.40.0
-frozenlist==1.3.3
+fonttools==4.42.1
+frozenlist==1.4.0
 fsspec==2023.6.0
 gitdb==4.0.10
-GitPython==3.1.32
+GitPython==3.1.35
 google-crc32c==1.5.0
-gunicorn==20.1.0
+gunicorn==21.2.0
 h11==0.14.0
-httpcore==0.17.2
-httptools==0.5.0
+httpcore==0.17.3
+httptools==0.6.0
 httpx==0.24.1
-huggingface-hub==0.15.1
+huggingface-hub==0.16.4
+humanfriendly==10.0
 HyperPyYAML==1.2.1
 idna==3.4
 ifaddr==0.2.0
-importlib-metadata==6.6.0
-importlib-resources==5.12.0
+importlib-metadata==6.8.0
+importlib-resources==6.0.1
 itsdangerous==2.1.2
 Jinja2==3.1.2
-joblib==1.2.0
-jsonschema==4.17.3
-kiwisolver==1.4.4
-lazy_loader==0.2
-librosa==0.10.0.post2
+joblib==1.3.2
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+kiwisolver==1.4.5
+lazy_loader==0.3
+librosa==0.10.1
 linkify-it-py==2.0.2
 lit==15.0.7
-llvmlite==0.40.0
+llvmlite==0.40.1
 Mako==1.2.4
-markdown-it-py==2.2.0
-markdown2==2.4.8
-MarkupSafe==2.1.1
-matplotlib==3.7.1
-mdit-py-plugins==0.3.3
+markdown-it-py==3.0.0
+markdown2==2.4.10
+MarkupSafe==2.1.3
+matplotlib==3.7.2
+mdit-py-plugins==0.4.0
 mdurl==0.1.2
-mpmath==1.3.0
+mpmath==1.2.1
 msgpack==1.0.5
 multidict==6.0.4
-multiprocess==0.70.14
+multiprocess==0.70.15
 networkx==3.0
-nh3==0.2.13
+nh3==0.2.14
 num2words==0.5.12
-numba==0.57.0
+numba==0.57.1
 numpy==1.23.5
-orjson==3.9.1
+optimum==1.13.1
+orjson==3.9.7
 packaging==23.1
-pandas==2.0.2
+pandas==2.1.0
 pathtools==0.1.2
-peft==0.3.0
-Pillow==9.3.0
+peft==0.5.0
+Pillow==10.0.0
 pkgutil_resolve_name==1.3.10
-platformdirs==2.6.0
-pooch==1.6.0
-prompt-toolkit==3.0.38
-protobuf==4.21.12
+platformdirs==3.10.0
+pooch==1.7.0
+prompt-toolkit==3.0.39
+protobuf==4.23.4
 psutil==5.9.5
-pyarrow==12.0.1
+pyarrow==13.0.0
 pycparser==2.21
 pycuda==2022.2.2
-pydantic==1.10.9
+pydantic==1.10.12
 pydub==0.25.1
-pyee==10.0.1
-Pygments==2.15.1
+pyee==11.0.0
+Pygments==2.16.1
 pylibsrtp==0.8.0
 pyOpenSSL==23.2.0
 pyparsing==3.0.9
@@ -104,54 +108,56 @@ pyston-lite-autoload==2.3.5
 python-dateutil==2.8.2
 python-dotenv==1.0.0
 python-multipart==0.0.6
-pytools==2022.1.13
-pytz==2023.3
-PyYAML==6.0
-regex==2023.6.3
+pytools==2023.1.1
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.30.2
+regex==2023.8.8
 requests==2.31.0
-responses==0.23.1
+responses==0.23.3
 rfc3986==2.0.0
-rich==13.4.2
+rich==13.5.2
 rouge==1.0.1
+rpds-py==0.10.2
 ruamel.yaml==0.17.28
 ruamel.yaml.clib==0.2.7
-safetensors==0.3.1
-scikit-learn==1.2.2
-scipy==1.10.1
+safetensors==0.3.3
+scikit-learn==1.3.0
+scipy==1.11.2
 semantic-version==2.10.0
 sentencepiece==0.1.99
-sentry-sdk==1.25.1
+sentry-sdk==1.30.0
 setproctitle==1.3.2
 shortuuid==1.0.11
 six==1.16.0
 smmap==5.0.0
 sniffio==1.3.0
 soundfile==0.12.1
-soxr==0.3.5
-speechbrain==0.5.14
+soxr==0.3.6
+speechbrain==0.5.15
 starlette==0.27.0
 svgwrite==1.4.3
 sympy==1.11.1
-threadpoolctl==3.1.0
+threadpoolctl==3.2.0
 tiktoken==0.4.0
 tokenizers==0.13.3
 toolz==0.12.0
-tqdm==4.65.0
-transformers==4.31.0
+tqdm==4.66.1
+transformers==4.33.1
 triton==2.0.0
-types-PyYAML==6.0.12.10
-typing_extensions==4.4.0
+types-PyYAML==6.0.12.11
+typing_extensions==4.7.1
 tzdata==2023.3
 uc-micro-py==1.0.2
 ujson==5.8.0
 urllib3==1.26.13
-uvicorn==0.22.0
+uvicorn==0.23.2
 uvloop==0.17.0
-wandb==0.15.4
-watchfiles==0.19.0
+wandb==0.15.10
+watchfiles==0.20.0
 wavedrom==2.0.3.post3
 wcwidth==0.2.6
 websockets==11.0.3
-xxhash==3.2.0
+xxhash==3.3.0
 yarl==1.9.2
-zipp==3.15.0
+zipp==3.16.2
diff --git a/settings.py b/settings.py
@@ -62,13 +62,7 @@ class APISettings(BaseSettings):
     support_chatbot: bool = False
 
     # Path to chatbot model - download from HuggingFace at runtime by default (gets cached)
-    chatbot_model_path: str = 'TheBloke/vicuna-13b-v1.3-GPTQ'
-
-    # Chatbot model basename
-    chatbot_model_basename: str = 'vicuna-13b-v1.3-GPTQ-4bit-128g.no-act.order'
-
-    # Chatbot device
-    chatbot_device: str = 'cuda:0'
+    chatbot_model_path: str = 'TheBloke/vicuna-13b-v1.3.0-GPTQ'
 
     # Chatbot pipeline default temperature
     chatbot_temperature: float = 0.7

diff --git a/utils.sh b/utils.sh
@@ -153,7 +153,7 @@ dep_check() {
     fi
 
     # Make sure we have it just in case
-    mkdir -p speakers/custom_tts speakers/voice_auth nginx/cache
+    mkdir -p speakers/custom_tts speakers/voice_auth nginx/cache cache
 
     # Check for new certs
     if [ ! -r nginx/cert.pem ] || [ ! -r nginx/key.pem ]; then