feat: ktransformers backend service

av · Sep 17, 2024 · d60382b · d60382b
1 parent 4e84a3d
commit d60382b
Show file tree

Hide file tree

Showing 17 changed files with 238 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Harbor is a containerized LLM toolkit that allows you to run LLMs and additional
 
 ##### Backends
 
-[Ollama](https://github.com/av/harbor/wiki/2.2.1-Backend:-Ollama) ⦁︎ [llama.cpp](https://github.com/av/harbor/wiki/2.2.2-Backend:-llama.cpp) ⦁︎ [vLLM](https://github.com/av/harbor/wiki/2.2.3-Backend:-vLLM) ⦁︎ [TabbyAPI](https://github.com/av/harbor/wiki/2.2.4-Backend:-TabbyAPI) ⦁︎ [Aphrodite Engine](https://github.com/av/harbor/wiki/2.2.5-Backend:-Aphrodite-Engine) ⦁︎ [mistral.rs](https://github.com/av/harbor/wiki/2.2.6-Backend:-mistral.rs) ⦁︎ [openedai-speech](https://github.com/av/harbor/wiki/2.2.7-Backend:-openedai-speech) ⦁︎ [Parler](https://github.com/av/harbor/wiki/2.2.8-Backend:-Parler) ⦁︎ [text-generation-inference](https://github.com/av/harbor/wiki/2.2.9-Backend:-text-generation-inference) ⦁︎ [LMDeploy](https://github.com/av/harbor/wiki/2.2.10-Backend:-lmdeploy) ⦁︎ [AirLLM](https://github.com/av/harbor/wiki/2.2.11-Backend:-AirLLM) ⦁︎ [SGLang](https://github.com/av/harbor/wiki/2.2.12-Backend:-SGLang)
+[Ollama](https://github.com/av/harbor/wiki/2.2.1-Backend:-Ollama) ⦁︎ [llama.cpp](https://github.com/av/harbor/wiki/2.2.2-Backend:-llama.cpp) ⦁︎ [vLLM](https://github.com/av/harbor/wiki/2.2.3-Backend:-vLLM) ⦁︎ [TabbyAPI](https://github.com/av/harbor/wiki/2.2.4-Backend:-TabbyAPI) ⦁︎ [Aphrodite Engine](https://github.com/av/harbor/wiki/2.2.5-Backend:-Aphrodite-Engine) ⦁︎ [mistral.rs](https://github.com/av/harbor/wiki/2.2.6-Backend:-mistral.rs) ⦁︎ [openedai-speech](https://github.com/av/harbor/wiki/2.2.7-Backend:-openedai-speech) ⦁︎ [Parler](https://github.com/av/harbor/wiki/2.2.8-Backend:-Parler) ⦁︎ [text-generation-inference](https://github.com/av/harbor/wiki/2.2.9-Backend:-text-generation-inference) ⦁︎ [LMDeploy](https://github.com/av/harbor/wiki/2.2.10-Backend:-lmdeploy) ⦁︎ [AirLLM](https://github.com/av/harbor/wiki/2.2.11-Backend:-AirLLM) ⦁︎ [SGLang](https://github.com/av/harbor/wiki/2.2.12-Backend:-SGLang) ⦁︎ [KTransformers](https://github.com/av/harbor/wiki/2.2.13-Backend:-KTransformers)
 
 ##### Satellites
 
@@ -35,7 +35,7 @@ harbor up searxng
 
 # Run additional/alternative LLM Inference backends
 # Open Webui is automatically connected to them.
-harbor up llamacpp tgi litellm vllm tabbyapi aphrodite sglang
+harbor up llamacpp tgi litellm vllm tabbyapi aphrodite sglang ktransformers
 
 # Run different Frontends
 harbor up librechat chatui bionicgpt hollama
@@ -134,7 +134,7 @@ harbor how to ping ollama container from the webui?
 
 - Docker
   - _Optional_ [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation)
-  - Only a few services have ARM builds - beware
+  - Note that not all services have native ARM builds and hence unsupported on MacOS
 - git
 - bash-compatible shell
 

diff --git a/aichat/configs/aichat.ktransformers.yml b/aichat/configs/aichat.ktransformers.yml
@@ -0,0 +1,8 @@
+clients:
+  - type: openai-compatible
+    name: ktransformers
+    api_base: http://ktransformers:12456/v1
+    api_key: sk-ktransformers
+    models:
+      - name: ${HARBOR_AICHAT_MODEL}
+
diff --git a/aider/configs/aider.ktransformers.yml b/aider/configs/aider.ktransformers.yml
@@ -0,0 +1,4 @@
+openai-api-base: http://ktransformers:12456/v1
+openai-api-key: sk-ktransformers
+model: openai/${HARBOR_AIDER_MODEL}
+verify-ssl: false
diff --git a/compose.ktransformers.yml b/compose.ktransformers.yml
@@ -0,0 +1,25 @@
+services:
+  ktransformers:
+    container_name: ${HARBOR_CONTAINER_PREFIX}.ktransformers
+    env_file:
+      - ./.env
+      - ./ktransformers/override.env
+    ipc: host
+    build:
+      context: ./ktransformers
+      dockerfile: Dockerfile
+    ports:
+      - ${HARBOR_KTRANSFORMERS_HOST_PORT}:12456
+    volumes:
+      - ${HARBOR_HF_CACHE}:/root/.cache/huggingface
+      - ${HARBOR_LLAMACPP_CACHE}:/root/.cache/llama.cpp
+      # Monkey-patch to make compatible with Open WebUI
+      - ./ktransformers/chat.py:/opt/conda/lib/python3.10/site-packages/ktransformers/server/api/openai/endpoints/chat.py
+    environment:
+      - HF_TOKEN=${HARBOR_HF_TOKEN}
+    networks:
+      - harbor-network
+    command: >
+      --model_path ${HARBOR_KTRANSFORMERS_MODEL}
+      --gguf_path ${HARBOR_KTRANSFORMERS_GGUF}
+      ${HARBOR_KTRANSFORMERS_EXTRA_ARGS}
diff --git a/compose.x.aichat.ktransformers.yml b/compose.x.aichat.ktransformers.yml
@@ -0,0 +1,4 @@
+services:
+  aichat:
+    volumes:
+      - ./aichat/configs/aichat.ktransformers.yml:/app/configs/ktransformers.yml
diff --git a/compose.x.aider.ktransformers.yml b/compose.x.aider.ktransformers.yml
@@ -0,0 +1,4 @@
+services:
+  aider:
+    volumes:
+      - ./aider/configs/aider.ktransformers.yml:/root/.aider/ktransformers.yml
diff --git a/compose.x.ktransformers.nvidia.yml b/compose.x.ktransformers.nvidia.yml
@@ -0,0 +1,9 @@
+services:
+  ktransformers:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
diff --git a/compose.x.webui.ktransformers.yml b/compose.x.webui.ktransformers.yml
@@ -0,0 +1,4 @@
+services:
+  webui:
+    volumes:
+      - ./open-webui/configs/config.ktransformers.json:/app/configs/config.ktransformers.json
diff --git a/harbor.sh b/harbor.sh
@@ -29,23 +29,26 @@ show_help() {
     echo "  cmd <handle>            - Print the docker compose command"
     echo
     echo "Setup Management Commands:"
-    echo "  webui      - Configure Open WebUI Service"
-    echo "  llamacpp   - Configure llamacpp service"
-    echo "  tgi        - Configure text-generation-inference service"
-    echo "  litellm    - Configure LiteLLM service"
-    echo "  openai     - Configure OpenAI API keys and URLs"
-    echo "  vllm       - Configure VLLM service"
-    echo "  aphrodite  - Configure Aphrodite service"
-    echo "  tabbyapi   - Configure TabbyAPI service"
-    echo "  mistralrs  - Configure mistral.rs service"
-    echo "  cfd        - Run cloudflared CLI"
-    echo "  airllm     - Configure AirLLM service"
-    echo "  txtai      - Configure txtai service"
-    echo "  chatui     - Configure HuggingFace ChatUI service"
-    echo "  comfyui    - Configure ComfyUI service"
-    echo "  parler     - Configure Parler service"
-    echo "  sglang     - Configure SGLang CLI"
-    echo "  omnichain  - Work with Omnichain service"
+    echo "  webui     - Configure Open WebUI Service"
+    echo "  llamacpp  - Configure llamacpp service"
+    echo "  tgi       - Configure text-generation-inference service"
+    echo "  litellm   - Configure LiteLLM service"
+    echo "  openai    - Configure OpenAI API keys and URLs"
+    echo "  vllm      - Configure VLLM service"
+    echo "  aphrodite - Configure Aphrodite service"
+    echo "  tabbyapi  - Configure TabbyAPI service"
+    echo "  mistralrs - Configure mistral.rs service"
+    echo "  cfd       - Run cloudflared CLI"
+    echo "  airllm    - Configure AirLLM service"
+    echo "  txtai     - Configure txtai service"
+    echo "  chatui    - Configure HuggingFace ChatUI service"
+    echo "  comfyui   - Configure ComfyUI service"
+    echo "  parler    - Configure Parler service"
+    echo "  sglang    - Configure SGLang CLI"
+    echo "  omnichain - Work with Omnichain service"
+    echo "  jupyter   - Configure Jupyter service"
+    echo "  ol1       - Configure ol1 service"
+    echo "  ktransformers - Configure ktransformers service"
     echo
     echo "Service CLIs:"
     echo "  ollama     - Run Ollama CLI (docker). Service should be running."
@@ -1589,6 +1592,8 @@ fix_fs_acl() {
     docker_fsacl ./bionicgpt
     docker_fsacl ./omnichain
     docker_fsacl ./bench
+    docker_fsacl ./jupyter
+    docker_fsacl ./ktransformers
 
     docker_fsacl $(eval echo "$(env_manager get hf.cache)")
     docker_fsacl $(eval echo "$(env_manager get vllm.cache)")
@@ -3023,12 +3028,54 @@ run_ol1_command() {
     esac
 }
 
+run_ktransformers_command() {
+    case "$1" in
+        model)
+            shift
+            env_manager_alias ktransformers.model "$@"
+            return 0
+            ;;
+        gguf)
+            shift
+            env_manager_dict ktransformers.gguf "$@"
+            return 0
+            ;;
+        version)
+            shift
+            env_manager_alias ktransformers.version "$@"
+            return 0
+            ;;
+        image)
+            shift
+            env_manager_alias ktransformers.image "$@"
+            return 0
+            ;;
+        args)
+            shift
+            env_manager_alias ktransformers.args "$@"
+            return 0
+            ;;
+        -h|--help|help)
+            echo "Please note that this is not KTransformers CLI, but a Harbor CLI to manage KTransformers service."
+            echo
+            echo "Usage: harbor ktransformers <command>"
+            echo
+            echo "Commands:"
+            echo "  harbor ktransformers model [user/repo] - Get or set --model_path for KTransformers"
+            echo "  harbor ktransformers gguf [args]       - Get or set --gguf_path for KTransformers"
+            echo "  harbor ktransformers version [version] - Get or set KTransformers version"
+            echo "  harbor ktransformers image [image]     - Get or set KTransformers image"
+            echo "  harbor ktransformers args [args]       - Get or set extra args to pass to KTransformers"
+            ;;
+    esac
+}
+
 # ========================================================================
 # == Main script
 # ========================================================================
 
 # Globals
-version="0.1.24"
+version="0.1.25"
 harbor_repo_url="https://github.com/av/harbor.git"
 delimiter="|"
 scramble_exit_code=42
@@ -3285,6 +3332,10 @@ main_entrypoint() {
             shift
             run_ol1_command "$@"
             ;;
+        ktransformers)
+            shift
+            run_ktransformers_command "$@"
+            ;;
         tunnel|t)
             shift
             establish_tunnel "$@"

diff --git a/http-catalog/ktransformers.http b/http-catalog/ktransformers.http
@@ -0,0 +1,18 @@
+@host = http://localhost:34121
+
+###
+
+curl {{host}}/v1/models
+
+###
+
+curl {{host}}/v1/chat/completions -H 'Content-Type: application/json' -H "Authorization: Bearer sk-fake" -d '{
+  "model": "anything",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Bobby was born in Paris. How old is Bobby?"
+    }
+  ],
+  "max_tokens": 30
+}'
diff --git a/ktransformers/Dockerfile b/ktransformers/Dockerfile
@@ -0,0 +1,16 @@
+# Base image for some other Harbor services, reusing
+ARG HARBOR_KTRANSFORMERS_IMAGE=pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel
+
+FROM ${HARBOR_KTRANSFORMERS_IMAGE}
+
+ARG HARBOR_KTRANSFORMERS_VERSION="0.1.4"
+ENV CUDA_HOME /usr/local/cuda
+
+WORKDIR /app
+RUN apt-get update && apt-get install -y git
+RUN pip install numpy cpufeature
+RUN pip install flash_attn
+RUN pip install https://github.com/kvcache-ai/ktransformers/releases/download/v${HARBOR_KTRANSFORMERS_VERSION}/ktransformers-${HARBOR_KTRANSFORMERS_VERSION}+cu121torch23avx2-cp310-cp310-linux_x86_64.whl --no-build-isolation
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+ENTRYPOINT [ "ktransformers" ]
diff --git a/ktransformers/chat.py b/ktransformers/chat.py
@@ -0,0 +1,51 @@
+# Monkey-patch to include "/v1/models" endpoint to make it complatible
+# with the Open WebUI
+
+import json
+from time import time
+from uuid import uuid4
+from fastapi import APIRouter
+from fastapi.requests import Request
+from ktransformers.server.utils.create_interface import get_interface
+from ktransformers.server.schemas.assistants.streaming import chat_stream_response
+from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate,ChatCompletionChunk,ChatCompletionObject
+from ktransformers.server.backend.base import BackendInterfaceBase
+
+router = APIRouter()
+
+@router.get('/models', tags=['openai'])
+async def models():
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "ktransformers",
+                "object": "model",
+                "created": 1234567890,
+                "owned_by": "organization",
+                "permission": []
+            }
+        ]
+    }
+
+@router.post('/chat/completions',tags=['openai'])
+async def chat_completion(request:Request,create:ChatCompletionCreate):
+    id = str(uuid4())
+
+    interface: BackendInterfaceBase = get_interface()
+    # input_ids = interface.format_and_tokenize_input_ids(id,messages=create.get_tokenizer_messages())
+
+    input_message = [json.loads(m.model_dump_json()) for m in create.messages]
+
+    if create.stream:
+        async def inner():
+            chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))
+            async for token in interface.inference(input_message,id):
+                chunk.set_token(token)
+                yield chunk
+        return chat_stream_response(request,inner())
+    else:
+        comp = ChatCompletionObject(id=id,object='chat.completion.chunk',created=int(time()))
+        async for token in interface.inference(input_message,id):
+            comp.append_token(token)
+        return comp
diff --git a/ktransformers/override.env b/ktransformers/override.env
@@ -0,0 +1,2 @@
+# You can provide additional
+# environment variables here
diff --git a/open-webui/configs/config.airllm.json b/open-webui/configs/config.airllm.json
@@ -1,4 +1,4 @@
-open-webui/configs/config.airllm.json{
+{
   "openai": {
 		"api_base_urls": [
 			"http://airllm:5000/v1"

diff --git a/open-webui/configs/config.ktransformers.json b/open-webui/configs/config.ktransformers.json
@@ -0,0 +1,11 @@
+{
+  "openai": {
+		"api_base_urls": [
+			"http://ktransformers:12456/v1"
+		],
+		"api_keys": [
+			"sk-ktransformers"
+		],
+		"enabled": true
+	}
+}
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@av/harbor",
-  "version": "0.1.24",
+  "version": "0.1.25",
   "bin": {
     "harbor": "./bin/harbor"
   }

diff --git a/profiles/default.env b/profiles/default.env
@@ -307,6 +307,14 @@ HARBOR_OL1_HOST_PORT=34111
 HARBOR_OL1_MODEL="llama3.1:8b"
 HARBOR_OL1_ARGS="temperature=0.2"
 
+# ktransformers
+HARBOR_KTRANSFORMERS_HOST_PORT=34121
+HARBOR_KTRANSFORMERS_VERSION="0.1.4"
+HARBOR_KTRANSFORMERS_IMAGE="pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel"
+HARBOR_KTRANSFORMERS_MODEL=""
+HARBOR_KTRANSFORMERS_GGUF=""
+HARBOR_KTRANSFORMERS_EXTRA_ARGS=""
+
 # ============================================
 # Service Configuration.
 # You can specify any of the service's own environment variables here.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# You can provide additional
		# environment variables here