Skip to content

Commit

Permalink
feat: ktransformers backend service
Browse files Browse the repository at this point in the history
  • Loading branch information
av committed Sep 17, 2024
1 parent 4e84a3d commit d60382b
Show file tree
Hide file tree
Showing 17 changed files with 238 additions and 23 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Harbor is a containerized LLM toolkit that allows you to run LLMs and additional

##### Backends

[Ollama](https://github.com/av/harbor/wiki/2.2.1-Backend:-Ollama) ⦁︎ [llama.cpp](https://github.com/av/harbor/wiki/2.2.2-Backend:-llama.cpp) ⦁︎ [vLLM](https://github.com/av/harbor/wiki/2.2.3-Backend:-vLLM) ⦁︎ [TabbyAPI](https://github.com/av/harbor/wiki/2.2.4-Backend:-TabbyAPI) ⦁︎ [Aphrodite Engine](https://github.com/av/harbor/wiki/2.2.5-Backend:-Aphrodite-Engine) ⦁︎ [mistral.rs](https://github.com/av/harbor/wiki/2.2.6-Backend:-mistral.rs) ⦁︎ [openedai-speech](https://github.com/av/harbor/wiki/2.2.7-Backend:-openedai-speech) ⦁︎ [Parler](https://github.com/av/harbor/wiki/2.2.8-Backend:-Parler) ⦁︎ [text-generation-inference](https://github.com/av/harbor/wiki/2.2.9-Backend:-text-generation-inference) ⦁︎ [LMDeploy](https://github.com/av/harbor/wiki/2.2.10-Backend:-lmdeploy) ⦁︎ [AirLLM](https://github.com/av/harbor/wiki/2.2.11-Backend:-AirLLM) ⦁︎ [SGLang](https://github.com/av/harbor/wiki/2.2.12-Backend:-SGLang)
[Ollama](https://github.com/av/harbor/wiki/2.2.1-Backend:-Ollama) ⦁︎ [llama.cpp](https://github.com/av/harbor/wiki/2.2.2-Backend:-llama.cpp) ⦁︎ [vLLM](https://github.com/av/harbor/wiki/2.2.3-Backend:-vLLM) ⦁︎ [TabbyAPI](https://github.com/av/harbor/wiki/2.2.4-Backend:-TabbyAPI) ⦁︎ [Aphrodite Engine](https://github.com/av/harbor/wiki/2.2.5-Backend:-Aphrodite-Engine) ⦁︎ [mistral.rs](https://github.com/av/harbor/wiki/2.2.6-Backend:-mistral.rs) ⦁︎ [openedai-speech](https://github.com/av/harbor/wiki/2.2.7-Backend:-openedai-speech) ⦁︎ [Parler](https://github.com/av/harbor/wiki/2.2.8-Backend:-Parler) ⦁︎ [text-generation-inference](https://github.com/av/harbor/wiki/2.2.9-Backend:-text-generation-inference) ⦁︎ [LMDeploy](https://github.com/av/harbor/wiki/2.2.10-Backend:-lmdeploy) ⦁︎ [AirLLM](https://github.com/av/harbor/wiki/2.2.11-Backend:-AirLLM) ⦁︎ [SGLang](https://github.com/av/harbor/wiki/2.2.12-Backend:-SGLang) ⦁︎ [KTransformers](https://github.com/av/harbor/wiki/2.2.13-Backend:-KTransformers)

##### Satellites

Expand All @@ -35,7 +35,7 @@ harbor up searxng

# Run additional/alternative LLM Inference backends
# Open Webui is automatically connected to them.
harbor up llamacpp tgi litellm vllm tabbyapi aphrodite sglang
harbor up llamacpp tgi litellm vllm tabbyapi aphrodite sglang ktransformers

# Run different Frontends
harbor up librechat chatui bionicgpt hollama
Expand Down Expand Up @@ -134,7 +134,7 @@ harbor how to ping ollama container from the webui?

- Docker
- _Optional_ [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation)
- Only a few services have ARM builds - beware
- Note that not all services have native ARM builds and hence unsupported on MacOS
- git
- bash-compatible shell

Expand Down
8 changes: 8 additions & 0 deletions aichat/configs/aichat.ktransformers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
clients:
- type: openai-compatible
name: ktransformers
api_base: http://ktransformers:12456/v1
api_key: sk-ktransformers
models:
- name: ${HARBOR_AICHAT_MODEL}

4 changes: 4 additions & 0 deletions aider/configs/aider.ktransformers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
openai-api-base: http://ktransformers:12456/v1
openai-api-key: sk-ktransformers
model: openai/${HARBOR_AIDER_MODEL}
verify-ssl: false
25 changes: 25 additions & 0 deletions compose.ktransformers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
services:
ktransformers:
container_name: ${HARBOR_CONTAINER_PREFIX}.ktransformers
env_file:
- ./.env
- ./ktransformers/override.env
ipc: host
build:
context: ./ktransformers
dockerfile: Dockerfile
ports:
- ${HARBOR_KTRANSFORMERS_HOST_PORT}:12456
volumes:
- ${HARBOR_HF_CACHE}:/root/.cache/huggingface
- ${HARBOR_LLAMACPP_CACHE}:/root/.cache/llama.cpp
# Monkey-patch to make compatible with Open WebUI
- ./ktransformers/chat.py:/opt/conda/lib/python3.10/site-packages/ktransformers/server/api/openai/endpoints/chat.py
environment:
- HF_TOKEN=${HARBOR_HF_TOKEN}
networks:
- harbor-network
command: >
--model_path ${HARBOR_KTRANSFORMERS_MODEL}
--gguf_path ${HARBOR_KTRANSFORMERS_GGUF}
${HARBOR_KTRANSFORMERS_EXTRA_ARGS}
4 changes: 4 additions & 0 deletions compose.x.aichat.ktransformers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
services:
aichat:
volumes:
- ./aichat/configs/aichat.ktransformers.yml:/app/configs/ktransformers.yml
4 changes: 4 additions & 0 deletions compose.x.aider.ktransformers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
services:
aider:
volumes:
- ./aider/configs/aider.ktransformers.yml:/root/.aider/ktransformers.yml
9 changes: 9 additions & 0 deletions compose.x.ktransformers.nvidia.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
services:
ktransformers:
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
4 changes: 4 additions & 0 deletions compose.x.webui.ktransformers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
services:
webui:
volumes:
- ./open-webui/configs/config.ktransformers.json:/app/configs/config.ktransformers.json
87 changes: 69 additions & 18 deletions harbor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,26 @@ show_help() {
echo " cmd <handle> - Print the docker compose command"
echo
echo "Setup Management Commands:"
echo " webui - Configure Open WebUI Service"
echo " llamacpp - Configure llamacpp service"
echo " tgi - Configure text-generation-inference service"
echo " litellm - Configure LiteLLM service"
echo " openai - Configure OpenAI API keys and URLs"
echo " vllm - Configure VLLM service"
echo " aphrodite - Configure Aphrodite service"
echo " tabbyapi - Configure TabbyAPI service"
echo " mistralrs - Configure mistral.rs service"
echo " cfd - Run cloudflared CLI"
echo " airllm - Configure AirLLM service"
echo " txtai - Configure txtai service"
echo " chatui - Configure HuggingFace ChatUI service"
echo " comfyui - Configure ComfyUI service"
echo " parler - Configure Parler service"
echo " sglang - Configure SGLang CLI"
echo " omnichain - Work with Omnichain service"
echo " webui - Configure Open WebUI Service"
echo " llamacpp - Configure llamacpp service"
echo " tgi - Configure text-generation-inference service"
echo " litellm - Configure LiteLLM service"
echo " openai - Configure OpenAI API keys and URLs"
echo " vllm - Configure VLLM service"
echo " aphrodite - Configure Aphrodite service"
echo " tabbyapi - Configure TabbyAPI service"
echo " mistralrs - Configure mistral.rs service"
echo " cfd - Run cloudflared CLI"
echo " airllm - Configure AirLLM service"
echo " txtai - Configure txtai service"
echo " chatui - Configure HuggingFace ChatUI service"
echo " comfyui - Configure ComfyUI service"
echo " parler - Configure Parler service"
echo " sglang - Configure SGLang CLI"
echo " omnichain - Work with Omnichain service"
echo " jupyter - Configure Jupyter service"
echo " ol1 - Configure ol1 service"
echo " ktransformers - Configure ktransformers service"
echo
echo "Service CLIs:"
echo " ollama - Run Ollama CLI (docker). Service should be running."
Expand Down Expand Up @@ -1589,6 +1592,8 @@ fix_fs_acl() {
docker_fsacl ./bionicgpt
docker_fsacl ./omnichain
docker_fsacl ./bench
docker_fsacl ./jupyter
docker_fsacl ./ktransformers

docker_fsacl $(eval echo "$(env_manager get hf.cache)")
docker_fsacl $(eval echo "$(env_manager get vllm.cache)")
Expand Down Expand Up @@ -3023,12 +3028,54 @@ run_ol1_command() {
esac
}

run_ktransformers_command() {
case "$1" in
model)
shift
env_manager_alias ktransformers.model "$@"
return 0
;;
gguf)
shift
env_manager_dict ktransformers.gguf "$@"
return 0
;;
version)
shift
env_manager_alias ktransformers.version "$@"
return 0
;;
image)
shift
env_manager_alias ktransformers.image "$@"
return 0
;;
args)
shift
env_manager_alias ktransformers.args "$@"
return 0
;;
-h|--help|help)
echo "Please note that this is not KTransformers CLI, but a Harbor CLI to manage KTransformers service."
echo
echo "Usage: harbor ktransformers <command>"
echo
echo "Commands:"
echo " harbor ktransformers model [user/repo] - Get or set --model_path for KTransformers"
echo " harbor ktransformers gguf [args] - Get or set --gguf_path for KTransformers"
echo " harbor ktransformers version [version] - Get or set KTransformers version"
echo " harbor ktransformers image [image] - Get or set KTransformers image"
echo " harbor ktransformers args [args] - Get or set extra args to pass to KTransformers"
;;
esac
}

# ========================================================================
# == Main script
# ========================================================================

# Globals
version="0.1.24"
version="0.1.25"
harbor_repo_url="https://github.com/av/harbor.git"
delimiter="|"
scramble_exit_code=42
Expand Down Expand Up @@ -3285,6 +3332,10 @@ main_entrypoint() {
shift
run_ol1_command "$@"
;;
ktransformers)
shift
run_ktransformers_command "$@"
;;
tunnel|t)
shift
establish_tunnel "$@"
Expand Down
18 changes: 18 additions & 0 deletions http-catalog/ktransformers.http
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@host = http://localhost:34121

###

curl {{host}}/v1/models

###

curl {{host}}/v1/chat/completions -H 'Content-Type: application/json' -H "Authorization: Bearer sk-fake" -d '{
"model": "anything",
"messages": [
{
"role": "user",
"content": "Bobby was born in Paris. How old is Bobby?"
}
],
"max_tokens": 30
}'
16 changes: 16 additions & 0 deletions ktransformers/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Base image for some other Harbor services, reusing
ARG HARBOR_KTRANSFORMERS_IMAGE=pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel

FROM ${HARBOR_KTRANSFORMERS_IMAGE}

ARG HARBOR_KTRANSFORMERS_VERSION="0.1.4"
ENV CUDA_HOME /usr/local/cuda

WORKDIR /app
RUN apt-get update && apt-get install -y git
RUN pip install numpy cpufeature
RUN pip install flash_attn
RUN pip install https://github.com/kvcache-ai/ktransformers/releases/download/v${HARBOR_KTRANSFORMERS_VERSION}/ktransformers-${HARBOR_KTRANSFORMERS_VERSION}+cu121torch23avx2-cp310-cp310-linux_x86_64.whl --no-build-isolation
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

ENTRYPOINT [ "ktransformers" ]
51 changes: 51 additions & 0 deletions ktransformers/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Monkey-patch to include "/v1/models" endpoint to make it complatible
# with the Open WebUI

import json
from time import time
from uuid import uuid4
from fastapi import APIRouter
from fastapi.requests import Request
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import chat_stream_response
from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate,ChatCompletionChunk,ChatCompletionObject
from ktransformers.server.backend.base import BackendInterfaceBase

router = APIRouter()

@router.get('/models', tags=['openai'])
async def models():
return {
"object": "list",
"data": [
{
"id": "ktransformers",
"object": "model",
"created": 1234567890,
"owned_by": "organization",
"permission": []
}
]
}

@router.post('/chat/completions',tags=['openai'])
async def chat_completion(request:Request,create:ChatCompletionCreate):
id = str(uuid4())

interface: BackendInterfaceBase = get_interface()
# input_ids = interface.format_and_tokenize_input_ids(id,messages=create.get_tokenizer_messages())

input_message = [json.loads(m.model_dump_json()) for m in create.messages]

if create.stream:
async def inner():
chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))
async for token in interface.inference(input_message,id):
chunk.set_token(token)
yield chunk
return chat_stream_response(request,inner())
else:
comp = ChatCompletionObject(id=id,object='chat.completion.chunk',created=int(time()))
async for token in interface.inference(input_message,id):
comp.append_token(token)
return comp
2 changes: 2 additions & 0 deletions ktransformers/override.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# You can provide additional
# environment variables here
2 changes: 1 addition & 1 deletion open-webui/configs/config.airllm.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
open-webui/configs/config.airllm.json{
{
"openai": {
"api_base_urls": [
"http://airllm:5000/v1"
Expand Down
11 changes: 11 additions & 0 deletions open-webui/configs/config.ktransformers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"openai": {
"api_base_urls": [
"http://ktransformers:12456/v1"
],
"api_keys": [
"sk-ktransformers"
],
"enabled": true
}
}
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@av/harbor",
"version": "0.1.24",
"version": "0.1.25",
"bin": {
"harbor": "./bin/harbor"
}
Expand Down
8 changes: 8 additions & 0 deletions profiles/default.env
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,14 @@ HARBOR_OL1_HOST_PORT=34111
HARBOR_OL1_MODEL="llama3.1:8b"
HARBOR_OL1_ARGS="temperature=0.2"

# ktransformers
HARBOR_KTRANSFORMERS_HOST_PORT=34121
HARBOR_KTRANSFORMERS_VERSION="0.1.4"
HARBOR_KTRANSFORMERS_IMAGE="pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel"
HARBOR_KTRANSFORMERS_MODEL=""
HARBOR_KTRANSFORMERS_GGUF=""
HARBOR_KTRANSFORMERS_EXTRA_ARGS=""

# ============================================
# Service Configuration.
# You can specify any of the service's own environment variables here.
Expand Down

0 comments on commit d60382b

Please sign in to comment.