diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..4eb2972cba --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +# Git +.git +.gitignore + +# Build artifacts +bin +build + +# IDE and OS files +.idea +.vscode +*.DS_Store + +# Local virtual environments +venv + +# Python cache files +__pycache__ + +# Docker files +Dockerfile \ No newline at end of file diff --git a/.github/workflows/ci-pr-checks.yaml b/.github/workflows/ci-pr-checks.yaml index dbf6e3dd98..0bd12a87a3 100644 --- a/.github/workflows/ci-pr-checks.yaml +++ b/.github/workflows/ci-pr-checks.yaml @@ -12,7 +12,7 @@ jobs: check-changes: runs-on: ubuntu-latest outputs: - docs: ${{ steps.filter.outputs.docs }} + src: ${{ steps.filter.outputs.src }} steps: - name: Checkout source uses: actions/checkout@v6 @@ -20,14 +20,19 @@ jobs: id: filter with: filters: | - docs: - - 'README.md' - - 'docs/**' + src: + - '**/*.go' + - '**/*.py' lint-and-test: needs: check-changes - if: ${{ needs.check-changes.outputs.docs == 'false' }} + if: ${{ needs.check-changes.outputs.src == 'true' }} runs-on: ubuntu-latest steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + - name: Checkout source uses: actions/checkout@v6 @@ -43,9 +48,6 @@ jobs: go-version: "${{ env.GO_VERSION }}" cache-dependency-path: ./go.sum - - name: Install dependencies - run: sudo make install-dependencies - - name: Configure CGO for Python run: | PYTHON_INCLUDE=$(python3 -c "import sysconfig; print(sysconfig.get_path('include'))") @@ -57,13 +59,16 @@ jobs: - name: Set PKG_CONFIG_PATH run: echo "PKG_CONFIG_PATH=/usr/lib/pkgconfig" >> $GITHUB_ENV - - name: go mod tidy - run: go mod tidy + - name: Install dependencies + run: | + go mod tidy + sudo -E env "PATH=$PATH" make install-dependencies + sudo -E env "PATH=$PATH" make install-python-deps - name: Run lint checks uses: golangci/golangci-lint-action@v9 with: - version: 'v2.1.6' + version: "v2.1.6" args: "--config=./.golangci.yml" env: CGO_ENABLED: ${{ env.CGO_ENABLED }} @@ -74,10 +79,8 @@ jobs: - name: Run make build shell: bash - run: | - make build + run: make build - name: Run make test shell: bash - run: | - make test + run: make test diff --git a/.gitignore b/.gitignore index f94c6c6ce6..0af36bc0c6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ main bin/ +*debug_bin* + # Test binary, built with `go test -c` *.test diff --git a/Dockerfile.epp b/Dockerfile.epp index 915a34a0af..f344914dd8 100644 --- a/Dockerfile.epp +++ b/Dockerfile.epp @@ -1,13 +1,42 @@ ## Minimal runtime Dockerfile (microdnf-only, no torch, wrapper in site-packages) -# Build Stage: using Go 1.24 image -FROM quay.io/projectquay/golang:1.24 AS builder +# Go dependencies stage: download go modules and extract kv-cache +FROM quay.io/projectquay/golang:1.24 AS go-deps + +WORKDIR /workspace + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum + +# Copy the go source +COPY cmd/ cmd/ +COPY pkg/ pkg/ + +RUN go mod download + +# Copy Python wrapper and requirements from llm-d-kv-cache dependency +# Extract version dynamically and copy to a known location +RUN KV_CACHE_PKG=$(go list -m -f '{{.Dir}}' github.com/llm-d/llm-d-kv-cache) && \ + mkdir -p /workspace/kv-cache && \ + cp -r $KV_CACHE_PKG/* /workspace/kv-cache && \ + chmod +x /workspace/kv-cache/pkg/preprocessing/chat_completions/setup.sh + +FROM python:3.12-slim AS python-builder + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential + +COPY --from=go-deps /workspace/kv-cache /workspace/kv-cache +WORKDIR /workspace/kv-cache +# llm-d-kv-cache's Makefile. not llm-d-inference-scheduler's +RUN KV_CACHE_PKG=/workspace/kv-cache make install-python-deps + +# Go build stage +FROM quay.io/projectquay/golang:1.24 AS go-builder ARG TARGETOS ARG TARGETARCH ARG PYTHON_VERSION=3.12 - ENV PYTHON=python${PYTHON_VERSION} -ENV PYTHONPATH=/usr/lib64/${PYTHON}/site-packages:/usr/lib/${PYTHON}/site-packages # Install build tools # The builder is based on UBI8, so we need epel-release-8. @@ -16,52 +45,22 @@ RUN dnf install -y 'https://dl.fedoraproject.org/pub/epel/epel-release-latest-8. dnf install -y gcc-c++ libstdc++ libstdc++-devel clang zeromq-devel pkgconfig ${PYTHON}-devel ${PYTHON}-pip git && \ dnf clean all +COPY --from=go-deps /workspace /workspace +COPY --from=go-deps /go/pkg/mod /go/pkg/mod WORKDIR /workspace -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum +COPY Makefile* ./ -# Copy the go source -COPY cmd/ cmd/ -COPY pkg/ pkg/ +COPY --from=python-builder /workspace/kv-cache/pkg/preprocessing/chat_completions /workspace/kv-cache/pkg/preprocessing/chat_completions +RUN make setup-venv +COPY --from=python-builder /workspace/kv-cache/build/venv/lib/python3.12/site-packages /workspace/build/venv/lib/python3.12/site-packages -RUN go mod download +ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace/build/venv/lib/python3.12/site-packages +RUN python3.12 -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed -# Copy Python wrapper and requirements from llm-d-kv-cache-manager dependency -# Extract version dynamically and copy to a known location -# We need to keep llm-d-kv-cache-manager as go module path is kept the old name -RUN KVCACHE_MANAGER_VERSION=$(go list -m -f '{{.Version}}' github.com/llm-d/llm-d-kv-cache-manager) && \ - mkdir -p /workspace/kv-cache && \ - cp /go/pkg/mod/github.com/llm-d/llm-d-kv-cache-manager@${KVCACHE_MANAGER_VERSION}/pkg/preprocessing/chat_completions/render_jinja_template_wrapper.py \ - /workspace/kv-cache/render_jinja_template_wrapper.py && \ - cp /go/pkg/mod/github.com/llm-d/llm-d-kv-cache-manager@${KVCACHE_MANAGER_VERSION}/pkg/preprocessing/chat_completions/requirements.txt \ - /workspace/kv-cache/requirements.txt - -# HuggingFace tokenizer bindings (static lib) -RUN mkdir -p lib -# Ensure that the RELEASE_VERSION matches the one used in the imported llm-d-kv-cache-manager version ARG RELEASE_VERSION=v1.22.1 -RUN curl -L https://github.com/daulet/tokenizers/releases/download/${RELEASE_VERSION}/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib -RUN ranlib lib/*.a - -# Build -# the GOARCH has not a default value to allow the binary be built according to the host where the command -# was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO -# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, -# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. -ENV CGO_ENABLED=1 -ENV GOOS=${TARGETOS:-linux} -ENV GOARCH=${TARGETARCH} - - -ARG COMMIT_SHA=unknown -ARG BUILD_REF -RUN CGO_CFLAGS="$(${PYTHON}-config --cflags) -I/workspace/lib" && \ - CGO_LDFLAGS="$(${PYTHON}-config --ldflags --embed) -L/workspace/lib -ltokenizers -ldl -lm" && \ - export CGO_CFLAGS CGO_LDFLAGS && \ - go build -a -o bin/epp -ldflags="-extldflags '-L$(pwd)/lib' -X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" cmd/epp/main.go +RUN TOKENIZER_VERSION=${RELEASE_VERSION} make build-epp # Runtime stage # Use ubi9 as a minimal base image to package the manager binary @@ -69,7 +68,7 @@ RUN CGO_CFLAGS="$(${PYTHON}-config --cflags) -I/workspace/lib" && \ FROM registry.access.redhat.com/ubi9/ubi-minimal:9.7 ARG PYTHON_VERSION=3.12 WORKDIR / -COPY --from=builder /workspace/bin/epp /app/epp +COPY --from=go-builder /workspace/bin/epp /app/epp USER root @@ -87,24 +86,11 @@ RUN curl -L -o /tmp/epel-release.rpm https://dl.fedoraproject.org/pub/epel/epel- ln -sf /usr/bin/${PYTHON} /usr/bin/python3 && \ ln -sf /usr/bin/${PYTHON} /usr/bin/python +# Copy Python kv-cache package and site-packages from the python-builder stage +COPY --from=python-builder /workspace/kv-cache /workspace/kv-cache +ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace/kv-cache/build/venv/lib/python3.12/site-packages +RUN ${PYTHON} -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed -# Install wrapper as a module in site-packages -RUN mkdir -p /usr/local/lib/${PYTHON}/site-packages/ -COPY --from=builder /workspace/kv-cache/render_jinja_template_wrapper.py /usr/local/lib/${PYTHON}/site-packages/ - -# Python deps (no cache, single target) – filter out torch -ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 -COPY --from=builder /workspace/kv-cache/requirements.txt /tmp/requirements.txt -RUN sed '/^torch\b/d' /tmp/requirements.txt > /tmp/requirements.notorch.txt && \ - ${PYTHON} -m pip install --no-cache-dir --upgrade pip setuptools wheel && \ - ${PYTHON} -m pip install --no-cache-dir --target /usr/local/lib/${PYTHON}/site-packages -r /tmp/requirements.notorch.txt && \ - ${PYTHON} -m pip install --no-cache-dir --target /usr/local/lib/${PYTHON}/site-packages PyYAML && \ - rm /tmp/requirements.txt /tmp/requirements.notorch.txt && \ - rm -rf /root/.cache/pip - -# Python env -ENV PYTHONPATH="/usr/local/lib/${PYTHON}/site-packages:/usr/lib/${PYTHON}/site-packages" -ENV PATH=/usr/bin:/usr/local/bin:$PATH ENV HF_HOME="/tmp/.cache" USER 65532:65532 diff --git a/Makefile b/Makefile index 19e722ee22..27736606fe 100644 --- a/Makefile +++ b/Makefile @@ -157,7 +157,7 @@ test-unit: test-unit-epp test-unit-sidecar ## Run unit tests .PHONY: test-unit-% test-unit-%: download-tokenizer install-python-deps check-dependencies ## Run unit tests @printf "\033[33;1m==== Running Unit Tests ====\033[0m\n" - @KV_CACHE_PKG=$$(go list -m -f '{{.Dir}}/pkg/preprocessing/chat_completions' github.com/llm-d/llm-d-kv-cache-manager 2>/dev/null || echo ""); \ + @KV_CACHE_PKG=$$(go list -m -f '{{.Dir}}/pkg/preprocessing/chat_completions' github.com/llm-d/llm-d-kv-cache 2>/dev/null || echo ""); \ PYTHONPATH="$$KV_CACHE_PKG:$(VENV_DIR)/lib/python$(PYTHON_VERSION)/site-packages" \ CGO_CFLAGS=${$*_CGO_CFLAGS} CGO_LDFLAGS=${$*_CGO_LDFLAGS} go test $($*_LDFLAGS) -v $$($($*_TEST_FILES) | tr '\n' ' ') @@ -169,7 +169,7 @@ test-filter: download-tokenizer install-python-deps check-dependencies ## Run fi fi @TEST_TYPE="$(if $(TYPE),$(TYPE),epp)"; \ printf "\033[33;1m==== Running Filtered Tests (pattern: $(PATTERN), type: $$TEST_TYPE) ====\033[0m\n"; \ - KV_CACHE_PKG=$$(go list -m -f '{{.Dir}}/pkg/preprocessing/chat_completions' github.com/llm-d/llm-d-kv-cache-manager 2>/dev/null || echo ""); \ + KV_CACHE_PKG=$$(go list -m -f '{{.Dir}}/pkg/preprocessing/chat_completions' github.com/llm-d/llm-d-kv-cache 2>/dev/null || echo ""); \ if [ "$$TEST_TYPE" = "epp" ]; then \ PYTHONPATH="$$KV_CACHE_PKG:$(VENV_DIR)/lib/python$(PYTHON_VERSION)/site-packages" \ CGO_CFLAGS=$(epp_CGO_CFLAGS) CGO_LDFLAGS=$(epp_CGO_LDFLAGS) \ diff --git a/Makefile.tools.mk b/Makefile.tools.mk index e750cd41b1..54eb912e9b 100644 --- a/Makefile.tools.mk +++ b/Makefile.tools.mk @@ -22,7 +22,7 @@ TYPOS_VERSION ?= v1.34.0 ## Python Configuration PYTHON_VERSION ?= 3.12 # Extract RELEASE_VERSION from Dockerfile -TOKENIZER_VERSION := $(shell grep '^ARG RELEASE_VERSION=' Dockerfile.epp | cut -d'=' -f2) +TOKENIZER_VERSION ?= $(shell grep '^ARG RELEASE_VERSION=' Dockerfile.epp | cut -d'=' -f2) # Python executable for creating venv PYTHON_EXE := $(shell command -v python$(PYTHON_VERSION) || command -v python3) @@ -151,33 +151,63 @@ $(TOKENIZER_LIB): | $(LOCALLIB) @ranlib $(LOCALLIB)/*.a @echo "Tokenizer bindings downloaded successfully." - -.PHONY: install-python-deps -install-python-deps: ## Sets up Python virtual environment and installs dependencies - @printf "\033[33;1m==== Setting up Python virtual environment in $(VENV_DIR) ====\033[0m\n" +.PHONY: detect-python +detect-python: ## Detects Python and prints the configuration. + @printf "\033[33;1m==== Python Configuration ====\033[0m\n" @if [ -z "$(PYTHON_EXE)" ]; then \ echo "ERROR: Python 3 not found in PATH."; \ exit 1; \ fi + @# Verify the version of the found python executable using its exit code + @if ! $(PYTHON_EXE) -c "import sys; sys.exit(0 if sys.version_info[:2] == ($(shell echo $(PYTHON_VERSION) | cut -d. -f1), $(shell echo $(PYTHON_VERSION) | cut -d. -f2)) else 1)"; then \ + echo "ERROR: Found Python at '$(PYTHON_EXE)' but it is not version $(PYTHON_VERSION)."; \ + echo "Please ensure 'python$(PYTHON_VERSION)' or a compatible 'python3' is in your PATH."; \ + exit 1; \ + fi + @echo "Python executable: $(PYTHON_EXE) ($$($(PYTHON_EXE) --version))" + @echo "Python CFLAGS: $(PYTHON_CFLAGS)" + @echo "Python LDFLAGS: $(PYTHON_LDFLAGS)" + @if [ -z "$(PYTHON_CFLAGS)" ]; then \ + echo "ERROR: Python development headers not found. See installation instructions above."; \ + exit 1; \ + fi + @printf "\033[33;1m==============================\033[0m\n" + +.PHONY: setup-venv +setup-venv: detect-python ## Sets up the Python virtual environment. + @printf "\033[33;1m==== Setting up Python virtual environment in $(VENV_DIR) ====\033[0m\n" @if [ ! -f "$(VENV_BIN)/pip" ]; then \ echo "Creating virtual environment..."; \ $(PYTHON_EXE) -m venv $(VENV_DIR) || { \ echo "ERROR: Failed to create virtual environment."; \ echo "Your Python installation may be missing the 'venv' module."; \ + echo "Try: 'sudo apt install python$(PYTHON_VERSION)-venv' or 'sudo dnf install python$(PYTHON_VERSION)-devel'"; \ exit 1; \ }; \ fi - @echo "Upgrading pip and installing dependencies..." - @$(VENV_BIN)/pip install --upgrade pip --quiet - @KV_CACHE_PKG=$$(go list -m -f '{{.Dir}}' github.com/llm-d/llm-d-kv-cache-manager 2>/dev/null); \ - if [ -n "$$KV_CACHE_PKG" ] && [ -f "$$KV_CACHE_PKG/pkg/preprocessing/chat_completions/requirements.txt" ]; then \ - echo "Installing Python dependencies from kv-cache-manager..."; \ - $(VENV_BIN)/pip install --quiet -r "$$KV_CACHE_PKG/pkg/preprocessing/chat_completions/requirements.txt"; \ + @echo "Upgrading pip..." + @$(VENV_BIN)/pip install --upgrade pip + @echo "Python virtual environment setup complete." + +.PHONY: install-python-deps +install-python-deps: setup-venv ## installs dependencies. + @printf "\033[33;1m==== Setting up Python virtual environment in $(VENV_DIR) ====\033[0m\n" + @echo "install vllm..." + @KV_CACHE_PKG=$${KV_CACHE_PKG:-$$(go list -m -f '{{.Dir}}' github.com/llm-d/llm-d-kv-cache 2>/dev/null)}; \ + if [ -n "$$KV_CACHE_PKG" ] && [ -f "$$KV_CACHE_PKG/pkg/preprocessing/chat_completions/setup.sh" ]; then \ + echo "Running kv-cache setup script..."; \ + cp "$$KV_CACHE_PKG/pkg/preprocessing/chat_completions/setup.sh" build/kv-cache-setup.sh; \ + chmod +x build/kv-cache-setup.sh; \ + cd build && PATH=$(VENV_BIN):$$PATH ./kv-cache-setup.sh && cd ..; \ else \ - echo "WARNING: Could not find kv-cache-manager requirements.txt, installing minimal deps..."; \ - $(VENV_BIN)/pip install --quiet 'transformers>=4.53.0' 'jinja2>=2.11'; \ + echo "ERROR: kv-cache package not found or setup script missing."; \ + exit 1; \ fi - @echo "✅ Python dependencies installed in venv" + @echo "Verifying vllm installation..." + @$(VENV_BIN)/python -c "import vllm; print('✅ vllm version ' + vllm.__version__ + ' installed.')" || { \ + echo "ERROR: vllm library not properly installed in venv."; \ + exit 1; \ + } .PHONY: check-tools check-tools: check-go check-ginkgo check-golangci-lint check-kustomize check-envsubst check-container-tool check-kubectl check-buildah check-typos ## Check that all required tools are installed diff --git a/deploy/config/epp-precise-prefix-cache-config.yaml b/deploy/config/epp-precise-prefix-cache-config.yaml index 1575052781..39b0bb2850 100644 --- a/deploy/config/epp-precise-prefix-cache-config.yaml +++ b/deploy/config/epp-precise-prefix-cache-config.yaml @@ -7,10 +7,10 @@ plugins: - type: decode-filter - type: precise-prefix-cache-scorer parameters: + tokenProcessorConfig: + blockSize: 64 # must match vLLM block size + hashSeed: "42" # must match vLLM PYTHONHASHSEED env var indexerConfig: - tokenProcessorConfig: - blockSize: 64 # must match vLLM block size - hashSeed: "42" # must match vLLM PYTHONHASHSEED env var kvBlockIndexConfig: enableMetrics: true # enable kv-block index metrics (prometheus) - type: kv-cache-utilization-scorer diff --git a/deploy/config/sim-epp-kvcache-config.yaml b/deploy/config/sim-epp-kvcache-config.yaml index 566e92437a..7850950ef7 100644 --- a/deploy/config/sim-epp-kvcache-config.yaml +++ b/deploy/config/sim-epp-kvcache-config.yaml @@ -6,15 +6,16 @@ plugins: - type: prefix-cache-scorer parameters: mode: cache_tracking + tokenProcessorConfig: + blockSize: 16 # must match vLLM block size if not default (16) + hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods kvEventsConfig: zmqEndpoint: tcp://0.0.0.0:5557 indexerConfig: prefixStoreConfig: blockSize: 16 - tokenProcessorConfig: - blockSize: 16 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods tokenizersPoolConfig: + modelName: # specify the model name to use for tokenizer loading hf: tokenizersCacheDir: "/cache/tokenizers" kvBlockIndexConfig: diff --git a/docs/architecture.md b/docs/architecture.md index b3215815c5..c51a67293b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -308,12 +308,14 @@ Configuration: - **Type**: `precise-prefix-cache-scorer` - **Parameters**: + - `tokenProcessorConfig`: Configuration for the `kvblock.TokenProcessor`. - `indexerConfig`: Configuration for the `kvcache.Indexer`. - `kvEventsConfig`: Configuration for the `kvevents.Pool`. See list of parameters at [llm-d-kv-cache/docs/configuration.md](https://github.com/llm-d/llm-d-kv-cache/blob/fa85b60207ba0a09daf23071e10ccb62d7977b40/docs/configuration.md). Note that in most cases you will only need to set: +- Model name in the `tokenizersPoolConfig` to match the model used in the vLLM deployment. - HuggingFace token for the `tokenizersPoolConfig` or the `tokenizersCacheDir` to a mounted directory containing the tokenizers. - For the HuggingFace token, the inference-scheduler also accepts the environment variable `HF_TOKEN` - this is the practical option for security. - **IMPORTANT**: Token processor's block-size and hash-seed to match those used in the vLLM deployment. @@ -325,15 +327,16 @@ Example configuration with the above parameters set: plugins: - type: precise-prefix-cache-scorer parameters: + tokenProcessorConfig: + blockSize: 64 # must match vLLM block size + hashSeed: "12345" # must match vLLM PYTHONHASHSEED env var indexerConfig: - tokenProcessorConfig: - blockSize: 64 - hashSeed: "12345" - tokenizersPoolConfig: - hf: - huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable - kvBlockIndexConfig: - enableMetrics: true + kvBlockIndexConfig: + enableMetrics: true + tokenizersPoolConfig: + modelName: hf-repo/model-name + hf: + huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable ``` Example configuration with all parameters set: @@ -342,23 +345,24 @@ Example configuration with all parameters set: plugins: - type: precise-prefix-cache-scorer parameters: + tokenProcessorConfig: + blockSize: 16 + hashSeed: "12345" kvEventsConfig: zmqEndpoint: tcp://*:5557 topicFilter: kv@ concurrency: 8 - kvCacheIndexerConfig: + indexerConfig: prefixStoreConfig: cacheSize: 500000 blockSize: 256 - tokenProcessorConfig: - blockSize: 16 - hashSeed: "12345" kvBlockIndexConfig: inMemoryConfig: size: 100000000 podCacheSize: 10 enableMetrics: true tokenizersPoolConfig: + modelName: hf-repo/model-name workersCount: 8 hf: huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable diff --git a/go.mod b/go.mod index 24f0e3d063..2d709788fa 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/google/uuid v1.6.0 github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/jellydator/ttlcache/v3 v3.4.0 - github.com/llm-d/llm-d-kv-cache-manager v0.4.0 + github.com/llm-d/llm-d-kv-cache v0.4.1-0.20260120091923-2d261e30d383 github.com/onsi/ginkgo/v2 v2.27.5 github.com/onsi/gomega v1.39.0 github.com/openai/openai-go v1.12.0 diff --git a/go.sum b/go.sum index 814a9e50d3..9e559e42e8 100644 --- a/go.sum +++ b/go.sum @@ -183,8 +183,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/llm-d/llm-d-kv-cache-manager v0.4.0 h1:MBWVpDW0PWsqNJEEAW1esrJW+Xavb0a7w14tCJWWyRY= -github.com/llm-d/llm-d-kv-cache-manager v0.4.0/go.mod h1:ZlK7MCuz5D/weLeHyNKEmVF/eJZDyYn3XyRowTihq9o= +github.com/llm-d/llm-d-kv-cache v0.4.1-0.20260120091923-2d261e30d383 h1:cHA1+Qe27oYDsWTmNToiidMBty6bxfuCUXvyhGPJ2FM= +github.com/llm-d/llm-d-kv-cache v0.4.1-0.20260120091923-2d261e30d383/go.mod h1:XyhzHBYeOWamBMPkuRySB5nJ0zzQpK/mbuXKqJRFT6A= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= diff --git a/pkg/plugins/scorer/precise_prefix_cache.go b/pkg/plugins/scorer/precise_prefix_cache.go index 2ce6551c0f..000d9845d1 100644 --- a/pkg/plugins/scorer/precise_prefix_cache.go +++ b/pkg/plugins/scorer/precise_prefix_cache.go @@ -7,9 +7,10 @@ import ( "fmt" "os" - "github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache" - "github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvevents" - preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions" + "github.com/llm-d/llm-d-kv-cache/pkg/kvcache" + "github.com/llm-d/llm-d-kv-cache/pkg/kvcache/kvblock" + "github.com/llm-d/llm-d-kv-cache/pkg/kvevents" + preprocessing "github.com/llm-d/llm-d-kv-cache/pkg/preprocessing/chat_completions" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" @@ -26,6 +27,9 @@ const ( // PrecisePrefixCachePluginConfig holds the configuration for the // PrecisePrefixCacheScorer plugin. type PrecisePrefixCachePluginConfig struct { + // TokenProcessorConfig holds the configuration for the `kvblock.TokenProcessor` which is + // used to process tokens into KV-block keys. + TokenProcessorConfig *kvblock.TokenProcessorConfig `json:"tokenProcessorConfig"` // IndexerConfig holds the configuration for the `kvcache.Indexer` which is // used to score pods based on the KV-cache index state. IndexerConfig *kvcache.Config `json:"indexerConfig"` @@ -53,18 +57,24 @@ func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage, KVEventsConfig: kvevents.DefaultConfig(), } - // read hugging face token from environment variable if set + if rawParameters != nil { + if err := json.Unmarshal(rawParameters, ¶meters); err != nil { + return nil, fmt.Errorf("failed to parse %s plugin config: %w", PrecisePrefixCachePluginType, err) + } + } + + // Apply HF token from environment if not already set if token := os.Getenv("HF_TOKEN"); token != "" && parameters.IndexerConfig != nil && parameters.IndexerConfig.TokenizersPoolConfig != nil && - parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig != nil { + parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig != nil && + parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig.HuggingFaceToken == "" { parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig.HuggingFaceToken = token } - if rawParameters != nil { - if err := json.Unmarshal(rawParameters, ¶meters); err != nil { - return nil, fmt.Errorf("failed to parse %s plugin config: %w", PrecisePrefixCachePluginType, err) - } + // Validate model name is set + if parameters.IndexerConfig == nil || parameters.IndexerConfig.TokenizersPoolConfig == nil || parameters.IndexerConfig.TokenizersPoolConfig.ModelName == "" { + return nil, errors.New("modelName is required in indexerConfig.tokenizersPoolConfig") } scorer, err := New(handle.Context(), parameters) @@ -85,8 +95,14 @@ func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage, // If the configuration is invalid or if the indexer fails to initialize, // an error is returned. func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePrefixCacheScorer, error) { + if config.TokenProcessorConfig == nil { + config.TokenProcessorConfig = kvblock.DefaultTokenProcessorConfig() + } + + tokenProcessor := kvblock.NewChunkedTokenDatabase(config.TokenProcessorConfig) + // initialize the indexer - kvCacheIndexer, err := kvcache.NewKVCacheIndexer(ctx, config.IndexerConfig) + kvCacheIndexer, err := kvcache.NewKVCacheIndexer(ctx, config.IndexerConfig, tokenProcessor) if err != nil { return nil, fmt.Errorf("failed to create `kvcache.Indexer`: %w", err) } @@ -94,7 +110,7 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr go kvCacheIndexer.Run(ctx) // initialize the KV-events pool - pool := kvevents.NewPool(config.KVEventsConfig, kvCacheIndexer.KVBlockIndex()) + pool := kvevents.NewPool(config.KVEventsConfig, kvCacheIndexer.KVBlockIndex(), tokenProcessor) pool.Start(ctx) return &PrecisePrefixCacheScorer{ @@ -186,8 +202,17 @@ func (s *PrecisePrefixCacheScorer) getScores(ctx context.Context, request *types traceLogger.Info("Both chat/completions and completions present; defaulting to chat/completions") } - renderReq := &preprocessing.RenderJinjaTemplateRequest{ - Conversations: make([]preprocessing.ChatMessage, 0), + // Convert messages to conversation format + conversations := make([]preprocessing.Conversation, len(request.Body.ChatCompletions.Messages)) + for i, msg := range request.Body.ChatCompletions.Messages { + conversations[i] = preprocessing.Conversation{ + Role: msg.Role, + Content: msg.Content.Raw, + } + } + + renderReq := &preprocessing.ApplyChatTemplateRequest{ + Conversation: [][]preprocessing.Conversation{conversations}, Tools: request.Body.ChatCompletions.Tools, Documents: request.Body.ChatCompletions.Documents, ChatTemplate: request.Body.ChatCompletions.ChatTemplate, @@ -197,16 +222,8 @@ func (s *PrecisePrefixCacheScorer) getScores(ctx context.Context, request *types ChatTemplateKWArgs: request.Body.ChatCompletions.ChatTemplateKWArgs, } - // Convert messages to the format expected by the renderer - for _, msg := range request.Body.ChatCompletions.Messages { - renderReq.Conversations = append(renderReq.Conversations, preprocessing.ChatMessage{ - Role: msg.Role, - Content: msg.Content.Raw, - }) - } - traceLogger.Info("Processing chat completion request", - "messagesCount", len(renderReq.Conversations), + "messagesCount", len(conversations), "toolsCount", len(renderReq.Tools), "documentsCount", len(renderReq.Documents)) diff --git a/pkg/plugins/scorer/precise_prefix_cache_test.go b/pkg/plugins/scorer/precise_prefix_cache_test.go index eb7284b95c..1a8bf9eec1 100644 --- a/pkg/plugins/scorer/precise_prefix_cache_test.go +++ b/pkg/plugins/scorer/precise_prefix_cache_test.go @@ -6,11 +6,11 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache" - "github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock" - "github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvevents" - preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions" - "github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization" + "github.com/llm-d/llm-d-kv-cache/pkg/kvcache" + "github.com/llm-d/llm-d-kv-cache/pkg/kvcache/kvblock" + "github.com/llm-d/llm-d-kv-cache/pkg/kvevents" + preprocessing "github.com/llm-d/llm-d-kv-cache/pkg/preprocessing/chat_completions" + "github.com/llm-d/llm-d-kv-cache/pkg/tokenization" "github.com/stretchr/testify/require" k8stypes "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" @@ -39,7 +39,7 @@ func TestPrefixCacheTracking_Score(t *testing.T) { name string pods []types.Pod request *types.LLMRequest - kvBlockData func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry + kvBlockData func(req *types.LLMRequestBody, model string) map[kvblock.BlockHash][]kvblock.PodEntry wantScoresByAddress map[string]float64 }{ { @@ -111,20 +111,20 @@ func TestPrefixCacheTracking_Score(t *testing.T) { }, }, }, - kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry { + kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.BlockHash][]kvblock.PodEntry { require.NotNil(t, req.Completions, "req expected to use Completions API") prompt := req.Completions.Prompt - testTokenizer, err := tokenization.NewCachedLocalTokenizer(localTokenizerConfig) + testTokenizer, err := tokenization.NewCachedLocalTokenizer(t.Context(), model, localTokenizerConfig) require.NoError(t, err) // use the actual tokenizer on the test prompt - tokens, _, err := testTokenizer.Encode(prompt, model) + tokens, _, err := testTokenizer.Encode(prompt, model, true) require.NoError(t, err) // compute chunk hashes using the default block size tokenProcessor := kvblock.NewChunkedTokenDatabase(kvblock.DefaultTokenProcessorConfig()) - chunkKeys := tokenProcessor.TokensToKVBlockKeys(tokens, model) + chunkKeys := tokenProcessor.TokensToKVBlockKeys(kvblock.EmptyBlockHash, tokens, model) require.GreaterOrEqual(t, len(chunkKeys), 3, "Need at least 3 chunks for test") @@ -138,17 +138,17 @@ func TestPrefixCacheTracking_Score(t *testing.T) { // pod-c: 1 chunk (0) -> score 1 // Normalized: (3-1)/(3-1) = 1.0, (2-1)/(3-1) = 0.5, (1-1)/(3-1) = 0.0 - return map[kvblock.Key][]kvblock.PodEntry{ - {ModelName: model, ChunkHash: chunkKeys[0].ChunkHash}: { + return map[kvblock.BlockHash][]kvblock.PodEntry{ + chunkKeys[0]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, {PodIdentifier: "10.0.0.3:8080"}, }, - {ModelName: model, ChunkHash: chunkKeys[1].ChunkHash}: { + chunkKeys[1]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, }, - {ModelName: model, ChunkHash: chunkKeys[2].ChunkHash}: { + chunkKeys[2]: { {PodIdentifier: "10.0.0.1:8080"}, }, } @@ -187,7 +187,7 @@ func TestPrefixCacheTracking_Score(t *testing.T) { Body: &types.LLMRequestBody{ ChatCompletions: &types.ChatCompletionsRequest{ ChatTemplate: `{% for message in messages %}{{ message.role }}: {{ message.content }} -{% endfor %}`, + {% endfor %}`, Messages: []types.Message{ { Role: "user", @@ -205,46 +205,53 @@ func TestPrefixCacheTracking_Score(t *testing.T) { }, }, }, - kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry { + kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.BlockHash][]kvblock.PodEntry { require.NotNil(t, req.ChatCompletions, "req expected to use ChatCompletions API") // convert to preprocessing format - var chatMessages []preprocessing.ChatMessage + var conversations []preprocessing.Conversation for _, msg := range req.ChatCompletions.Messages { - chatMessages = append(chatMessages, preprocessing.ChatMessage{ + conversations = append(conversations, preprocessing.Conversation{ Role: msg.Role, Content: msg.Content.Raw, }) } + processor := preprocessing.NewChatTemplatingProcessor() + tokenizerCacheKey, err := processor.GetOrCreateTokenizerKey(t.Context(), &preprocessing.GetOrCreateTokenizerKeyRequest{ + IsLocal: true, + Model: "testdata/" + model, + }) + require.NoError(t, err) + // render the chat template - renderReq := &preprocessing.RenderJinjaTemplateRequest{ - Conversations: chatMessages, - ChatTemplate: req.ChatCompletions.ChatTemplate, + renderReq := &preprocessing.ApplyChatTemplateRequest{ + Key: tokenizerCacheKey, + Conversation: [][]preprocessing.Conversation{conversations}, + ChatTemplate: req.ChatCompletions.ChatTemplate, } - processor := preprocessing.NewChatTemplatingProcessor() - rendered, err := processor.RenderChatTemplate(t.Context(), renderReq) + rendered, err := processor.ApplyChatTemplate(t.Context(), renderReq) require.NoError(t, err) // tokenize rendered prompt - testTokenizer, err := tokenization.NewCachedLocalTokenizer(localTokenizerConfig) + testTokenizer, err := tokenization.NewCachedLocalTokenizer(t.Context(), model, localTokenizerConfig) require.NoError(t, err) - tokens, _, err := testTokenizer.Encode(rendered.RenderedChats[0], model) + tokens, _, err := testTokenizer.Encode(rendered, model, false) require.NoError(t, err) tokenProcessor := kvblock.NewChunkedTokenDatabase(kvblock.DefaultTokenProcessorConfig()) - chunkKeys := tokenProcessor.TokensToKVBlockKeys(tokens, model) + chunkKeys := tokenProcessor.TokensToKVBlockKeys(kvblock.EmptyBlockHash, tokens, model) require.GreaterOrEqual(t, len(chunkKeys), 2, "Need at least 2 chunks for test") // pod-a has both chunks, pod-b has only the first - return map[kvblock.Key][]kvblock.PodEntry{ - {ModelName: model, ChunkHash: chunkKeys[0].ChunkHash}: { + return map[kvblock.BlockHash][]kvblock.PodEntry{ + chunkKeys[0]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, }, - {ModelName: model, ChunkHash: chunkKeys[1].ChunkHash}: { + chunkKeys[1]: { {PodIdentifier: "10.0.0.1:8080"}, }, } @@ -294,17 +301,17 @@ func TestPrefixCacheTracking_Score(t *testing.T) { }, }, }, - kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry { + kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.BlockHash][]kvblock.PodEntry { require.NotNil(t, req.Completions, "req expected to use Completions API") - testTokenizer, err := tokenization.NewCachedLocalTokenizer(localTokenizerConfig) + testTokenizer, err := tokenization.NewCachedLocalTokenizer(t.Context(), model, localTokenizerConfig) require.NoError(t, err) - tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model) + tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model, true) require.NoError(t, err) tokenProcessor := kvblock.NewChunkedTokenDatabase(kvblock.DefaultTokenProcessorConfig()) - chunkKeys := tokenProcessor.TokensToKVBlockKeys(tokens, model) + chunkKeys := tokenProcessor.TokensToKVBlockKeys(kvblock.EmptyBlockHash, tokens, model) require.GreaterOrEqual(t, len(chunkKeys), 3, "Need at least 3 chunks for test") @@ -317,16 +324,16 @@ func TestPrefixCacheTracking_Score(t *testing.T) { // pod-a: has chunks 0,1,2 contiguously -> score 3 // pod-b: has chunks 0,2 (missing 1) -> prefix stops at chunk0 -> score 1 // pod-c: has only chunk 0 -> score 1 - return map[kvblock.Key][]kvblock.PodEntry{ - {ModelName: model, ChunkHash: chunkKeys[0].ChunkHash}: { + return map[kvblock.BlockHash][]kvblock.PodEntry{ + chunkKeys[0]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, {PodIdentifier: "10.0.0.3:8080"}, }, - {ModelName: model, ChunkHash: chunkKeys[1].ChunkHash}: { + chunkKeys[1]: { {PodIdentifier: "10.0.0.1:8080"}, // only pod-a has chunk1 }, - {ModelName: model, ChunkHash: chunkKeys[2].ChunkHash}: { + chunkKeys[2]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, // pod-b has chunk2 but missing chunk1 }, @@ -341,62 +348,6 @@ func TestPrefixCacheTracking_Score(t *testing.T) { "10.0.0.3:8080": 0.0, }, }, - { - name: "different model names", - pods: []types.Pod{ - &types.PodMetrics{ - Pod: &backend.Pod{ - NamespacedName: k8stypes.NamespacedName{Name: "pod-a"}, - Address: "10.0.0.1:8080", - }, - }, - &types.PodMetrics{ - Pod: &backend.Pod{ - NamespacedName: k8stypes.NamespacedName{Name: "pod-b"}, - Address: "10.0.0.2:8080", - }, - }, - }, - request: &types.LLMRequest{ - RequestId: "test-request", - TargetModel: "test-model", - Body: &types.LLMRequestBody{ - Completions: &types.CompletionsRequest{ - Prompt: prompt, - }, - }, - }, - kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry { - require.NotNil(t, req.Completions, "req expected to use Completions API") - - testTokenizer, err := tokenization.NewCachedLocalTokenizer(localTokenizerConfig) - require.NoError(t, err) - - tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model) - require.NoError(t, err) - - tokenProcessor := kvblock.NewChunkedTokenDatabase(kvblock.DefaultTokenProcessorConfig()) - chunkKeys := tokenProcessor.TokensToKVBlockKeys(tokens, model) - - require.GreaterOrEqual(t, len(chunkKeys), 1, "Need at least 1 chunk for test") - - // Populate the index with blocks for model `different-model` - // The request will ask for "test-model" but the cache only has "different-model" - // This should result in no cache hits since models don't share cache - return map[kvblock.Key][]kvblock.PodEntry{ - {ModelName: "different-model", ChunkHash: chunkKeys[0].ChunkHash}: { - {PodIdentifier: "10.0.0.1:8080"}, - {PodIdentifier: "10.0.0.2:8080"}, - }, - } - }, - wantScoresByAddress: map[string]float64{ - // Even though both pods have the chunk cached, it's for a different model - // so there should be no cache hits for the requested model - "10.0.0.1:8080": 0.0, - "10.0.0.2:8080": 0.0, - }, - }, { name: "single pod", pods: []types.Pod{ @@ -419,26 +370,26 @@ func TestPrefixCacheTracking_Score(t *testing.T) { }, }, }, - kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry { + kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.BlockHash][]kvblock.PodEntry { require.NotNil(t, req.Completions, "req expected to use Completions API") - testTokenizer, err := tokenization.NewCachedLocalTokenizer(localTokenizerConfig) + testTokenizer, err := tokenization.NewCachedLocalTokenizer(t.Context(), model, localTokenizerConfig) require.NoError(t, err) - tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model) + tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model, true) require.NoError(t, err) tokenProcessor := kvblock.NewChunkedTokenDatabase(kvblock.DefaultTokenProcessorConfig()) - chunkKeys := tokenProcessor.TokensToKVBlockKeys(tokens, model) + chunkKeys := tokenProcessor.TokensToKVBlockKeys(kvblock.EmptyBlockHash, tokens, model) require.GreaterOrEqual(t, len(chunkKeys), 2, "Need at least 2 chunks for test") // Single pod has 2 chunks cached - return map[kvblock.Key][]kvblock.PodEntry{ - {ModelName: model, ChunkHash: chunkKeys[0].ChunkHash}: { + return map[kvblock.BlockHash][]kvblock.PodEntry{ + chunkKeys[0]: { {PodIdentifier: "10.0.0.1:8080"}, }, - {ModelName: model, ChunkHash: chunkKeys[1].ChunkHash}: { + chunkKeys[1]: { {PodIdentifier: "10.0.0.1:8080"}, }, } @@ -518,28 +469,28 @@ func TestPrefixCacheTracking_Score(t *testing.T) { }, }, }, - kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.Key][]kvblock.PodEntry { + kvBlockData: func(req *types.LLMRequestBody, model string) map[kvblock.BlockHash][]kvblock.PodEntry { require.NotNil(t, req.Completions, "req expected to use Completions API") - testTokenizer, err := tokenization.NewCachedLocalTokenizer(localTokenizerConfig) + testTokenizer, err := tokenization.NewCachedLocalTokenizer(t.Context(), model, localTokenizerConfig) require.NoError(t, err) - tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model) + tokens, _, err := testTokenizer.Encode(req.Completions.Prompt, model, true) require.NoError(t, err) tokenProcessor := kvblock.NewChunkedTokenDatabase(kvblock.DefaultTokenProcessorConfig()) - chunkKeys := tokenProcessor.TokensToKVBlockKeys(tokens, model) + chunkKeys := tokenProcessor.TokensToKVBlockKeys(kvblock.EmptyBlockHash, tokens, model) require.GreaterOrEqual(t, len(chunkKeys), 2, "Need at least 2 chunks for test") // all pods have the same 2 chunks cached - return map[kvblock.Key][]kvblock.PodEntry{ - {ModelName: model, ChunkHash: chunkKeys[0].ChunkHash}: { + return map[kvblock.BlockHash][]kvblock.PodEntry{ + chunkKeys[0]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, {PodIdentifier: "10.0.0.3:8080"}, }, - {ModelName: model, ChunkHash: chunkKeys[1].ChunkHash}: { + chunkKeys[1]: { {PodIdentifier: "10.0.0.1:8080"}, {PodIdentifier: "10.0.0.2:8080"}, {PodIdentifier: "10.0.0.3:8080"}, @@ -562,6 +513,7 @@ func TestPrefixCacheTracking_Score(t *testing.T) { kvcacheConfig, err := kvcache.NewDefaultConfig() kvcacheConfig.TokenizersPoolConfig = &tokenization.Config{ + ModelName: "test-model", WorkersCount: 1, MinPrefixOverlapRatio: 0.8, LocalTokenizerConfig: &localTokenizerConfig, @@ -580,7 +532,7 @@ func TestPrefixCacheTracking_Score(t *testing.T) { kvBlockIndex := prefixCacheScorer.kvCacheIndexer.KVBlockIndex() blockData := tt.kvBlockData(tt.request.Body, tt.request.TargetModel) for key, entries := range blockData { - err := kvBlockIndex.Add(ctx, []kvblock.Key{key}, entries) + err := kvBlockIndex.Add(ctx, []kvblock.BlockHash{kvblock.EmptyBlockHash}, []kvblock.BlockHash{key}, entries) require.NoError(t, err) } } diff --git a/scripts/fetch-python-wrapper.sh b/scripts/fetch-python-wrapper.sh deleted file mode 100755 index 0c8bc1d681..0000000000 --- a/scripts/fetch-python-wrapper.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash -# fetch-python-wrapper.sh -# Fetches the Python wrapper file (render_jinja_template_wrapper.py) from llm-d-kv-cache-manager -# for use in Docker builds and local development. -# Version can be provided as CLI arg or via KVCACHE_MANAGER_VERSION env var (default v0.3.2). -# -# This script replicates the original Dockerfile logic: -# 1. Creates a temporary directory -# 2. Clones the repo into that directory -# 3. Creates the output directory structure -# 4. Copies the wrapper file to the output location -# 5. Cleans up the temporary directory - -set -euo pipefail - -VERSION="${1:-${KVCACHE_MANAGER_VERSION:-v0.3.2}}" -OUTPUT_DIR="${2:-llm-d-kv-cache-manager/pkg/preprocessing/chat_completions}" - -REPO_URL="https://github.com/llm-d/llm-d-kv-cache-manager.git" -WRAPPER_FILE="pkg/preprocessing/chat_completions/render_jinja_template_wrapper.py" - -# Create temporary directory (equivalent to: mkdir -p /tmp/kv-cache-manager) -# TEMP_DIR will be an absolute path like /tmp/tmp.XXXXXX -TEMP_DIR=$(mktemp -d) -trap "rm -rf ${TEMP_DIR}" EXIT - -echo "Fetching Python wrapper from llm-d-kv-cache-manager@${VERSION}..." - -# Equivalent to: cd /tmp/kv-cache-manager && git clone ... . -# (clones repo contents directly into TEMP_DIR - using absolute path, no need to cd) -git clone --depth 1 --branch "${VERSION}" "${REPO_URL}" "${TEMP_DIR}" - -# Create output directory if it doesn't exist -# (equivalent to: mkdir -p /workspace/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions) -# OUTPUT_DIR is relative to current working directory (relative path, same as original) -mkdir -p "${OUTPUT_DIR}" - -# Copy wrapper file -# Source: absolute path ${TEMP_DIR}/${WRAPPER_FILE} (e.g., /tmp/tmp.XXXXXX/pkg/.../wrapper.py) -# Destination: relative path ${OUTPUT_DIR}/ (e.g., llm-d-kv-cache-manager/pkg/.../) -# (equivalent to original: cp pkg/.../wrapper.py /workspace/... from within temp dir) -cp "${TEMP_DIR}/${WRAPPER_FILE}" "${OUTPUT_DIR}/" - -# Cleanup happens automatically via trap (equivalent to: rm -rf /tmp/kv-cache-manager) - -echo "Successfully fetched render_jinja_template_wrapper.py to ${OUTPUT_DIR}/" - diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 6cd8c4456a..215fe86a4a 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -24,7 +24,7 @@ if [[ -z "${HF_TOKEN:-}" ]]; then exit 1 fi -export VLLM_CHART_DIR="${VLLM_CHART_DIR:-../llm-d-kv-cache-manager/vllm-setup-helm}" +export VLLM_CHART_DIR="${VLLM_CHART_DIR:-../llm-d-kv-cache/vllm-setup-helm}" # Check that Chart.yaml exists if [[ ! -f "$VLLM_CHART_DIR/Chart.yaml" ]]; then echo "Chart.yaml not found in $VLLM_CHART_DIR" diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index d23b20a58d..027a496867 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -487,15 +487,16 @@ kind: EndpointPickerConfig plugins: - type: precise-prefix-cache-scorer parameters: + tokenProcessorConfig: + blockSize: 16 + hashSeed: "42" kvEventsConfig: zmqEndpoint: tcp://0.0.0.0:5557 indexerConfig: prefixStoreConfig: blockSize: 16 - tokenProcessorConfig: - blockSize: 16 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods tokenizersPoolConfig: + modelName: Qwen/Qwen2.5-1.5B-Instruct hf: tokenizersCacheDir: "/cache/tokenizers" kvBlockIndexConfig: