From 1d55530558bd23866cd65deeb079856992755220 Mon Sep 17 00:00:00 2001 From: Wen Zhou Date: Wed, 21 Jan 2026 14:59:11 +0100 Subject: [PATCH 1/2] feat: use Tinyllama as the "model" for kind test - in order to test precies-prefix-cache-score we cannot use fool-reviewer since it need call kv-cache-manager to get tokenizer by getting a real model from HF - the change is to switch the "default model" to TinyLlama - also to make tokenizer folder writable need change permission to the USER in Dockerfile - rename dp-epp-config.yaml sim-dp-epp-config.yaml as it is used for local test Signed-off-by: Wen Zhou --- Dockerfile.epp | 5 ++++- deploy/config/epp-config.yaml | 4 ++-- .../{dp-epp-config.yaml => sim-dp-epp-config.yaml} | 10 ++++++++-- deploy/config/sim-epp-config.yaml | 12 +++++++----- deploy/config/sim-epp-kvcache-config.yaml | 6 +++--- deploy/config/sim-epp-no-hit-lru.yaml | 12 +++++++----- deploy/config/sim-pd-epp-config.yaml | 14 ++++++++------ scripts/kind-dev-env.sh | 4 ++-- 8 files changed, 41 insertions(+), 26 deletions(-) rename deploy/config/{dp-epp-config.yaml => sim-dp-epp-config.yaml} (63%) diff --git a/Dockerfile.epp b/Dockerfile.epp index 9d1e89fdb5..4996c6fbec 100644 --- a/Dockerfile.epp +++ b/Dockerfile.epp @@ -102,6 +102,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace RUN ${PYTHON} -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed ENV HF_HOME="/tmp/.cache" +# used by kv-cache-manager +ENV LOCAL_TOKENIZER_DIR="/tmp/.cache" +# Create cache directory and set permissions for non-root user +RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME} USER 65532:65532 @@ -113,4 +117,3 @@ EXPOSE 9090 EXPOSE 5557 ENTRYPOINT ["/app/epp"] - diff --git a/deploy/config/epp-config.yaml b/deploy/config/epp-config.yaml index afb1f701fc..d8bf748dd4 100644 --- a/deploy/config/epp-config.yaml +++ b/deploy/config/epp-config.yaml @@ -3,7 +3,7 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer - type: decode-filter - type: max-score-picker - type: single-profile-handler @@ -12,5 +12,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 diff --git a/deploy/config/dp-epp-config.yaml b/deploy/config/sim-dp-epp-config.yaml similarity index 63% rename from deploy/config/dp-epp-config.yaml rename to deploy/config/sim-dp-epp-config.yaml index 703a44f676..6e84188661 100644 --- a/deploy/config/dp-epp-config.yaml +++ b/deploy/config/sim-dp-epp-config.yaml @@ -3,7 +3,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer + parameters: + indexerConfig: + tokenProcessorConfig: + blockSize: 5 + kvBlockIndexConfig: + maxPrefixBlocksToMatch: 256 - type: decode-filter - type: max-score-picker - type: data-parallel-profile-handler @@ -14,5 +20,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 diff --git a/deploy/config/sim-epp-config.yaml b/deploy/config/sim-epp-config.yaml index 18e2a25410..8540c53960 100644 --- a/deploy/config/sim-epp-config.yaml +++ b/deploy/config/sim-epp-config.yaml @@ -3,11 +3,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 + indexerConfig: + tokenProcessorConfig: + blockSize: 5 + kvBlockIndexConfig: + maxPrefixBlocksToMatch: 256 - type: decode-filter - type: max-score-picker - type: single-profile-handler @@ -16,5 +18,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 diff --git a/deploy/config/sim-epp-kvcache-config.yaml b/deploy/config/sim-epp-kvcache-config.yaml index 7850950ef7..76aab070f7 100644 --- a/deploy/config/sim-epp-kvcache-config.yaml +++ b/deploy/config/sim-epp-kvcache-config.yaml @@ -3,7 +3,7 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer parameters: mode: cache_tracking tokenProcessorConfig: @@ -15,7 +15,7 @@ plugins: prefixStoreConfig: blockSize: 16 tokenizersPoolConfig: - modelName: # specify the model name to use for tokenizer loading + modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # replace value to use different model for tokenizer loading hf: tokenizersCacheDir: "/cache/tokenizers" kvBlockIndexConfig: @@ -29,5 +29,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 10 diff --git a/deploy/config/sim-epp-no-hit-lru.yaml b/deploy/config/sim-epp-no-hit-lru.yaml index 8d02244114..e10ec50629 100644 --- a/deploy/config/sim-epp-no-hit-lru.yaml +++ b/deploy/config/sim-epp-no-hit-lru.yaml @@ -3,11 +3,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 + indexerConfig: + tokenProcessorConfig: + blockSize: 5 + kvBlockIndexConfig: + maxPrefixBlocksToMatch: 256 - type: no-hit-lru-scorer parameters: lruSize: 2048 @@ -19,7 +21,7 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 - pluginRef: no-hit-lru-scorer weight: 1 diff --git a/deploy/config/sim-pd-epp-config.yaml b/deploy/config/sim-pd-epp-config.yaml index 2d6a85dd9e..da5ad08808 100644 --- a/deploy/config/sim-pd-epp-config.yaml +++ b/deploy/config/sim-pd-epp-config.yaml @@ -4,11 +4,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: - type: prefill-header-handler -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 + indexerConfig: + tokenProcessorConfig: + blockSize: 5 + kvBlockIndexConfig: + maxPrefixBlocksToMatch: 256 - type: prefill-filter - type: decode-filter - type: max-score-picker @@ -22,11 +24,11 @@ schedulingProfiles: plugins: - pluginRef: prefill-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 - name: decode plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh index 5ef91726d2..61e7d79ffa 100755 --- a/scripts/kind-dev-env.sh +++ b/scripts/kind-dev-env.sh @@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}" export EPP_IMAGE # Set the model name to deploy -export MODEL_NAME="${MODEL_NAME:-food-review}" +export MODEL_NAME="${MODEL_NAME:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}" # Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct") export MODEL_FAMILY="${MODEL_NAME%%/*}" # Extract model ID (e.g., "Llama-3.1-8B-Instruct") @@ -89,7 +89,7 @@ else PRIMARY_PORT="8000" fi else - DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml" + DEFAULT_EPP_CONFIG="deploy/config/sim-dp-epp-config.yaml" fi else echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported" From fb2bd6cc9e379bc78b2b19f4e90e59a49fa5af27 Mon Sep 17 00:00:00 2001 From: Wen Zhou Date: Thu, 22 Jan 2026 11:07:19 +0100 Subject: [PATCH 2/2] update: revert back some config to keep using prefix-cache-scorer - revert file renaming Signed-off-by: Wen Zhou --- ...-dp-epp-config.yaml => dp-epp-config.yaml} | 0 deploy/config/epp-config.yaml | 4 +- deploy/config/sim-epp-config.yaml | 12 ++--- deploy/config/sim-pd-epp-config.yaml | 14 +++--- scripts/kind-dev-env.sh | 47 +++++++++++-------- 5 files changed, 41 insertions(+), 36 deletions(-) rename deploy/config/{sim-dp-epp-config.yaml => dp-epp-config.yaml} (100%) diff --git a/deploy/config/sim-dp-epp-config.yaml b/deploy/config/dp-epp-config.yaml similarity index 100% rename from deploy/config/sim-dp-epp-config.yaml rename to deploy/config/dp-epp-config.yaml diff --git a/deploy/config/epp-config.yaml b/deploy/config/epp-config.yaml index d8bf748dd4..afb1f701fc 100644 --- a/deploy/config/epp-config.yaml +++ b/deploy/config/epp-config.yaml @@ -3,7 +3,7 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: precise-prefix-cache-scorer +- type: prefix-cache-scorer - type: decode-filter - type: max-score-picker - type: single-profile-handler @@ -12,5 +12,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: precise-prefix-cache-scorer + - pluginRef: prefix-cache-scorer weight: 2 diff --git a/deploy/config/sim-epp-config.yaml b/deploy/config/sim-epp-config.yaml index 8540c53960..18e2a25410 100644 --- a/deploy/config/sim-epp-config.yaml +++ b/deploy/config/sim-epp-config.yaml @@ -3,13 +3,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: precise-prefix-cache-scorer +- type: prefix-cache-scorer parameters: - indexerConfig: - tokenProcessorConfig: - blockSize: 5 - kvBlockIndexConfig: - maxPrefixBlocksToMatch: 256 + hashBlockSize: 5 + maxPrefixBlocksToMatch: 256 + lruCapacityPerServer: 31250 - type: decode-filter - type: max-score-picker - type: single-profile-handler @@ -18,5 +16,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: precise-prefix-cache-scorer + - pluginRef: prefix-cache-scorer weight: 2 diff --git a/deploy/config/sim-pd-epp-config.yaml b/deploy/config/sim-pd-epp-config.yaml index da5ad08808..2d6a85dd9e 100644 --- a/deploy/config/sim-pd-epp-config.yaml +++ b/deploy/config/sim-pd-epp-config.yaml @@ -4,13 +4,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: - type: prefill-header-handler -- type: precise-prefix-cache-scorer +- type: prefix-cache-scorer parameters: - indexerConfig: - tokenProcessorConfig: - blockSize: 5 - kvBlockIndexConfig: - maxPrefixBlocksToMatch: 256 + hashBlockSize: 5 + maxPrefixBlocksToMatch: 256 + lruCapacityPerServer: 31250 - type: prefill-filter - type: decode-filter - type: max-score-picker @@ -24,11 +22,11 @@ schedulingProfiles: plugins: - pluginRef: prefill-filter - pluginRef: max-score-picker - - pluginRef: precise-prefix-cache-scorer + - pluginRef: prefix-cache-scorer weight: 2 - name: decode plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: precise-prefix-cache-scorer + - pluginRef: prefix-cache-scorer weight: 2 diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh index 61e7d79ffa..6b0d22a049 100755 --- a/scripts/kind-dev-env.sh +++ b/scripts/kind-dev-env.sh @@ -74,32 +74,41 @@ export VLLM_REPLICA_COUNT_D="${VLLM_REPLICA_COUNT_D:-2}" # Data Parallel size export VLLM_DATA_PARALLEL_SIZE="${VLLM_DATA_PARALLEL_SIZE:-1}" -PRIMARY_PORT="0" -if [ "${PD_ENABLED}" != "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -eq 1 ]; then - if [ "${KV_CACHE_ENABLED}" != "true" ]; then - DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml" - else - DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml" - fi -else - if [ "${KV_CACHE_ENABLED}" != "true" ]; then - if [ "${PD_ENABLED}" == "\"true\"" ]; then - DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml" - if [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then - PRIMARY_PORT="8000" - fi - else - DEFAULT_EPP_CONFIG="deploy/config/sim-dp-epp-config.yaml" - fi - else +# Validate configuration constraints +if [ "${KV_CACHE_ENABLED}" == "true" ]; then + # KV cache requires simple mode: no PD and DP size must be 1 + if [ "${PD_ENABLED}" == "\"true\"" ] || [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported" exit 1 fi fi -export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}" +# Set PRIMARY_PORT based on PD mode with data parallelism +if [ "${PD_ENABLED}" == "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then + PRIMARY_PORT="8000" +else + PRIMARY_PORT="0" +fi export PRIMARY_PORT +# Determine EPP config file based on feature flags +if [ "${KV_CACHE_ENABLED}" == "true" ]; then + # KV cache mode (simple mode only) + DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml" +elif [ "${PD_ENABLED}" == "\"true\"" ]; then + # Prefill-Decode mode + DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml" +elif [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then + # Data Parallel mode (only needed for Istio pre-1.28.1) + # Not really called in kind(docker.io/istio/pilot:1.28.1) by "make env-dev-kind" + DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml" +else + # Simple mode + DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml" +fi + +export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}" + # ------------------------------------------------------------------------------ # Setup & Requirement Checks # ------------------------------------------------------------------------------