From 1d55530558bd23866cd65deeb079856992755220 Mon Sep 17 00:00:00 2001
From: Wen Zhou <wenzhou@redhat.com>
Date: Wed, 21 Jan 2026 14:59:11 +0100
Subject: [PATCH 1/2] feat: use Tinyllama as the "model" for kind test

- in order to test precies-prefix-cache-score we cannot use
  fool-reviewer since it need call kv-cache-manager to get tokenizer by
  getting a real model from HF
- the change is to switch the "default model" to TinyLlama
- also to make tokenizer folder writable need change permission to the
  USER in Dockerfile
- rename dp-epp-config.yaml sim-dp-epp-config.yaml as it is used for
  local test

Signed-off-by: Wen Zhou <wenzhou@redhat.com>
---
 Dockerfile.epp                                     |  5 ++++-
 deploy/config/epp-config.yaml                      |  4 ++--
 .../{dp-epp-config.yaml => sim-dp-epp-config.yaml} | 10 ++++++++--
 deploy/config/sim-epp-config.yaml                  | 12 +++++++-----
 deploy/config/sim-epp-kvcache-config.yaml          |  6 +++---
 deploy/config/sim-epp-no-hit-lru.yaml              | 12 +++++++-----
 deploy/config/sim-pd-epp-config.yaml               | 14 ++++++++------
 scripts/kind-dev-env.sh                            |  4 ++--
 8 files changed, 41 insertions(+), 26 deletions(-)
 rename deploy/config/{dp-epp-config.yaml => sim-dp-epp-config.yaml} (63%)

diff --git a/Dockerfile.epp b/Dockerfile.epp
index 9d1e89fdb5..4996c6fbec 100644
--- a/Dockerfile.epp
+++ b/Dockerfile.epp
@@ -102,6 +102,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace
 RUN ${PYTHON} -c "import tokenizer_wrapper"  # verify tokenizer_wrapper is correctly installed
 
 ENV HF_HOME="/tmp/.cache"
+# used by kv-cache-manager
+ENV LOCAL_TOKENIZER_DIR="/tmp/.cache"
+# Create cache directory and set permissions for non-root user
+RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME}
 
 USER 65532:65532
 
@@ -113,4 +117,3 @@ EXPOSE 9090
 EXPOSE 5557
 
 ENTRYPOINT ["/app/epp"]
-
diff --git a/deploy/config/epp-config.yaml b/deploy/config/epp-config.yaml
index afb1f701fc..d8bf748dd4 100644
--- a/deploy/config/epp-config.yaml
+++ b/deploy/config/epp-config.yaml
@@ -3,7 +3,7 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
 - type: decode-filter
 - type: max-score-picker
 - type: single-profile-handler
@@ -12,5 +12,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/dp-epp-config.yaml b/deploy/config/sim-dp-epp-config.yaml
similarity index 63%
rename from deploy/config/dp-epp-config.yaml
rename to deploy/config/sim-dp-epp-config.yaml
index 703a44f676..6e84188661 100644
--- a/deploy/config/dp-epp-config.yaml
+++ b/deploy/config/sim-dp-epp-config.yaml
@@ -3,7 +3,13 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
+  parameters:
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: decode-filter
 - type: max-score-picker
 - type: data-parallel-profile-handler
@@ -14,5 +20,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-epp-config.yaml b/deploy/config/sim-epp-config.yaml
index 18e2a25410..8540c53960 100644
--- a/deploy/config/sim-epp-config.yaml
+++ b/deploy/config/sim-epp-config.yaml
@@ -3,11 +3,13 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
-    maxPrefixBlocksToMatch: 256
-    lruCapacityPerServer: 31250
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: decode-filter
 - type: max-score-picker
 - type: single-profile-handler
@@ -16,5 +18,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-epp-kvcache-config.yaml b/deploy/config/sim-epp-kvcache-config.yaml
index 7850950ef7..76aab070f7 100644
--- a/deploy/config/sim-epp-kvcache-config.yaml
+++ b/deploy/config/sim-epp-kvcache-config.yaml
@@ -3,7 +3,7 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
     mode: cache_tracking
     tokenProcessorConfig:
@@ -15,7 +15,7 @@ plugins:
       prefixStoreConfig:
         blockSize: 16 
       tokenizersPoolConfig:
-        modelName: <model-name>            # specify the model name to use for tokenizer loading
+        modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0  # replace value to use different model for tokenizer loading
         hf:
           tokenizersCacheDir: "/cache/tokenizers"
       kvBlockIndexConfig:
@@ -29,5 +29,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 10
diff --git a/deploy/config/sim-epp-no-hit-lru.yaml b/deploy/config/sim-epp-no-hit-lru.yaml
index 8d02244114..e10ec50629 100644
--- a/deploy/config/sim-epp-no-hit-lru.yaml
+++ b/deploy/config/sim-epp-no-hit-lru.yaml
@@ -3,11 +3,13 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
-    maxPrefixBlocksToMatch: 256
-    lruCapacityPerServer: 31250
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: no-hit-lru-scorer
   parameters:
     lruSize: 2048
@@ -19,7 +21,7 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
   - pluginRef: no-hit-lru-scorer
     weight: 1
diff --git a/deploy/config/sim-pd-epp-config.yaml b/deploy/config/sim-pd-epp-config.yaml
index 2d6a85dd9e..da5ad08808 100644
--- a/deploy/config/sim-pd-epp-config.yaml
+++ b/deploy/config/sim-pd-epp-config.yaml
@@ -4,11 +4,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
 - type: prefill-header-handler
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
-    maxPrefixBlocksToMatch: 256
-    lruCapacityPerServer: 31250
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: prefill-filter
 - type: decode-filter
 - type: max-score-picker
@@ -22,11 +24,11 @@ schedulingProfiles:
   plugins:
   - pluginRef: prefill-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
 - name: decode
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh
index 5ef91726d2..61e7d79ffa 100755
--- a/scripts/kind-dev-env.sh
+++ b/scripts/kind-dev-env.sh
@@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}"
 export EPP_IMAGE
 
 # Set the model name to deploy
-export MODEL_NAME="${MODEL_NAME:-food-review}"
+export MODEL_NAME="${MODEL_NAME:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}"
 # Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct")
 export MODEL_FAMILY="${MODEL_NAME%%/*}"
 # Extract model ID (e.g., "Llama-3.1-8B-Instruct")
@@ -89,7 +89,7 @@ else
         PRIMARY_PORT="8000"
       fi
     else
-      DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml"
+      DEFAULT_EPP_CONFIG="deploy/config/sim-dp-epp-config.yaml"
     fi
   else
     echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported"

From fb2bd6cc9e379bc78b2b19f4e90e59a49fa5af27 Mon Sep 17 00:00:00 2001
From: Wen Zhou <wenzhou@redhat.com>
Date: Thu, 22 Jan 2026 11:07:19 +0100
Subject: [PATCH 2/2] update: revert back some config to keep using
 prefix-cache-scorer

- revert file renaming

Signed-off-by: Wen Zhou <wenzhou@redhat.com>
---
 ...-dp-epp-config.yaml => dp-epp-config.yaml} |  0
 deploy/config/epp-config.yaml                 |  4 +-
 deploy/config/sim-epp-config.yaml             | 12 ++---
 deploy/config/sim-pd-epp-config.yaml          | 14 +++---
 scripts/kind-dev-env.sh                       | 47 +++++++++++--------
 5 files changed, 41 insertions(+), 36 deletions(-)
 rename deploy/config/{sim-dp-epp-config.yaml => dp-epp-config.yaml} (100%)

diff --git a/deploy/config/sim-dp-epp-config.yaml b/deploy/config/dp-epp-config.yaml
similarity index 100%
rename from deploy/config/sim-dp-epp-config.yaml
rename to deploy/config/dp-epp-config.yaml
diff --git a/deploy/config/epp-config.yaml b/deploy/config/epp-config.yaml
index d8bf748dd4..afb1f701fc 100644
--- a/deploy/config/epp-config.yaml
+++ b/deploy/config/epp-config.yaml
@@ -3,7 +3,7 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: precise-prefix-cache-scorer
+- type: prefix-cache-scorer
 - type: decode-filter
 - type: max-score-picker
 - type: single-profile-handler
@@ -12,5 +12,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: precise-prefix-cache-scorer
+  - pluginRef: prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-epp-config.yaml b/deploy/config/sim-epp-config.yaml
index 8540c53960..18e2a25410 100644
--- a/deploy/config/sim-epp-config.yaml
+++ b/deploy/config/sim-epp-config.yaml
@@ -3,13 +3,11 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: precise-prefix-cache-scorer
+- type: prefix-cache-scorer
   parameters:
-    indexerConfig:
-      tokenProcessorConfig:
-        blockSize: 5
-      kvBlockIndexConfig:
-        maxPrefixBlocksToMatch: 256
+    hashBlockSize: 5
+    maxPrefixBlocksToMatch: 256
+    lruCapacityPerServer: 31250
 - type: decode-filter
 - type: max-score-picker
 - type: single-profile-handler
@@ -18,5 +16,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: precise-prefix-cache-scorer
+  - pluginRef: prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-pd-epp-config.yaml b/deploy/config/sim-pd-epp-config.yaml
index da5ad08808..2d6a85dd9e 100644
--- a/deploy/config/sim-pd-epp-config.yaml
+++ b/deploy/config/sim-pd-epp-config.yaml
@@ -4,13 +4,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
 - type: prefill-header-handler
-- type: precise-prefix-cache-scorer
+- type: prefix-cache-scorer
   parameters:
-    indexerConfig:
-      tokenProcessorConfig:
-        blockSize: 5
-      kvBlockIndexConfig:
-        maxPrefixBlocksToMatch: 256
+    hashBlockSize: 5
+    maxPrefixBlocksToMatch: 256
+    lruCapacityPerServer: 31250
 - type: prefill-filter
 - type: decode-filter
 - type: max-score-picker
@@ -24,11 +22,11 @@ schedulingProfiles:
   plugins:
   - pluginRef: prefill-filter
   - pluginRef: max-score-picker
-  - pluginRef: precise-prefix-cache-scorer
+  - pluginRef: prefix-cache-scorer
     weight: 2
 - name: decode
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: precise-prefix-cache-scorer
+  - pluginRef: prefix-cache-scorer
     weight: 2
diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh
index 61e7d79ffa..6b0d22a049 100755
--- a/scripts/kind-dev-env.sh
+++ b/scripts/kind-dev-env.sh
@@ -74,32 +74,41 @@ export VLLM_REPLICA_COUNT_D="${VLLM_REPLICA_COUNT_D:-2}"
 # Data Parallel size
 export VLLM_DATA_PARALLEL_SIZE="${VLLM_DATA_PARALLEL_SIZE:-1}"
 
-PRIMARY_PORT="0"
-if [ "${PD_ENABLED}" != "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -eq 1 ]; then
-  if [ "${KV_CACHE_ENABLED}" != "true" ]; then
-    DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml"
-  else
-    DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml"
-  fi
-else
-  if [ "${KV_CACHE_ENABLED}" != "true" ]; then
-    if [ "${PD_ENABLED}" == "\"true\"" ]; then
-      DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml"
-      if [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
-        PRIMARY_PORT="8000"
-      fi
-    else
-      DEFAULT_EPP_CONFIG="deploy/config/sim-dp-epp-config.yaml"
-    fi
-  else
+# Validate configuration constraints
+if [ "${KV_CACHE_ENABLED}" == "true" ]; then
+  # KV cache requires simple mode: no PD and DP size must be 1
+  if [ "${PD_ENABLED}" == "\"true\"" ] || [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
     echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported"
     exit 1
   fi
 fi
 
-export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}"
+# Set PRIMARY_PORT based on PD mode with data parallelism
+if [ "${PD_ENABLED}" == "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
+  PRIMARY_PORT="8000"
+else
+  PRIMARY_PORT="0"
+fi
 export PRIMARY_PORT
 
+# Determine EPP config file based on feature flags
+if [ "${KV_CACHE_ENABLED}" == "true" ]; then
+  # KV cache mode (simple mode only)
+  DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml"
+elif [ "${PD_ENABLED}" == "\"true\"" ]; then
+  # Prefill-Decode mode
+  DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml"
+elif [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
+  # Data Parallel mode (only needed for Istio pre-1.28.1)
+  # Not really called in kind(docker.io/istio/pilot:1.28.1) by "make env-dev-kind"
+  DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml"
+else
+  # Simple mode
+  DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml"
+fi
+
+export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}"
+
 # ------------------------------------------------------------------------------
 # Setup & Requirement Checks
 # ------------------------------------------------------------------------------