From 7d9873787958dfd51c8b0f3b5d005903613d4c17 Mon Sep 17 00:00:00 2001 From: Edoardo Vacchi Date: Mon, 29 Dec 2025 09:11:22 +0100 Subject: [PATCH 1/4] feat: make no-hit-lru P/D-aware Signed-off-by: Edoardo Vacchi --- pkg/plugins/scorer/no_hit_lru.go | 31 ++++-- pkg/plugins/scorer/no_hit_lru_test.go | 148 ++++++++++++++++++++++++++ 2 files changed, 173 insertions(+), 6 deletions(-) diff --git a/pkg/plugins/scorer/no_hit_lru.go b/pkg/plugins/scorer/no_hit_lru.go index 417cf05a56..0be25192dc 100644 --- a/pkg/plugins/scorer/no_hit_lru.go +++ b/pkg/plugins/scorer/no_hit_lru.go @@ -21,6 +21,9 @@ const ( // defaultLRUSize is the maximum number of pods we'll consider in the cache defaultLRUSize = 1024 + + // defaultPrefillProfile is the name of the prefill profile when not explicitly declared + defaultPrefillProfile = "prefill" ) // compile-time type assertions @@ -36,6 +39,10 @@ type NoHitLRUParameters struct { // Defaults to "prefix-cache-scorer". PrefixPluginName string `json:"prefixPluginName"` + // PrefillProfile defines the name of the prefill profile to track in LRU. + // Defaults to "prefill". + PrefillProfile string `json:"prefillProfile"` + // LRUSize defines the maximum number of pods to track in the LRU cache. LRUSize int `json:"lruSize"` } @@ -75,6 +82,7 @@ func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU { prefixPluginType := prefix.PrefixCachePluginType prefixPluginName := prefix.PrefixCachePluginType lruSize := defaultLRUSize + prefillProfile := defaultPrefillProfile if params != nil { if params.PrefixPluginType != "" { @@ -86,6 +94,9 @@ func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU { if params.LRUSize > 0 { lruSize = params.LRUSize } + if params.PrefillProfile != "" { + prefillProfile = params.PrefillProfile + } } lruCache, err := lru.New[string, struct{}](lruSize) @@ -98,6 +109,7 @@ func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU { typedName: plugins.TypedName{Type: NoHitLRUType}, lruCache: lruCache, prefixPluginTypedName: plugins.TypedName{Type: prefixPluginType, Name: prefixPluginName}, + prefillProfile: prefillProfile, pluginState: plugins.NewPluginState(ctx), } } @@ -109,6 +121,7 @@ type NoHitLRU struct { typedName plugins.TypedName lruCache *lru.Cache[string, struct{}] // pod name -> dummy value (we only care about order) prefixPluginTypedName plugins.TypedName + prefillProfile string pluginState *plugins.PluginState } @@ -286,19 +299,25 @@ func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, sc return } - // Get the primary profile's target pod - primaryProfile := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName] - if primaryProfile == nil || len(primaryProfile.TargetPods) == 0 { - logger.Info("No target pod in primary profile") + s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName], schedulingResult.PrimaryProfileName) + s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[s.prefillProfile], s.prefillProfile) +} + +func (s *NoHitLRU) moveTargetPodToFront(ctx context.Context, request *types.LLMRequest, targetProfile *types.ProfileRunResult, profileName string) { + logger := log.FromContext(ctx).V(logutil.DEBUG) + + // Get the target profile's target pod + if targetProfile == nil || len(targetProfile.TargetPods) == 0 { + logger.Info("No target pod in profile", "profile", profileName) return } - targetPod := primaryProfile.TargetPods[0] + targetPod := targetProfile.TargetPods[0] podName := targetPod.GetPod().NamespacedName.String() // Move the pod to the front of the LRU. var present struct{} // dummy value s.lruCache.Add(podName, present) - logger.Info("Updated LRU cache for cold request", "pod", podName, "requestId", request.RequestId) + logger.Info("Updated LRU cache for cold request", "profile", profileName, "pod", podName, "requestId", request.RequestId) } diff --git a/pkg/plugins/scorer/no_hit_lru_test.go b/pkg/plugins/scorer/no_hit_lru_test.go index 6890c998cb..e4eaf9848c 100644 --- a/pkg/plugins/scorer/no_hit_lru_test.go +++ b/pkg/plugins/scorer/no_hit_lru_test.go @@ -455,3 +455,151 @@ func TestNoHitLRUEdgeCases(t *testing.T) { } }) } + +func TestNoHitLRUPrefillDecodeTracking(t *testing.T) { + // Prefill worker pods + prefillPodA := &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "prefill-a", Namespace: "default"}}, + MetricsState: &backendmetrics.MetricsState{}, + } + prefillPodB := &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "prefill-b", Namespace: "default"}}, + MetricsState: &backendmetrics.MetricsState{}, + } + + // Decode worker pods + decodePodA := &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "decode-a", Namespace: "default"}}, + MetricsState: &backendmetrics.MetricsState{}, + } + decodePodB := &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "decode-b", Namespace: "default"}}, + MetricsState: &backendmetrics.MetricsState{}, + } + + prefillPods := []types.Pod{prefillPodA, prefillPodB} + decodePods := []types.Pod{decodePodA, decodePodB} + + coldPrefixState := &types.CycleState{} + coldPrefixState.Write(plugins.StateKey(prefix.PrefixCachePluginType), &prefix.SchedulingContextState{ + PrefixCacheServers: make(map[prefix.ServerID]int), // empty = cold request + }) + + ctx := context.Background() + + t.Run("P/D scenario - both profiles tracked separately", func(t *testing.T) { + scorer := scorer.NewNoHitLRU(ctx, nil) + + // First cold request with P/D + req1 := &types.LLMRequest{RequestId: "pd-request-1"} + scorer.Score(ctx, coldPrefixState, req1, append(prefillPods, decodePods...)) + + // Simulate scheduling result with both prefill and decode profiles + pdResult := &types.SchedulingResult{ + PrimaryProfileName: "decode", + ProfileResults: map[string]*types.ProfileRunResult{ + "prefill": { + TargetPods: []types.Pod{prefillPodA}, + }, + "decode": { + TargetPods: []types.Pod{decodePodA}, + }, + }, + } + scorer.PreRequest(ctx, req1, pdResult) + + // Second cold request - both prefillPodB and decodePodB should score higher + // since prefillPodA and decodePodA were just used + req2 := &types.LLMRequest{RequestId: "pd-request-2"} + prefillScores := scorer.Score(ctx, coldPrefixState, req2, prefillPods) + decodeScores := scorer.Score(ctx, coldPrefixState, req2, decodePods) + + if prefillScores[prefillPodB] <= prefillScores[prefillPodA] { + t.Errorf("Expected prefill-b to score higher than prefill-a after prefill-a was used: %+v", prefillScores) + } + + if decodeScores[decodePodB] <= decodeScores[decodePodA] { + t.Errorf("Expected decode-b to score higher than decode-a after decode-a was used: %+v", decodeScores) + } + }) + + t.Run("non-P/D scenario - only primary profile exists", func(t *testing.T) { + req := &types.LLMRequest{RequestId: "non-pd-request"} + scorer := scorer.NewNoHitLRU(ctx, nil) + scorer.Score(ctx, coldPrefixState, req, decodePods) + + // Scheduling result with only decode profile (no prefill) + result := &types.SchedulingResult{ + PrimaryProfileName: "decode", + ProfileResults: map[string]*types.ProfileRunResult{ + "decode": { + TargetPods: []types.Pod{decodePodA}, + }, + // No "prefill" profile in results + }, + } + // Should not panic when prefill profile doesn't exist + scorer.PreRequest(ctx, req, result) + + // Verify decodePodA was tracked + req2 := &types.LLMRequest{RequestId: "non-pd-request-2"} + scores := scorer.Score(ctx, coldPrefixState, req2, decodePods) + + if scores[decodePodB] <= scores[decodePodA] { + t.Errorf("Expected decode-b to score higher than decode-a: %+v", scores) + } + }) + + t.Run("custom prefill profile name", func(t *testing.T) { + // Create scorer with custom prefill profile name + scorer := scorer.NewNoHitLRU(ctx, &scorer.NoHitLRUParameters{ + PrefillProfile: "custom-prefill", + }) + + req := &types.LLMRequest{RequestId: "custom-prefill-request"} + scorer.Score(ctx, coldPrefixState, req, prefillPods) + + result := &types.SchedulingResult{ + PrimaryProfileName: "decode", + ProfileResults: map[string]*types.ProfileRunResult{ + "custom-prefill": { + TargetPods: []types.Pod{prefillPodA}, + }, + "decode": { + TargetPods: []types.Pod{decodePodA}, + }, + }, + } + scorer.PreRequest(ctx, req, result) + + // Verify custom prefill profile was tracked + req2 := &types.LLMRequest{RequestId: "custom-prefill-request-2"} + scores := scorer.Score(ctx, coldPrefixState, req2, prefillPods) + + if scores[prefillPodB] <= scores[prefillPodA] { + t.Errorf("Expected prefill-b to score higher with custom profile name: %+v", scores) + } + }) + + t.Run("nil scheduling result - graceful handling", func(_ *testing.T) { + req := &types.LLMRequest{RequestId: "nil-result"} + scorer := scorer.NewNoHitLRU(ctx, nil) + scorer.Score(ctx, coldPrefixState, req, decodePods) + + // Should not panic with nil result + scorer.PreRequest(ctx, req, nil) + }) + + t.Run("empty profile results - graceful handling", func(_ *testing.T) { + req := &types.LLMRequest{RequestId: "empty-results"} + scorer := scorer.NewNoHitLRU(ctx, nil) + scorer.Score(ctx, coldPrefixState, req, decodePods) + + result := &types.SchedulingResult{ + PrimaryProfileName: "decode", + ProfileResults: map[string]*types.ProfileRunResult{}, + } + // Should not panic with empty profile results + scorer.PreRequest(ctx, req, result) + }) +} From a5ff4fc5d1d40218ca2a1213c67e628a90097578 Mon Sep 17 00:00:00 2001 From: Edoardo Vacchi Date: Thu, 15 Jan 2026 14:38:27 +0100 Subject: [PATCH 2/4] hardcode prefill profile Signed-off-by: Edoardo Vacchi --- pkg/plugins/scorer/no_hit_lru.go | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pkg/plugins/scorer/no_hit_lru.go b/pkg/plugins/scorer/no_hit_lru.go index 0be25192dc..c5d0d5e92a 100644 --- a/pkg/plugins/scorer/no_hit_lru.go +++ b/pkg/plugins/scorer/no_hit_lru.go @@ -22,7 +22,10 @@ const ( // defaultLRUSize is the maximum number of pods we'll consider in the cache defaultLRUSize = 1024 - // defaultPrefillProfile is the name of the prefill profile when not explicitly declared + // defaultPrefillProfile is the name of the prefill profile + // + // This is currently hardcoded until we have a defined proper config interface. + // (See also https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/2104/ ) defaultPrefillProfile = "prefill" ) @@ -39,10 +42,6 @@ type NoHitLRUParameters struct { // Defaults to "prefix-cache-scorer". PrefixPluginName string `json:"prefixPluginName"` - // PrefillProfile defines the name of the prefill profile to track in LRU. - // Defaults to "prefill". - PrefillProfile string `json:"prefillProfile"` - // LRUSize defines the maximum number of pods to track in the LRU cache. LRUSize int `json:"lruSize"` } @@ -82,7 +81,6 @@ func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU { prefixPluginType := prefix.PrefixCachePluginType prefixPluginName := prefix.PrefixCachePluginType lruSize := defaultLRUSize - prefillProfile := defaultPrefillProfile if params != nil { if params.PrefixPluginType != "" { @@ -94,9 +92,6 @@ func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU { if params.LRUSize > 0 { lruSize = params.LRUSize } - if params.PrefillProfile != "" { - prefillProfile = params.PrefillProfile - } } lruCache, err := lru.New[string, struct{}](lruSize) @@ -109,7 +104,6 @@ func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU { typedName: plugins.TypedName{Type: NoHitLRUType}, lruCache: lruCache, prefixPluginTypedName: plugins.TypedName{Type: prefixPluginType, Name: prefixPluginName}, - prefillProfile: prefillProfile, pluginState: plugins.NewPluginState(ctx), } } @@ -121,7 +115,6 @@ type NoHitLRU struct { typedName plugins.TypedName lruCache *lru.Cache[string, struct{}] // pod name -> dummy value (we only care about order) prefixPluginTypedName plugins.TypedName - prefillProfile string pluginState *plugins.PluginState } @@ -300,7 +293,7 @@ func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, sc } s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName], schedulingResult.PrimaryProfileName) - s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[s.prefillProfile], s.prefillProfile) + s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[defaultPrefillProfile], defaultPrefillProfile) } func (s *NoHitLRU) moveTargetPodToFront(ctx context.Context, request *types.LLMRequest, targetProfile *types.ProfileRunResult, profileName string) { From 6d11ece27ebc959b6b092fea9f8d787cd900d4ea Mon Sep 17 00:00:00 2001 From: Edoardo Vacchi Date: Thu, 15 Jan 2026 15:20:06 +0100 Subject: [PATCH 3/4] remove spammy log Signed-off-by: Edoardo Vacchi --- pkg/plugins/scorer/no_hit_lru.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/plugins/scorer/no_hit_lru.go b/pkg/plugins/scorer/no_hit_lru.go index c5d0d5e92a..7eceb75322 100644 --- a/pkg/plugins/scorer/no_hit_lru.go +++ b/pkg/plugins/scorer/no_hit_lru.go @@ -301,7 +301,6 @@ func (s *NoHitLRU) moveTargetPodToFront(ctx context.Context, request *types.LLMR // Get the target profile's target pod if targetProfile == nil || len(targetProfile.TargetPods) == 0 { - logger.Info("No target pod in profile", "profile", profileName) return } From 782ee794cd3b7342e5ebfadb80fa4f3ebeb7036c Mon Sep 17 00:00:00 2001 From: Edoardo Vacchi Date: Sun, 18 Jan 2026 15:44:59 +0100 Subject: [PATCH 4/4] apply suggestions Signed-off-by: Edoardo Vacchi --- pkg/plugins/scorer/no_hit_lru.go | 13 ++++++----- pkg/plugins/scorer/no_hit_lru_test.go | 31 --------------------------- 2 files changed, 6 insertions(+), 38 deletions(-) diff --git a/pkg/plugins/scorer/no_hit_lru.go b/pkg/plugins/scorer/no_hit_lru.go index 7eceb75322..a367caa349 100644 --- a/pkg/plugins/scorer/no_hit_lru.go +++ b/pkg/plugins/scorer/no_hit_lru.go @@ -292,18 +292,17 @@ func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, sc return } - s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName], schedulingResult.PrimaryProfileName) - s.moveTargetPodToFront(ctx, request, schedulingResult.ProfileResults[defaultPrefillProfile], defaultPrefillProfile) + if targetProfile, ok := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName]; ok && targetProfile != nil && len(targetProfile.TargetPods) != 0 { + s.moveTargetPodToFront(ctx, request, targetProfile, schedulingResult.PrimaryProfileName) + } + if targetProfile, ok := schedulingResult.ProfileResults[defaultPrefillProfile]; ok && targetProfile != nil && len(targetProfile.TargetPods) != 0 { + s.moveTargetPodToFront(ctx, request, targetProfile, defaultPrefillProfile) + } } func (s *NoHitLRU) moveTargetPodToFront(ctx context.Context, request *types.LLMRequest, targetProfile *types.ProfileRunResult, profileName string) { logger := log.FromContext(ctx).V(logutil.DEBUG) - // Get the target profile's target pod - if targetProfile == nil || len(targetProfile.TargetPods) == 0 { - return - } - targetPod := targetProfile.TargetPods[0] podName := targetPod.GetPod().NamespacedName.String() diff --git a/pkg/plugins/scorer/no_hit_lru_test.go b/pkg/plugins/scorer/no_hit_lru_test.go index e4eaf9848c..a03dd3aae2 100644 --- a/pkg/plugins/scorer/no_hit_lru_test.go +++ b/pkg/plugins/scorer/no_hit_lru_test.go @@ -550,37 +550,6 @@ func TestNoHitLRUPrefillDecodeTracking(t *testing.T) { } }) - t.Run("custom prefill profile name", func(t *testing.T) { - // Create scorer with custom prefill profile name - scorer := scorer.NewNoHitLRU(ctx, &scorer.NoHitLRUParameters{ - PrefillProfile: "custom-prefill", - }) - - req := &types.LLMRequest{RequestId: "custom-prefill-request"} - scorer.Score(ctx, coldPrefixState, req, prefillPods) - - result := &types.SchedulingResult{ - PrimaryProfileName: "decode", - ProfileResults: map[string]*types.ProfileRunResult{ - "custom-prefill": { - TargetPods: []types.Pod{prefillPodA}, - }, - "decode": { - TargetPods: []types.Pod{decodePodA}, - }, - }, - } - scorer.PreRequest(ctx, req, result) - - // Verify custom prefill profile was tracked - req2 := &types.LLMRequest{RequestId: "custom-prefill-request-2"} - scores := scorer.Score(ctx, coldPrefixState, req2, prefillPods) - - if scores[prefillPodB] <= scores[prefillPodA] { - t.Errorf("Expected prefill-b to score higher with custom profile name: %+v", scores) - } - }) - t.Run("nil scheduling result - graceful handling", func(_ *testing.T) { req := &types.LLMRequest{RequestId: "nil-result"} scorer := scorer.NewNoHitLRU(ctx, nil)