refactor: Replace prefix cache structure with golang-lru

kfirtoledo · vMaroon · kfirtoledo · commit f3d636ba8129 · 2025-06-05T16:50:29.000+03:00
Signed-off-by: Kfir Toledo &lt;kfir.toledo@ibm.com&gt;
Co-authored-by: Maroon Ayoub &lt;maroon.ayoub@ibm.com&gt;
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
@@ -120,6 +120,7 @@ func loadPrefixCacheConfig() prefix.Config {
 
 	return prefix.Config{
 		HashBlockSize:          envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),
+		MaxNumServersToMatch:   envutil.GetEnvInt("PREFIX_CACHE_MAX_SERVER_TO_MATCH", prefix.DefaultNumServersToMatch, baseLogger),
 		MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),
 		LRUIndexerCapacity:     envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),
 	}
diff --git a/go.mod b/go.mod
@@ -9,6 +9,7 @@ require (
 	github.com/go-logr/logr v1.4.3
 	github.com/google/go-cmp v0.7.0
 	github.com/google/uuid v1.6.0
+	github.com/hashicorp/golang-lru/v2 v2.0.7
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
 	github.com/prometheus/client_golang v1.22.0
diff --git a/go.sum b/go.sum
@@ -95,6 +95,8 @@ github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5T
 github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4=
 github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
 github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
@@ -20,154 +20,92 @@ import (
 	"context"
 	"sync"
 	"time"
-	"unsafe"
-
-	"container/list"
 
+	lru "github.com/hashicorp/golang-lru/v2"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-func newIndexer(maxCacheSize int) *indexer {
-	t := &indexer{
-		maxCacheSize: maxCacheSize,
-		table:        make(map[BlockHash]map[ServerID]*list.Element),
-		ll:           list.New(),
-	}
-	go t.ReportCacheSize(time.Second)
-	return t
+// block holds an LRU cache of servers that may have a specific prefix hash.
+type block struct {
+	Pods *lru.Cache[ServerID, struct{}] // Can be extended with metadata (e.g., timestamp).
 }
 
 // An indexer maintains an LRU cache of prompt prefix hashes and the server(s) that might have that
 // prefix cached .
 type indexer struct {
-	mu           sync.RWMutex
-	maxCacheSize int
-	table        map[BlockHash]map[ServerID]*list.Element // from any prefix cache to the cache entry to find the server
-	ll           *list.List                               // LinkedList to keep track of the order of entries
+	mu                sync.RWMutex
+	cache             *lru.Cache[BlockHash, *block]
+	maxCacheSize      int
+	maxServersToMatch int
 }
 
-// value is the value stored in the linked list.
-type value struct {
-	server ServerID
-	hash   BlockHash
+// newIndexer initializes an indexer with size limits and starts cache size reporting.
+func newIndexer(maxCacheSize, maxServersToMatch int) *indexer {
+	c, err := lru.New[BlockHash, *block](maxCacheSize)
+	if err != nil {
+		panic(err)
+	}
+	ix := &indexer{
+		cache:             c,
+		maxCacheSize:      maxCacheSize,
+		maxServersToMatch: maxServersToMatch,
+	}
+	go ix.ReportCacheSize(time.Second)
+	return ix
 }
 
-// Get returns the set of servers that have the given prefix hash cached.
-func (i *indexer) Get(hash BlockHash) map[ServerID]bool {
-	i.mu.RLock()
-	defer i.mu.RUnlock()
-	res := map[ServerID]bool{}
-	for server := range i.table[hash] {
-		res[server] = true
+// Add adds a list of prefix hashes to the cache, tied to the server.
+func (i *indexer) Add(hashes []BlockHash, pod ServerID) {
+	if len(hashes) == 0 || pod.Name == "" {
+		return
 	}
-	return res
-}
 
-// Add adds a list of prefix hashes of a single request to the server the request was sent to.
-// The intuition is that this server is likely to have the prefix cached, so next time a request
-// sharing the longest prefix should be sent to the same server to take advantage of the cache hit.
-func (i *indexer) Add(hashes []BlockHash, server ServerID) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	for _, hash := range hashes {
-		i.add(hash, server)
-	}
-}
-
-func (i *indexer) check(hash BlockHash, server ServerID) (*list.Element, bool) {
-	servers, ok := i.table[hash]
-	if !ok {
-		return nil, false
-	}
-	e, ok := servers[server]
-	return e, ok
-}
 
-func (i *indexer) add(hash BlockHash, server ServerID) {
-	e, exists := i.check(hash, server)
-	if exists {
-		i.ll.MoveToBack(e)
-	} else {
-		i.create(hash, server)
+	for _, hash := range hashes {
+		b, ok := i.cache.Get(hash)
+		if !ok {
+			// Create block with new LRU
+			podLRU, _ := lru.New[ServerID, struct{}](i.maxServersToMatch)
+			b = &block{Pods: podLRU}
+			i.cache.Add(hash, b)
+		}
+
+		b.Pods.Add(pod, struct{}{})
 	}
 }
 
-func (i *indexer) create(hash BlockHash, server ServerID) {
-	for i.ll.Len() >= i.maxCacheSize {
-		// Evict the least recently used entry if we've exceeded the max cache size
-		i.evict()
-	}
-
-	if _, ok := i.table[hash]; !ok {
-		i.table[hash] = make(map[ServerID]*list.Element)
-	}
-	v := &value{
-		server: server,
-		hash:   hash,
-	}
-	e := i.ll.PushBack(v)
-	i.table[hash][server] = e
-}
+// Get returns a set of servers that have the given prefix hash cached.
+func (i *indexer) Get(hash BlockHash) map[ServerID]bool {
+	i.mu.RLock()
+	defer i.mu.RUnlock()
 
-// evict removes the least recently used entry from the cache
-func (i *indexer) evict() {
-	oldestNode := i.ll.Front()
-	if oldestNode == nil {
-		return
+	res := map[ServerID]bool{}
+	block, ok := i.cache.Get(hash)
+	if !ok {
+		return res
 	}
-	i.ll.Remove(oldestNode)
-
-	v := oldestNode.Value.(*value)
-	hash := v.hash
-	server := v.server
-	// Remove from the hash map
-	serverMap := i.table[hash]
-	delete(serverMap, server)
-
-	// If this was the last server for this hash, remove the hash entry entirely
-	if len(serverMap) == 0 {
-		delete(i.table, hash)
+	for _, pod := range block.Pods.Keys() {
+		res[pod] = true
 	}
-
-	log.FromContext(context.TODO()).V(logutil.TRACE).Info("Evicted LRU entry", "hash", hash, "server", server)
+	return res
 }
 
-// ReportCacheSize starts a goroutine that periodically reports the cache size metric
+// ReportCacheSize starts a goroutine that periodically reports the cache size metric.
 func (i *indexer) ReportCacheSize(interval time.Duration) {
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()
 	for range ticker.C {
 		i.mu.RLock()
-		metrics.RecordPrefixCacheSize(int64(i.ll.Len()))
-		log.FromContext(context.TODO()).V(logutil.TRACE).Info("LRU", "# entries", i.ll.Len(), "estimated size MB", i.ll.Len()*i.estimateEntrySize()/1000000)
+		size := i.cache.Len()
+		metrics.RecordPrefixCacheSize(int64(size))
+		log.FromContext(context.TODO()).V(logutil.TRACE).Info("LRU",
+			"# entries", size,
+			"prefix cache utilization [%]", float64(size)*100/float64(i.maxCacheSize),
+		)
 		i.mu.RUnlock()
 	}
 }
-
-// estimateEntrySize estimates the memory size of a cache entry in bytes.
-func (i *indexer) estimateEntrySize() int {
-	size := 0
-
-	// Estimate the size of a node in the linked list.
-	// First get the size of the node struct via unsafe.Sizeof.
-	// The prev and next pointers are 8 bytes each on a 64-bit system.
-	// The BlockHash is a uint64, which is 8 bytes.
-	// The ServerID is a NamespacedName, which contains two strings (Name and Namespace).
-	// The headers for the strings are 16 bytes each (8 bytes for the pointer and 8 bytes for the length).
-	// So unsafe.Sizeof(node{}) should return 2*8 + 8 + 2*16 = 48 bytes.
-	size += int(unsafe.Sizeof(value{}))
-	// Size of the Name and Namespace strings in ServerID, assuming 63 bytes each (max length for Kubernetes NamespacedName).
-	size += 2 * 63
-
-	// Estimate the size of an entry in the hash map. Note the overhead of the map headers and buckets are ignored.
-	size += 8      // Size of the BlockHash (uint64).
-	size += 2 * 16 // Size of the ServerID string headers (NamespacedName).
-	size += 2 * 63 // Size of the Name and Namespace strings in ServerID.
-	size += 8      // Size of the pointer to the node in the hash map.
-
-	// Based on the above estimates, the estimated size of an entry is:
-	// (48 + 2*63) + (8 + 2*16 + 2*63 + 8) = 348 bytes.
-	return size
-}
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
@@ -22,24 +22,24 @@ import (
 )
 
 func TestIndexer_AddAndGet(t *testing.T) {
-	cache := newIndexer(2)
+	i := newIndexer(2, 2)
 
 	hash1 := BlockHash(1)
 	server := ServerID{Namespace: "default", Name: "server1"}
 
 	// Add an entry to the cache
-	cache.Add([]BlockHash{hash1}, server)
+	i.Add([]BlockHash{hash1}, server)
 
 	// Retrieve the entry
-	assert.Equal(t, 1, cache.ll.Len(), "Cache size should be 1 after adding an entry")
-	servers := cache.Get(hash1)
+	assert.Equal(t, 1, i.cache.Len(), "Cache size should be 1 after adding an entry")
+	servers := i.Get(hash1)
 	assert.Contains(t, servers, server, "Cache should contain the added server")
 
 	// Add another entry to the cache, the cache size should be incremented to 2.
-	cache.Add([]BlockHash{BlockHash(2)}, server)
-	assert.Equal(t, 2, cache.ll.Len(), "Cache size should  be 2 after adding an entry")
+	i.Add([]BlockHash{BlockHash(2)}, server)
+	assert.Equal(t, 2, i.cache.Len(), "Cache size should  be 2 after adding an entry")
 
 	// Add another entry to the cache, which should evict the first one due to max size.
-	cache.Add([]BlockHash{BlockHash(3)}, server)
-	assert.Equal(t, 2, cache.ll.Len(), "Cache size should still be 2 after adding an entry")
+	i.Add([]BlockHash{BlockHash(3)}, server)
+	assert.Equal(t, 2, i.cache.Len(), "Cache size should still be 2 after adding an entry")
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -36,28 +36,25 @@ const (
 	// Why not just return the server with longest prefix match?
 	// It may not be the optimal choice, e.g., it may have a high queue depth.
 	// We optimistically search more than one to give more candidates for the scheduler to choose.
-	DefaultNumServersToMatch = 2
+	DefaultNumServersToMatch = 16
 	// vLLM default token block size is 16, and a good guess of average characters per token is 4.
 	DefaultHashBlockSize = 64
 	// The maximum number of blocks to match. Two long requests with the same prefix up to this
 	// limit will be indistinguishable.
 	// This parameter provides a trade-off between cache size, prefix matching speed and matching
 	// accuracy. Use a small value if most requests are short to reduce cache size and speed up the
 	// matching process. Use a large value if most requests are long to increase the matching accuracy.
-	DefaultMaxPrefixBlocks = 128
+	DefaultMaxPrefixBlocks = 256
 	// The indexer is an approximation to the actual prefix cache state on the model servers.
 	// A small capacity ensures a high accuracy of cache hit on the model server, but it will
 	// increase the chance of false negatives. A high capacity does the opposite.
 	// To properly size this, consider the sum of the total number of cache entries on all model
-	// servers. Consider the llama3 8B model on 3 H100 80GB GPUs. The size of the model weight is
+	// servers. Consider the llama3 8B model on 8 H100 80GB GPUs. The size of the model weight is
 	// about 16GB. Assume 50% of the remaining HBM is used for caching prefixes, we have 32GB. Each
 	// token is about 128KB in size, so we can cache 250K tokens. Using the default block size of 16
-	// in vLLM, we will have 250K / 16 = 15.6K blocks. In total we have 15.6K * 3 = 46.8K blocks, or
-	// roughly 50K.
-	// How much memory space does it require to hold the 50K block hashes?
-	// According to the estimates in indexer.estimateEntrySize(), the size of each entry is
-	// approximately 348 bytes. So in total we have 50K * 348 = 17.4MB.
-	DefaultLRUIndexerCapacity = 50000
+	// in vLLM, we will have 250K / 16 = 15.6K blocks. In total we have 15.6K * 8 = 124.8K blocks, or
+	// roughly 130K.
+	DefaultLRUIndexerCapacity = 130000
 )
 
 type Config struct {
@@ -67,6 +64,8 @@ type Config struct {
 	// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
 	// be ignored.
 	MaxPrefixBlocksToMatch int
+	// NumServersToMatch is the maximum number that can match per hash BlockHash.
+	MaxNumServersToMatch int
 	// Max (approximate) size of the LRU indexer in number of entries.
 	LRUIndexerCapacity int
 }
@@ -123,7 +122,7 @@ var _ framework.PostCycle = &Plugin{}
 func New(config Config) *Plugin {
 	m := &Plugin{
 		Config:  config,
-		indexer: newIndexer(config.LRUIndexerCapacity),
+		indexer: newIndexer(config.LRUIndexerCapacity, config.MaxNumServersToMatch),
 	}
 	return m
 }
@@ -138,14 +137,11 @@ func (m *Plugin) Score(ctx context.Context, request *types.LLMRequest, cycleStat
 	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
 	// pre score step, hashing prompt and find longest prefix match.
 	hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
-	numServers := DefaultNumServersToMatch
-	if numServers > len(pods) {
-		numServers = len(pods)
-	}
 	state := &schedulingContextState{
 		PrefixHashes:       hashes,
-		PrefixCacheServers: m.matchLongestPrefix(ctx, hashes, numServers),
+		PrefixCacheServers: m.matchLongestPrefix(ctx, hashes),
 	}
+
 	cycleState.Write(types.StateKey(m.Name()), state)
 	loggerTrace.Info(fmt.Sprintf("cached servers: %+v", state.PrefixCacheServers), "hashes", state.PrefixHashes)
 	// calculate the scores of pods
@@ -181,22 +177,22 @@ func (m *Plugin) PostCycle(ctx context.Context, cycleState *types.CycleState, re
 }
 
 // matchLongestPrefix returns a map of servers and length of prefix that each server caches.
-func (m *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash, numServers int) map[ServerID]int {
+func (m *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map[ServerID]int {
 	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
 	res := make(map[ServerID]int)
 	// Use a greedy strategy to search from the longest prefix.
 	// NOTE: It's possible to further optimize this with a binary search.
-	for i := len(hashes) - 1; i >= 0 && len(res) < numServers; i-- {
+	for i := 0; i < len(hashes); i++ {
 		hash := hashes[i]
 		cachedServers := m.indexer.Get(hash)
-		if len(cachedServers) > 0 {
+		if len(cachedServers) == 0 {
+			break
+		} else {
 			loggerTrace.Info("Found cached servers", "cachedServers", cachedServers, "total # blocks", len(hashes), "longest prefix", i)
 			for server := range cachedServers {
 				// Update servers with their longest prefix match.
-				// If we already found this server with longer prefix match, don't update it.
-				if _, ok := res[server]; !ok {
-					res[server] = i + 1
-				}
+				res[server]++
+
 			}
 		}
 	}
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ func loadPrefixCacheConfig() prefix.Config {`
`120`	`120`
`121`	`121`	`return prefix.Config{`
`122`	`122`	`HashBlockSize: envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger),`
	`123`	`+ MaxNumServersToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_SERVER_TO_MATCH", prefix.DefaultNumServersToMatch, baseLogger),`
`123`	`124`	`MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger),`
`124`	`125`	`LRUIndexerCapacity: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, baseLogger),`
`125`	`126`	`}`