remove Model field from LLMRequest (kubernetes-sigs#782)

nirrozenbaum · rlakhtakia · commit 27ad10a2df1c · 2025-06-11T19:45:25.000Z
* remove Model field from LLMRequest

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;

* rebase handling

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;

---------

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;
diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
@@ -79,14 +79,14 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
 		if reqCtx.ResolvedTargetModel == "" {
 			return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)}
 		}
+		reqCtx.Request.Body["model"] = reqCtx.ResolvedTargetModel // Update target model in the body.
 	}
 
 	llmReq := &schedulingtypes.LLMRequest{
-		Model:               reqCtx.Model,
-		ResolvedTargetModel: reqCtx.ResolvedTargetModel,
-		Critical:            modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical,
-		Prompt:              prompt,
-		Headers:             reqCtx.Request.Headers,
+		TargetModel: reqCtx.ResolvedTargetModel,
+		Critical:    modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical,
+		Prompt:      prompt,
+		Headers:     reqCtx.Request.Headers,
 	}
 	logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq)
 	results, err := d.Dispatch(ctx, llmReq)
@@ -129,13 +129,8 @@ func (d *Director) PostDispatch(ctx context.Context, reqCtx *handlers.RequestCon
 	}
 
 	endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber))
-	logger.V(logutil.DEFAULT).Info("Request handled",
-		"model", reqCtx.Model, "targetModel", reqCtx.ResolvedTargetModel, "endpoint", targetPod)
+	logger.V(logutil.DEFAULT).Info("Request handled", "model", reqCtx.Model, "targetModel", reqCtx.ResolvedTargetModel, "endpoint", targetPod)
 
-	// Update target models in the body.
-	if reqCtx.Model != reqCtx.ResolvedTargetModel {
-		reqCtx.Request.Body["model"] = reqCtx.ResolvedTargetModel
-	}
 	reqCtx.TargetPod = targetPod.NamespacedName.String()
 	reqCtx.TargetEndpoint = endpoint
 
diff --git a/pkg/epp/scheduling/plugins/filter/filter_test.go b/pkg/epp/scheduling/plugins/filter/filter_test.go
@@ -204,8 +204,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 
 	// Create a test request and pods
 	req := &types.LLMRequest{
-		Model:               testAffinityModel,
-		ResolvedTargetModel: testAffinityModel,
+		TargetModel: testAffinityModel,
 	}
 
 	// Test setup: One affinity pod and one available pod
diff --git a/pkg/epp/scheduling/plugins/filter/lora_affinity_filter.go b/pkg/epp/scheduling/plugins/filter/lora_affinity_filter.go
@@ -59,8 +59,8 @@ func (f *LoraAffinityFilter) Filter(ctx *types.SchedulingContext, pods []types.P
 
 	// Categorize pods based on affinity and availability
 	for _, pod := range pods {
-		_, active := pod.GetMetrics().ActiveModels[ctx.Req.ResolvedTargetModel]
-		_, waiting := pod.GetMetrics().WaitingModels[ctx.Req.ResolvedTargetModel]
+		_, active := pod.GetMetrics().ActiveModels[ctx.Req.TargetModel]
+		_, waiting := pod.GetMetrics().WaitingModels[ctx.Req.TargetModel]
 
 		if active || waiting {
 			filtered_affinity = append(filtered_affinity, pod)
diff --git a/pkg/epp/scheduling/plugins/prefix/plugin.go b/pkg/epp/scheduling/plugins/prefix/plugin.go
@@ -229,7 +229,7 @@ func hashPrompt(ctx *types.SchedulingContext, cacheBlockSize int, maxPrefixBlock
 	// If the last block is smaller than cacheBlockSize, it will be ignored.
 	res := make([]BlockHash, 0, 1+len(prompt)/cacheBlockSize)
 	// Add the model to the first block hash so that different models have different hashes even with the same body.
-	res = append(res, BlockHash(xxhash.Sum64String(ctx.Req.ResolvedTargetModel)))
+	res = append(res, BlockHash(xxhash.Sum64String(ctx.Req.TargetModel)))
 	for i := 0; i+cacheBlockSize <= len(prompt); i += cacheBlockSize {
 		block := prompt[i : i+cacheBlockSize]
 		prevBlockHash := res[len(res)-1]
diff --git a/pkg/epp/scheduling/plugins/prefix/plugin_test.go b/pkg/epp/scheduling/plugins/prefix/plugin_test.go
@@ -24,9 +24,8 @@ func TestPrefixPlugin(t *testing.T) {
 
 	// First request.
 	req1 := &types.LLMRequest{
-		Model:               "test-model1",
-		ResolvedTargetModel: "test-model1",
-		Prompt:              "aaaaaa",
+		TargetModel: "test-model1",
+		Prompt:      "aaaaaa",
 	}
 	ctx := types.NewSchedulingContext(context.Background(), req1, pods)
 	plugin.PreSchedule(ctx)
@@ -49,9 +48,8 @@ func TestPrefixPlugin(t *testing.T) {
 	// Second request doesn't share any prefix with first one. It should be added to the cache but
 	// the pod score should be 0.
 	req2 := &types.LLMRequest{
-		Model:               "test-model2",
-		ResolvedTargetModel: "test-model2",
-		Prompt:              "bbbbbb",
+		TargetModel: "test-model2",
+		Prompt:      "bbbbbb",
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req2, pods)
 	plugin.PreSchedule(ctx)
@@ -73,9 +71,8 @@ func TestPrefixPlugin(t *testing.T) {
 
 	// Third request shares partial prefix with first one.
 	req3 := &types.LLMRequest{
-		Model:               "test-model1",
-		ResolvedTargetModel: "test-model1",
-		Prompt:              "aaaabbbb",
+		TargetModel: "test-model1",
+		Prompt:      "aaaabbbb",
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req3, pods)
 	plugin.PreSchedule(ctx)
@@ -96,9 +93,8 @@ func TestPrefixPlugin(t *testing.T) {
 
 	// 4th request is same as req3 except the model is different, still no match.
 	req4 := &types.LLMRequest{
-		Model:               "test-model-new",
-		ResolvedTargetModel: "test-model-new",
-		Prompt:              "aaaabbbb",
+		TargetModel: "test-model-new",
+		Prompt:      "aaaabbbb",
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req4, pods)
 	plugin.PreSchedule(ctx)
@@ -119,9 +115,8 @@ func TestPrefixPlugin(t *testing.T) {
 
 	// 5th request shares partial prefix with 3rd one.
 	req5 := &types.LLMRequest{
-		Model:               "test-model1",
-		ResolvedTargetModel: "test-model1",
-		Prompt:              "aaaabbbbcccc",
+		TargetModel: "test-model1",
+		Prompt:      "aaaabbbbcccc",
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req5, pods)
 	plugin.PreSchedule(ctx)
diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go
@@ -40,19 +40,17 @@ func TestSchedule(t *testing.T) {
 		{
 			name: "no pods in datastore",
 			req: &types.LLMRequest{
-				Model:               "any-model",
-				ResolvedTargetModel: "any-model",
-				Critical:            true,
+				TargetModel: "any-model",
+				Critical:    true,
 			},
 			input: []*backendmetrics.FakePodMetrics{},
 			err:   true,
 		},
 		{
 			name: "critical request",
 			req: &types.LLMRequest{
-				Model:               "critical",
-				ResolvedTargetModel: "critical",
-				Critical:            true,
+				TargetModel: "critical",
+				Critical:    true,
 			},
 			// pod2 will be picked because it has relatively low queue size, with the requested
 			// model being active, and has low KV cache.
@@ -114,9 +112,8 @@ func TestSchedule(t *testing.T) {
 		{
 			name: "sheddable request, accepted",
 			req: &types.LLMRequest{
-				Model:               "sheddable",
-				ResolvedTargetModel: "sheddable",
-				Critical:            false,
+				TargetModel: "sheddable",
+				Critical:    false,
 			},
 			// pod1 will be picked because it has capacity for the sheddable request.
 			input: []*backendmetrics.FakePodMetrics{
@@ -177,9 +174,8 @@ func TestSchedule(t *testing.T) {
 		{
 			name: "sheddable request, dropped",
 			req: &types.LLMRequest{
-				Model:               "sheddable",
-				ResolvedTargetModel: "sheddable",
-				Critical:            false,
+				TargetModel: "sheddable",
+				Critical:    false,
 			},
 			// All pods have higher KV cache thant the threshold, so the sheddable request will be
 			// dropped.
@@ -356,7 +352,7 @@ func TestSchedulePlugins(t *testing.T) {
 			// Initialize the scheduler
 			scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, &test.config)
 
-			req := &types.LLMRequest{Model: "test-model"}
+			req := &types.LLMRequest{TargetModel: "test-model"}
 			got, err := scheduler.Schedule(context.Background(), req)
 
 			// Validate error state
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
@@ -25,10 +25,8 @@ import (
 
 // LLMRequest is a structured representation of the fields we parse out of the LLMRequest body.
 type LLMRequest struct {
-	// Model is the name of the model that the user specified in the request body.
-	Model string
-	// ResolvedTargetModel is the final target model after traffic split.
-	ResolvedTargetModel string
+	// TargetModel is the final target model after traffic split.
+	TargetModel string
 	// Critical is a boolean that specifies if a request is critical or not.
 	Critical bool
 	// Prompt is the prompt that was sent in the request body.
@@ -38,8 +36,7 @@ type LLMRequest struct {
 }
 
 func (r *LLMRequest) String() string {
-	return fmt.Sprintf("Model: %s, ResolvedTargetModel: %s, Critical: %t, PromptLength: %d, Headers: %v",
-		r.Model, r.ResolvedTargetModel, r.Critical, len(r.Prompt), r.Headers)
+	return fmt.Sprintf("TargetModel: %s, Critical: %t, PromptLength: %d, Headers: %v", r.TargetModel, r.Critical, len(r.Prompt), r.Headers)
 }
 
 type Pod interface {

Original file line number	Diff line number	Diff line change
`@@ -204,8 +204,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {`
`204`	`204`
`205`	`205`	`// Create a test request and pods`
`206`	`206`	`req := &types.LLMRequest{`
`207`		`- Model: testAffinityModel,`
`208`		`- ResolvedTargetModel: testAffinityModel,`
	`207`	`+ TargetModel: testAffinityModel,`
`209`	`208`	`}`
`210`	`209`
`211`	`210`	`// Test setup: One affinity pod and one available pod`