Skip to content

Commit 27ad10a

Browse files
nirrozenbaumrlakhtakia
authored andcommitted
remove Model field from LLMRequest (kubernetes-sigs#782)
* remove Model field from LLMRequest Signed-off-by: Nir Rozenbaum <[email protected]> * rebase handling Signed-off-by: Nir Rozenbaum <[email protected]> --------- Signed-off-by: Nir Rozenbaum <[email protected]>
1 parent 792c3b4 commit 27ad10a

File tree

7 files changed

+32
-50
lines changed

7 files changed

+32
-50
lines changed

pkg/epp/requestcontrol/director.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,14 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
7979
if reqCtx.ResolvedTargetModel == "" {
8080
return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)}
8181
}
82+
reqCtx.Request.Body["model"] = reqCtx.ResolvedTargetModel // Update target model in the body.
8283
}
8384

8485
llmReq := &schedulingtypes.LLMRequest{
85-
Model: reqCtx.Model,
86-
ResolvedTargetModel: reqCtx.ResolvedTargetModel,
87-
Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical,
88-
Prompt: prompt,
89-
Headers: reqCtx.Request.Headers,
86+
TargetModel: reqCtx.ResolvedTargetModel,
87+
Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical,
88+
Prompt: prompt,
89+
Headers: reqCtx.Request.Headers,
9090
}
9191
logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq)
9292
results, err := d.Dispatch(ctx, llmReq)
@@ -129,13 +129,8 @@ func (d *Director) PostDispatch(ctx context.Context, reqCtx *handlers.RequestCon
129129
}
130130

131131
endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber))
132-
logger.V(logutil.DEFAULT).Info("Request handled",
133-
"model", reqCtx.Model, "targetModel", reqCtx.ResolvedTargetModel, "endpoint", targetPod)
132+
logger.V(logutil.DEFAULT).Info("Request handled", "model", reqCtx.Model, "targetModel", reqCtx.ResolvedTargetModel, "endpoint", targetPod)
134133

135-
// Update target models in the body.
136-
if reqCtx.Model != reqCtx.ResolvedTargetModel {
137-
reqCtx.Request.Body["model"] = reqCtx.ResolvedTargetModel
138-
}
139134
reqCtx.TargetPod = targetPod.NamespacedName.String()
140135
reqCtx.TargetEndpoint = endpoint
141136

pkg/epp/scheduling/plugins/filter/filter_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
204204

205205
// Create a test request and pods
206206
req := &types.LLMRequest{
207-
Model: testAffinityModel,
208-
ResolvedTargetModel: testAffinityModel,
207+
TargetModel: testAffinityModel,
209208
}
210209

211210
// Test setup: One affinity pod and one available pod

pkg/epp/scheduling/plugins/filter/lora_affinity_filter.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ func (f *LoraAffinityFilter) Filter(ctx *types.SchedulingContext, pods []types.P
5959

6060
// Categorize pods based on affinity and availability
6161
for _, pod := range pods {
62-
_, active := pod.GetMetrics().ActiveModels[ctx.Req.ResolvedTargetModel]
63-
_, waiting := pod.GetMetrics().WaitingModels[ctx.Req.ResolvedTargetModel]
62+
_, active := pod.GetMetrics().ActiveModels[ctx.Req.TargetModel]
63+
_, waiting := pod.GetMetrics().WaitingModels[ctx.Req.TargetModel]
6464

6565
if active || waiting {
6666
filtered_affinity = append(filtered_affinity, pod)

pkg/epp/scheduling/plugins/prefix/plugin.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ func hashPrompt(ctx *types.SchedulingContext, cacheBlockSize int, maxPrefixBlock
229229
// If the last block is smaller than cacheBlockSize, it will be ignored.
230230
res := make([]BlockHash, 0, 1+len(prompt)/cacheBlockSize)
231231
// Add the model to the first block hash so that different models have different hashes even with the same body.
232-
res = append(res, BlockHash(xxhash.Sum64String(ctx.Req.ResolvedTargetModel)))
232+
res = append(res, BlockHash(xxhash.Sum64String(ctx.Req.TargetModel)))
233233
for i := 0; i+cacheBlockSize <= len(prompt); i += cacheBlockSize {
234234
block := prompt[i : i+cacheBlockSize]
235235
prevBlockHash := res[len(res)-1]

pkg/epp/scheduling/plugins/prefix/plugin_test.go

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,8 @@ func TestPrefixPlugin(t *testing.T) {
2424

2525
// First request.
2626
req1 := &types.LLMRequest{
27-
Model: "test-model1",
28-
ResolvedTargetModel: "test-model1",
29-
Prompt: "aaaaaa",
27+
TargetModel: "test-model1",
28+
Prompt: "aaaaaa",
3029
}
3130
ctx := types.NewSchedulingContext(context.Background(), req1, pods)
3231
plugin.PreSchedule(ctx)
@@ -49,9 +48,8 @@ func TestPrefixPlugin(t *testing.T) {
4948
// Second request doesn't share any prefix with first one. It should be added to the cache but
5049
// the pod score should be 0.
5150
req2 := &types.LLMRequest{
52-
Model: "test-model2",
53-
ResolvedTargetModel: "test-model2",
54-
Prompt: "bbbbbb",
51+
TargetModel: "test-model2",
52+
Prompt: "bbbbbb",
5553
}
5654
ctx = types.NewSchedulingContext(context.Background(), req2, pods)
5755
plugin.PreSchedule(ctx)
@@ -73,9 +71,8 @@ func TestPrefixPlugin(t *testing.T) {
7371

7472
// Third request shares partial prefix with first one.
7573
req3 := &types.LLMRequest{
76-
Model: "test-model1",
77-
ResolvedTargetModel: "test-model1",
78-
Prompt: "aaaabbbb",
74+
TargetModel: "test-model1",
75+
Prompt: "aaaabbbb",
7976
}
8077
ctx = types.NewSchedulingContext(context.Background(), req3, pods)
8178
plugin.PreSchedule(ctx)
@@ -96,9 +93,8 @@ func TestPrefixPlugin(t *testing.T) {
9693

9794
// 4th request is same as req3 except the model is different, still no match.
9895
req4 := &types.LLMRequest{
99-
Model: "test-model-new",
100-
ResolvedTargetModel: "test-model-new",
101-
Prompt: "aaaabbbb",
96+
TargetModel: "test-model-new",
97+
Prompt: "aaaabbbb",
10298
}
10399
ctx = types.NewSchedulingContext(context.Background(), req4, pods)
104100
plugin.PreSchedule(ctx)
@@ -119,9 +115,8 @@ func TestPrefixPlugin(t *testing.T) {
119115

120116
// 5th request shares partial prefix with 3rd one.
121117
req5 := &types.LLMRequest{
122-
Model: "test-model1",
123-
ResolvedTargetModel: "test-model1",
124-
Prompt: "aaaabbbbcccc",
118+
TargetModel: "test-model1",
119+
Prompt: "aaaabbbbcccc",
125120
}
126121
ctx = types.NewSchedulingContext(context.Background(), req5, pods)
127122
plugin.PreSchedule(ctx)

pkg/epp/scheduling/scheduler_test.go

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,19 +40,17 @@ func TestSchedule(t *testing.T) {
4040
{
4141
name: "no pods in datastore",
4242
req: &types.LLMRequest{
43-
Model: "any-model",
44-
ResolvedTargetModel: "any-model",
45-
Critical: true,
43+
TargetModel: "any-model",
44+
Critical: true,
4645
},
4746
input: []*backendmetrics.FakePodMetrics{},
4847
err: true,
4948
},
5049
{
5150
name: "critical request",
5251
req: &types.LLMRequest{
53-
Model: "critical",
54-
ResolvedTargetModel: "critical",
55-
Critical: true,
52+
TargetModel: "critical",
53+
Critical: true,
5654
},
5755
// pod2 will be picked because it has relatively low queue size, with the requested
5856
// model being active, and has low KV cache.
@@ -114,9 +112,8 @@ func TestSchedule(t *testing.T) {
114112
{
115113
name: "sheddable request, accepted",
116114
req: &types.LLMRequest{
117-
Model: "sheddable",
118-
ResolvedTargetModel: "sheddable",
119-
Critical: false,
115+
TargetModel: "sheddable",
116+
Critical: false,
120117
},
121118
// pod1 will be picked because it has capacity for the sheddable request.
122119
input: []*backendmetrics.FakePodMetrics{
@@ -177,9 +174,8 @@ func TestSchedule(t *testing.T) {
177174
{
178175
name: "sheddable request, dropped",
179176
req: &types.LLMRequest{
180-
Model: "sheddable",
181-
ResolvedTargetModel: "sheddable",
182-
Critical: false,
177+
TargetModel: "sheddable",
178+
Critical: false,
183179
},
184180
// All pods have higher KV cache thant the threshold, so the sheddable request will be
185181
// dropped.
@@ -356,7 +352,7 @@ func TestSchedulePlugins(t *testing.T) {
356352
// Initialize the scheduler
357353
scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, &test.config)
358354

359-
req := &types.LLMRequest{Model: "test-model"}
355+
req := &types.LLMRequest{TargetModel: "test-model"}
360356
got, err := scheduler.Schedule(context.Background(), req)
361357

362358
// Validate error state

pkg/epp/scheduling/types/types.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,8 @@ import (
2525

2626
// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body.
2727
type LLMRequest struct {
28-
// Model is the name of the model that the user specified in the request body.
29-
Model string
30-
// ResolvedTargetModel is the final target model after traffic split.
31-
ResolvedTargetModel string
28+
// TargetModel is the final target model after traffic split.
29+
TargetModel string
3230
// Critical is a boolean that specifies if a request is critical or not.
3331
Critical bool
3432
// Prompt is the prompt that was sent in the request body.
@@ -38,8 +36,7 @@ type LLMRequest struct {
3836
}
3937

4038
func (r *LLMRequest) String() string {
41-
return fmt.Sprintf("Model: %s, ResolvedTargetModel: %s, Critical: %t, PromptLength: %d, Headers: %v",
42-
r.Model, r.ResolvedTargetModel, r.Critical, len(r.Prompt), r.Headers)
39+
return fmt.Sprintf("TargetModel: %s, Critical: %t, PromptLength: %d, Headers: %v", r.TargetModel, r.Critical, len(r.Prompt), r.Headers)
4340
}
4441

4542
type Pod interface {

0 commit comments

Comments
 (0)