@@ -122,10 +122,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
122
122
s .loadedMu .Unlock ()
123
123
var runner * runnerRef = nil
124
124
if len (runners ) > 0 {
125
+ var minRef = runners [0 ].refCount
125
126
for _ , r := range runners {
126
- if ! r . isAtCapacity () {
127
+ if runner . refCount <= minRef {
127
128
runner = r
128
- break
129
+ minRef = r . refCount
129
130
}
130
131
}
131
132
}
@@ -315,7 +316,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
315
316
// Complete the pending request and send the runner back to the requester
316
317
// Wires up a finished event after the request context is completed
317
318
// Updates session duration, and resets expiration timer
318
- func (pending * LlmRequest ) useLoadedRunner (runner * runnerRef , finished chan * LlmRequest ) {
319
+ func (pending * LlmRequest ) useLoadedRunner (runner * runnerRef , finished chan * runnerRef ) {
319
320
runner .refMu .Lock ()
320
321
defer runner .refMu .Unlock ()
321
322
runner .refCount ++
@@ -387,7 +388,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
387
388
go func () {
388
389
<- req .ctx .Done ()
389
390
slog .Debug ("context for request finished" )
390
- s .finishedReqCh <- req
391
+ s .finishedReqCh <- runner
391
392
}()
392
393
req .successCh <- runner
393
394
}()
@@ -465,6 +466,11 @@ type runnerRef struct {
465
466
* api.Options
466
467
}
467
468
469
+ func (r * runnerRef ) isAtCapacity () bool {
470
+ // Implement your capacity check logic here
471
+ // Return true if the runner is at capacity, false otherwise
472
+ }
473
+
468
474
// The refMu must already be held when calling unload
469
475
func (runner * runnerRef ) unload () {
470
476
if runner .expireTimer != nil {
0 commit comments