Skip to content

Commit 0c7e303

Browse files
committed
refactor(api): change runner assignment per availabilty score
Signed-off-by: Luka Brecic <[email protected]>
1 parent 77c7ade commit 0c7e303

File tree

18 files changed

+49
-229
lines changed

18 files changed

+49
-229
lines changed

apps/api/src/main.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import { RunnerService } from './sandbox/services/runner.service'
2121

2222
import { SandboxClass } from './sandbox/enums/sandbox-class.enum'
2323
import { getOpenApiConfig } from './openapi.config'
24-
import { SchedulerRegistry } from '@nestjs/schedule'
2524
import { EventEmitter2 } from '@nestjs/event-emitter'
2625
import { AuditInterceptor } from './audit/interceptors/audit.interceptor'
2726

@@ -114,7 +113,6 @@ async function bootstrap() {
114113
diskGiB: 50,
115114
gpu: 0,
116115
gpuType: 'none',
117-
capacity: 100,
118116
region: 'us',
119117
class: SandboxClass.SMALL,
120118
domain: 'localtest.me:3003',
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/*
2+
* Copyright 2025 Daytona Platforms Inc.
3+
* SPDX-License-Identifier: AGPL-3.0
4+
*/
5+
6+
export const BUILD_RUNNER_SCORE_THRESHOLD = 50
7+
8+
export const BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD = 50

apps/api/src/sandbox/controllers/runner.controller.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ export class RunnerController {
4848
gpu: req.body?.gpu,
4949
gpuType: req.body?.gpuType,
5050
class: req.body?.class,
51-
capacity: req.body?.capacity,
5251
region: req.body?.region,
5352
}),
5453
},

apps/api/src/sandbox/dto/create-runner.dto.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,6 @@ export class CreateRunnerDto {
5252
})
5353
class: SandboxClass
5454

55-
@IsNumber()
56-
@ApiProperty()
57-
capacity: number
58-
5955
@IsString()
6056
@ApiProperty({
6157
example: 'us',

apps/api/src/sandbox/dto/runner.dto.ts

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,6 @@ export class RunnerDto {
7979
@IsEnum(SandboxClass)
8080
class: SandboxClass
8181

82-
@ApiProperty({
83-
description: 'The current usage of the runner',
84-
example: 2,
85-
})
86-
used: number
87-
88-
@ApiProperty({
89-
description: 'The capacity of the runner',
90-
example: 10,
91-
})
92-
capacity: number
93-
9482
@ApiPropertyOptional({
9583
description: 'Current CPU usage percentage',
9684
example: 45.6,
@@ -199,8 +187,6 @@ export class RunnerDto {
199187
gpu: runner.gpu,
200188
gpuType: runner.gpuType,
201189
class: runner.class,
202-
used: runner.used,
203-
capacity: runner.capacity,
204190
currentCpuUsagePercentage: runner.currentCpuUsagePercentage,
205191
currentMemoryUsagePercentage: runner.currentMemoryUsagePercentage,
206192
currentDiskUsagePercentage: runner.currentDiskUsagePercentage,

apps/api/src/sandbox/entities/runner.entity.ts

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,6 @@ export class Runner {
4646
})
4747
class: SandboxClass
4848

49-
@Column({
50-
default: 0,
51-
})
52-
used: number
53-
54-
@Column()
55-
capacity: number
56-
5749
@Column({
5850
type: 'float',
5951
default: 0,

apps/api/src/sandbox/managers/sandbox-actions/sandbox-start.action.ts

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import { RunnerAdapterFactory } from '../../runner-adapter/runnerAdapter'
2121
import { ToolboxService } from '../../services/toolbox.service'
2222
import { InjectRepository } from '@nestjs/typeorm'
2323
import { Snapshot } from '../../entities/snapshot.entity'
24+
import { BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD, BUILD_RUNNER_SCORE_THRESHOLD } from '../../constants/runner.constants'
2425

2526
@Injectable()
2627
export class SandboxStartAction extends SandboxAction {
@@ -113,7 +114,7 @@ export class SandboxStartAction extends SandboxAction {
113114

114115
for (const snapshotRunner of snapshotRunners) {
115116
const runner = await this.runnerService.findOne(snapshotRunner.runnerId)
116-
if (runner.used < runner.capacity) {
117+
if (runner.availabilityScore >= BUILD_RUNNER_SCORE_THRESHOLD) {
117118
if (snapshotRunner.state === SnapshotRunnerState.BUILDING_SNAPSHOT) {
118119
await this.updateSandboxState(sandbox.id, SandboxState.BUILDING_SNAPSHOT, runner.id)
119120
return SYNC_AGAIN
@@ -137,7 +138,7 @@ export class SandboxStartAction extends SandboxAction {
137138
this.buildOnRunner(sandbox.buildInfo, runnerId, sandbox.organizationId)
138139

139140
await this.updateSandboxState(sandbox.id, SandboxState.BUILDING_SNAPSHOT, runnerId)
140-
await this.runnerService.recalculateRunnerUsage(runner)
141+
141142
return SYNC_AGAIN
142143
}
143144

@@ -229,16 +230,7 @@ export class SandboxStartAction extends SandboxAction {
229230
// If the sandbox is on a runner and its backupState is COMPLETED
230231
// but there are too many running sandboxes on that runner, move it to a less used runner
231232
if (sandbox.backupState === BackupState.COMPLETED) {
232-
const usageThreshold = 35
233-
const runningSandboxsCount = await this.sandboxRepository.count({
234-
where: {
235-
runnerId: originalRunnerId,
236-
state: SandboxState.STARTED,
237-
},
238-
})
239-
if (runningSandboxsCount > usageThreshold) {
240-
// TODO: usage should be based on compute usage
241-
233+
if (runner.availabilityScore < BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD) {
242234
const availableRunners = await this.runnerService.findAvailableRunners({
243235
region: sandbox.region,
244236
sandboxClass: sandbox.class,
@@ -341,17 +333,23 @@ export class SandboxStartAction extends SandboxAction {
341333
const snapshotRef = baseSnapshot ? baseSnapshot.internalName : null
342334

343335
let availableRunners = []
344-
const runnersWithBaseSnapshot = snapshotRef
345-
? await this.runnerService.findAvailableRunners({
336+
337+
if (snapshotRef) {
338+
availableRunners = await this.runnerService.findAvailableRunners({
339+
region: sandbox.region,
340+
sandboxClass: sandbox.class,
341+
snapshotRef,
342+
availabilityScoreThreshold: BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD,
343+
})
344+
345+
if (availableRunners.length === 0) {
346+
availableRunners = await this.runnerService.findAvailableRunners({
346347
region: sandbox.region,
347348
sandboxClass: sandbox.class,
348349
snapshotRef,
349350
})
350-
: []
351-
if (runnersWithBaseSnapshot.length > 0) {
352-
availableRunners = runnersWithBaseSnapshot
351+
}
353352
} else {
354-
// if no runner has the base snapshot, get all available runners
355353
availableRunners = await this.runnerService.findAvailableRunners({
356354
region: sandbox.region,
357355
sandboxClass: sandbox.class,
@@ -364,21 +362,18 @@ export class SandboxStartAction extends SandboxAction {
364362
return DONT_SYNC_AGAIN
365363
}
366364

367-
// get random runner from available runners
368-
const randomRunnerIndex = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1) + min)
369-
const runnerId = availableRunners[randomRunnerIndex(0, availableRunners.length - 1)].id
370-
371-
const runner = await this.runnerService.findOne(runnerId)
365+
// get runner with highest availability score from available runners
366+
const runner = availableRunners[0]
372367

373368
// verify the runner is still available and ready
374-
if (!runner || runner.state !== RunnerState.READY || runner.unschedulable || runner.used >= runner.capacity) {
375-
this.logger.warn(`Selected runner ${runnerId} is no longer available, retrying sandbox assignment`)
369+
if (!runner || runner.state !== RunnerState.READY || runner.unschedulable) {
370+
this.logger.warn(`Selected runner ${runner.Id} is no longer available, retrying sandbox assignment`)
376371
return SYNC_AGAIN
377372
}
378373

379374
const runnerAdapter = await this.runnerAdapterFactory.create(runner)
380375

381-
await this.updateSandboxState(sandbox.id, SandboxState.RESTORING, runnerId)
376+
await this.updateSandboxState(sandbox.id, SandboxState.RESTORING, runner.Id)
382377

383378
sandbox.snapshot = validBackup
384379
await runnerAdapter.createSandbox(sandbox, registry)

apps/api/src/sandbox/managers/snapshot.manager.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import { Injectable, Logger, NotFoundException, OnApplicationShutdown } from '@nestjs/common'
77
import { InjectRepository } from '@nestjs/typeorm'
88
import { Cron, CronExpression } from '@nestjs/schedule'
9-
import { In, IsNull, LessThan, Not, Or, Raw, Repository } from 'typeorm'
9+
import { In, LessThan, MoreThanOrEqual, Not, Repository } from 'typeorm'
1010
import { DockerRegistryService } from '../../docker-registry/services/docker-registry.service'
1111
import { Snapshot } from '../entities/snapshot.entity'
1212
import { SnapshotState } from '../enums/snapshot-state.enum'
@@ -31,6 +31,7 @@ import { RunnerAdapterFactory } from '../runner-adapter/runnerAdapter'
3131
import { TrackableJobExecutions } from '../../common/interfaces/trackable-job-executions'
3232
import { TrackJobExecution } from '../../common/decorators/track-job-execution.decorator'
3333
import { setTimeout as sleep } from 'timers/promises'
34+
import { BUILD_RUNNER_SCORE_THRESHOLD } from '../constants/runner.constants'
3435

3536
@Injectable()
3637
export class SnapshotManager implements TrackableJobExecutions, OnApplicationShutdown {
@@ -702,7 +703,7 @@ export class SnapshotManager implements TrackableJobExecutions, OnApplicationShu
702703
where: {
703704
state: RunnerState.READY,
704705
unschedulable: Not(true),
705-
used: Raw((alias) => `${alias} < capacity`),
706+
availabilityScore: MoreThanOrEqual(BUILD_RUNNER_SCORE_THRESHOLD),
706707
},
707708
})
708709
// Propagate snapshot to one runner so it can be used immediately

apps/api/src/sandbox/services/runner.service.ts

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import { Injectable, Logger, NotFoundException } from '@nestjs/common'
77
import { InjectRepository } from '@nestjs/typeorm'
88
import { Cron, CronExpression } from '@nestjs/schedule'
9-
import { FindOptionsWhere, In, Not, Raw, Repository } from 'typeorm'
9+
import { FindOptionsWhere, In, MoreThanOrEqual, Not, Repository } from 'typeorm'
1010
import { Runner } from '../entities/runner.entity'
1111
import { CreateRunnerDto } from '../dto/create-runner.dto'
1212
import { SandboxClass } from '../enums/sandbox-class.enum'
@@ -60,8 +60,6 @@ export class RunnerService {
6060
runner.diskGiB = createRunnerDto.diskGiB
6161
runner.gpu = createRunnerDto.gpu
6262
runner.gpuType = createRunnerDto.gpuType
63-
runner.used = 0
64-
runner.capacity = createRunnerDto.capacity
6563
runner.region = createRunnerDto.region
6664
runner.class = createRunnerDto.class
6765
runner.version = createRunnerDto.version
@@ -111,7 +109,6 @@ export class RunnerService {
111109
const runnerFilter: FindOptionsWhere<Runner> = {
112110
state: RunnerState.READY,
113111
unschedulable: Not(true),
114-
used: Raw((alias) => `${alias} < capacity`),
115112
}
116113

117114
if (params.snapshotRef !== undefined) {
@@ -145,11 +142,15 @@ export class RunnerService {
145142
runnerFilter.class = params.sandboxClass
146143
}
147144

145+
if (params.availabilityScoreThreshold !== undefined) {
146+
runnerFilter.availabilityScore = MoreThanOrEqual(params.availabilityScoreThreshold)
147+
}
148+
148149
const runners = await this.runnerRepository.find({
149150
where: runnerFilter,
150151
})
151152

152-
return runners.sort((a, b) => a.used / a.capacity - b.used / b.capacity).slice(0, 10)
153+
return runners.sort((a, b) => b.availabilityScore - a.availabilityScore).slice(0, 10)
153154
}
154155

155156
async remove(id: string): Promise<void> {
@@ -161,13 +162,6 @@ export class RunnerService {
161162
if (![SandboxState.DESTROYED, SandboxState.CREATING, SandboxState.ARCHIVED].includes(event.newState)) {
162163
return
163164
}
164-
165-
const runner = await this.runnerRepository.findOne({ where: { id: event.sandbox.runnerId } })
166-
if (!runner) {
167-
throw new Error('Runner not found, cannot recalculate usage')
168-
}
169-
170-
await this.recalculateRunnerUsage(runner)
171165
}
172166

173167
private async updateRunnerState(runnerId: string, newState: RunnerState): Promise<void> {
@@ -230,7 +224,6 @@ export class RunnerService {
230224
}
231225

232226
await this.updateRunnerStatus(runner.id, runnerInfo)
233-
await this.recalculateRunnerUsage(runner)
234227
})(),
235228
new Promise((_, reject) => {
236229
timeoutId = setTimeout(() => {
@@ -305,7 +298,6 @@ export class RunnerService {
305298
allocatedCpu: updateData.currentAllocatedCpu,
306299
allocatedMemoryGiB: updateData.currentAllocatedMemoryGiB,
307300
allocatedDiskGiB: updateData.currentAllocatedDiskGiB,
308-
capacity: runner.capacity,
309301
runnerCpu: runner.cpu,
310302
runnerMemoryGiB: runner.memoryGiB,
311303
runnerDiskGiB: runner.diskGiB,
@@ -317,19 +309,6 @@ export class RunnerService {
317309
await this.runnerRepository.update(runnerId, updateData)
318310
}
319311

320-
async recalculateRunnerUsage(runner: Runner) {
321-
const sandboxes = await this.sandboxRepository.find({
322-
where: {
323-
runnerId: runner.id,
324-
state: Not(SandboxState.DESTROYED),
325-
},
326-
})
327-
328-
await this.runnerRepository.update(runner.id, {
329-
used: sandboxes.length,
330-
})
331-
}
332-
333312
private isValidClass(sandboxClass: SandboxClass): boolean {
334313
return Object.values(SandboxClass).includes(sandboxClass)
335314
}
@@ -347,8 +326,6 @@ export class RunnerService {
347326
async getRandomAvailableRunner(params: GetRunnerParams): Promise<Runner> {
348327
const availableRunners = await this.findAvailableRunners(params)
349328

350-
// TODO: implement a better algorithm to get a random available runner based on the runner's usage
351-
352329
if (availableRunners.length === 0) {
353330
throw new BadRequestError('No available runners')
354331
}
@@ -500,6 +477,7 @@ export class GetRunnerParams {
500477
sandboxClass?: SandboxClass
501478
snapshotRef?: string
502479
excludedRunnerIds?: string[]
480+
availabilityScoreThreshold?: number
503481
}
504482

505483
interface AvailabilityScoreParams {
@@ -509,7 +487,6 @@ interface AvailabilityScoreParams {
509487
allocatedCpu: number
510488
allocatedMemoryGiB: number
511489
allocatedDiskGiB: number
512-
capacity: number
513490
runnerCpu: number
514491
runnerMemoryGiB: number
515492
runnerDiskGiB: number

0 commit comments

Comments
 (0)