Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions apps/api/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import { RunnerService } from './sandbox/services/runner.service'

import { SandboxClass } from './sandbox/enums/sandbox-class.enum'
import { getOpenApiConfig } from './openapi.config'
import { SchedulerRegistry } from '@nestjs/schedule'
import { EventEmitter2 } from '@nestjs/event-emitter'
import { AuditInterceptor } from './audit/interceptors/audit.interceptor'

Expand Down Expand Up @@ -114,7 +113,6 @@ async function bootstrap() {
diskGiB: 50,
gpu: 0,
gpuType: 'none',
capacity: 100,
region: 'us',
class: SandboxClass.SMALL,
domain: 'localtest.me:3003',
Expand Down
8 changes: 8 additions & 0 deletions apps/api/src/sandbox/constants/runner.constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/*
* Copyright 2025 Daytona Platforms Inc.
* SPDX-License-Identifier: AGPL-3.0
*/

export const BUILD_RUNNER_SCORE_THRESHOLD = 50

export const BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD = 50
1 change: 0 additions & 1 deletion apps/api/src/sandbox/controllers/runner.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ export class RunnerController {
gpu: req.body?.gpu,
gpuType: req.body?.gpuType,
class: req.body?.class,
capacity: req.body?.capacity,
region: req.body?.region,
}),
},
Expand Down
4 changes: 0 additions & 4 deletions apps/api/src/sandbox/dto/create-runner.dto.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ export class CreateRunnerDto {
})
class: SandboxClass

@IsNumber()
@ApiProperty()
capacity: number

@IsString()
@ApiProperty({
example: 'us',
Expand Down
14 changes: 0 additions & 14 deletions apps/api/src/sandbox/dto/runner.dto.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,6 @@ export class RunnerDto {
@IsEnum(SandboxClass)
class: SandboxClass

@ApiProperty({
description: 'The current usage of the runner',
example: 2,
})
used: number

@ApiProperty({
description: 'The capacity of the runner',
example: 10,
})
capacity: number

@ApiPropertyOptional({
description: 'Current CPU usage percentage',
example: 45.6,
Expand Down Expand Up @@ -199,8 +187,6 @@ export class RunnerDto {
gpu: runner.gpu,
gpuType: runner.gpuType,
class: runner.class,
used: runner.used,
capacity: runner.capacity,
currentCpuUsagePercentage: runner.currentCpuUsagePercentage,
currentMemoryUsagePercentage: runner.currentMemoryUsagePercentage,
currentDiskUsagePercentage: runner.currentDiskUsagePercentage,
Expand Down
8 changes: 0 additions & 8 deletions apps/api/src/sandbox/entities/runner.entity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,6 @@ export class Runner {
})
class: SandboxClass

@Column({
default: 0,
})
used: number

@Column()
capacity: number

@Column({
type: 'float',
default: 0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import { RunnerAdapterFactory } from '../../runner-adapter/runnerAdapter'
import { ToolboxService } from '../../services/toolbox.service'
import { InjectRepository } from '@nestjs/typeorm'
import { Snapshot } from '../../entities/snapshot.entity'
import { BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD, BUILD_RUNNER_SCORE_THRESHOLD } from '../../constants/runner.constants'

@Injectable()
export class SandboxStartAction extends SandboxAction {
Expand Down Expand Up @@ -113,7 +114,7 @@ export class SandboxStartAction extends SandboxAction {

for (const snapshotRunner of snapshotRunners) {
const runner = await this.runnerService.findOne(snapshotRunner.runnerId)
if (runner.used < runner.capacity) {
if (runner.availabilityScore >= BUILD_RUNNER_SCORE_THRESHOLD) {
if (snapshotRunner.state === SnapshotRunnerState.BUILDING_SNAPSHOT) {
await this.updateSandboxState(sandbox.id, SandboxState.BUILDING_SNAPSHOT, runner.id)
return SYNC_AGAIN
Expand All @@ -137,7 +138,7 @@ export class SandboxStartAction extends SandboxAction {
this.buildOnRunner(sandbox.buildInfo, runnerId, sandbox.organizationId)

await this.updateSandboxState(sandbox.id, SandboxState.BUILDING_SNAPSHOT, runnerId)
await this.runnerService.recalculateRunnerUsage(runner)

return SYNC_AGAIN
}

Expand Down Expand Up @@ -229,16 +230,7 @@ export class SandboxStartAction extends SandboxAction {
// If the sandbox is on a runner and its backupState is COMPLETED
// but there are too many running sandboxes on that runner, move it to a less used runner
if (sandbox.backupState === BackupState.COMPLETED) {
const usageThreshold = 35
const runningSandboxsCount = await this.sandboxRepository.count({
where: {
runnerId: originalRunnerId,
state: SandboxState.STARTED,
},
})
if (runningSandboxsCount > usageThreshold) {
// TODO: usage should be based on compute usage

if (runner.availabilityScore < BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD) {
const availableRunners = await this.runnerService.findAvailableRunners({
region: sandbox.region,
sandboxClass: sandbox.class,
Expand Down Expand Up @@ -341,17 +333,23 @@ export class SandboxStartAction extends SandboxAction {
const snapshotRef = baseSnapshot ? baseSnapshot.internalName : null

let availableRunners = []
const runnersWithBaseSnapshot = snapshotRef
? await this.runnerService.findAvailableRunners({

if (snapshotRef) {
availableRunners = await this.runnerService.findAvailableRunners({
region: sandbox.region,
sandboxClass: sandbox.class,
snapshotRef,
availabilityScoreThreshold: BASE_SNAPSHOT_RUNNER_SCORE_THRESHOLD,
})

if (availableRunners.length === 0) {
availableRunners = await this.runnerService.findAvailableRunners({
region: sandbox.region,
sandboxClass: sandbox.class,
snapshotRef,
})
: []
if (runnersWithBaseSnapshot.length > 0) {
availableRunners = runnersWithBaseSnapshot
}
} else {
// if no runner has the base snapshot, get all available runners
availableRunners = await this.runnerService.findAvailableRunners({
region: sandbox.region,
sandboxClass: sandbox.class,
Expand All @@ -364,21 +362,18 @@ export class SandboxStartAction extends SandboxAction {
return DONT_SYNC_AGAIN
}

// get random runner from available runners
const randomRunnerIndex = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1) + min)
const runnerId = availableRunners[randomRunnerIndex(0, availableRunners.length - 1)].id

const runner = await this.runnerService.findOne(runnerId)
// get runner with highest availability score from available runners
const runner = availableRunners[0]

// verify the runner is still available and ready
if (!runner || runner.state !== RunnerState.READY || runner.unschedulable || runner.used >= runner.capacity) {
this.logger.warn(`Selected runner ${runnerId} is no longer available, retrying sandbox assignment`)
if (!runner || runner.state !== RunnerState.READY || runner.unschedulable) {
this.logger.warn(`Selected runner ${runner.Id} is no longer available, retrying sandbox assignment`)
return SYNC_AGAIN
}

const runnerAdapter = await this.runnerAdapterFactory.create(runner)

await this.updateSandboxState(sandbox.id, SandboxState.RESTORING, runnerId)
await this.updateSandboxState(sandbox.id, SandboxState.RESTORING, runner.Id)

sandbox.snapshot = validBackup
await runnerAdapter.createSandbox(sandbox, registry)
Expand Down
5 changes: 3 additions & 2 deletions apps/api/src/sandbox/managers/snapshot.manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import { Injectable, Logger, NotFoundException, OnApplicationShutdown } from '@nestjs/common'
import { InjectRepository } from '@nestjs/typeorm'
import { Cron, CronExpression } from '@nestjs/schedule'
import { In, IsNull, LessThan, Not, Or, Raw, Repository } from 'typeorm'
import { In, LessThan, MoreThanOrEqual, Not, Repository } from 'typeorm'
import { DockerRegistryService } from '../../docker-registry/services/docker-registry.service'
import { Snapshot } from '../entities/snapshot.entity'
import { SnapshotState } from '../enums/snapshot-state.enum'
Expand All @@ -31,6 +31,7 @@ import { RunnerAdapterFactory } from '../runner-adapter/runnerAdapter'
import { TrackableJobExecutions } from '../../common/interfaces/trackable-job-executions'
import { TrackJobExecution } from '../../common/decorators/track-job-execution.decorator'
import { setTimeout as sleep } from 'timers/promises'
import { BUILD_RUNNER_SCORE_THRESHOLD } from '../constants/runner.constants'

@Injectable()
export class SnapshotManager implements TrackableJobExecutions, OnApplicationShutdown {
Expand Down Expand Up @@ -702,7 +703,7 @@ export class SnapshotManager implements TrackableJobExecutions, OnApplicationShu
where: {
state: RunnerState.READY,
unschedulable: Not(true),
used: Raw((alias) => `${alias} < capacity`),
availabilityScore: MoreThanOrEqual(BUILD_RUNNER_SCORE_THRESHOLD),
},
})
// Propagate snapshot to one runner so it can be used immediately
Expand Down
37 changes: 7 additions & 30 deletions apps/api/src/sandbox/services/runner.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import { Injectable, Logger, NotFoundException } from '@nestjs/common'
import { InjectRepository } from '@nestjs/typeorm'
import { Cron, CronExpression } from '@nestjs/schedule'
import { FindOptionsWhere, In, Not, Raw, Repository } from 'typeorm'
import { FindOptionsWhere, In, MoreThanOrEqual, Not, Repository } from 'typeorm'
import { Runner } from '../entities/runner.entity'
import { CreateRunnerDto } from '../dto/create-runner.dto'
import { SandboxClass } from '../enums/sandbox-class.enum'
Expand Down Expand Up @@ -60,8 +60,6 @@ export class RunnerService {
runner.diskGiB = createRunnerDto.diskGiB
runner.gpu = createRunnerDto.gpu
runner.gpuType = createRunnerDto.gpuType
runner.used = 0
runner.capacity = createRunnerDto.capacity
runner.region = createRunnerDto.region
runner.class = createRunnerDto.class
runner.version = createRunnerDto.version
Expand Down Expand Up @@ -111,7 +109,6 @@ export class RunnerService {
const runnerFilter: FindOptionsWhere<Runner> = {
state: RunnerState.READY,
unschedulable: Not(true),
used: Raw((alias) => `${alias} < capacity`),
}

if (params.snapshotRef !== undefined) {
Expand Down Expand Up @@ -145,11 +142,15 @@ export class RunnerService {
runnerFilter.class = params.sandboxClass
}

if (params.availabilityScoreThreshold !== undefined) {
runnerFilter.availabilityScore = MoreThanOrEqual(params.availabilityScoreThreshold)
}

const runners = await this.runnerRepository.find({
where: runnerFilter,
})

return runners.sort((a, b) => a.used / a.capacity - b.used / b.capacity).slice(0, 10)
return runners.sort((a, b) => b.availabilityScore - a.availabilityScore).slice(0, 10)
}

async remove(id: string): Promise<void> {
Expand All @@ -161,13 +162,6 @@ export class RunnerService {
if (![SandboxState.DESTROYED, SandboxState.CREATING, SandboxState.ARCHIVED].includes(event.newState)) {
return
}

const runner = await this.runnerRepository.findOne({ where: { id: event.sandbox.runnerId } })
if (!runner) {
throw new Error('Runner not found, cannot recalculate usage')
}

await this.recalculateRunnerUsage(runner)
}

private async updateRunnerState(runnerId: string, newState: RunnerState): Promise<void> {
Expand Down Expand Up @@ -230,7 +224,6 @@ export class RunnerService {
}

await this.updateRunnerStatus(runner.id, runnerInfo)
await this.recalculateRunnerUsage(runner)
})(),
new Promise((_, reject) => {
timeoutId = setTimeout(() => {
Expand Down Expand Up @@ -305,7 +298,6 @@ export class RunnerService {
allocatedCpu: updateData.currentAllocatedCpu,
allocatedMemoryGiB: updateData.currentAllocatedMemoryGiB,
allocatedDiskGiB: updateData.currentAllocatedDiskGiB,
capacity: runner.capacity,
runnerCpu: runner.cpu,
runnerMemoryGiB: runner.memoryGiB,
runnerDiskGiB: runner.diskGiB,
Expand All @@ -317,19 +309,6 @@ export class RunnerService {
await this.runnerRepository.update(runnerId, updateData)
}

async recalculateRunnerUsage(runner: Runner) {
const sandboxes = await this.sandboxRepository.find({
where: {
runnerId: runner.id,
state: Not(SandboxState.DESTROYED),
},
})

await this.runnerRepository.update(runner.id, {
used: sandboxes.length,
})
}

private isValidClass(sandboxClass: SandboxClass): boolean {
return Object.values(SandboxClass).includes(sandboxClass)
}
Expand All @@ -347,8 +326,6 @@ export class RunnerService {
async getRandomAvailableRunner(params: GetRunnerParams): Promise<Runner> {
const availableRunners = await this.findAvailableRunners(params)

// TODO: implement a better algorithm to get a random available runner based on the runner's usage

if (availableRunners.length === 0) {
throw new BadRequestError('No available runners')
}
Expand Down Expand Up @@ -500,6 +477,7 @@ export class GetRunnerParams {
sandboxClass?: SandboxClass
snapshotRef?: string
excludedRunnerIds?: string[]
availabilityScoreThreshold?: number
}

interface AvailabilityScoreParams {
Expand All @@ -509,7 +487,6 @@ interface AvailabilityScoreParams {
allocatedCpu: number
allocatedMemoryGiB: number
allocatedDiskGiB: number
capacity: number
runnerCpu: number
runnerMemoryGiB: number
runnerDiskGiB: number
Expand Down
Loading
Loading