Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 35 additions & 49 deletions clients/macos/vellum-assistant/App/AppDelegate+Bootstrap.swift
Original file line number Diff line number Diff line change
Expand Up @@ -232,56 +232,39 @@ extension AppDelegate {
/// to re-provision from scratch. Intended as a recovery primitive for
/// stale/invalid credentials (see `GatewayConnectionManager.attemptRePair()`).
///
/// In Docker/cloud hatches, the CLI-persisted `guardian-token.json` on
/// disk can still contain the same revoked token that produced the auth
/// failures. `performInitialBootstrap()` imports that file ahead of any
/// HTTP path, so without deleting it we would silently re-arm the bad
/// credential and the re-pair would "succeed" only to fall right back
/// into 401s. Delete the file for remote hatches (Docker/cloud/managed),
/// where the CLI or launcher will re-provision and rewrite the file,
/// so the bootstrap is forced down a genuine reprovision path.
/// Recovery must invalidate every guardian-token file the bootstrap path
/// might re-import — including copies in sibling-env config dirs that the
/// CLI's `seedGuardianTokenFromSiblingEnv` would otherwise restore on the
/// next `vellum wake`. Refresh-window expiry is a clock check, not a
/// server-validity check, so a server-revoked token whose refresh hasn't
/// elapsed will silently re-arm itself if any sibling-env copy survives.
///
/// For local/bare-metal assistants the on-disk token is the only
/// recovery artifact — nothing is guaranteed to rewrite it — so preserve
/// it. Bootstrap will still skip the (now-wiped) in-memory credentials
/// and re-import from the file, which is the intended local fallback.
/// For local/bare-metal hatches we additionally clear the guardian-init
/// lockfile so `/v1/guardian/init` can be called again — the lockfile is
/// one-time-use after first successful hatch on bare-metal.
///
/// If the lockfile entry can't be resolved (malformed/legacy lockfile,
/// missing entry), the hatch classification is ambiguous. Default to
/// treating it as remote — i.e. delete the file — so we match pre-PR
/// behavior and never silently re-arm a stale token on a misclassified
/// remote hatch.
///
/// If the delete is attempted but fails (e.g. filesystem permissions),
/// the stale file would otherwise be re-imported by
/// `performInitialBootstrap()`, defeating the fix. In that case pass
/// `skipFileImport: true` so the bootstrap ignores the file entirely
/// and drives the HTTP reprovision path.
/// We always pass `skipFileImport: true` so recovery is driven exclusively
/// by the HTTP path; `bootstrapActorToken` will surface its own errors if
/// the gateway is unreachable.
func forceReBootstrap() async {
log.info("forceReBootstrap: clearing stored credentials and re-running bootstrap")
ActorTokenManager.deleteAllCredentials()

var skipFileImport = false
if let assistantId = LockfileAssistant.loadActiveAssistantId() {
let removed = GuardianTokenFileReader.deleteTokenFileAcrossAllEnvs(
assistantId: assistantId
)
log.info("forceReBootstrap: removed \(removed, privacy: .public) stale guardian-token file(s) across env dirs")

// When the lockfile entry can't be resolved, default to treating
// the hatch as remote (skip lock reset) — matches the prior
// ambiguous-state convention.
let assistant = LockfileAssistant.loadByName(assistantId)
// When the lockfile entry can't be resolved (malformed/legacy
// lockfile, missing entry, read failure) default to treating the
// hatch as remote. Pre-PR behavior unconditionally deleted the
// token file; defaulting ambiguous state to "remote" preserves
// that safety so we never silently re-arm a stale token on a
// remote hatch we failed to classify.
let isRemoteHatch = assistant?.isRemote ?? true
if assistant == nil {
log.warning("forceReBootstrap: could not resolve lockfile entry for active assistant — treating as remote hatch to preserve delete-on-rebootstrap safety")
log.warning("forceReBootstrap: could not resolve lockfile entry for active assistant — treating as remote hatch")
}
if isRemoteHatch {
let deleted = GuardianTokenFileReader.deleteTokenFile(assistantId: assistantId)
if !deleted {
log.warning("forceReBootstrap: failed to delete stale guardian token file — skipping file import to avoid re-arming the revoked token")
skipFileImport = true
}
} else {
log.info("forceReBootstrap: local/bare-metal hatch — preserving guardian token file as the only recovery artifact")
if !isRemoteHatch {
// Clear the guardian-init lock so /v1/guardian/init can succeed
// again. Without this, the HTTP fallback in performInitialBootstrap
// is permanently 403'd on bare-metal after the first hatch.
Expand All @@ -291,7 +274,7 @@ extension AppDelegate {
}
}
}
await performInitialBootstrap(skipFileImport: skipFileImport)
await performInitialBootstrap(skipFileImport: true)
}

/// Performs the initial actor token bootstrap, reactively waiting for a
Expand All @@ -303,10 +286,9 @@ extension AppDelegate {
/// imports it directly and skips the HTTP bootstrap entirely.
///
/// `skipFileImport`: when `true`, bypass the guardian-token.json import
/// entirely and jump straight to the HTTP fallback. Used by
/// `forceReBootstrap()` when a stale file could not be deleted — leaving
/// the import path enabled in that case would silently re-arm the revoked
/// token the re-bootstrap is meant to discard.
/// entirely and jump straight to the HTTP fallback. `forceReBootstrap()`
/// always passes `true` so recovery is driven exclusively by the HTTP
/// path.
func performInitialBootstrap(skipFileImport: Bool = false) async {
guard let assistantId = LockfileAssistant.loadActiveAssistantId() else { return }

Expand Down Expand Up @@ -390,12 +372,16 @@ extension AppDelegate {
}
}

// HTTP /v1/guardian/init does NOT gate on `connectionManager.isConnected`.
// `isConnected` flips true only after a 200 health-check response,
// which requires auth — and the whole point of this loop is to mint
// the credential that the health check would use. Gating here would
// deadlock initial bootstrap whenever an existing keychain token has
// been wiped (re-pair / refresh-token revoked / first-launch-after-
// sibling-env cleanup). `bootstrapActorToken` is itself an HTTP POST
// — it will surface gateway-down errors via its return value, and the
// retry loop handles transient failures.
while !Task.isCancelled {
if !connectionManager.isConnected {
await awaitConnectionEstablished()
guard !Task.isCancelled else { return }
}

let success = await GuardianClient().bootstrapActorToken(
platform: "macos",
deviceId: deviceId
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ final class GuardianTokenFileReaderTests: XCTestCase {
let path = tmpDir.appendingPathComponent("guardian-token.json").path
let body: [String: Any] = [
"guardianPrincipalId": "vellum-principal-test",
"accessToken": "header.payload.signature",
"accessToken": "stub.access.value",
"accessTokenExpiresAt": accessExpiresMs,
"refreshToken": "refresh-test-token",
"refreshTokenExpiresAt": refreshExpiresMs,
Expand All @@ -60,7 +60,7 @@ final class GuardianTokenFileReaderTests: XCTestCase {
)
let decision = GuardianTokenFileReader.decideImport(fromPath: path, nowMs: nowMs)
if case .importValid(let creds) = decision {
XCTAssertEqual(creds.accessToken, "header.payload.signature")
XCTAssertEqual(creds.accessToken, "stub.access.value")
XCTAssertEqual(creds.refreshToken, "refresh-test-token")
} else {
XCTFail("Expected .importValid, got \(decision)")
Expand Down Expand Up @@ -137,4 +137,89 @@ final class GuardianTokenFileReaderTests: XCTestCase {
let decision = GuardianTokenFileReader.decideImport(fromPath: path, nowMs: 1_000_000_000)
XCTAssertEqual(decision, .skipUnparseableJson)
}

// MARK: - deleteTokenFileAcrossAllEnvs

/// Per-env `VellumPaths` rooted at this test's tmp dir so the sweep
/// touches only the test sandbox, never the user's real `~/.config/`.
private func makeEnvPaths() -> [VellumPaths] {
VellumPaths.allEnvs(
homeDirectory: tmpDir,
xdgConfigHome: tmpDir.appendingPathComponent(".config")
)
}

private func plantTokenFile(at paths: VellumPaths, assistantId: String) {
let dir = paths.configDir
.appendingPathComponent("assistants")
.appendingPathComponent(assistantId)
try! FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
try! Data("{}".utf8).write(to: dir.appendingPathComponent("guardian-token.json"))
}

private func tokenFilePath(at paths: VellumPaths, assistantId: String) -> String {
paths.configDir
.appendingPathComponent("assistants")
.appendingPathComponent(assistantId)
.appendingPathComponent("guardian-token.json")
.path
}

/// Recovery flows must sweep guardian-token.json from every env config
/// dir, not just the active one. Otherwise `seedGuardianTokenFromSiblingEnv`
/// (CLI side) restores a server-revoked token from a sibling env on the
/// next `vellum wake`, silently undoing the re-pair.
func testDeleteAcrossAllEnvsRemovesSiblingCopies() {
let assistantId = "vellum-test-pike-abc123"
let envPaths = makeEnvPaths()
let plantedEnvs: [VellumEnvironment] = [.production, .dev, .local]
for paths in envPaths where plantedEnvs.contains(paths.environment) {
plantTokenFile(at: paths, assistantId: assistantId)
}

let removed = GuardianTokenFileReader.deleteTokenFileAcrossAllEnvs(
assistantId: assistantId,
envPaths: envPaths
)

XCTAssertEqual(removed, plantedEnvs.count, "Should report removing exactly the planted files")
for paths in envPaths {
XCTAssertFalse(
FileManager.default.fileExists(atPath: tokenFilePath(at: paths, assistantId: assistantId)),
"Token file should be gone in env \(paths.environment.rawValue)"
)
}
}

func testDeleteAcrossAllEnvsHandlesMissingFiles() {
let removed = GuardianTokenFileReader.deleteTokenFileAcrossAllEnvs(
assistantId: "vellum-no-files-here",
envPaths: makeEnvPaths()
)
XCTAssertEqual(removed, 0)
}

func testDeleteAcrossAllEnvsLeavesOtherAssistantsAlone() {
let target = "vellum-target-pike"
let bystander = "vellum-bystander-deer"
let envPaths = makeEnvPaths()
let plantedEnvs: [VellumEnvironment] = [.production, .dev]
for paths in envPaths where plantedEnvs.contains(paths.environment) {
plantTokenFile(at: paths, assistantId: target)
plantTokenFile(at: paths, assistantId: bystander)
}

let removed = GuardianTokenFileReader.deleteTokenFileAcrossAllEnvs(
assistantId: target,
envPaths: envPaths
)

XCTAssertEqual(removed, plantedEnvs.count, "Should remove only the target assistant's files")
for paths in envPaths where plantedEnvs.contains(paths.environment) {
XCTAssertTrue(
FileManager.default.fileExists(atPath: tokenFilePath(at: paths, assistantId: bystander)),
"Bystander assistant's token must not be touched in env \(paths.environment.rawValue)"
)
}
}
}
37 changes: 34 additions & 3 deletions clients/shared/App/Auth/GuardianTokenFileReader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ public enum GuardianTokenFileReader {
/// treated as success.
@discardableResult
public static func deleteTokenFile(assistantId: String) -> Bool {
let path = guardianTokenPath(for: assistantId)
let path = guardianTokenPath(for: assistantId, paths: VellumPaths.current)
guard FileManager.default.fileExists(atPath: path) else { return true }
do {
try FileManager.default.removeItem(atPath: path)
Expand All @@ -206,12 +206,43 @@ public enum GuardianTokenFileReader {
}
}
Comment thread
siddseethepalli marked this conversation as resolved.

/// Deletes the guardian-token file for the given assistant across every
/// `VellumEnvironment`'s config dir.
///
/// Recovery flows must invalidate every copy that the CLI's
/// `seedGuardianTokenFromSiblingEnv` could re-import on next launch. A
/// server-revoked token whose refresh window is still open is otherwise
/// silently restored from a sibling env, defeating the re-bootstrap.
///
/// - Returns: number of files actually removed (missing files are not
/// counted; deletion failures are logged but do not abort the sweep).
@discardableResult
public static func deleteTokenFileAcrossAllEnvs(
assistantId: String,
envPaths: [VellumPaths]? = nil
) -> Int {
let allEnvPaths = envPaths ?? VellumPaths.allEnvs()
var deleted = 0
for paths in allEnvPaths {
let path = guardianTokenPath(for: assistantId, paths: paths)
guard FileManager.default.fileExists(atPath: path) else { continue }
do {
try FileManager.default.removeItem(atPath: path)
log.info("Deleted guardian token file at \(path, privacy: .public)")
deleted += 1
} catch {
log.warning("Failed to delete guardian token file at \(path, privacy: .public): \(error.localizedDescription, privacy: .public)")
}
}
return deleted
}

// MARK: - Path Resolution

/// Resolves `$XDG_CONFIG_HOME/vellum{-env}/assistants/<id>/guardian-token.json`,
/// matching the CLI's `getGuardianTokenPath()`.
private static func guardianTokenPath(for assistantId: String) -> String {
return VellumPaths.current.configDir
private static func guardianTokenPath(for assistantId: String, paths: VellumPaths) -> String {
Comment thread
siddseethepalli marked this conversation as resolved.
return paths.configDir
.appendingPathComponent("assistants")
.appendingPathComponent(assistantId)
.appendingPathComponent("guardian-token.json")
Expand Down
2 changes: 1 addition & 1 deletion clients/shared/App/VellumEnvironment.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import Foundation
/// Values: `local`, `dev`, `test`, `staging`, `production`.
/// Falls back to `.production` when the variable is unset (e.g. in unit
/// tests or when launched outside the normal build pipeline).
public enum VellumEnvironment: String {
public enum VellumEnvironment: String, CaseIterable {
case local
case dev
case test
Expand Down
16 changes: 16 additions & 0 deletions clients/shared/Utilities/VellumPaths.swift
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ public struct VellumPaths {
self.xdgConfigHome = xdgConfigHome
}

/// One `VellumPaths` per `VellumEnvironment`, all sharing the same
/// home + XDG roots. Used by recovery flows that need to operate on
/// every env's config dir (e.g. sweeping stale tokens across siblings).
public static func allEnvs(
homeDirectory: URL = current.homeDirectory,
xdgConfigHome: URL = current.xdgConfigHome
) -> [VellumPaths] {
VellumEnvironment.allCases.map { env in
VellumPaths(
environment: env,
homeDirectory: homeDirectory,
xdgConfigHome: xdgConfigHome
)
}
}

// MARK: - Path getters

/// `~/.config/vellum/` for production, `~/.config/vellum-<env>/` otherwise.
Expand Down