diff --git a/apps/desktop/src/main/host-service/index.ts b/apps/desktop/src/main/host-service/index.ts index 0a757b2b55a..b4b8983b206 100644 --- a/apps/desktop/src/main/host-service/index.ts +++ b/apps/desktop/src/main/host-service/index.ts @@ -8,6 +8,7 @@ import { serve } from "@hono/node-server"; import { createApp, + installProcessSafetyNet, JwtApiAuthProvider, LocalGitCredentialProvider, LocalModelProvider, @@ -53,6 +54,10 @@ async function main(): Promise { const server = serve( { fetch: app.fetch, port: env.HOST_SERVICE_PORT, hostname: "127.0.0.1" }, (info: { port: number }) => { + // Install only after the server is listening so startup throws still + // reach `main().catch(...)` and exit with a non-zero code. + installProcessSafetyNet(); + if (env.ORGANIZATION_ID) { try { writeManifest({ diff --git a/apps/desktop/src/renderer/hooks/host-service/useFileTree/useFileTree.ts b/apps/desktop/src/renderer/hooks/host-service/useFileTree/useFileTree.ts index 1d07dd299d8..9032188bf20 100644 --- a/apps/desktop/src/renderer/hooks/host-service/useFileTree/useFileTree.ts +++ b/apps/desktop/src/renderer/hooks/host-service/useFileTree/useFileTree.ts @@ -40,6 +40,43 @@ interface FileTreeState { loadingDirectories: Set; } +interface LoadDirectoryOptions { + force?: boolean; +} + +function applyDirectoryEntries( + current: FileTreeState, + absolutePath: string, + entries: FsEntry[], +): FileTreeState { + const nextEntries = new Map(current.entriesByPath); + const nextChildren = new Map(current.childPathsByDirectory); + const nextLoaded = new Set(current.loadedDirectories); + const nextInvalidated = new Set(current.invalidatedDirectories); + const nextLoading = new Set(current.loadingDirectories); + nextLoading.delete(absolutePath); + nextLoaded.add(absolutePath); + nextInvalidated.delete(absolutePath); + + for (const entry of entries) { + nextEntries.set(entry.absolutePath, entry); + } + + nextChildren.set( + absolutePath, + entries.map((entry) => entry.absolutePath), + ); + + return { + ...current, + childPathsByDirectory: nextChildren, + entriesByPath: nextEntries, + invalidatedDirectories: nextInvalidated, + loadedDirectories: nextLoaded, + loadingDirectories: nextLoading, + }; +} + function createInitialState(): FileTreeState { return { childPathsByDirectory: new Map(), @@ -187,16 +224,15 @@ export function useFileTree({ ); const loadDirectory = useCallback( - async (absolutePath: string, force = false): Promise => { - if (!workspaceId || !absolutePath) { - return; - } + async ( + absolutePath: string, + options: LoadDirectoryOptions = {}, + ): Promise => { + const { force = false } = options; + if (!workspaceId || !absolutePath) return; const currentState = stateRef.current; - if (currentState.loadingDirectories.has(absolutePath)) { - return; - } - + if (currentState.loadingDirectories.has(absolutePath)) return; if ( !force && currentState.loadedDirectories.has(absolutePath) && @@ -205,65 +241,38 @@ export function useFileTree({ return; } - updateState((current) => { - const nextLoading = new Set(current.loadingDirectories); - nextLoading.add(absolutePath); - return { - ...current, - loadingDirectories: nextLoading, - }; - }); + const input = { workspaceId, absolutePath }; + const cachedResult = utils.filesystem.listDirectory.getData(input); + if (cachedResult) { + updateState((current) => + applyDirectoryEntries(current, absolutePath, cachedResult.entries), + ); + if (!force) return; + } - try { - const result = await utils.filesystem.listDirectory.fetch({ - workspaceId, + updateState((current) => ({ + ...current, + loadingDirectories: new Set(current.loadingDirectories).add( absolutePath, - }); - - updateState((current) => { - const nextEntries = new Map(current.entriesByPath); - const nextChildren = new Map(current.childPathsByDirectory); - const nextLoaded = new Set(current.loadedDirectories); - const nextInvalidated = new Set(current.invalidatedDirectories); - const nextLoading = new Set(current.loadingDirectories); - nextLoading.delete(absolutePath); - nextLoaded.add(absolutePath); - nextInvalidated.delete(absolutePath); - - for (const entry of result.entries) { - nextEntries.set(entry.absolutePath, entry); - } - - nextChildren.set( - absolutePath, - result.entries.map((entry) => entry.absolutePath), - ); + ), + })); - return { - ...current, - childPathsByDirectory: nextChildren, - entriesByPath: nextEntries, - invalidatedDirectories: nextInvalidated, - loadedDirectories: nextLoaded, - loadingDirectories: nextLoading, - }; - }); + try { + // Server-side timeout + React Query's TIMEOUT-aware retry handle + // hung host-service IPC; we just await the fetch and apply results. + const result = await utils.filesystem.listDirectory.fetch(input); + updateState((current) => + applyDirectoryEntries(current, absolutePath, result.entries), + ); } catch (error) { console.error( "[workspace-client/useFileTree] Failed to load directory:", - { - absolutePath, - error, - }, + { absolutePath, error }, ); - updateState((current) => { const nextLoading = new Set(current.loadingDirectories); nextLoading.delete(absolutePath); - return { - ...current, - loadingDirectories: nextLoading, - }; + return { ...current, loadingDirectories: nextLoading }; }); } }, @@ -272,7 +281,7 @@ export function useFileTree({ const refreshPath = useCallback( async (absolutePath: string): Promise => { - await loadDirectory(absolutePath, true); + await loadDirectory(absolutePath, { force: true }); }, [loadDirectory], ); @@ -288,10 +297,10 @@ export function useFileTree({ (left, right) => left.split(/[/\\]/).length - right.split(/[/\\]/).length, ); - await loadDirectory(rootPath, true); + await loadDirectory(rootPath, { force: true }); for (const absolutePath of expandedDirectories) { if (absolutePath !== rootPath) { - await loadDirectory(absolutePath, true); + await loadDirectory(absolutePath, { force: true }); } } }, [loadDirectory, rootPath]); @@ -347,11 +356,8 @@ export function useFileTree({ useEffect(() => { updateState(() => createInitialState()); - if (!rootPath) { - return; - } - - void loadDirectory(rootPath, true); + if (!rootPath) return; + void loadDirectory(rootPath, { force: true }); }, [loadDirectory, rootPath, updateState]); useWorkspaceEvent( @@ -404,16 +410,16 @@ export function useFileTree({ }); if (stateRef.current.loadedDirectories.has(oldParentPath)) { - void loadDirectory(oldParentPath, true); + void loadDirectory(oldParentPath, { force: true }); } if (stateRef.current.loadedDirectories.has(newParentPath)) { - void loadDirectory(newParentPath, true); + void loadDirectory(newParentPath, { force: true }); } if ( event.isDirectory && stateRef.current.expandedDirectories.has(event.absolutePath) ) { - void loadDirectory(event.absolutePath, true); + void loadDirectory(event.absolutePath, { force: true }); } return; } @@ -438,7 +444,7 @@ export function useFileTree({ }); if (stateRef.current.loadedDirectories.has(parentPath)) { - void loadDirectory(parentPath, true); + void loadDirectory(parentPath, { force: true }); } }, Boolean(workspaceId && rootPath), diff --git a/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/WorkspaceSidebar.tsx b/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/WorkspaceSidebar.tsx index bac06261b80..7a6d68dcd0d 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/WorkspaceSidebar.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/WorkspaceSidebar.tsx @@ -44,7 +44,6 @@ interface WorkspaceSidebarProps { selectedFilePath?: string; pendingReveal?: PendingReveal | null; workspaceId: string; - workspaceName?: string; } function IconButton({ @@ -82,7 +81,6 @@ export function WorkspaceSidebar({ selectedFilePath, pendingReveal, workspaceId, - workspaceName, }: WorkspaceSidebarProps) { const collections = useCollections(); // FORK NOTE: keep reactive `useLiveQuery` against @@ -155,7 +153,6 @@ export function WorkspaceSidebar({ selectedFilePath={selectedFilePath} pendingReveal={pendingReveal} workspaceId={workspaceId} - workspaceName={workspaceName} gitStatus={gitStatus.data} /> ), diff --git a/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/components/FilesTab/FilesTab.tsx b/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/components/FilesTab/FilesTab.tsx index b90f8e315b0..5cb23450489 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/components/FilesTab/FilesTab.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/components/WorkspaceSidebar/components/FilesTab/FilesTab.tsx @@ -5,7 +5,13 @@ import { toast } from "@superset/ui/sonner"; import { Tooltip, TooltipContent, TooltipTrigger } from "@superset/ui/tooltip"; import { workspaceTrpc } from "@superset/workspace-client"; import type { inferRouterOutputs } from "@trpc/server"; -import { FilePlus, FolderPlus, FoldVertical, RefreshCw } from "lucide-react"; +import { + FilePlus, + FolderPlus, + FoldVertical, + Loader2, + RefreshCw, +} from "lucide-react"; import { Fragment, useCallback, useEffect, useRef, useState } from "react"; import { type FileTreeNode, @@ -39,7 +45,6 @@ interface FilesTabProps { isDirectory: boolean; } | null; workspaceId: string; - workspaceName?: string; gitStatus: GitStatusData | undefined; } @@ -209,7 +214,6 @@ export function FilesTab({ selectedFilePath, pendingReveal, workspaceId, - workspaceName, gitStatus, }: FilesTabProps) { const [_isRefreshing, setIsRefreshing] = useState(false); @@ -462,10 +466,17 @@ export function FilesTab({ [workspaceId, deletePath], ); - if (!workspaceQuery.data?.worktreePath) { + if (!rootPath) { return ( -
- Workspace worktree not available +
+ {workspaceQuery.isLoading ? ( + <> + + Loading files... + + ) : ( + "Workspace worktree not available" + )}
); } @@ -503,7 +514,7 @@ export function FilesTab({ zIndex: 20, }} > - {workspaceName ?? "Explorer"} + Explorer
diff --git a/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/page.tsx b/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/page.tsx index 086ffcab8da..9658a79c190 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/page.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/_dashboard/v2-workspace/$workspaceId/page.tsx @@ -178,7 +178,6 @@ function V2WorkspacePage() { diff --git a/packages/host-service/QUERY_TIMEOUTS.md b/packages/host-service/QUERY_TIMEOUTS.md new file mode 100644 index 00000000000..0bf1c1f81e9 --- /dev/null +++ b/packages/host-service/QUERY_TIMEOUTS.md @@ -0,0 +1,77 @@ +# Query Timeouts + +Every host-service query procedure has a server-side timeout. If a query +takes longer than its budget, the procedure rejects with +`TRPCError({ code: "TIMEOUT" })`. The renderer's `QueryClient` retries +those errors a couple of times with linear backoff, so transient slow +ops self-recover without leaving the UI spinning forever. + +## Where it lives + +- **Server middleware**: `src/trpc/index.ts` — `timeoutMiddleware` races + `next()` against a per-procedure timer. +- **Builder**: `queryProcedure = protectedProcedure.use(timeoutMiddleware)` + in the same file. Use this for every `.query` procedure. +- **Per-procedure overrides**: `.meta({ timeoutMs })` on the procedure + builder. Without `meta`, the default is 5000 ms. +- **Client retry**: `WorkspaceClientProvider` in `packages/workspace-client` + retries `TIMEOUT` errors up to 2 times with `300 ms × attempt` backoff. + Other errors keep the previous single retry. + +Mutations stay on `protectedProcedure` because their latency is highly +variable (file writes, network calls) and a blanket budget would do more +harm than good. + +## Adding a new query + +```ts +// Defaults to 5s — fine for most local fs/git work. +myFastQuery: queryProcedure + .input(...) + .query(async ({ ctx, input }) => { ... }), + +// Override when the work legitimately takes longer. +mySlowQuery: queryProcedure + .meta({ timeoutMs: 30_000 }) + .input(...) + .query(async ({ ctx, input }) => { ... }), +``` + +Pick the smallest budget that fits the slowest legitimate run on real +hardware. Too generous and the UX of a hung host-service degrades; too +tight and healthy queries time out under load. + +## Current budgets + +| Procedure | Budget | Reason | +|---|---|---| +| `filesystem.listDirectory`, `filesystem.getMetadata` | 5s | Fast in practice | +| `filesystem.readFile` | 30s | Large files (e.g. lockfiles, generated bundles) | +| `filesystem.searchFiles` | 30s | ripgrep on large repos | +| `filesystem.searchContent` | 60s | content search worst case | +| `git.listBranches`, `git.getBaseBranch`, `git.getPullRequest` | 5s | Cheap reads | +| `git.getStatus`, `git.getCommitFiles` | 15s | Slow on big working trees | +| `git.listCommits`, `git.getDiff`, `git.getBranchSyncStatus`, `git.getPullRequestThreads` | 30s | Long history, big diffs, GitHub API | + +## What the timeout does *not* do + +The middleware only races a timer against the procedure's `next()` +result. It does **not** kill the underlying work — `fs.readdir`, `git` +child processes, etc. continue server-side until they finish naturally. +For ops that *can* be cancelled, the procedure should plumb the +`AbortSignal` through. `filesystem.listDirectory` does this: +the renderer's tRPC client provides `signal` automatically (and +`abortOnUnmount: true` aborts on unmount), the procedure forwards it +to the `FsService`, and `workspace-fs/fs.ts::listDirectory` checks +`signal?.throwIfAborted()` between `fs.readdir` and each batch of stat +calls. Node's `fs.readdir`/`fs.stat` themselves ignore `AbortSignal`, +so the readdir syscall is uncancellable; we can only short-circuit +between operations. + +## What a timeout looks like to the client + +`TRPCClientError` with `error.data.code === "TIMEOUT"`. The +`WorkspaceClientProvider` retry predicate keys on this. Bespoke per-hook +retry logic should not be necessary — if it is, the procedure's budget +is probably wrong, or the underlying work isn't really a single query +and should be split. diff --git a/packages/host-service/src/events/event-bus.ts b/packages/host-service/src/events/event-bus.ts index 33763322e36..b550f568e13 100644 --- a/packages/host-service/src/events/event-bus.ts +++ b/packages/host-service/src/events/event-bus.ts @@ -43,8 +43,8 @@ function parseClientMessage(data: unknown): ClientMessage | null { return parsed as ClientMessage; } } - } catch { - // Malformed message — ignore + } catch (error) { + console.warn("[event-bus] malformed client message — ignored", { error }); } return null; } @@ -139,8 +139,21 @@ export class EventBus { } private broadcast(message: ServerMessage): void { + // One bad socket must not block fan-out to the rest. Drop dead sockets + // rather than logging on every broadcast forever. + const dead: WsSocket[] = []; for (const socket of this.clients.keys()) { - sendMessage(socket, message); + try { + sendMessage(socket, message); + } catch (error) { + console.error("[event-bus:send] socket failed — dropping", { error }); + dead.push(socket); + } + } + for (const socket of dead) { + const state = this.clients.get(socket); + if (state) this.cleanupClient(socket, state); + this.clients.delete(socket); } } diff --git a/packages/host-service/src/events/git-watcher.ts b/packages/host-service/src/events/git-watcher.ts index 4f5b02d61a7..717a87048ee 100644 --- a/packages/host-service/src/events/git-watcher.ts +++ b/packages/host-service/src/events/git-watcher.ts @@ -149,7 +149,15 @@ export class GitWatcher { ? { workspaceId } : { workspaceId, paths: [...batch.paths] }; for (const listener of this.listeners) { - listener(event); + // Isolate per-listener throws so one bad subscriber can't skip + // siblings. Other escapes fall through to the process-level net. + try { + listener(event); + } catch (error) { + console.error("[git-watcher:listener] threw — contained", { + error, + }); + } } }, DEBOUNCE_MS), ); diff --git a/packages/host-service/src/index.ts b/packages/host-service/src/index.ts index e50521cf1ee..d6fd19781b2 100644 --- a/packages/host-service/src/index.ts +++ b/packages/host-service/src/index.ts @@ -19,6 +19,7 @@ export { LocalModelProvider, } from "./providers/model-providers"; export type { GitCredentialProvider, GitFactory } from "./runtime/git"; +export { installProcessSafetyNet } from "./safety"; export type { TeardownFailureCause } from "./trpc/error-types"; export type { AppRouter } from "./trpc/router"; export type { ApiClient, HostServiceContext } from "./types"; diff --git a/packages/host-service/src/runtime/pull-requests/pull-requests.ts b/packages/host-service/src/runtime/pull-requests/pull-requests.ts index 0a33bed5740..d5fa61c03cd 100644 --- a/packages/host-service/src/runtime/pull-requests/pull-requests.ts +++ b/packages/host-service/src/runtime/pull-requests/pull-requests.ts @@ -6,6 +6,7 @@ import type { HostDb } from "../../db"; import { projects, pullRequests, workspaces } from "../../db/schema"; import type { GitFactory } from "../git"; import { fetchRepositoryPullRequests } from "./utils/github-query"; +import type { GraphQLPullRequestNode } from "./utils/github-query/types"; import { type ChecksStatus, coerceChecksStatus, @@ -22,7 +23,11 @@ import { } from "./utils/pull-request-mappers"; const BRANCH_SYNC_INTERVAL_MS = 30_000; -const PROJECT_REFRESH_INTERVAL_MS = 10_000; +const PROJECT_REFRESH_INTERVAL_MS = 20_000; +// Multiple projects can target the same GitHub repo; collapse those into a +// single GraphQL call within the polling window so we don't multiply traffic +// by the number of projects. +const REPO_PULL_REQUEST_CACHE_TTL_MS = 10_000; const UNBORN_HEAD_ERROR_PATTERNS = [ "ambiguous argument 'head'", "unknown revision or path not in the working tree", @@ -197,7 +202,10 @@ export class PullRequestRuntimeManager { private branchSyncTimer: ReturnType | null = null; private projectRefreshTimer: ReturnType | null = null; private readonly inFlightProjects = new Map>(); - private readonly nextProjectRefreshAt = new Map(); + private readonly repoPullRequestCache = new Map< + string, + { promise: Promise; fetchedAt: number } + >(); constructor(options: PullRequestRuntimeManagerOptions) { this.db = options.db; @@ -216,7 +224,7 @@ export class PullRequestRuntimeManager { }, PROJECT_REFRESH_INTERVAL_MS); void this.syncWorkspaceBranches(); - void this.refreshEligibleProjects(true); + void this.refreshEligibleProjects(); } stop() { @@ -287,7 +295,9 @@ export class PullRequestRuntimeManager { const projectIds = [...new Set(rows.map((row) => row.projectId))]; await Promise.all( - projectIds.map((projectId) => this.refreshProject(projectId, true)), + projectIds.map((projectId) => + this.refreshProject(projectId, { bypassCache: true }), + ), ); } @@ -345,12 +355,12 @@ export class PullRequestRuntimeManager { await Promise.all( [...changedProjectIds].map((projectId) => - this.refreshProject(projectId, true), + this.refreshProject(projectId, { bypassCache: true }), ), ); } - private async refreshEligibleProjects(force = false): Promise { + private async refreshEligibleProjects(): Promise { const rows = this.db .select({ projectId: workspaces.projectId, @@ -359,27 +369,21 @@ export class PullRequestRuntimeManager { .all(); const projectIds = [...new Set(rows.map((row) => row.projectId))]; await Promise.all( - projectIds.map((projectId) => this.refreshProject(projectId, force)), + projectIds.map((projectId) => this.refreshProject(projectId)), ); } private async refreshProject( projectId: string, - force = false, + options: { bypassCache?: boolean } = {}, ): Promise { - const now = Date.now(); const existing = this.inFlightProjects.get(projectId); if (existing) { await existing; return; } - const nextEligibleRefreshAt = this.nextProjectRefreshAt.get(projectId) ?? 0; - if (!force && nextEligibleRefreshAt > now) { - return; - } - - const refreshPromise = this.performProjectRefresh(projectId) + const refreshPromise = this.performProjectRefresh(projectId, options) .catch((error) => { console.warn( "[host-service:pull-request-runtime] Project refresh failed", @@ -391,17 +395,16 @@ export class PullRequestRuntimeManager { }) .finally(() => { this.inFlightProjects.delete(projectId); - this.nextProjectRefreshAt.set( - projectId, - Date.now() + PROJECT_REFRESH_INTERVAL_MS, - ); }); this.inFlightProjects.set(projectId, refreshPromise); await refreshPromise; } - private async performProjectRefresh(projectId: string): Promise { + private async performProjectRefresh( + projectId: string, + options: { bypassCache?: boolean } = {}, + ): Promise { const repo = await this.getProjectRepository(projectId); if (!repo) return; @@ -426,6 +429,7 @@ export class PullRequestRuntimeManager { projectId, repo, wantedKeys, + options, ); for (const workspace of projectWorkspaces) { @@ -501,18 +505,49 @@ export class PullRequestRuntimeManager { }; } + private async getCachedRepoPullRequests( + repo: NormalizedRepoIdentity, + options: { bypassCache?: boolean } = {}, + ): Promise { + const cacheKey = `${repo.owner.toLowerCase()}/${repo.name.toLowerCase()}`; + if (!options.bypassCache) { + const cached = this.repoPullRequestCache.get(cacheKey); + if ( + cached && + Date.now() - cached.fetchedAt < REPO_PULL_REQUEST_CACHE_TTL_MS + ) { + return cached.promise; + } + } + + const fetchedAt = Date.now(); + const promise = (async () => { + const octokit = await this.github(); + return fetchRepositoryPullRequests(octokit, { + owner: repo.owner, + name: repo.name, + }); + })(); + // Evict on failure so the next caller retries instead of serving a + // poisoned cache entry for the rest of the TTL. + promise.catch(() => { + if (this.repoPullRequestCache.get(cacheKey)?.promise === promise) { + this.repoPullRequestCache.delete(cacheKey); + } + }); + this.repoPullRequestCache.set(cacheKey, { promise, fetchedAt }); + return promise; + } + private async fetchRepoPullRequests( projectId: string, repo: NormalizedRepoIdentity, wantedKeys: Set, + options: { bypassCache?: boolean } = {}, ): Promise> { if (wantedKeys.size === 0) return new Map(); - const octokit = await this.github(); - const nodes = await fetchRepositoryPullRequests(octokit, { - owner: repo.owner, - name: repo.name, - }); + const nodes = await this.getCachedRepoPullRequests(repo, options); const latestByKey = new Map(); diff --git a/packages/host-service/src/safety.ts b/packages/host-service/src/safety.ts new file mode 100644 index 00000000000..d79c5cd0eaa --- /dev/null +++ b/packages/host-service/src/safety.ts @@ -0,0 +1,32 @@ +/** + * Host-service crash isolation. + * + * Policy: the main host-service process must stay up even when a subsystem + * throws. We rely on a process-level safety net as the primary mechanism — + * Node already routes throws from `setInterval`, `setTimeout`, `EventEmitter` + * listeners, native callbacks (`pty.onData`/`onExit`), and orphaned promise + * continuations into `uncaughtException` / `unhandledRejection`, so a single + * handler covers all of them. + * + * The two places where this isn't enough are fan-out loops over multiple + * subscribers (broadcasts, listener iteration). A throw there skips the + * remaining iterations, so those sites use inline `try/catch` directly. + */ + +let safetyNetInstalled = false; + +export function installProcessSafetyNet(): void { + if (safetyNetInstalled) return; + safetyNetInstalled = true; + + process.on("uncaughtException", (error, origin) => { + console.error("[host-service] uncaughtException — staying up", { + origin, + error, + }); + }); + + process.on("unhandledRejection", (reason) => { + console.error("[host-service] unhandledRejection — staying up", { reason }); + }); +} diff --git a/packages/host-service/src/serve.ts b/packages/host-service/src/serve.ts index 7daa3784910..1090f397bc9 100644 --- a/packages/host-service/src/serve.ts +++ b/packages/host-service/src/serve.ts @@ -5,6 +5,7 @@ import { JwtApiAuthProvider } from "./providers/auth"; import { LocalGitCredentialProvider } from "./providers/git"; import { PskHostAuthProvider } from "./providers/host-auth"; import { LocalModelProvider } from "./providers/model-providers"; +import { installProcessSafetyNet } from "./safety"; import { initTerminalBaseEnv, resolveTerminalBaseEnv } from "./terminal/env"; import { connectRelay } from "./tunnel"; @@ -34,6 +35,9 @@ async function main(): Promise { }); const server = serve({ fetch: app.fetch, port: env.PORT }, (info) => { + // Install only after the server is listening so startup throws still + // reach `main().catch(...)` and exit with a non-zero code. + installProcessSafetyNet(); console.log(`[host-service] listening on http://localhost:${info.port}`); if (env.RELAY_URL) { diff --git a/packages/host-service/src/trpc/index.ts b/packages/host-service/src/trpc/index.ts index a74f14b22d7..dd09c811239 100644 --- a/packages/host-service/src/trpc/index.ts +++ b/packages/host-service/src/trpc/index.ts @@ -8,41 +8,54 @@ import { type TeardownFailureCause, } from "./error-types"; -const t = initTRPC.context().create({ - transformer: superjson, - errorFormatter({ shape, error }) { - // tRPC wraps non-Error `cause` values via getCauseFromUnknown() into a - // synthetic UnknownCauseError that carries the original fields as own - // properties. Superjson then serializes it as an Error (message/stack - // only) and drops our fields. Re-build a plain object so the wire - // format keeps `kind`, `exitCode`, `outputTail`, etc. - const teardownFailure: TeardownFailureCause | undefined = - isTeardownFailureCause(error.cause) - ? { - kind: "TEARDOWN_FAILED", - exitCode: error.cause.exitCode, - signal: error.cause.signal, - timedOut: error.cause.timedOut, - outputTail: error.cause.outputTail, - } - : undefined; - const projectNotSetup: ProjectNotSetupCause | undefined = - isProjectNotSetupCause(error.cause) - ? { - kind: "PROJECT_NOT_SETUP", - projectId: error.cause.projectId, - } - : undefined; - return { - ...shape, - data: { - ...shape.data, - teardownFailure, - projectNotSetup, - }, - }; - }, -}); +interface RouterMeta { + /** + * Per-procedure timeout in milliseconds, applied to query procedures + * via `queryProcedure`. Defaults to 5_000 when omitted. Set higher for + * procedures that legitimately take longer (e.g. searching large + * histories or shelling out to long-running commands). + */ + timeoutMs?: number; +} + +const t = initTRPC + .context() + .meta() + .create({ + transformer: superjson, + errorFormatter({ shape, error }) { + // tRPC wraps non-Error `cause` values via getCauseFromUnknown() into a + // synthetic UnknownCauseError that carries the original fields as own + // properties. Superjson then serializes it as an Error (message/stack + // only) and drops our fields. Re-build a plain object so the wire + // format keeps `kind`, `exitCode`, `outputTail`, etc. + const teardownFailure: TeardownFailureCause | undefined = + isTeardownFailureCause(error.cause) + ? { + kind: "TEARDOWN_FAILED", + exitCode: error.cause.exitCode, + signal: error.cause.signal, + timedOut: error.cause.timedOut, + outputTail: error.cause.outputTail, + } + : undefined; + const projectNotSetup: ProjectNotSetupCause | undefined = + isProjectNotSetupCause(error.cause) + ? { + kind: "PROJECT_NOT_SETUP", + projectId: error.cause.projectId, + } + : undefined; + return { + ...shape, + data: { + ...shape.data, + teardownFailure, + projectNotSetup, + }, + }; + }, + }); export const router = t.router; export const publicProcedure = t.procedure; @@ -57,6 +70,44 @@ export const protectedProcedure = t.procedure.use(async ({ ctx, next }) => { return next({ ctx }); }); +const DEFAULT_QUERY_TIMEOUT_MS = 5_000; + +const timeoutMiddleware = t.middleware(async ({ next, type, path, meta }) => { + if (type !== "query") return next(); + const timeoutMs = meta?.timeoutMs ?? DEFAULT_QUERY_TIMEOUT_MS; + + let timer: ReturnType | undefined; + const timeoutPromise = new Promise((_, reject) => { + timer = setTimeout(() => { + reject( + new TRPCError({ + code: "TIMEOUT", + message: `${path} timed out after ${timeoutMs}ms`, + }), + ); + }, timeoutMs); + }); + + try { + return await Promise.race([next(), timeoutPromise]); + } finally { + if (timer) clearTimeout(timer); + } +}); + +/** + * Query procedures with a server-side timeout. Hung filesystem/git work + * rejects after `meta.timeoutMs` (default 5s) so the renderer doesn't + * spin forever. React Query is configured to retry on `TIMEOUT` errors. + * + * Use this for `.query` procedures only — mutations have variable + * latency and shouldn't share a blanket budget. + * + * See `packages/host-service/QUERY_TIMEOUTS.md` for the policy and + * current per-procedure budgets. + */ +export const queryProcedure = protectedProcedure.use(timeoutMiddleware); + export type { ProjectNotSetupCause, TeardownFailureCause, diff --git a/packages/host-service/src/trpc/router/filesystem/filesystem.ts b/packages/host-service/src/trpc/router/filesystem/filesystem.ts index 808ca87f503..e50ff33d26e 100644 --- a/packages/host-service/src/trpc/router/filesystem/filesystem.ts +++ b/packages/host-service/src/trpc/router/filesystem/filesystem.ts @@ -3,7 +3,7 @@ import { isAbsolute, join, normalize, resolve } from "node:path"; import { TRPCError } from "@trpc/server"; import { z } from "zod"; import type { HostServiceContext } from "../../../types"; -import { protectedProcedure, router } from "../../index"; +import { protectedProcedure, queryProcedure, router } from "../../index"; function getFilesystemService(ctx: HostServiceContext, workspaceId: string) { try { @@ -79,20 +79,21 @@ const replaceContentInputSchema = z.object({ }); export const filesystemRouter = router({ - listDirectory: protectedProcedure + listDirectory: queryProcedure .input( z.object({ workspaceId: z.string(), absolutePath: z.string(), }), ) - .query(async ({ ctx, input }) => { + .query(async ({ ctx, input, signal }) => { const { workspaceId, ...serviceInput } = input; const service = getFilesystemService(ctx, workspaceId); - return await service.listDirectory(serviceInput); + return await service.listDirectory(serviceInput, { signal }); }), - readFile: protectedProcedure + readFile: queryProcedure + .meta({ timeoutMs: 30_000 }) .input( z.object({ workspaceId: z.string(), @@ -117,7 +118,7 @@ export const filesystemRouter = router({ return result; }), - getMetadata: protectedProcedure + getMetadata: queryProcedure .input( z.object({ workspaceId: z.string(), @@ -276,7 +277,8 @@ export const filesystemRouter = router({ return await service.copyPath(serviceInput); }), - searchFiles: protectedProcedure + searchFiles: queryProcedure + .meta({ timeoutMs: 30_000 }) .input( z .object({ @@ -326,7 +328,8 @@ export const filesystemRouter = router({ return await service.warmupSearchIndex(serviceInput); }), - searchContent: protectedProcedure + searchContent: queryProcedure + .meta({ timeoutMs: 60_000 }) .input(searchContentInputSchema) .query(async ({ ctx, input }) => { const trimmedQuery = input.query.trim(); diff --git a/packages/host-service/src/trpc/router/git/git.ts b/packages/host-service/src/trpc/router/git/git.ts index 4a22734b8cc..3314d6ffc7f 100644 --- a/packages/host-service/src/trpc/router/git/git.ts +++ b/packages/host-service/src/trpc/router/git/git.ts @@ -3,7 +3,7 @@ import { TRPCError } from "@trpc/server"; import { eq } from "drizzle-orm"; import { z } from "zod"; import { projects, pullRequests, workspaces } from "../../../db/schema"; -import { protectedProcedure, router } from "../../index"; +import { protectedProcedure, queryProcedure, router } from "../../index"; import type { ChangedFile, CheckConclusionState, @@ -34,7 +34,7 @@ import { import { resolveWorktreePath } from "./utils/resolve-worktree"; export const gitRouter = router({ - listBranches: protectedProcedure + listBranches: queryProcedure .input( z.object({ workspaceId: z.string(), @@ -147,7 +147,8 @@ export const gitRouter = router({ return { branches }; }), - getStatus: protectedProcedure + getStatus: queryProcedure + .meta({ timeoutMs: 15_000 }) .input( z.object({ workspaceId: z.string(), @@ -299,7 +300,8 @@ export const gitRouter = router({ }; }), - listCommits: protectedProcedure + listCommits: queryProcedure + .meta({ timeoutMs: 30_000 }) .input( z.object({ workspaceId: z.string(), @@ -336,7 +338,8 @@ export const gitRouter = router({ return { commits }; }), - getCommitFiles: protectedProcedure + getCommitFiles: queryProcedure + .meta({ timeoutMs: 15_000 }) .input( z.object({ workspaceId: z.string(), @@ -354,7 +357,7 @@ export const gitRouter = router({ return { files }; }), - getBaseBranch: protectedProcedure + getBaseBranch: queryProcedure .input(z.object({ workspaceId: z.string() })) .query(async ({ ctx, input }) => { const worktreePath = resolveWorktreePath(ctx, input.workspaceId); @@ -441,7 +444,8 @@ export const gitRouter = router({ return { name: input.newName }; }), - getDiff: protectedProcedure + getDiff: queryProcedure + .meta({ timeoutMs: 30_000 }) .input( z.object({ workspaceId: z.string(), @@ -519,7 +523,8 @@ export const gitRouter = router({ }; }), - getBranchSyncStatus: protectedProcedure + getBranchSyncStatus: queryProcedure + .meta({ timeoutMs: 30_000 }) .input(z.object({ workspaceId: z.string() })) .query(async ({ ctx, input }) => { const worktreePath = resolveWorktreePath(ctx, input.workspaceId); @@ -586,7 +591,7 @@ export const gitRouter = router({ }; }), - getPullRequest: protectedProcedure + getPullRequest: queryProcedure .input(z.object({ workspaceId: z.string() })) .query(({ ctx, input }) => { const workspace = ctx.db.query.workspaces @@ -645,7 +650,8 @@ export const gitRouter = router({ }; }), - getPullRequestThreads: protectedProcedure + getPullRequestThreads: queryProcedure + .meta({ timeoutMs: 30_000 }) .input(z.object({ workspaceId: z.string() })) .query(async ({ ctx, input }) => { const workspace = ctx.db.query.workspaces diff --git a/packages/host-service/src/trpc/router/workspace-creation/procedures/get-github-pull-request-content.ts b/packages/host-service/src/trpc/router/workspace-creation/procedures/get-github-pull-request-content.ts index 63ea9a19ea4..30b16121933 100644 --- a/packages/host-service/src/trpc/router/workspace-creation/procedures/get-github-pull-request-content.ts +++ b/packages/host-service/src/trpc/router/workspace-creation/procedures/get-github-pull-request-content.ts @@ -7,40 +7,86 @@ import { import { resolveGithubRepo } from "../shared/project-helpers"; import { execGh } from "../utils/exec-gh"; +type PullRequestContent = { + number: number; + title: string; + body: string; + url: string; + state: string; + branch: string; + baseBranch: string; + headRepositoryOwner: string | null; + isCrossRepository: boolean; + author: string | null; + isDraft: boolean; + createdAt: string | undefined; + updatedAt: string | undefined; +}; + +// Browsing the PR list re-opens the detail panel constantly; cache the +// `gh pr view` response so we don't burn the user's GitHub token bucket on +// repeat clicks. Concurrent callers share the same in-flight promise. +const PULL_REQUEST_CONTENT_CACHE_TTL_MS = 30_000; +const pullRequestContentCache = new Map< + string, + { promise: Promise; fetchedAt: number } +>(); + export const getGitHubPullRequestContent = protectedProcedure .input(githubPullRequestContentInputSchema) .query(async ({ ctx, input }) => { const repo = await resolveGithubRepo(ctx, input.projectId); - try { - const raw = await execGh([ - "pr", - "view", - String(input.prNumber), - "--repo", - `${repo.owner}/${repo.name}`, - "--json", - "number,title,body,url,state,author,headRefName,baseRefName,headRepositoryOwner,isCrossRepository,isDraft,createdAt,updatedAt", - ]); - const data = pullRequestContentSchema.parse(raw); - return { - number: data.number, - title: data.title, - body: data.body ?? "", - url: data.url, - state: data.state.toLowerCase(), - branch: data.headRefName, - baseBranch: data.baseRefName, - headRepositoryOwner: data.headRepositoryOwner?.login ?? null, - isCrossRepository: data.isCrossRepository, - author: data.author?.login ?? null, - isDraft: data.isDraft, - createdAt: data.createdAt, - updatedAt: data.updatedAt, - }; - } catch (err) { - throw new TRPCError({ - code: "INTERNAL_SERVER_ERROR", - message: `Failed to fetch PR #${input.prNumber}: ${err instanceof Error ? err.message : String(err)}`, - }); + const cacheKey = `${repo.owner.toLowerCase()}/${repo.name.toLowerCase()}#${input.prNumber}`; + const cached = pullRequestContentCache.get(cacheKey); + if ( + cached && + Date.now() - cached.fetchedAt < PULL_REQUEST_CONTENT_CACHE_TTL_MS + ) { + return cached.promise; } + + const fetchedAt = Date.now(); + const promise = (async (): Promise => { + try { + const raw = await execGh([ + "pr", + "view", + String(input.prNumber), + "--repo", + `${repo.owner}/${repo.name}`, + "--json", + "number,title,body,url,state,author,headRefName,baseRefName,headRepositoryOwner,isCrossRepository,isDraft,createdAt,updatedAt", + ]); + const data = pullRequestContentSchema.parse(raw); + return { + number: data.number, + title: data.title, + body: data.body ?? "", + url: data.url, + state: data.state.toLowerCase(), + branch: data.headRefName, + baseBranch: data.baseRefName, + headRepositoryOwner: data.headRepositoryOwner?.login ?? null, + isCrossRepository: data.isCrossRepository, + author: data.author?.login ?? null, + isDraft: data.isDraft, + createdAt: data.createdAt, + updatedAt: data.updatedAt, + }; + } catch (err) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: `Failed to fetch PR #${input.prNumber}: ${err instanceof Error ? err.message : String(err)}`, + }); + } + })(); + // Evict on failure so the next caller retries instead of replaying the + // same error for the rest of the TTL. + promise.catch(() => { + if (pullRequestContentCache.get(cacheKey)?.promise === promise) { + pullRequestContentCache.delete(cacheKey); + } + }); + pullRequestContentCache.set(cacheKey, { promise, fetchedAt }); + return promise; }); diff --git a/packages/workspace-client/src/providers/WorkspaceClientProvider/WorkspaceClientProvider.tsx b/packages/workspace-client/src/providers/WorkspaceClientProvider/WorkspaceClientProvider.tsx index 1ad9dfb12c4..3e359b6a871 100644 --- a/packages/workspace-client/src/providers/WorkspaceClientProvider/WorkspaceClientProvider.tsx +++ b/packages/workspace-client/src/providers/WorkspaceClientProvider/WorkspaceClientProvider.tsx @@ -1,11 +1,17 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query"; -import { httpBatchStreamLink } from "@trpc/client"; +import { httpBatchStreamLink, TRPCClientError } from "@trpc/client"; import { createContext, type ReactNode, useContext } from "react"; import superjson from "superjson"; import { workspaceTrpc } from "../../workspace-trpc"; const STALE_TIME_MS = 5_000; const GC_TIME_MS = 30 * 60 * 1_000; +const MAX_TIMEOUT_RETRIES = 2; +const TIMEOUT_RETRY_BASE_DELAY_MS = 300; + +function isTimeoutError(error: unknown): boolean { + return error instanceof TRPCClientError && error.data?.code === "TIMEOUT"; +} export interface WorkspaceClientContextValue { hostUrl: string; @@ -49,7 +55,18 @@ function getWorkspaceClients( defaultOptions: { queries: { refetchOnWindowFocus: false, - retry: 1, + // Retry server-side TIMEOUT errors a couple of times — these come + // from `queryProcedure`'s middleware when a host-service query + // (filesystem, git) takes longer than its budget. Other errors + // fall back to a single retry as before. + retry: (failureCount, error) => { + if (isTimeoutError(error)) return failureCount < MAX_TIMEOUT_RETRIES; + return failureCount < 1; + }, + retryDelay: (attempt, error) => + isTimeoutError(error) + ? TIMEOUT_RETRY_BASE_DELAY_MS * (attempt + 1) + : Math.min(1000 * 2 ** attempt, 30_000), staleTime: STALE_TIME_MS, gcTime: GC_TIME_MS, }, diff --git a/packages/workspace-client/src/workspace-trpc.ts b/packages/workspace-client/src/workspace-trpc.ts index d601513a5c2..6d38436c15f 100644 --- a/packages/workspace-client/src/workspace-trpc.ts +++ b/packages/workspace-client/src/workspace-trpc.ts @@ -1,4 +1,6 @@ import type { AppRouter } from "@superset/host-service/trpc"; import { createTRPCReact } from "@trpc/react-query"; -export const workspaceTrpc = createTRPCReact(); +export const workspaceTrpc = createTRPCReact({ + abortOnUnmount: true, +}); diff --git a/packages/workspace-fs/src/core/service.ts b/packages/workspace-fs/src/core/service.ts index 2755aed30f1..40f8e729ade 100644 --- a/packages/workspace-fs/src/core/service.ts +++ b/packages/workspace-fs/src/core/service.ts @@ -27,9 +27,10 @@ export interface FsContentStreamEvent { } export interface FsService { - listDirectory(input: { - absolutePath: string; - }): Promise<{ entries: FsEntry[] }>; + listDirectory( + input: { absolutePath: string }, + options?: { signal?: AbortSignal }, + ): Promise<{ entries: FsEntry[] }>; readFile(input: { absolutePath: string; diff --git a/packages/workspace-fs/src/fs.ts b/packages/workspace-fs/src/fs.ts index a72073b2c67..4f71767e5ae 100644 --- a/packages/workspace-fs/src/fs.ts +++ b/packages/workspace-fs/src/fs.ts @@ -368,36 +368,51 @@ async function writeAtomically({ } } +// Symlink-resolution batch size. Node's fs.readdir and fs.stat ignore +// AbortSignal, so we can only check it between operations — batching the +// per-entry stat calls bounds how much zombie work continues after an abort. +const LIST_DIRECTORY_STAT_BATCH_SIZE = 16; + export async function listDirectory({ rootPath, absolutePath, + signal, }: { rootPath: string; absolutePath: string; + signal?: AbortSignal; }): Promise { const targetPath = ensureWithinRoot({ rootPath, absolutePath }); + signal?.throwIfAborted(); const entries = await fs.readdir(targetPath, { withFileTypes: true }); - const mapped = await Promise.all( - entries.map(async (entry) => { - let kind = direntToKind(entry); - // Resolve symlinks to determine target type (e.g. symlinked dirs in node_modules) - if (kind === "symlink") { - try { - const stats = await fs.stat(path.join(targetPath, entry.name)); - if (stats.isDirectory()) kind = "directory"; - else if (stats.isFile()) kind = "file"; - } catch { - // Dangling symlink or permission error — keep as "symlink" - } - } - return { - absolutePath: path.join(targetPath, entry.name), - name: entry.name, - kind, - }; - }), - ); + const mapped: FsEntry[] = []; + for (let i = 0; i < entries.length; i += LIST_DIRECTORY_STAT_BATCH_SIZE) { + signal?.throwIfAborted(); + const batch = await Promise.all( + entries + .slice(i, i + LIST_DIRECTORY_STAT_BATCH_SIZE) + .map(async (entry) => { + let kind = direntToKind(entry); + // Resolve symlinks to determine target type (e.g. symlinked dirs in node_modules) + if (kind === "symlink") { + try { + const stats = await fs.stat(path.join(targetPath, entry.name)); + if (stats.isDirectory()) kind = "directory"; + else if (stats.isFile()) kind = "file"; + } catch { + // Dangling symlink or permission error — keep as "symlink" + } + } + return { + absolutePath: path.join(targetPath, entry.name), + name: entry.name, + kind, + }; + }), + ); + mapped.push(...batch); + } return mapped.sort((left, right) => { const leftIsDir = left.kind === "directory"; diff --git a/packages/workspace-fs/src/host/service.ts b/packages/workspace-fs/src/host/service.ts index 9a5f5bd514f..8220e7ee470 100644 --- a/packages/workspace-fs/src/host/service.ts +++ b/packages/workspace-fs/src/host/service.ts @@ -144,10 +144,11 @@ export function createFsHostService( const { rootPath } = options; return { - async listDirectory(input) { + async listDirectory(input, options) { const entries = await listDirectory({ rootPath, absolutePath: input.absolutePath, + signal: options?.signal, }); return { entries }; },