diff --git a/sky/dashboard/src/components/clusters.jsx b/sky/dashboard/src/components/clusters.jsx index 03ca76a0874..a5765e9eb6a 100755 --- a/sky/dashboard/src/components/clusters.jsx +++ b/sky/dashboard/src/components/clusters.jsx @@ -619,10 +619,15 @@ export function ClusterTable({ const activeClusters = await dashboardCache.get(getClusters); if (showHistory) { - const historyClusters = await dashboardCache.get(getClusterHistory, [ - null, - historyDays, - ]); + let historyClusters = []; + try { + historyClusters = await dashboardCache.get(getClusterHistory, [ + null, + historyDays, + ]); + } catch (error) { + console.error('Error fetching cluster history:', error); + } // Mark clusters as active or historical for UI distinction const markedActiveClusters = activeClusters.map((cluster) => ({ ...cluster, diff --git a/sky/dashboard/src/components/elements/version-display.jsx b/sky/dashboard/src/components/elements/version-display.jsx index ba2d5ac5d6b..d6d2c483d68 100644 --- a/sky/dashboard/src/components/elements/version-display.jsx +++ b/sky/dashboard/src/components/elements/version-display.jsx @@ -8,6 +8,12 @@ export function VersionDisplay() { const getVersion = async () => { const data = await apiClient.get('/api/health'); + if (!data.ok) { + console.error( + `API request /api/health failed with status ${data.status}` + ); + return; + } const healthData = await data.json(); if (healthData.version) { setVersion(healthData.version); diff --git a/sky/dashboard/src/components/jobs.jsx b/sky/dashboard/src/components/jobs.jsx index 9d1538b4118..c2931ef936f 100755 --- a/sky/dashboard/src/components/jobs.jsx +++ b/sky/dashboard/src/components/jobs.jsx @@ -29,7 +29,6 @@ import { getPoolStatus } from '@/data/connectors/jobs'; import jobsCacheManager from '@/lib/jobs-cache-manager'; import { getClusters, downloadJobLogs } from '@/data/connectors/clusters'; import { getWorkspaces } from '@/data/connectors/workspaces'; -import { getUsers } from '@/data/connectors/users'; import { CustomTooltip as Tooltip, NonCapitalizedTooltip, @@ -219,7 +218,6 @@ export function ManagedJobs() { jobsCacheManager.invalidateCache(); dashboardCache.invalidate(getPoolStatus, [{}]); dashboardCache.invalidate(getWorkspaces); - dashboardCache.invalidate(getUsers); // Trigger a re-fetch in both tables via their refreshDataRef if (jobsRefreshRef.current) { @@ -441,15 +439,13 @@ export function ManagedJobsTable({ const isDataLoading = jobsCacheManager.isDataLoading(params); if (includeStatus) { - const [jr, cd] = await Promise.all([ - jobsCacheManager.getPaginatedJobs(params), - dashboardCache.get(getClusters), - ]); - jobsResponse = jr; - clustersData = cd; - } else { - jobsResponse = await jobsCacheManager.getPaginatedJobs(params); + try { + clustersData = await dashboardCache.get(getClusters); + } catch (error) { + console.error('Error fetching clusters:', error); + } } + jobsResponse = await jobsCacheManager.getPaginatedJobs(params); // Always process the response, even if it's null const { diff --git a/sky/dashboard/src/components/users.jsx b/sky/dashboard/src/components/users.jsx index 57bb440b77e..48e6d8570ed 100644 --- a/sky/dashboard/src/components/users.jsx +++ b/sky/dashboard/src/components/users.jsx @@ -1337,23 +1337,29 @@ function UsersTable({ if (showLoading) setIsLoading(false); // Step 2: Load clusters and jobs in background and update counts - const [clustersData, managedJobsResponse] = await Promise.all([ - dashboardCache.get(getClusters), - dashboardCache.get(getManagedJobs, [ - { - allUsers: true, - skipFinished: true, - fields: [ - 'user_hash', - 'status', - 'accelerators', - 'job_name', - 'job_id', - 'infra', - ], - }, - ]), - ]); + let clustersData = []; + let managedJobsResponse = { jobs: [] }; + try { + [clustersData, managedJobsResponse] = await Promise.all([ + dashboardCache.get(getClusters), + dashboardCache.get(getManagedJobs, [ + { + allUsers: true, + skipFinished: true, + fields: [ + 'user_hash', + 'status', + 'accelerators', + 'job_name', + 'job_id', + 'infra', + ], + }, + ]), + ]); + } catch (error) { + console.error('Error fetching clusters and managed jobs:', error); + } const jobsData = managedJobsResponse.jobs || []; @@ -2332,16 +2338,28 @@ function ServiceAccountTokensView({ setTokens(tokensData || []); // Step 2: Fetch clusters and jobs data in parallel - const [clustersResponse, jobsResponse] = await Promise.all([ - dashboardCache.get(getClusters), - dashboardCache.get(getManagedJobs, [ - { - allUsers: true, - skipFinished: true, - fields: ['user_hash', 'status', 'accelerators', 'job_id', 'infra'], - }, - ]), - ]); + let clustersResponse = []; + let jobsResponse = { jobs: [] }; + try { + [clustersResponse, jobsResponse] = await Promise.all([ + dashboardCache.get(getClusters), + dashboardCache.get(getManagedJobs, [ + { + allUsers: true, + skipFinished: true, + fields: [ + 'user_hash', + 'status', + 'accelerators', + 'job_id', + 'infra', + ], + }, + ]), + ]); + } catch (error) { + console.error('Error fetching clusters and managed jobs:', error); + } const clustersData = clustersResponse || []; const jobsData = jobsResponse?.jobs || []; diff --git a/sky/dashboard/src/components/workspaces.jsx b/sky/dashboard/src/components/workspaces.jsx index 3cbfcc1aecb..95973d5f0dd 100644 --- a/sky/dashboard/src/components/workspaces.jsx +++ b/sky/dashboard/src/components/workspaces.jsx @@ -48,7 +48,10 @@ import { REFRESH_INTERVALS } from '@/lib/config'; import cachePreloader from '@/lib/cache-preloader'; import { apiClient } from '@/data/connectors/client'; import { sortData } from '@/data/utils'; -import { CLOUD_CANONICALIZATIONS } from '@/data/connectors/constants'; +import { + CLOUD_CANONICALIZATIONS, + CLUSTER_NOT_UP_ERROR, +} from '@/data/connectors/constants'; import Link from 'next/link'; // Workspace-aware API functions (cacheable) @@ -93,11 +96,9 @@ export async function getWorkspaceClusters(workspaceName) { ); return filteredClusters; } catch (error) { - console.error( - `Error fetching clusters for workspace ${workspaceName}:`, - error - ); - return []; + const msg = `Error fetching clusters for workspace ${workspaceName}: ${error}`; + console.error(msg); + throw new Error(msg); } } @@ -112,8 +113,41 @@ export async function getWorkspaceManagedJobs(workspaceName) { override_skypilot_config: { active_workspace: workspaceName }, }); + // Check if initial request succeeded + if (!response.ok) { + const msg = `Initial API request to get managed jobs failed with status ${response.status} for workspace ${workspaceName}`; + throw new Error(msg); + } + const id = response.headers.get('X-Skypilot-Request-ID'); + // Handle empty request ID + if (!id) { + const msg = `No request ID received from server for getting managed jobs for workspace ${workspaceName}`; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); + if (fetchedData.status === 500) { + try { + const data = await fetchedData.json(); + if (data.detail && data.detail.error) { + try { + const error = JSON.parse(data.detail.error); + // Handle specific error types + if (error.type && error.type === CLUSTER_NOT_UP_ERROR) { + return { jobs: [] }; + } + } catch (jsonError) { + console.error('Error parsing JSON:', jsonError); + } + } + } catch (parseError) { + console.error('Error parsing JSON:', parseError); + } + } + if (!fetchedData.ok) { + const msg = `API request to get managed jobs result failed with status ${fetchedData.status} for workspace ${workspaceName}`; + throw new Error(msg); + } const data = await fetchedData.json(); const jobsData = data.return_value ? JSON.parse(data.return_value) @@ -134,11 +168,9 @@ export async function getWorkspaceManagedJobs(workspaceName) { return jobsData; } catch (error) { - console.error( - `Error fetching managed jobs for workspace ${workspaceName}:`, - error - ); - return { jobs: [] }; + const msg = `Error fetching managed jobs for workspace ${workspaceName}: ${error}`; + console.error(msg); + throw new Error(msg); } } @@ -407,18 +439,28 @@ export function Workspaces() { // Fetch data for each workspace in parallel using workspace-aware API calls const workspaceDataPromises = configuredWorkspaceNames.map( async (wsName) => { - const [enabledClouds, clusters, managedJobs] = await Promise.all([ - dashboardCache.get(getEnabledClouds, [wsName]), - dashboardCache.get(getWorkspaceClusters, [wsName]), - dashboardCache.get(getWorkspaceManagedJobs, [wsName]), - ]); - - return { - workspaceName: wsName, - enabledClouds, - clusters: clusters || [], - managedJobs: managedJobs || { jobs: [] }, - }; + try { + const [enabledClouds, clusters, managedJobs] = await Promise.all([ + dashboardCache.get(getEnabledClouds, [wsName]), + dashboardCache.get(getWorkspaceClusters, [wsName]), + dashboardCache.get(getWorkspaceManagedJobs, [wsName]), + ]); + + return { + workspaceName: wsName, + enabledClouds, + clusters: clusters || [], + managedJobs: managedJobs || { jobs: [] }, + }; + } catch (error) { + console.error('Error fetching workspace data:', error); + return { + workspaceName: wsName, + enabledClouds: [], + clusters: [], + managedJobs: { jobs: [] }, + }; + } } ); diff --git a/sky/dashboard/src/data/connectors/client.js b/sky/dashboard/src/data/connectors/client.js index c45fa4760cb..606d411cc8c 100644 --- a/sky/dashboard/src/data/connectors/client.js +++ b/sky/dashboard/src/data/connectors/client.js @@ -4,32 +4,57 @@ import { ENDPOINT } from './constants'; export const apiClient = { fetch: async (path, body, method = 'POST') => { - const headers = - method === 'POST' - ? { - 'Content-Type': 'application/json', - } - : {}; + try { + const headers = + method === 'POST' + ? { + 'Content-Type': 'application/json', + } + : {}; - const baseUrl = window.location.origin; - const fullUrl = `${baseUrl}${ENDPOINT}${path}`; + const baseUrl = window.location.origin; + const fullUrl = `${baseUrl}${ENDPOINT}${path}`; - const response = await fetch(fullUrl, { - method, - headers, - body: method === 'POST' ? JSON.stringify(body) : undefined, - }); + const response = await fetch(fullUrl, { + method, + headers, + body: method === 'POST' ? JSON.stringify(body) : undefined, + }); + + // Check if initial request succeeded + if (!response.ok) { + const msg = `Initial API request ${path} failed with status ${response.status}`; + throw new Error(msg); + } - // Handle X-Request-ID for API requests - const id = - response.headers.get('X-Skypilot-Request-ID') || - response.headers.get('X-Request-ID'); + // Handle X-Request-ID for API requests + const id = + response.headers.get('X-Skypilot-Request-ID') || + response.headers.get('X-Request-ID'); - const fetchedData = await fetch( - `${baseUrl}${ENDPOINT}/api/get?request_id=${id}` - ); - const data = await fetchedData.json(); - return data.return_value ? JSON.parse(data.return_value) : []; + // Handle empty request ID + if (!id) { + const msg = `No request ID received from server for ${path}`; + throw new Error(msg); + } + + const fetchedData = await fetch( + `${baseUrl}${ENDPOINT}/api/get?request_id=${id}` + ); + + // Handle all error status codes (4xx, 5xx, etc.) + if (!fetchedData.ok) { + const msg = `API request to get ${path} result failed with status ${fetchedData.status}`; + throw new Error(msg); + } + + const data = await fetchedData.json(); + return data.return_value ? JSON.parse(data.return_value) : []; + } catch (error) { + const msg = `Error in apiClient.fetch for ${path}: ${error}`; + console.error(msg); + throw new Error(msg); + } }, // Helper method for POST requests @@ -51,6 +76,11 @@ export const apiClient = { // Helper method for streaming responses stream: async (path, body, onData) => { const response = await apiClient.post(path, body); + if (!response.ok) { + const msg = `API request ${path} failed with status ${response.status}`; + console.error(msg); + throw new Error(msg); + } const reader = response.body.getReader(); try { diff --git a/sky/dashboard/src/data/connectors/clusters.jsx b/sky/dashboard/src/data/connectors/clusters.jsx index f47976e32a9..2ea1026f567 100644 --- a/sky/dashboard/src/data/connectors/clusters.jsx +++ b/sky/dashboard/src/data/connectors/clusters.jsx @@ -111,7 +111,7 @@ export async function getClusters({ clusterNames = null } = {}) { return clusterData; } catch (error) { console.error('Error fetching clusters:', error); - return []; + throw error; } } @@ -186,7 +186,7 @@ export async function getClusterHistory(clusterHash = null, days = 30) { return historyData; } catch (error) { console.error('Error fetching cluster history:', error); - return []; + throw error; } } @@ -317,7 +317,7 @@ export async function getClusterJobs({ clusterName, workspace }) { return jobData; } catch (error) { console.error('Error fetching cluster jobs:', error); - return []; + throw error; } } @@ -339,8 +339,13 @@ export function useClusterDetails({ cluster, job = null }) { const data = await dashboardCache.get(getClusters, [ { clusterNames: [cluster] }, ]); - setClusterData(data[0]); // Assuming getClusters returns an array - return data[0]; // Return the data for use in fetchClusterJobData + if (data.length > 0) { + setClusterData(data[0]); // Assuming getClusters returns an array + return data[0]; // Return the data for use in fetchClusterJobData + } else { + console.error('No cluster data found for cluster:', cluster); + return null; + } } catch (error) { console.error('Error fetching cluster data:', error); return null; diff --git a/sky/dashboard/src/data/connectors/infra.jsx b/sky/dashboard/src/data/connectors/infra.jsx index b0447b12d36..616e58895af 100644 --- a/sky/dashboard/src/data/connectors/infra.jsx +++ b/sky/dashboard/src/data/connectors/infra.jsx @@ -8,11 +8,21 @@ export async function getCloudInfrastructure(forceRefresh = false) { const { getClusters } = await import('@/data/connectors/clusters'); const { getManagedJobs } = await import('@/data/connectors/jobs'); try { - const jobsData = await dashboardCache.get(getManagedJobs, [ - { allUsers: true, skipFinished: true, fields: ['cloud', 'region'] }, - ]); + let jobsData = { jobs: [] }; + try { + jobsData = await dashboardCache.get(getManagedJobs, [ + { allUsers: true, skipFinished: true, fields: ['cloud', 'region'] }, + ]); + } catch (error) { + console.error('Error fetching managed jobs:', error); + } const jobs = jobsData?.jobs || []; - const clustersData = await dashboardCache.get(getClusters); + let clustersData = []; + try { + clustersData = await dashboardCache.get(getClusters); + } catch (error) { + console.error('Error fetching clusters:', error); + } const clusters = clustersData || []; // Get enabled clouds let enabledCloudsList = []; @@ -22,14 +32,26 @@ export async function getCloudInfrastructure(forceRefresh = false) { console.log('Force refreshing clouds by running sky check...'); try { const checkResponse = await apiClient.post('/check', {}); + if (!checkResponse.ok) { + const msg = `Failed to run sky check with status ${checkResponse.status}`; + throw new Error(msg); + } const checkId = checkResponse.headers.get('X-Skypilot-Request-ID') || checkResponse.headers.get('X-Request-ID'); + if (!checkId) { + const msg = 'No request ID received from server for sky check'; + throw new Error(msg); + } // Wait for the check to complete const checkResult = await apiClient.get( `/api/get?request_id=${checkId}` ); + if (!checkResult.ok) { + const msg = `Failed to get sky check result with status ${checkResult.status}`; + throw new Error(msg); + } const checkData = await checkResult.json(); console.log('Sky check completed:', checkData); } catch (checkError) { @@ -39,11 +61,22 @@ export async function getCloudInfrastructure(forceRefresh = false) { } const enabledCloudsResponse = await apiClient.get(`/enabled_clouds`); - + if (!enabledCloudsResponse.ok) { + const msg = `Failed to get enabled clouds with status ${enabledCloudsResponse.status}`; + throw new Error(msg); + } const id = enabledCloudsResponse.headers.get('X-Skypilot-Request-ID') || enabledCloudsResponse.headers.get('X-Request-ID'); + if (!id) { + const msg = 'No request ID received from server for enabled clouds'; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); + if (!fetchedData.ok) { + const msg = `Failed to get enabled clouds result with status ${fetchedData.status}`; + throw new Error(msg); + } const data = await fetchedData.json(); enabledCloudsList = data.return_value ? JSON.parse(data.return_value) @@ -113,11 +146,7 @@ export async function getCloudInfrastructure(forceRefresh = false) { }; } catch (error) { console.error('Error fetching cloud infrastructure:', error); - return { - clouds: [], - totalClouds: CLOUDS_LIST.length, - enabledClouds: 0, - }; + throw error; } } @@ -239,12 +268,7 @@ export async function getWorkspaceInfrastructure() { `Failed to fetch infrastructure for workspace ${workspaceName}:`, error ); - workspaceInfraData[workspaceName] = { - config: workspaceConfig, - clouds: [], - contexts: [], - error: error.message, - }; + throw error; } } ) @@ -253,17 +277,36 @@ export async function getWorkspaceInfrastructure() { // Step 3: Get detailed GPU information for all contexts const { getClusters } = await import('@/data/connectors/clusters'); const dashboardCache = (await import('@/lib/cache')).default; - const clustersData = await dashboardCache.get(getClusters); + let clustersData = []; + try { + clustersData = await dashboardCache.get(getClusters); + } catch (error) { + console.error('Error fetching clusters:', error); + } const clusters = clustersData || []; // Get context stats (cluster counts) - const contextStats = await getContextClusters(clusters); + let contextStats = {}; + try { + contextStats = await getContextClusters(clusters); + } catch (error) { + console.error('Error fetching context clusters:', error); + } // Get GPU data for all contexts (filter out any undefined contexts) const validContexts = [...new Set(allContextsAcrossWorkspaces)].filter( (context) => context && typeof context === 'string' ); - const gpuData = await getKubernetesGPUsFromContexts(validContexts); + let gpuData = { + allGPUs: [], + perContextGPUs: [], + perNodeGPUs: [], + }; + try { + gpuData = await getKubernetesGPUsFromContexts(validContexts); + } catch (error) { + console.error('Error fetching Kubernetes GPUs:', error); + } const finalResult = { workspaces: workspaceInfraData, @@ -283,16 +326,7 @@ export async function getWorkspaceInfrastructure() { } catch (error) { console.error('[DEBUG] Failed to fetch workspace infrastructure:', error); console.error('[DEBUG] Error stack:', error.stack); - return { - workspaces: {}, - allContextNames: [], - allGPUs: [], - perContextGPUs: [], - perNodeGPUs: [], - contextStats: {}, - contextWorkspaceMap: {}, - error: error.message, - }; + throw error; } } @@ -455,61 +489,7 @@ async function getKubernetesGPUsFromContexts(contextNames) { }; } catch (error) { console.error('[infra.jsx] Error in getKubernetesGPUsFromContexts:', error); - return { - allGPUs: [], - perContextGPUs: [], - perNodeGPUs: [], - }; - } -} - -async function getAllContexts() { - try { - const response = await apiClient.get(`/all_contexts`); - if (!response.ok) { - console.error( - `Error fetching all contexts: ${response.status} ${response.statusText}` - ); - return []; - } - const id = - response.headers.get('X-Skypilot-Request-ID') || - response.headers.get('x-request-id'); - if (!id) { - console.error('No request ID returned for /all_contexts'); - return []; - } - const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); - const data = await fetchedData.json(); - return data.return_value ? JSON.parse(data.return_value) : []; - } catch (error) { - console.error('[infra.jsx] Error in getAllContexts:', error); - return []; - } -} - -async function getAllContextsForUser() { - try { - const response = await apiClient.get(`/all_contexts_for_user`); - if (!response.ok) { - console.error( - `Error fetching all contexts for user: ${response.status} ${response.statusText}` - ); - return []; - } - const id = - response.headers.get('X-Skypilot-Request-ID') || - response.headers.get('x-request-id'); - if (!id) { - console.error('No request ID returned for /all_contexts_for_user'); - return []; - } - const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); - const data = await fetchedData.json(); - return data.return_value ? JSON.parse(data.return_value) : []; - } catch (error) { - console.error('[infra.jsx] Error in getAllContextsForUser:', error); - return []; + throw error; } } @@ -518,9 +498,17 @@ async function getKubernetesPerNodeGPUs(context) { const response = await apiClient.post(`/kubernetes_node_info`, { context: context, }); + if (!response.ok) { + const msg = `Failed to get kubernetes node info with status ${response.status}`; + throw new Error(msg); + } const id = response.headers.get('X-Skypilot-Request-ID') || response.headers.get('x-request-id'); + if (!id) { + const msg = 'No request ID received from server for kubernetes node info'; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); if (fetchedData.status === 500) { try { @@ -528,10 +516,8 @@ async function getKubernetesPerNodeGPUs(context) { if (data.detail && data.detail.error) { try { const error = JSON.parse(data.detail.error); - console.warn( - `[infra.jsx] Context ${context} unavailable:`, - error.message - ); + const msg = `Context ${context} unavailable: ${error.message}`; + throw new Error(msg); } catch (jsonError) { console.error('Error parsing JSON:', jsonError); } @@ -539,7 +525,10 @@ async function getKubernetesPerNodeGPUs(context) { } catch (parseError) { console.error('Error parsing JSON:', parseError); } - return {}; + } + if (!fetchedData.ok) { + const msg = `Failed to get kubernetes node info result with status ${fetchedData.status}`; + throw new Error(msg); } const data = await fetchedData.json(); const nodeInfo = data.return_value ? JSON.parse(data.return_value) : {}; @@ -550,7 +539,7 @@ async function getKubernetesPerNodeGPUs(context) { `[infra.jsx] Context ${context} unavailable or timed out:`, error.message ); - return {}; + throw error; } } @@ -595,7 +584,7 @@ export async function getContextJobs(jobs) { return contextStats; } catch (error) { console.error('=== Error in getContextJobs ===', error); - return {}; + throw error; } } @@ -638,208 +627,7 @@ export async function getContextClusters(clusters) { return contextStats; } catch (error) { console.error('=== Error in getContextClusters ===', error); - return {}; - } -} - -async function getKubernetesGPUs(clusters) { - try { - // 1. Fetch all context names (Kubernetes + SSH) with workspace information - const allAvailableContextsWithWorkspaces = await getAllContextsForUser(); - - if ( - !allAvailableContextsWithWorkspaces || - allAvailableContextsWithWorkspaces.length === 0 - ) { - console.log('No contexts found from /all_contexts_for_user endpoint.'); - return { - allContextNames: [], - allGPUs: [], - perContextGPUs: [], - perNodeGPUs: [], - contextStats: {}, - contextWorkspaceMap: {}, - }; - } - - // Extract unique context names for backward compatibility - const allAvailableContextNames = [ - ...new Set(allAvailableContextsWithWorkspaces.map((ctx) => ctx.context)), - ]; - - // Create a mapping of context to workspaces - const contextWorkspaceMap = {}; - allAvailableContextsWithWorkspaces.forEach((ctx) => { - if (!contextWorkspaceMap[ctx.context]) { - contextWorkspaceMap[ctx.context] = []; - } - if (!contextWorkspaceMap[ctx.context].includes(ctx.workspace)) { - contextWorkspaceMap[ctx.context].push(ctx.workspace); - } - }); - - // 2. Fetch cluster counts per context - const contextStats = await getContextClusters(clusters); - - const allGPUsSummary = {}; - const perContextGPUsData = {}; - const perNodeGPUs_dict = {}; - - // Get all of the node info for all contexts in parallel and put them - // in a dictionary keyed by context name. - const contextNodeInfoList = await Promise.all( - allAvailableContextNames.map((context) => - getKubernetesPerNodeGPUs(context) - ) - ); - const contextToNodeInfo = {}; - for (let i = 0; i < allAvailableContextNames.length; i++) { - contextToNodeInfo[allAvailableContextNames[i]] = contextNodeInfoList[i]; - } - - // 3: Populate the gpuToData map for each context. - for (const context of allAvailableContextNames) { - const nodeInfoForContext = contextToNodeInfo[context] || {}; - if (nodeInfoForContext && Object.keys(nodeInfoForContext).length > 0) { - const gpuToData = {}; - for (const nodeName in nodeInfoForContext) { - const nodeData = nodeInfoForContext[nodeName]; - const gpuName = nodeData['accelerator_type']; - const totalCount = nodeData['total']['accelerator_count']; - const freeCount = nodeData['free']['accelerators_available']; - if (totalCount > 0) { - if (!gpuToData[gpuName]) { - gpuToData[gpuName] = { - gpu_name: gpuName, - gpu_requestable_qty_per_node: 0, - gpu_total: 0, - gpu_free: 0, - context: context, - }; - } - gpuToData[gpuName].gpu_total += totalCount; - gpuToData[gpuName].gpu_free += freeCount; - gpuToData[gpuName].gpu_requestable_qty_per_node = totalCount; - } - } - perContextGPUsData[context] = Object.values(gpuToData); - for (const gpuName in gpuToData) { - if (gpuName in allGPUsSummary) { - allGPUsSummary[gpuName].gpu_total += gpuToData[gpuName].gpu_total; - allGPUsSummary[gpuName].gpu_free += gpuToData[gpuName].gpu_free; - } else { - allGPUsSummary[gpuName] = { - gpu_total: gpuToData[gpuName].gpu_total, - gpu_free: gpuToData[gpuName].gpu_free, - gpu_name: gpuName, - }; - } - } - } - } - - // 4: Populate the perNodeGPUs_dict map for each context. - for (const context of allAvailableContextNames) { - const nodeInfoForContext = contextToNodeInfo[context]; - if (nodeInfoForContext && Object.keys(nodeInfoForContext).length > 0) { - for (const nodeName in nodeInfoForContext) { - const nodeData = nodeInfoForContext[nodeName]; - // Ensure accelerator_type, total, and free fields exist or provide defaults - const acceleratorType = nodeData['accelerator_type'] || '-'; - const totalAccelerators = - nodeData['total']?.['accelerator_count'] ?? 0; - const freeAccelerators = - nodeData['free']?.['accelerators_available'] ?? 0; - - perNodeGPUs_dict[`${context}/${nodeName}`] = { - node_name: nodeData['name'], - gpu_name: acceleratorType, - gpu_total: totalAccelerators, - gpu_free: freeAccelerators, - ip_address: nodeData['ip_address'] || null, - context: context, - }; - - // If this node provides a GPU type not found via GPU availability, - // add it to perContextGPUsData with 0/0 counts if it's not already there. - // This helps list CPU-only nodes or nodes with GPUs not picked by availability check. - if ( - acceleratorType !== '-' && - !perContextGPUsData[context].some( - (gpu) => gpu.gpu_name === acceleratorType - ) - ) { - if (!(acceleratorType in allGPUsSummary)) { - allGPUsSummary[acceleratorType] = { - gpu_total: 0, // Initialize with 0, will be summed up if multiple nodes have this - gpu_free: 0, - gpu_name: acceleratorType, - }; - } - // This ensures the GPU type is listed under the context, even if availability check missed it. - // We can't reliably sum total/free here from nodeInfo alone for per-context summary - // if the GPU availability check is the source of truth for those numbers. - // However, we must ensure the accelerator type is listed. - const existingGpuEntry = perContextGPUsData[context].find( - (gpu) => gpu.gpu_name === acceleratorType - ); - if (!existingGpuEntry) { - perContextGPUsData[context].push({ - gpu_name: acceleratorType, - gpu_requestable_qty_per_node: '-', // Or derive if possible - gpu_total: 0, // Placeholder, actual totals come from availability - gpu_free: 0, // Placeholder - context: context, - }); - } - } - } - } - // If after processing nodes and GPU availability, a context has no GPUs listed - // but nodes were found, ensure it appears in perContext data (e.g. for CPU only nodes) - if ( - perContextGPUsData[context].length === 0 && - nodeInfoForContext && - Object.keys(nodeInfoForContext).length > 0 - ) { - // This indicates a CPU-only context or one where GPU detection failed in availability check - // but nodes are present. It's already handled by allAvailableContextNames. - // We might add a placeholder if needed for UI consistency, but `allContextNames` should list it. - } - } - - const result = { - allContextNames: allAvailableContextNames.sort(), - allGPUs: Object.values(allGPUsSummary).sort((a, b) => - a.gpu_name.localeCompare(b.gpu_name) - ), - perContextGPUs: Object.values(perContextGPUsData) - .flat() - .sort( - (a, b) => - a.context.localeCompare(b.context) || - a.gpu_name.localeCompare(b.gpu_name) - ), - perNodeGPUs: Object.values(perNodeGPUs_dict).sort( - (a, b) => - a.context.localeCompare(b.context) || - a.node_name.localeCompare(b.node_name) || - a.gpu_name.localeCompare(b.gpu_name) - ), - contextStats: contextStats, - contextWorkspaceMap: contextWorkspaceMap, - }; - return result; - } catch (error) { - console.error('[infra.jsx] Outer error in getKubernetesGPUs:', error); - return { - allContextNames: [], - allGPUs: [], - perContextGPUs: [], - perNodeGPUs: [], - contextStats: {}, - contextWorkspaceMap: {}, - }; + throw error; } } @@ -849,9 +637,17 @@ export async function getCloudGPUs() { clouds: CLOUDS_LIST, gpus_only: true, }); + if (!response.ok) { + const msg = `Failed to get cloud GPUs with status ${response.status}`; + throw new Error(msg); + } const id = response.headers.get('X-Skypilot-Request-ID') || response.headers.get('x-request-id'); + if (!id) { + const msg = 'No request ID received from server for cloud GPUs'; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); if (fetchedData.status === 500) { try { @@ -859,7 +655,8 @@ export async function getCloudGPUs() { if (data.detail && data.detail.error) { try { const error = JSON.parse(data.detail.error); - console.error('Error fetching cloud GPUs:', error.message); + const msg = `Error fetching cloud GPUs: ${error.message}`; + throw new Error(msg); } catch (jsonError) { console.error('Error parsing JSON:', jsonError); } @@ -867,11 +664,10 @@ export async function getCloudGPUs() { } catch (parseError) { console.error('Error parsing JSON:', parseError); } - return { - commonGPUs: [], - tpus: [], - otherGPUs: [], - }; + } + if (!fetchedData.ok) { + const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}`; + throw new Error(msg); } const data = await fetchedData.json(); const allGPUs = data.return_value ? JSON.parse(data.return_value) : {}; @@ -903,11 +699,7 @@ export async function getCloudGPUs() { }; } catch (error) { console.error('Error fetching cloud GPUs:', error); - return { - commonGPUs: [], - tpus: [], - otherGPUs: [], - }; + throw error; } } @@ -937,14 +729,21 @@ export async function getDetailedGpuInfo(filter) { case_sensitive: false, all_regions: true, }); + if (!response.ok) { + const msg = `Failed to get detailed GPU info with status ${response.status}`; + throw new Error(msg); + } const id = response.headers.get('X-Skypilot-Request-ID') || response.headers.get('X-Request-ID'); + if (!id) { + const msg = 'No request ID received from server for detailed GPU info'; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); - - if (fetchedData.status === 500) { - console.error('Error fetching detailed GPU info: Server error'); - return []; + if (!fetchedData.ok) { + const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}`; + throw new Error(msg); } const data = await fetchedData.json(); @@ -970,7 +769,7 @@ export async function getDetailedGpuInfo(filter) { ); } catch (parseError) { console.error('Error parsing GPU data:', parseError); - return []; + throw parseError; } const formattedData = []; @@ -1093,6 +892,6 @@ export async function getDetailedGpuInfo(filter) { }); } catch (error) { console.error('Outer error in getDetailedGpuInfo:', error); - return []; + throw error; } } diff --git a/sky/dashboard/src/data/connectors/jobs.jsx b/sky/dashboard/src/data/connectors/jobs.jsx index a801c050238..1292cb91f69 100644 --- a/sky/dashboard/src/data/connectors/jobs.jsx +++ b/sky/dashboard/src/data/connectors/jobs.jsx @@ -73,7 +73,16 @@ export async function getManagedJobs(options = {}) { } const response = await apiClient.post(`/jobs/queue/v2`, body); + if (!response.ok) { + const msg = `Failed to get managed jobs with status ${response.status}`; + throw new Error(msg); + } const id = response.headers.get('X-Skypilot-Request-ID'); + // Handle empty request ID + if (!id) { + const msg = 'No request ID received from server for managed jobs'; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); if (fetchedData.status === 500) { try { @@ -92,13 +101,11 @@ export async function getManagedJobs(options = {}) { } catch (parseError) { console.error('Error parsing JSON:', parseError); } - // For non-CLUSTER_NOT_UP 500 errors, signal cache to skip update - return { - __skipCache: true, - jobs: [], - total: 0, - controllerStopped: false, - }; + } + // Handle all error status codes (4xx, 5xx, etc.) + if (!fetchedData.ok) { + const msg = `API request to get managed jobs result failed with status ${fetchedData.status}`; + throw new Error(msg); } // print out the response for debugging const data = await fetchedData.json(); @@ -215,14 +222,7 @@ export async function getManagedJobs(options = {}) { } catch (error) { console.error('Error fetching managed job data:', error); // Signal to the cache to not overwrite previously cached data - return { - __skipCache: true, - jobs: [], - total: 0, - totalNoFilter: 0, - controllerStopped: false, - statusCounts: {}, - }; + throw error; } } @@ -301,8 +301,7 @@ export async function getManagedJobsWithClientPagination(options) { 'Error fetching managed job data with client pagination:', error ); - // Signal to the cache to not overwrite previously cached data - return { __skipCache: true, jobs: [], controllerStopped: false, total: 0 }; + throw error; } } @@ -311,7 +310,15 @@ export async function getPoolStatus() { const response = await apiClient.post(`/jobs/pool_status`, { pool_names: null, // null means get all pools }); + if (!response.ok) { + const msg = `Initial API request to get pool status failed with status ${response.status}`; + throw new Error(msg); + } const id = response.headers.get('X-Skypilot-Request-ID'); + if (!id) { + const msg = 'No request ID received from server for getting pool status'; + throw new Error(msg); + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); if (fetchedData.status === 500) { @@ -330,7 +337,11 @@ export async function getPoolStatus() { } catch (dataError) { console.error('Failed to parse response JSON:', dataError); } - throw new Error('Server error'); + } + + if (!fetchedData.ok) { + const msg = `API request to get pool status result failed with status ${fetchedData.status}`; + throw new Error(msg); } // Parse the pools data from the response @@ -575,7 +586,12 @@ export async function handleJobAction(action, jobId, cluster) { logStarter = 'Restarting'; logMiddle = 'restarted'; apiPath = 'jobs/queue/v2'; - requestBody = { all_users: true, refresh: true }; + requestBody = { + all_users: true, + refresh: true, + skip_finished: true, + fields: ['status'], + }; jobId = 'controller'; break; default: @@ -597,8 +613,26 @@ export async function handleJobAction(action, jobId, cluster) { }, body: JSON.stringify(requestBody), }); + if (!response.ok) { + console.error( + `Initial API request ${apiPath} failed with status ${response.status}` + ); + showToast( + `${logStarter} job ${jobId} failed with status ${response.status}.`, + 'error' + ); + return; + } const id = response.headers.get('X-Skypilot-Request-ID'); + if (!id) { + console.error(`No request ID received from server for ${apiPath}`); + showToast( + `${logStarter} job ${jobId} failed with no request ID.`, + 'error' + ); + return; + } const finalResponse = await fetch( `${fullEndpoint}/api/get?request_id=${id}` ); diff --git a/sky/dashboard/src/data/connectors/users.js b/sky/dashboard/src/data/connectors/users.js index e47737fb46f..bd8997af1ea 100644 --- a/sky/dashboard/src/data/connectors/users.js +++ b/sky/dashboard/src/data/connectors/users.js @@ -1,33 +1,10 @@ import { apiClient } from '@/data/connectors/client'; -// Helper functions for username parsing -const parseUsername = (username, userId) => { - if (username && username.includes('@')) { - return username.split('@')[0]; - } - // If no email, show username with userId in parentheses only if they're different - const usernameBase = username || 'N/A'; - - // Skip showing userId if it's the same as username - if (userId && userId !== usernameBase) { - return `${usernameBase} (${userId})`; - } - - return usernameBase; -}; - -const getFullEmail = (username) => { - if (username && username.includes('@')) { - return username; - } - return '-'; -}; - export async function getUsers() { try { const response = await apiClient.get(`/users`); if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); + throw new Error(`Failed to fetch users with status ${response.status}`); } const data = await response.json(); // Data from API is: [{ id: 'user_hash', name: 'username' }, ...] @@ -42,6 +19,6 @@ export async function getUsers() { ); } catch (error) { console.error('Failed to fetch users:', error); - return []; // Return empty array on error + throw error; } } diff --git a/sky/dashboard/src/data/connectors/volumes.js b/sky/dashboard/src/data/connectors/volumes.js index 265679c1fed..3c1e29695b1 100644 --- a/sky/dashboard/src/data/connectors/volumes.js +++ b/sky/dashboard/src/data/connectors/volumes.js @@ -42,7 +42,7 @@ export async function getVolumes() { return transformedData; } catch (error) { console.error('Failed to fetch volumes:', error); - return []; + throw error; } } @@ -52,30 +52,35 @@ export async function deleteVolume(volumeName) { const response = await apiClient.post('/volumes/delete', { names: [volumeName], }); + if (!response.ok) { + console.error( + `Initial API request to delete volume failed with status ${response.status}` + ); + return { + success: false, + msg: `Failed to delete volume with status ${response.status}`, + }; + } const id = response.headers.get('X-SkyPilot-Request-ID') || response.headers.get('X-Request-ID'); + if (!id) { + console.error('No request ID received from server for deleting volume'); + return { + success: false, + msg: 'No request ID received from server for deleting volume', + }; + } const fetchedData = await apiClient.get(`/api/get?request_id=${id}`); - if (fetchedData.status === 500) { - try { - const data = await fetchedData.json(); - if (data.detail && data.detail.error) { - try { - const error = JSON.parse(data.detail.error); - // Handle specific error types - msg = error.message; - } catch (jsonError) { - console.error('Error parsing JSON:', jsonError); - } - } - } catch (parseError) { - console.error('Error parsing JSON:', parseError); - } + if (!fetchedData.ok) { + msg = `Failed to delete volume with status ${fetchedData.status}`; + console.error(msg); return { success: false, msg: msg }; } return { success: true }; } catch (error) { - console.error('Failed to delete volume:', error); - return { success: false, msg: error.message }; + msg = `Failed to delete volume: ${error}`; + console.error(msg); + return { success: false, msg: msg }; } } diff --git a/sky/dashboard/src/lib/cache.js b/sky/dashboard/src/lib/cache.js index aecdee2c335..62c6bac2f8d 100644 --- a/sky/dashboard/src/lib/cache.js +++ b/sky/dashboard/src/lib/cache.js @@ -123,7 +123,7 @@ class DashboardCache { // If fetch fails and we have stale data, return stale data if (cachedItem) { console.warn( - `Failed to fetch fresh data for ${key}, returning stale data:`, + `Failed to fetch fresh data for ${key}/${functionName}, returning stale data:`, error ); return cachedItem.data; diff --git a/sky/dashboard/src/lib/jobs-cache-manager.js b/sky/dashboard/src/lib/jobs-cache-manager.js index a966eab0952..a0dffc44ae8 100644 --- a/sky/dashboard/src/lib/jobs-cache-manager.js +++ b/sky/dashboard/src/lib/jobs-cache-manager.js @@ -178,15 +178,7 @@ class JobsCacheManager { }; } catch (error) { console.error('Error in getPaginatedJobs:', error); - return { - jobs: [], - total: 0, - totalNoFilter: 0, - controllerStopped: false, - statusCounts: {}, - fromCache: false, - cacheStatus: 'error', - }; + throw error; } }