diff --git a/README.md b/README.md index 0da7a9ddd..bb34e5491 100644 --- a/README.md +++ b/README.md @@ -352,7 +352,7 @@ In case multi-cluster support is enabled (default) and you have access to multip - `graphType` (`string`) - Type of graph to return: 'versionedApp', 'app', 'service', 'workload', 'mesh'. Default: 'versionedApp' - `namespace` (`string`) - Optional single namespace to include in the graph (alternative to namespaces) - `namespaces` (`string`) - Optional comma-separated list of namespaces to include in the graph - - `rateInterval` (`string`) - Rate interval for fetching (e.g., '10m', '5m', '1h'). Default: '60s' + - `rateInterval` (`string`) - Rate interval for fetching (e.g., '10m', '5m', '1h'). Default: '10m' - **kiali_manage_istio_config** - Manages Istio configuration objects (Gateways, VirtualServices, etc.). Can list (objects and validations), get, create, patch, or delete objects - `action` (`string`) **(required)** - Action to perform: list, get, create, patch, or delete @@ -374,7 +374,7 @@ In case multi-cluster support is enabled (default) and you have access to multip - `duration` (`string`) - Time range to get metrics for (optional string - if provided, gets metrics; if empty, get default 1800s). - `namespace` (`string`) **(required)** - Namespace to get resources from - `quantiles` (`string`) - Comma-separated list of quantiles for histogram metrics (e.g., '0.5,0.95,0.99'). Optional - - `rateInterval` (`string`) - Rate interval for metrics (e.g., '1m', '5m'). Optional, defaults to '1m' + - `rateInterval` (`string`) - Rate interval for metrics (e.g., '1m', '5m'). Optional, defaults to '10m' - `reporter` (`string`) - Metrics reporter: 'source', 'destination', or 'both'. Optional, defaults to 'source' - `requestProtocol` (`string`) - Filter by request protocol (e.g., 'http', 'grpc', 'tcp'). Optional - `resource_name` (`string`) **(required)** - Name of the resource to get details for (optional string - if provided, gets details; if empty, lists all). diff --git a/pkg/kiali/defaults.go b/pkg/kiali/defaults.go new file mode 100644 index 000000000..d9c1af69d --- /dev/null +++ b/pkg/kiali/defaults.go @@ -0,0 +1,8 @@ +package kiali + +// Default values for Kiali API parameters shared across this package. +const ( + // DefaultRateInterval is the default rate interval for fetching error rates and metrics. + // This value is used when rateInterval is not explicitly provided in API calls. + DefaultRateInterval = "10m" +) diff --git a/pkg/kiali/get_mesh_graph.go b/pkg/kiali/get_mesh_graph.go index 8718c8150..f45609697 100644 --- a/pkg/kiali/get_mesh_graph.go +++ b/pkg/kiali/get_mesh_graph.go @@ -7,20 +7,24 @@ import ( "sync" ) +// GetMeshGraphResponse contains the combined response from multiple Kiali API endpoints. +// Note: Health data is fetched from Kiali's health API and used internally to compute +// MeshHealthSummary, but the raw health data is not included in the response to reduce payload size. +// MeshHealthSummary contains all the key aggregated metrics needed for mesh health overview. type GetMeshGraphResponse struct { - Graph json.RawMessage `json:"graph,omitempty"` - Health json.RawMessage `json:"health,omitempty"` - MeshStatus json.RawMessage `json:"mesh_status,omitempty"` - Namespaces json.RawMessage `json:"namespaces,omitempty"` - Errors map[string]string `json:"errors,omitempty"` + Graph json.RawMessage `json:"graph,omitempty"` + MeshStatus json.RawMessage `json:"mesh_status,omitempty"` + Namespaces json.RawMessage `json:"namespaces,omitempty"` + MeshHealthSummary *MeshHealthSummary `json:"mesh_health_summary,omitempty"` // Aggregated summary computed from health data + Errors map[string]string `json:"errors,omitempty"` } // GetMeshGraph fetches multiple Kiali endpoints in parallel and returns a combined response. // Each field in the response corresponds to one API call result. // - graph: /api/namespaces/graph (optionally filtered by namespaces) -// - health: /api/clusters/health (optionally filtered by namespaces and queryParams) -// - status(mesh):/api/mesh/graph +// - mesh_status: /api/mesh/graph // - namespaces: /api/namespaces +// - mesh_health_summary: computed from /api/clusters/health (health data is fetched but not included in response) func (k *Kiali) GetMeshGraph(ctx context.Context, namespaces []string, queryParams map[string]string) (string, error) { cleaned := make([]string, 0, len(namespaces)) for _, ns := range namespaces { @@ -51,7 +55,7 @@ func (k *Kiali) GetMeshGraph(ctx context.Context, namespaces []string, queryPara resp.Graph = data }() - // Health + // Health - compute MeshHealthSummary inside the goroutine go func() { defer wg.Done() data, err := k.getHealth(ctx, cleaned, queryParams) @@ -61,7 +65,13 @@ func (k *Kiali) GetMeshGraph(ctx context.Context, namespaces []string, queryPara errorsMu.Unlock() return } - resp.Health = data + // Compute mesh health summary from health data + if len(data) > 0 { + summary := computeMeshHealthSummary(data, cleaned, queryParams) + if summary != nil { + resp.MeshHealthSummary = summary + } + } }() // Mesh status diff --git a/pkg/kiali/graph.go b/pkg/kiali/graph.go index c699df2d5..b7a2c717c 100644 --- a/pkg/kiali/graph.go +++ b/pkg/kiali/graph.go @@ -18,7 +18,7 @@ func (k *Kiali) Graph(ctx context.Context, namespaces []string, queryParams map[ q := u.Query() // Static graph parameters per requirements // Defaults with optional overrides via queryParams - duration := "60s" + duration := DefaultRateInterval graphType := "versionedApp" if v, ok := queryParams["rateInterval"]; ok && strings.TrimSpace(v) != "" { duration = strings.TrimSpace(v) diff --git a/pkg/kiali/health.go b/pkg/kiali/health.go index 15aba2e34..b5d17a4f7 100644 --- a/pkg/kiali/health.go +++ b/pkg/kiali/health.go @@ -12,7 +12,7 @@ import ( // - namespaces: comma-separated list of namespaces (optional, if empty returns health for all accessible namespaces) // - queryParams: optional query parameters map for filtering health data (e.g., "type", "rateInterval", "queryTime") // - type: health type - "app", "service", or "workload" (default: "app") -// - rateInterval: rate interval for fetching error rate (default: "1m") +// - rateInterval: rate interval for fetching error rate (default: DefaultRateInterval, which is "10m") // - queryTime: Unix timestamp for the prometheus query (optional) func (k *Kiali) Health(ctx context.Context, namespaces string, queryParams map[string]string) (string, error) { // Build query parameters @@ -34,14 +34,18 @@ func (k *Kiali) Health(ctx context.Context, namespaces string, queryParams map[s } } - // Ensure health "type" aligns with graphType (versionedApp -> app) + // Ensure health "type" aligns with graphType (versionedApp -> app, mesh -> app) + // The Kiali health API only accepts "app", "service", or "workload" as valid types healthType := "app" if gt, ok := queryParams["graphType"]; ok && strings.TrimSpace(gt) != "" { v := strings.TrimSpace(gt) if strings.EqualFold(v, "versionedApp") { healthType = "app" - } else { + } else if v == "workload" || v == "service" { healthType = v + } else { + // For "mesh" or any other graphType, default to "app" + healthType = "app" } } q.Set("type", healthType) diff --git a/pkg/kiali/health_calculation.go b/pkg/kiali/health_calculation.go new file mode 100644 index 000000000..ea9b6f2ea --- /dev/null +++ b/pkg/kiali/health_calculation.go @@ -0,0 +1,626 @@ +package kiali + +import ( + "encoding/json" + "fmt" + "strings" + "time" +) + +// computeMeshHealthSummary processes the health JSON and creates an aggregated summary. +// The health data corresponds to the type specified in queryParams (app, workload, or service). +func computeMeshHealthSummary(healthData json.RawMessage, requestedNamespaces []string, queryParams map[string]string) *MeshHealthSummary { + // Determine the health type from queryParams (defaults to "app") + healthType := "app" + if gt, ok := queryParams["graphType"]; ok && strings.TrimSpace(gt) != "" { + v := strings.TrimSpace(gt) + if strings.EqualFold(v, "versionedApp") { + healthType = "app" + } else if v == "workload" || v == "service" { + healthType = v + } + } + + rateInterval := queryParams["rateInterval"] + if rateInterval == "" { + rateInterval = DefaultRateInterval + } + + // Parse the health JSON into ClustersNamespaceHealth structure + var clustersHealth ClustersNamespaceHealth + if err := json.Unmarshal(healthData, &clustersHealth); err != nil { + // If parsing fails, return empty summary + return &MeshHealthSummary{ + EntityCounts: EntityHealthCounts{}, + NamespaceSummary: make(map[string]NamespaceSummary), + TopUnhealthy: []UnhealthyEntity{}, + Timestamp: time.Now().UTC().Format(time.RFC3339), + RateInterval: rateInterval, + } + } + + // Create empty health structures for types we don't have + emptyHealth := ClustersNamespaceHealth{ + AppHealth: make(map[string]NamespaceAppHealth), + ServiceHealth: make(map[string]NamespaceServiceHealth), + WorkloadHealth: make(map[string]NamespaceWorkloadHealth), + } + + // Use the appropriate health data based on type + var appHealth, svcHealth, wlHealth ClustersNamespaceHealth + switch healthType { + case "app": + appHealth = clustersHealth + svcHealth = emptyHealth + wlHealth = emptyHealth + case "service": + appHealth = emptyHealth + svcHealth = clustersHealth + wlHealth = emptyHealth + case "workload": + appHealth = emptyHealth + svcHealth = emptyHealth + wlHealth = clustersHealth + default: + appHealth = clustersHealth + svcHealth = emptyHealth + wlHealth = emptyHealth + } + + // Compute summary using the same logic as the old branch + summary := computeHealthSummary(appHealth, svcHealth, wlHealth, rateInterval) + + return &summary +} + +// computeHealthSummary aggregates health data (same logic as old branch) +func computeHealthSummary( + appHealth ClustersNamespaceHealth, + svcHealth ClustersNamespaceHealth, + wlHealth ClustersNamespaceHealth, + rateInterval string, +) MeshHealthSummary { + summary := MeshHealthSummary{ + EntityCounts: EntityHealthCounts{}, + NamespaceSummary: make(map[string]NamespaceSummary), + TopUnhealthy: []UnhealthyEntity{}, + Timestamp: time.Now().UTC().Format(time.RFC3339), + RateInterval: rateInterval, + } + + // Collect all namespace names + nsSet := make(map[string]bool) + for ns := range appHealth.AppHealth { + nsSet[ns] = true + } + for ns := range svcHealth.ServiceHealth { + nsSet[ns] = true + } + for ns := range wlHealth.WorkloadHealth { + nsSet[ns] = true + } + summary.NamespaceCount = len(nsSet) + + // Aggregate per namespace + for ns := range nsSet { + nsSummary := NamespaceSummary{} + + // Process apps + if nsApps, ok := appHealth.AppHealth[ns]; ok { + for appName, app := range nsApps { + summary.EntityCounts.Apps.Total++ + nsSummary.Apps.Total++ + + status, issue := evaluateAppHealth(app) + switch status { + case "HEALTHY": + summary.EntityCounts.Apps.Healthy++ + nsSummary.Apps.Healthy++ + case "NOT_READY": + summary.EntityCounts.Apps.NotReady++ + nsSummary.Apps.NotReady++ + case "DEGRADED": + summary.EntityCounts.Apps.Degraded++ + nsSummary.Apps.Degraded++ + case "UNHEALTHY": + summary.EntityCounts.Apps.Unhealthy++ + nsSummary.Apps.Unhealthy++ + summary.TopUnhealthy = append(summary.TopUnhealthy, UnhealthyEntity{ + Type: "app", + Namespace: ns, + Name: appName, + Status: status, + Issue: issue, + ErrorRate: calculateErrorRate(app.Requests), + }) + } + + nsSummary.ErrorRate += calculateErrorRate(app.Requests) + } + } + + // Process services + if nsSvcs, ok := svcHealth.ServiceHealth[ns]; ok { + for svcName, svc := range nsSvcs { + summary.EntityCounts.Services.Total++ + nsSummary.Services.Total++ + + status, issue := evaluateServiceHealth(svc) + switch status { + case "HEALTHY": + summary.EntityCounts.Services.Healthy++ + nsSummary.Services.Healthy++ + case "NOT_READY": + summary.EntityCounts.Services.NotReady++ + nsSummary.Services.NotReady++ + case "DEGRADED": + summary.EntityCounts.Services.Degraded++ + nsSummary.Services.Degraded++ + case "UNHEALTHY": + summary.EntityCounts.Services.Unhealthy++ + nsSummary.Services.Unhealthy++ + summary.TopUnhealthy = append(summary.TopUnhealthy, UnhealthyEntity{ + Type: "service", + Namespace: ns, + Name: svcName, + Status: status, + Issue: issue, + ErrorRate: calculateErrorRate(svc.Requests), + }) + } + + nsSummary.ErrorRate += calculateErrorRate(svc.Requests) + } + } + + // Process workloads + if nsWls, ok := wlHealth.WorkloadHealth[ns]; ok { + for wlName, wl := range nsWls { + summary.EntityCounts.Workloads.Total++ + nsSummary.Workloads.Total++ + + status, issue := evaluateWorkloadHealth(wl) + switch status { + case "HEALTHY": + summary.EntityCounts.Workloads.Healthy++ + nsSummary.Workloads.Healthy++ + case "NOT_READY": + summary.EntityCounts.Workloads.NotReady++ + nsSummary.Workloads.NotReady++ + case "DEGRADED": + summary.EntityCounts.Workloads.Degraded++ + nsSummary.Workloads.Degraded++ + case "UNHEALTHY": + summary.EntityCounts.Workloads.Unhealthy++ + nsSummary.Workloads.Unhealthy++ + summary.TopUnhealthy = append(summary.TopUnhealthy, UnhealthyEntity{ + Type: "workload", + Namespace: ns, + Name: wlName, + Status: status, + Issue: issue, + ErrorRate: calculateErrorRate(wl.Requests), + }) + } + + nsSummary.ErrorRate += calculateErrorRate(wl.Requests) + } + } + + // Compute namespace status and availability + nsSummary.Status = computeNamespaceStatus(nsSummary) + nsSummary.Availability = computeAvailability(nsSummary) + summary.NamespaceSummary[ns] = nsSummary + } + + // Compute overall stats + summary.OverallStatus = computeOverallStatus(summary.EntityCounts) + summary.Availability = computeOverallAvailability(summary.EntityCounts) + summary.TotalErrorRate = computeTotalErrorRate(summary.NamespaceSummary) + + // Sort and limit top unhealthy + sortUnhealthyByImpact(summary.TopUnhealthy) + if len(summary.TopUnhealthy) > 10 { + summary.TopUnhealthy = summary.TopUnhealthy[:10] + } + + return summary +} + +// evaluateAppHealth determines app health status +func evaluateAppHealth(app AppHealth) (status string, issue string) { + // Check workload statuses + totalWorkloads := len(app.WorkloadStatuses) + if totalWorkloads == 0 { + return "UNKNOWN", "no workloads found" + } + + workloadStatus := "HEALTHY" + unhealthyCount := 0 + for _, ws := range app.WorkloadStatuses { + // User has scaled down a workload, then desired replicas will be 0 and it's not an error condition + // This matches Kiali frontend logic: return NOT_READY when desiredReplicas === 0 + if ws.DesiredReplicas == 0 { + workloadStatus = "NOT_READY" + issue = "scaled to 0 replicas" + continue + } + + if ws.AvailableReplicas < ws.DesiredReplicas { + unhealthyCount++ + issue = fmt.Sprintf("%d/%d replicas available", ws.AvailableReplicas, ws.DesiredReplicas) + if ws.AvailableReplicas == 0 { + workloadStatus = "UNHEALTHY" + } else if workloadStatus != "UNHEALTHY" { + workloadStatus = "DEGRADED" + } + } + if ws.SyncedProxies >= 0 && ws.SyncedProxies < ws.AvailableReplicas { + if issue == "" { + issue = fmt.Sprintf("%d/%d proxies synced", ws.SyncedProxies, ws.AvailableReplicas) + } + if workloadStatus == "HEALTHY" { + workloadStatus = "DEGRADED" + } + } + } + + // Evaluate request health using tolerance-based logic (Kiali tolerances) + requestStatus, errorRate := evaluateRequestHealth(app.Requests) + if errorRate > 0 && issue == "" { + issue = fmt.Sprintf("error rate: %.2f%%", errorRate*100) + } + + // Merge workload and request statuses (worst wins) + finalStatus := mergeHealthStatus(workloadStatus, requestStatus) + return finalStatus, issue +} + +// evaluateServiceHealth determines service health status +func evaluateServiceHealth(svc ServiceHealth) (status string, issue string) { + // If there is no inbound or outbound traffic data, service health is UNKNOWN + if !hasAnyRequests(svc.Requests) { + return "UNKNOWN", "" + } + + // Evaluate request health using tolerance-based logic (Kiali tolerances) + status, errorRate := evaluateRequestHealth(svc.Requests) + + if errorRate > 0 && issue == "" { + issue = fmt.Sprintf("error rate: %.2f%%", errorRate*100) + } + return status, issue +} + +// hasAnyRequests returns true if there is any non-zero request count in inbound or outbound +func hasAnyRequests(req RequestHealth) bool { + // Check inbound + for _, codes := range req.Inbound { + for _, count := range codes { + if count > 0 { + return true + } + } + } + // Check outbound + for _, codes := range req.Outbound { + for _, count := range codes { + if count > 0 { + return true + } + } + } + return false +} + +// evaluateWorkloadHealth determines workload health status +func evaluateWorkloadHealth(wl WorkloadHealth) (status string, issue string) { + workloadStatus := "HEALTHY" + + if wl.WorkloadStatus != nil { + ws := wl.WorkloadStatus + // User has scaled down a workload, then desired replicas will be 0 and it's not an error condition + // This matches Kiali frontend logic: return NOT_READY when desiredReplicas === 0 + if ws.DesiredReplicas == 0 { + workloadStatus = "NOT_READY" + issue = "scaled to 0 replicas" + } else if ws.AvailableReplicas < ws.DesiredReplicas { + issue = fmt.Sprintf("%d/%d replicas available", ws.AvailableReplicas, ws.DesiredReplicas) + if ws.AvailableReplicas == 0 { + workloadStatus = "UNHEALTHY" + } else { + workloadStatus = "DEGRADED" + } + } + if ws.SyncedProxies >= 0 && ws.SyncedProxies < ws.AvailableReplicas { + if issue == "" { + issue = fmt.Sprintf("%d/%d proxies synced", ws.SyncedProxies, ws.AvailableReplicas) + } + if workloadStatus == "HEALTHY" { + workloadStatus = "DEGRADED" + } + } + } + + // Evaluate request health using tolerance-based logic (Kiali tolerances) + requestStatus, errorRate := evaluateRequestHealth(wl.Requests) + + // If there is no inbound or outbound traffic data and no workload status info, mark UNKNOWN + if !hasAnyRequests(wl.Requests) && wl.WorkloadStatus == nil { + return "UNKNOWN", "" + } + if errorRate > 0 && issue == "" { + issue = fmt.Sprintf("error rate: %.2f%%", errorRate*100) + } + + // Merge workload and request statuses (worst wins) + finalStatus := mergeHealthStatus(workloadStatus, requestStatus) + return finalStatus, issue +} + +// mergeHealthStatus returns the worst of two health statuses +// Priority matches Kiali frontend: UNHEALTHY(4) > DEGRADED(3) > NOT_READY(2) > HEALTHY(1) > UNKNOWN(0) +func mergeHealthStatus(s1, s2 string) string { + priority := map[string]int{ + "UNHEALTHY": 4, + "DEGRADED": 3, + "NOT_READY": 2, + "HEALTHY": 1, + "UNKNOWN": 0, + } + + if priority[s1] > priority[s2] { + return s1 + } + return s2 +} + +// calculateErrorRate computes error percentage from request health +// This uses a simplified approach - for each protocol/code combination, +// it checks against tolerance thresholds to determine if it's an error +func calculateErrorRate(req RequestHealth) float64 { + totalRequests := 0.0 + errorRequests := 0.0 + + // Count inbound + for protocol, codes := range req.Inbound { + for code, count := range codes { + totalRequests += count + if isErrorCode(protocol, code) { + errorRequests += count + } + } + } + + // Count outbound + for protocol, codes := range req.Outbound { + for code, count := range codes { + totalRequests += count + if isErrorCode(protocol, code) { + errorRequests += count + } + } + } + + if totalRequests == 0 { + return 0.0 + } + return errorRequests / totalRequests +} + +// isErrorCode checks if a status code represents an error +// Based on Kiali's default tolerance configuration +func isErrorCode(protocol, code string) bool { + switch protocol { + case "http": + // "-" represents aborted/fault-injected requests (always an error) + if code == "-" { + return true + } + // 4xx client errors + if len(code) == 3 && code[0] == '4' { + return true + } + // 5xx server errors + if len(code) == 3 && code[0] == '5' { + return true + } + case "grpc": + // "-" represents aborted requests + if code == "-" { + return true + } + // gRPC error codes (1-16, non-zero) + if code != "0" { + return true + } + } + return false +} + +// evaluateRequestHealth evaluates health status based on request metrics +// Returns status and worst error ratio found +func evaluateRequestHealth(req RequestHealth) (status string, worstRatio float64) { + status = "HEALTHY" + worstRatio = 0.0 + + // Helper to process requests (inbound or outbound) + processRequests := func(requests map[string]map[string]float64) { + for protocol, codes := range requests { + totalForProtocol := 0.0 + + // Calculate totals + for _, count := range codes { + totalForProtocol += count + } + + if totalForProtocol == 0 { + continue + } + + // Calculate error ratios for each code + for code, count := range codes { + if isErrorCode(protocol, code) { + ratio := count / totalForProtocol + + // Track worst ratio + if ratio > worstRatio { + worstRatio = ratio + } + + // Evaluate against tolerance thresholds + // Based on Kiali defaults: + // - Code "-": degraded=0%, failure=10% + // - 5xx: degraded=0%, failure=10% + // - 4xx: degraded=10%, failure=20% + // - grpc errors: degraded=0%, failure=10% + + codeStatus := getStatusForCodeRatio(protocol, code, ratio) + if codeStatus == "UNHEALTHY" { + status = "UNHEALTHY" + } else if codeStatus == "DEGRADED" && status == "HEALTHY" { + status = "DEGRADED" + } + } + } + } + } + + processRequests(req.Inbound) + processRequests(req.Outbound) + + return status, worstRatio +} + +// getStatusForCodeRatio determines health status based on error code and ratio +// Implements Kiali's default tolerance configuration +func getStatusForCodeRatio(protocol, code string, ratio float64) string { + percentage := ratio * 100 + + switch protocol { + case "http": + if code == "-" { + // Aborted/fault-injected: degraded=0%, failure=10% + if percentage >= 10 { + return "UNHEALTHY" + } else if percentage > 0 { + return "DEGRADED" + } + } else if len(code) == 3 && code[0] == '5' { + // 5xx errors: degraded=0%, failure=10% + if percentage >= 10 { + return "UNHEALTHY" + } else if percentage > 0 { + return "DEGRADED" + } + } else if len(code) == 3 && code[0] == '4' { + // 4xx errors: degraded=10%, failure=20% + if percentage >= 20 { + return "UNHEALTHY" + } else if percentage >= 10 { + return "DEGRADED" + } + } + case "grpc": + // gRPC errors (including "-"): degraded=0%, failure=10% + if code != "0" { + if percentage >= 10 { + return "UNHEALTHY" + } else if percentage > 0 { + return "DEGRADED" + } + } + } + + return "HEALTHY" +} + +// computeNamespaceStatus determines namespace overall status +func computeNamespaceStatus(ns NamespaceSummary) string { + totalUnhealthy := ns.Apps.Unhealthy + ns.Services.Unhealthy + ns.Workloads.Unhealthy + totalEntities := ns.Apps.Total + ns.Services.Total + ns.Workloads.Total + + if totalEntities == 0 { + return "UNKNOWN" + } + + if totalUnhealthy == 0 && ns.ErrorRate < 0.01 { + return "HEALTHY" + } else if totalUnhealthy > totalEntities/2 || ns.ErrorRate > 0.05 { + return "UNHEALTHY" + } + return "DEGRADED" +} + +// computeAvailability computes availability percentage for a namespace +func computeAvailability(ns NamespaceSummary) float64 { + total := ns.Apps.Total + ns.Services.Total + ns.Workloads.Total + if total == 0 { + return 100.0 + } + + healthy := ns.Apps.Healthy + ns.Services.Healthy + ns.Workloads.Healthy + degraded := ns.Apps.Degraded + ns.Services.Degraded + ns.Workloads.Degraded + + return (float64(healthy) + float64(degraded)*0.5) / float64(total) * 100.0 +} + +// computeOverallStatus determines overall mesh status +func computeOverallStatus(counts EntityHealthCounts) string { + total := counts.Apps.Total + counts.Services.Total + counts.Workloads.Total + unhealthy := counts.Apps.Unhealthy + counts.Services.Unhealthy + counts.Workloads.Unhealthy + degraded := counts.Apps.Degraded + counts.Services.Degraded + counts.Workloads.Degraded + + if total == 0 { + return "UNKNOWN" + } + + // If there are any unhealthy entities + if unhealthy > 0 { + if unhealthy > total/2 { + return "UNHEALTHY" + } + return "DEGRADED" + } + + // If there are degraded entities but no unhealthy + if degraded > 0 { + return "DEGRADED" + } + + return "HEALTHY" +} + +// computeOverallAvailability computes overall mesh availability +func computeOverallAvailability(counts EntityHealthCounts) float64 { + total := counts.Apps.Total + counts.Services.Total + counts.Workloads.Total + if total == 0 { + return 100.0 + } + + healthy := counts.Apps.Healthy + counts.Services.Healthy + counts.Workloads.Healthy + degraded := counts.Apps.Degraded + counts.Services.Degraded + counts.Workloads.Degraded + + return (float64(healthy) + float64(degraded)*0.5) / float64(total) * 100.0 +} + +// computeTotalErrorRate sums error rates across namespaces +func computeTotalErrorRate(nsSummaries map[string]NamespaceSummary) float64 { + total := 0.0 + for _, ns := range nsSummaries { + total += ns.ErrorRate + } + return total +} + +// sortUnhealthyByImpact sorts unhealthy entities by error rate +func sortUnhealthyByImpact(unhealthy []UnhealthyEntity) { + // Simple bubble sort by error rate descending + for i := 0; i < len(unhealthy); i++ { + for j := i + 1; j < len(unhealthy); j++ { + if unhealthy[j].ErrorRate > unhealthy[i].ErrorRate { + unhealthy[i], unhealthy[j] = unhealthy[j], unhealthy[i] + } + } + } +} diff --git a/pkg/kiali/services.go b/pkg/kiali/services.go index 1b8dc9be1..6be585b37 100644 --- a/pkg/kiali/services.go +++ b/pkg/kiali/services.go @@ -9,7 +9,7 @@ import ( // ServicesList returns the list of services across specified namespaces. func (k *Kiali) ServicesList(ctx context.Context, namespaces string) (string, error) { - endpoint := ServicesEndpoint + "?health=true&istioResources=true&rateInterval=60s&onlyDefinitions=false" + endpoint := ServicesEndpoint + "?health=true&istioResources=true&rateInterval=" + DefaultRateInterval + "&onlyDefinitions=false" if namespaces != "" { endpoint += "&namespaces=" + url.QueryEscape(namespaces) } @@ -25,7 +25,7 @@ func (k *Kiali) ServiceDetails(ctx context.Context, namespace string, service st if service == "" { return "", fmt.Errorf("service name is required") } - endpoint := fmt.Sprintf(ServiceDetailsEndpoint, url.PathEscape(namespace), url.PathEscape(service)) + "?validate=true&rateInterval=60s" + endpoint := fmt.Sprintf(ServiceDetailsEndpoint, url.PathEscape(namespace), url.PathEscape(service)) + "?validate=true&rateInterval=" + DefaultRateInterval return k.executeRequest(ctx, http.MethodGet, endpoint, "", nil) } diff --git a/pkg/kiali/types.go b/pkg/kiali/types.go new file mode 100644 index 000000000..e3dda8f55 --- /dev/null +++ b/pkg/kiali/types.go @@ -0,0 +1,99 @@ +package kiali + +// MeshHealthSummary represents aggregated health across the mesh +type MeshHealthSummary struct { + OverallStatus string `json:"overallStatus"` // HEALTHY, DEGRADED, UNHEALTHY + Availability float64 `json:"availability"` // Percentage 0-100 + TotalErrorRate float64 `json:"totalErrorRate"` + NamespaceCount int `json:"namespaceCount"` + EntityCounts EntityHealthCounts `json:"entityCounts"` + NamespaceSummary map[string]NamespaceSummary `json:"namespaceSummary"` + TopUnhealthy []UnhealthyEntity `json:"topUnhealthy,omitempty"` + Timestamp string `json:"timestamp"` + RateInterval string `json:"rateInterval"` +} + +// EntityHealthCounts contains health counts for all entity types +type EntityHealthCounts struct { + Apps HealthCounts `json:"apps"` + Services HealthCounts `json:"services"` + Workloads HealthCounts `json:"workloads"` +} + +// HealthCounts represents health status counts +type HealthCounts struct { + Total int `json:"total"` + Healthy int `json:"healthy"` + Degraded int `json:"degraded"` + Unhealthy int `json:"unhealthy"` + NotReady int `json:"notReady"` +} + +// NamespaceSummary contains health summary for a namespace +type NamespaceSummary struct { + Status string `json:"status"` + Availability float64 `json:"availability"` + ErrorRate float64 `json:"errorRate"` + Apps HealthCounts `json:"apps"` + Services HealthCounts `json:"services"` + Workloads HealthCounts `json:"workloads"` +} + +// UnhealthyEntity represents an unhealthy entity +type UnhealthyEntity struct { + Type string `json:"type"` // app, service, workload + Namespace string `json:"namespace"` + Name string `json:"name"` + Status string `json:"status"` + Issue string `json:"issue"` + ErrorRate float64 `json:"errorRate,omitempty"` +} + +// ClustersNamespaceHealth matches Kiali's response structure +type ClustersNamespaceHealth struct { + AppHealth map[string]NamespaceAppHealth `json:"namespaceAppHealth"` + ServiceHealth map[string]NamespaceServiceHealth `json:"namespaceServiceHealth"` + WorkloadHealth map[string]NamespaceWorkloadHealth `json:"namespaceWorkloadHealth"` +} + +// NamespaceAppHealth is a map of app name to health +type NamespaceAppHealth map[string]AppHealth + +// NamespaceServiceHealth is a map of service name to health +type NamespaceServiceHealth map[string]ServiceHealth + +// NamespaceWorkloadHealth is a map of workload name to health +type NamespaceWorkloadHealth map[string]WorkloadHealth + +// AppHealth contains health information for an app +type AppHealth struct { + WorkloadStatuses []WorkloadStatus `json:"workloadStatuses"` + Requests RequestHealth `json:"requests"` +} + +// ServiceHealth contains health information for a service +type ServiceHealth struct { + Requests RequestHealth `json:"requests"` +} + +// WorkloadHealth contains health information for a workload +type WorkloadHealth struct { + WorkloadStatus *WorkloadStatus `json:"workloadStatus"` + Requests RequestHealth `json:"requests"` +} + +// WorkloadStatus represents workload replica status +type WorkloadStatus struct { + Name string `json:"name"` + DesiredReplicas int32 `json:"desiredReplicas"` + CurrentReplicas int32 `json:"currentReplicas"` + AvailableReplicas int32 `json:"availableReplicas"` + SyncedProxies int32 `json:"syncedProxies"` +} + +// RequestHealth holds request health metrics +type RequestHealth struct { + Inbound map[string]map[string]float64 `json:"inbound"` + Outbound map[string]map[string]float64 `json:"outbound"` + HealthAnnotations map[string]string `json:"healthAnnotations"` +} diff --git a/pkg/kiali/workloads.go b/pkg/kiali/workloads.go index ccd5538a4..926096ee8 100644 --- a/pkg/kiali/workloads.go +++ b/pkg/kiali/workloads.go @@ -10,7 +10,7 @@ import ( // WorkloadsList returns the list of workloads across specified namespaces. func (k *Kiali) WorkloadsList(ctx context.Context, namespaces string) (string, error) { - endpoint := WorkloadsEndpoint + "?health=true&istioResources=true&rateInterval=60s" + endpoint := WorkloadsEndpoint + "?health=true&istioResources=true&rateInterval=" + DefaultRateInterval if namespaces != "" { endpoint += "&namespaces=" + url.QueryEscape(namespaces) } @@ -26,7 +26,7 @@ func (k *Kiali) WorkloadDetails(ctx context.Context, namespace string, workload if workload == "" { return "", fmt.Errorf("workload name is required") } - endpoint := fmt.Sprintf(WorkloadDetailsEndpoint, url.PathEscape(namespace), url.PathEscape(workload)) + "?validate=true&rateInterval=60s&health=true" + endpoint := fmt.Sprintf(WorkloadDetailsEndpoint, url.PathEscape(namespace), url.PathEscape(workload)) + "?validate=true&rateInterval=" + DefaultRateInterval + "&health=true" return k.executeRequest(ctx, http.MethodGet, endpoint, "", nil) } diff --git a/pkg/toolsets/kiali/get_mesh_graph.go b/pkg/toolsets/kiali/get_mesh_graph.go index 8981e514a..c2f8e6ab1 100644 --- a/pkg/toolsets/kiali/get_mesh_graph.go +++ b/pkg/toolsets/kiali/get_mesh_graph.go @@ -8,6 +8,7 @@ import ( "k8s.io/utils/ptr" "github.com/containers/kubernetes-mcp-server/pkg/api" + kialiclient "github.com/containers/kubernetes-mcp-server/pkg/kiali" ) func initGetMeshGraph() []api.ServerTool { @@ -15,7 +16,7 @@ func initGetMeshGraph() []api.ServerTool { ret = append(ret, api.ServerTool{ Tool: api.Tool{ Name: "kiali_get_mesh_graph", - Description: "Returns the topology of a specific namespaces, health, status of the mesh and namespaces. Use this for high-level overviews", + Description: "Returns the topology of a specific namespaces, health, status of the mesh and namespaces. Includes a mesh health summary overview with aggregated counts of healthy, degraded, and failing apps, workloads, and services. Use this for high-level overviews", InputSchema: &jsonschema.Schema{ Type: "object", Properties: map[string]*jsonschema.Schema{ @@ -29,7 +30,7 @@ func initGetMeshGraph() []api.ServerTool { }, "rateInterval": { Type: "string", - Description: "Rate interval for fetching (e.g., '10m', '5m', '1h'). Default: '60s'", + Description: "Rate interval for fetching (e.g., '10m', '5m', '1h'). Default: '10m'", }, "graphType": { Type: "string", @@ -88,9 +89,11 @@ func getMeshGraphHandler(params api.ToolHandlerParams) (*api.ToolCallResult, err // Extract optional query parameters queryParams := make(map[string]string) - if rateInterval, ok := params.GetArguments()["rateInterval"].(string); ok && rateInterval != "" { - queryParams["rateInterval"] = rateInterval + rateInterval := kialiclient.DefaultRateInterval // default + if v, ok := params.GetArguments()["rateInterval"].(string); ok && v != "" { + rateInterval = v } + queryParams["rateInterval"] = rateInterval if graphType, ok := params.GetArguments()["graphType"].(string); ok && graphType != "" { queryParams["graphType"] = graphType } diff --git a/pkg/toolsets/kiali/get_metrics.go b/pkg/toolsets/kiali/get_metrics.go index acedaf651..a67000c0a 100644 --- a/pkg/toolsets/kiali/get_metrics.go +++ b/pkg/toolsets/kiali/get_metrics.go @@ -65,7 +65,7 @@ func initGetMetrics() []api.ServerTool { }, "rateInterval": { Type: "string", - Description: "Rate interval for metrics (e.g., '1m', '5m'). Optional, defaults to '1m'", + Description: "Rate interval for metrics (e.g., '1m', '5m'). Optional, defaults to '10m'", }, "direction": { Type: "string", @@ -135,9 +135,11 @@ func resourceMetricsHandler(params api.ToolHandlerParams) (*api.ToolCallResult, if step, ok := params.GetArguments()["step"].(string); ok && step != "" { queryParams["step"] = step } - if rateInterval, ok := params.GetArguments()["rateInterval"].(string); ok && rateInterval != "" { - queryParams["rateInterval"] = rateInterval + rateInterval := kialiclient.DefaultRateInterval // default + if v, ok := params.GetArguments()["rateInterval"].(string); ok && v != "" { + rateInterval = v } + queryParams["rateInterval"] = rateInterval if direction, ok := params.GetArguments()["direction"].(string); ok && direction != "" { queryParams["direction"] = direction }