diff --git a/pkg/toolsets/netedge/internal/defaults/defaults.go b/pkg/toolsets/netedge/internal/defaults/defaults.go new file mode 100644 index 000000000..a1f92d7fe --- /dev/null +++ b/pkg/toolsets/netedge/internal/defaults/defaults.go @@ -0,0 +1,9 @@ +package defaults + +func ToolsetName() string { + return "netedge" +} + +func ToolsetDescription() string { + return "NetEdge troubleshooting tools for OpenShift" +} diff --git a/pkg/toolsets/netedge/tools/query_prometheus.go b/pkg/toolsets/netedge/tools/query_prometheus.go new file mode 100644 index 000000000..441e67c6c --- /dev/null +++ b/pkg/toolsets/netedge/tools/query_prometheus.go @@ -0,0 +1,183 @@ +package tools + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" + + "github.com/google/jsonschema-go/jsonschema" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/prometheus" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets/netedge/internal/defaults" +) + +const ( + // defaultMonitoringNamespace is the default namespace for OpenShift monitoring components + defaultMonitoringNamespace = "openshift-monitoring" + + // thanosQuerierRoute is the route name for Thanos Querier + thanosQuerierRoute = "thanos-querier" +) + +var routeGVR = schema.GroupVersionResource{ + Group: "route.openshift.io", + Version: "v1", + Resource: "routes", +} + +func InitQueryPrometheus() []api.ServerTool { + return []api.ServerTool{ + { + Tool: api.Tool{ + Name: defaults.ToolsetName() + "_query_prometheus", + Description: "Executes specialized diagnostic queries for specific NetEdge components (ingress, dns).", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "diagnostic_target": { + Type: "string", + Description: "Run specialized diagnostics for a specific component.", + Enum: []any{"ingress", "dns", "operators"}, + }, + }, + Required: []string{"diagnostic_target"}, + }, + Annotations: api.ToolAnnotations{ + Title: "NetEdge Diagnostics", + ReadOnlyHint: ptr.To(true), + DestructiveHint: ptr.To(false), + IdempotentHint: ptr.To(true), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: queryPrometheusHandler, + }, + } +} + +type DiagnosticResult struct { + Name string `json:"name"` + Query string `json:"query"` + Result interface{} `json:"result,omitempty"` + Error string `json:"error,omitempty"` + Timestamp time.Time `json:"timestamp"` +} + +func queryPrometheusHandler(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + diagnosticTarget, ok := params.GetArguments()["diagnostic_target"].(string) + if !ok || diagnosticTarget == "" { + return api.NewToolCallResult("", fmt.Errorf("diagnostic_target is required")), nil + } + + return handleDiagnosticTarget(params, diagnosticTarget) +} + +func handleDiagnosticTarget(params api.ToolHandlerParams, target string) (*api.ToolCallResult, error) { + var queries map[string]string + + switch target { + case "ingress": + queries = map[string]string{ + "ingress_error_rate": `sum(rate(haproxy_server_http_responses_total{code!~"2.."}[5m]))`, + "ingress_active_conns": `sum(haproxy_backend_connections_active_total)`, + "ingress_reloads_last_day": `changes(haproxy_server_start_time_seconds[1d])`, + "ingress_top_error_routes": `topk(5, sum by (route) (rate(haproxy_server_http_responses_total{code!~"2.."}[5m])))`, + } + case "dns": + queries = map[string]string{ + "dns_request_rate": `sum(rate(coredns_dns_request_count_total[5m]))`, + "dns_nxdomain_rate": `sum(rate(coredns_dns_request_count_total{rcode="NXDOMAIN"}[5m]))`, + "dns_servfail_rate": `sum(rate(coredns_dns_request_count_total{rcode="SERVFAIL"}[5m]))`, + "dns_panic_recovery": `sum(rate(coredns_panic_count_total[5m]))`, + "dns_error_breakdown": `sum by (rcode) (rate(coredns_dns_request_count_total{rcode!="NOERROR"}[5m]))`, + "dns_rewrite_count": `sum(rate(coredns_plugin_rewrite_request_count_total[5m]))`, + } + case "operators": + queries = map[string]string{ + "active_alerts": `ALERTS{alertstate="firing", namespace=~"openshift-ingress-operator|openshift-dns"}`, + "operator_up": `up{job=~"cluster-ingress-operator|dns-operator"}`, + } + default: + return api.NewToolCallResult("", fmt.Errorf("unknown diagnostic target: %s", target)), nil + } + + // Resolve Thanos URL + baseURL, err := getRouteURL(params.Context, params, thanosQuerierRoute, defaultMonitoringNamespace) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("failed to get Thanos Querier route: %v", err)), nil + } + + opts := []prometheus.ClientOption{ + prometheus.WithBearerTokenFromRESTConfig(params.RESTConfig()), + prometheus.WithTLSFromRESTConfig(params.RESTConfig()), + } + + // Explicitly handle insecure config since WithTLSFromRESTConfig might not cover it strictly if CAs fail to load but system pool works + if params.RESTConfig().Insecure { + opts = append(opts, prometheus.WithInsecure(true)) + } + + client := prometheus.NewClient(baseURL, opts...) + + results := make([]DiagnosticResult, 0, len(queries)) + var wg sync.WaitGroup + var mu sync.Mutex + + for name, q := range queries { + wg.Add(1) + go func(n, qStr string) { + defer wg.Done() + res := DiagnosticResult{ + Name: n, + Query: qStr, + Timestamp: time.Now(), + } + + // Using generic Query since these are instant queries + promResp, err := client.Query(params.Context, qStr, "") + if err != nil { + res.Error = err.Error() + } else { + res.Result = promResp.Data.Result + } + + mu.Lock() + results = append(results, res) + mu.Unlock() + }(name, q) + } + + wg.Wait() + + jsonResult, err := json.MarshalIndent(results, "", " ") + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("failed to marshal diagnostic results: %v", err)), nil + } + + return api.NewToolCallResult(string(jsonResult), nil), nil +} + +// getRouteURL retrieves the URL for an OpenShift route. +func getRouteURL(ctx context.Context, params api.ToolHandlerParams, routeName, namespace string) (string, error) { + route, err := params.DynamicClient().Resource(routeGVR).Namespace(namespace).Get(ctx, routeName, metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("failed to get route %s/%s: %w", namespace, routeName, err) + } + + host, found, err := unstructured.NestedString(route.Object, "spec", "host") + if err != nil { + return "", fmt.Errorf("failed to read route host: %w", err) + } + if !found || host == "" { + return "", fmt.Errorf("route %s/%s has no host configured", namespace, routeName) + } + + return fmt.Sprintf("https://%s", host), nil +} diff --git a/pkg/toolsets/netedge/tools/query_prometheus_test.go b/pkg/toolsets/netedge/tools/query_prometheus_test.go new file mode 100644 index 000000000..1c653c0fd --- /dev/null +++ b/pkg/toolsets/netedge/tools/query_prometheus_test.go @@ -0,0 +1,233 @@ +package tools + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/prometheus" + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/dynamic" + fakedynamic "k8s.io/client-go/dynamic/fake" + "k8s.io/client-go/rest" +) + +// MockConfigProvider partially implements api.ExtendedConfigProvider +type MockConfigProvider struct { + api.ExtendedConfigProvider // Embed to satisfy interface +} + +func (m *MockConfigProvider) GetToolsetConfig(name string) (api.ExtendedConfig, bool) { + return nil, false +} + +// MockKubernetesClient implements api.KubernetesClient with DynamicClient support +type MockKubernetesClient struct { + api.KubernetesClient // Embed to satisfy interface + Token string + DynClient dynamic.Interface +} + +func (m *MockKubernetesClient) RESTConfig() *rest.Config { + return &rest.Config{ + BearerToken: m.Token, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + }, + } +} + +func (m *MockKubernetesClient) DynamicClient() dynamic.Interface { + return m.DynClient +} + +// MockToolCallRequest implements api.ToolCallRequest +type MockToolCallRequest struct { + Args map[string]any +} + +func (m *MockToolCallRequest) GetArguments() map[string]any { + return m.Args +} + +func TestQueryPrometheusHandler_Diagnostics(t *testing.T) { + // 1. Setup mock Prometheus server (TLS) + ts := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Verify Authentication + auth := r.Header.Get("Authorization") + if auth != "Bearer fake-token" { + w.WriteHeader(http.StatusUnauthorized) + return + } + + // Check what query we received + query := r.URL.Query().Get("query") + + // Parse the query to ensure we return correct metric names + // This is a rough simulation + var metricName string + if strings.Contains(query, "haproxy_server_http_responses_total") { + metricName = "haproxy_server_http_responses_total" + } else if strings.Contains(query, "coredns_dns_request_count_total") { + metricName = "coredns_dns_request_count_total" + } else if strings.Contains(query, "ALERTS") { + metricName = "ALERTS" + } else if strings.Contains(query, "up") { + metricName = "up" + } else if strings.Contains(query, "coredns_plugin_rewrite_request_count_total") { + metricName = "coredns_plugin_rewrite_request_count_total" + } else { + metricName = "result" + } + + // Respond with mock data + resp := prometheus.QueryResult{ + Status: "success", + Data: prometheus.Data{ + ResultType: "vector", + Result: []prometheus.Result{ + { + Metric: map[string]string{"__name__": metricName}, + Value: []interface{}{float64(time.Now().Unix()), "123.45"}, + }, + }, + }, + } + + // Add specific metrics for validation + if strings.Contains(query, "haproxy_server_http_responses_total") { + resp.Data.Result[0].Metric["check"] = "ingress-error-rate" + } else if strings.Contains(query, "coredns_dns_request_count_total") { + resp.Data.Result[0].Metric["check"] = "dns-request-rate" + } else if strings.Contains(query, "ALERTS") { + resp.Data.Result[0].Metric["check"] = "operators-alerts" + } + + _ = json.NewEncoder(w).Encode(resp) + })) + defer ts.Close() + + // Extract host from URL for Route object + u, _ := url.Parse(ts.URL) + host := u.Host + + // 2. Setup Fake Dynamic Client with Route + route := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": map[string]interface{}{ + "name": "thanos-querier", + "namespace": "openshift-monitoring", + }, + "spec": map[string]interface{}{ + "host": host, + }, + }, + } + + scheme := runtime.NewScheme() + dynClient := fakedynamic.NewSimpleDynamicClient(scheme, route) + + tests := []struct { + name string + diagTarget string + expectedContains []string + expectError bool + }{ + { + name: "Ingress Diagnostics", + diagTarget: "ingress", + expectedContains: []string{ + "ingress_error_rate", + "ingress_active_conns", + "ingress_reloads_last_day", + "ingress_top_error_routes", + "check", "ingress-error-rate", + }, + }, + { + name: "DNS Diagnostics", + diagTarget: "dns", + expectedContains: []string{ + "dns_request_rate", + "dns_nxdomain_rate", + "dns_servfail_rate", + "dns_panic_recovery", + "dns_error_breakdown", + "dns_rewrite_count", + "check", "dns-request-rate", + }, + }, + { + name: "Operators Diagnostics", + diagTarget: "operators", + expectedContains: []string{ + "active_alerts", + "operator_up", + "check", "operators-alerts", + }, + }, + { + name: "Unknown Target", + diagTarget: "foo", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Setup dependencies + cfgProvider := &MockConfigProvider{} + kubeClient := &MockKubernetesClient{ + Token: "fake-token", + DynClient: dynClient, + } + toolReq := &MockToolCallRequest{ + Args: map[string]any{ + "diagnostic_target": tt.diagTarget, + }, + } + + // Construct params + params := api.ToolHandlerParams{ + Context: context.Background(), + ExtendedConfigProvider: cfgProvider, + KubernetesClient: kubeClient, + ToolCallRequest: toolReq, + } + + result, err := queryPrometheusHandler(params) + + // Validation + if tt.expectError { + if err == nil { + // Check if result has Error field (ToolCallResult) + if result == nil || result.Error == nil { + t.Errorf("expected error but got nil") + } + } else { + // Logic error in test assumption: handler returns (result, nil) usually + // but check if err is returned + assert.NoError(t, err) + } + } else { + assert.NoError(t, err) + if assert.NotNil(t, result) { + assert.NoError(t, result.Error) + for _, expected := range tt.expectedContains { + assert.Contains(t, result.Content, expected) + } + } + } + }) + } +} diff --git a/pkg/toolsets/netedge/toolset.go b/pkg/toolsets/netedge/toolset.go new file mode 100644 index 000000000..d30b0889b --- /dev/null +++ b/pkg/toolsets/netedge/toolset.go @@ -0,0 +1,36 @@ +package netedge + +import ( + "slices" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets/netedge/internal/defaults" + netedgeTools "github.com/containers/kubernetes-mcp-server/pkg/toolsets/netedge/tools" +) + +type Toolset struct{} + +var _ api.Toolset = (*Toolset)(nil) + +func (t *Toolset) GetName() string { + return defaults.ToolsetName() +} + +func (t *Toolset) GetDescription() string { + return defaults.ToolsetDescription() +} + +func (t *Toolset) GetTools(_ api.Openshift) []api.ServerTool { + return slices.Concat( + netedgeTools.InitQueryPrometheus(), + ) +} + +func (t *Toolset) GetPrompts() []api.ServerPrompt { + return nil +} + +func init() { + toolsets.Register(&Toolset{}) +}