diff --git a/README.md b/README.md index 01d739844..048a3d697 100644 --- a/README.md +++ b/README.md @@ -328,13 +328,14 @@ The following sets of tools are available (toolsets marked with ✓ in the Defau -| Toolset | Description | Default | -|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| -| config | View and manage the current local Kubernetes configuration (kubeconfig) | ✓ | -| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | ✓ | -| kiali | Most common tools for managing Kiali, check the [Kiali documentation](https://github.com/containers/kubernetes-mcp-server/blob/main/docs/KIALI.md) for more details. | | -| kubevirt | KubeVirt virtual machine management tools | | -| helm | Tools for managing Helm charts and releases | ✓ | +| Toolset | Description | Default | +|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| +| config | View and manage the current local Kubernetes configuration (kubeconfig) | ✓ | +| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | ✓ | +| ossm | Most common tools for managing OSSM, check the [OSSM documentation](https://github.com/openshift/openshift-mcp-server/blob/main/docs/OSSM.md) for more details. | | +| kubevirt | KubeVirt virtual machine management tools | | +| observability | Cluster observability tools for querying Prometheus metrics and Alertmanager alerts | ✓ | +| helm | Tools for managing Helm charts and releases | ✓ | @@ -350,6 +351,8 @@ In case multi-cluster support is enabled (default) and you have access to multip - **configuration_contexts_list** - List all available context names and associated server urls from the kubeconfig file +- **targets_list** - List all available targets + - **configuration_view** - Get the current Kubernetes configuration content as a kubeconfig YAML - `minified` (`boolean`) - Return a minified version of the configuration. If set to true, keeps only the current-context and the relevant pieces of the configuration for that context. If set to false, all contexts, clusters, auth-infos, and users are returned in the configuration. (Optional, default true) @@ -379,9 +382,11 @@ In case multi-cluster support is enabled (default) and you have access to multip - `name` (`string`) - Name of the Node to get the resource consumption from (Optional, all Nodes if not provided) - **pods_list** - List all the Kubernetes pods in the current cluster from all namespaces + - `fieldSelector` (`string`) - Optional Kubernetes field selector to filter pods by field values (e.g. 'status.phase=Running', 'spec.nodeName=node1'). Supported fields: metadata.name, metadata.namespace, spec.nodeName, spec.restartPolicy, spec.schedulerName, spec.serviceAccountName, status.phase (Pending/Running/Succeeded/Failed/Unknown), status.podIP, status.nominatedNodeName. Note: CrashLoopBackOff is a container state, not a pod phase, so it cannot be filtered directly. See https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the pods by label - **pods_list_in_namespace** - List all the Kubernetes pods in the specified namespace in the current cluster + - `fieldSelector` (`string`) - Optional Kubernetes field selector to filter pods by field values (e.g. 'status.phase=Running', 'spec.nodeName=node1'). Supported fields: metadata.name, metadata.namespace, spec.nodeName, spec.restartPolicy, spec.schedulerName, spec.serviceAccountName, status.phase (Pending/Running/Succeeded/Failed/Unknown), status.podIP, status.nominatedNodeName. Note: CrashLoopBackOff is a container state, not a pod phase, so it cannot be filtered directly. See https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the pods by label - `namespace` (`string`) **(required)** - Namespace to list pods from @@ -421,8 +426,9 @@ In case multi-cluster support is enabled (default) and you have access to multip - **resources_list** - List Kubernetes resources and objects in the current cluster by providing their apiVersion and kind and optionally the namespace and label selector (common apiVersion and kind include: v1 Pod, v1 Service, v1 Node, apps/v1 Deployment, networking.k8s.io/v1 Ingress, route.openshift.io/v1 Route) - `apiVersion` (`string`) **(required)** - apiVersion of the resources (examples of valid apiVersion are: v1, apps/v1, networking.k8s.io/v1) + - `fieldSelector` (`string`) - Optional Kubernetes field selector to filter resources by field values (e.g. 'status.phase=Running', 'metadata.name=myresource'). Supported fields vary by resource type. For Pods: metadata.name, metadata.namespace, spec.nodeName, spec.restartPolicy, spec.schedulerName, spec.serviceAccountName, status.phase (Pending/Running/Succeeded/Failed/Unknown), status.podIP, status.nominatedNodeName. See https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ - `kind` (`string`) **(required)** - kind of the resources (examples of valid kind are: Pod, Service, Deployment, Ingress) - - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the pods by label + - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the resources by label - `namespace` (`string`) - Optional Namespace to retrieve the namespaced resources from (ignored in case of cluster scoped resources). If not provided, will list resources from all namespaces - **resources_get** - Get a Kubernetes resource in the current cluster by providing its apiVersion, kind, optionally the namespace, and its name @@ -454,15 +460,15 @@ In case multi-cluster support is enabled (default) and you have access to multip
-kiali +ossm -- **kiali_mesh_graph** - Returns the topology of a specific namespaces, health, status of the mesh and namespaces. Includes a mesh health summary overview with aggregated counts of healthy, degraded, and failing apps, workloads, and services. Use this for high-level overviews +- **ossm_mesh_graph** - Returns the topology of a specific namespaces, health, status of the mesh and namespaces. Includes a mesh health summary overview with aggregated counts of healthy, degraded, and failing apps, workloads, and services. Use this for high-level overviews - `graphType` (`string`) - Optional type of graph to return: 'versionedApp', 'app', 'service', 'workload', 'mesh' - `namespace` (`string`) - Optional single namespace to include in the graph (alternative to namespaces) - `namespaces` (`string`) - Optional comma-separated list of namespaces to include in the graph - `rateInterval` (`string`) - Optional rate interval for fetching (e.g., '10m', '5m', '1h'). -- **kiali_manage_istio_config** - Manages Istio configuration objects (Gateways, VirtualServices, etc.). Can list (objects and validations), get, create, patch, or delete objects +- **ossm_manage_istio_config** - Manages Istio configuration objects (Gateways, VirtualServices, etc.). Can list (objects and validations), get, create, patch, or delete objects - `action` (`string`) **(required)** - Action to perform: list, get, create, patch, or delete - `group` (`string`) - API group of the Istio object (e.g., 'networking.istio.io', 'gateway.networking.k8s.io') - `json_data` (`string`) - JSON data to apply or create the object @@ -471,12 +477,12 @@ In case multi-cluster support is enabled (default) and you have access to multip - `namespace` (`string`) - Namespace containing the Istio object - `version` (`string`) - API version of the Istio object (e.g., 'v1', 'v1beta1') -- **kiali_get_resource_details** - Gets lists or detailed info for Kubernetes resources (services, workloads) within the mesh +- **ossm_get_resource_details** - Gets lists or detailed info for Kubernetes resources (services, workloads) within the mesh - `namespaces` (`string`) - Comma-separated list of namespaces to get services from (e.g. 'bookinfo' or 'bookinfo,default'). If not provided, will list services from all accessible namespaces - `resource_name` (`string`) - Name of the resource to get details for (optional string - if provided, gets details; if empty, lists all). - `resource_type` (`string`) - Type of resource to get details for (service, workload) -- **kiali_get_metrics** - Gets lists or detailed info for Kubernetes resources (services, workloads) within the mesh +- **ossm_get_metrics** - Gets lists or detailed info for Kubernetes resources (services, workloads) within the mesh - `byLabels` (`string`) - Comma-separated list of labels to group metrics by (e.g., 'source_workload,destination_service'). Optional - `direction` (`string`) - Traffic direction: 'inbound' or 'outbound'. Optional, defaults to 'outbound' - `duration` (`string`) - Time range to get metrics for (optional string - if provided, gets metrics (e.g., '1m', '5m', '1h'); if empty, get default 30m). @@ -489,14 +495,14 @@ In case multi-cluster support is enabled (default) and you have access to multip - `resource_type` (`string`) **(required)** - Type of resource to get details for (service, workload) - `step` (`string`) - Step between data points in seconds (e.g., '15'). Optional, defaults to 15 seconds -- **kiali_workload_logs** - Get logs for a specific workload's pods in a namespace. Only requires namespace and workload name - automatically discovers pods and containers. Optionally filter by container name, time range, and other parameters. Container is auto-detected if not specified. +- **ossm_workload_logs** - Get logs for a specific workload's pods in a namespace. Only requires namespace and workload name - automatically discovers pods and containers. Optionally filter by container name, time range, and other parameters. Container is auto-detected if not specified. - `container` (`string`) - Optional container name to filter logs. If not provided, automatically detects and uses the main application container (excludes istio-proxy and istio-init) - `namespace` (`string`) **(required)** - Namespace containing the workload - `since` (`string`) - Time duration to fetch logs from (e.g., '5m', '1h', '30s'). If not provided, returns recent logs - `tail` (`integer`) - Number of lines to retrieve from the end of logs (default: 100) - `workload` (`string`) **(required)** - Name of the workload to get logs for -- **kiali_get_traces** - Gets traces for a specific resource (app, service, workload) in a namespace, or gets detailed information for a specific trace by its ID. If traceId is provided, it returns detailed trace information and other parameters are not required. +- **ossm_get_traces** - Gets traces for a specific resource (app, service, workload) in a namespace, or gets detailed information for a specific trace by its ID. If traceId is provided, it returns detailed trace information and other parameters are not required. - `clusterName` (`string`) - Cluster name for multi-cluster environments (optional, only used when traceId is not provided) - `endMicros` (`string`) - End time for traces in microseconds since epoch (optional, defaults to 10 minutes after startMicros if not provided, only used when traceId is not provided) - `limit` (`integer`) - Maximum number of traces to return (default: 100, only used when traceId is not provided) @@ -534,6 +540,53 @@ In case multi-cluster support is enabled (default) and you have access to multip
+observability + +- **prometheus_query** - Execute an instant PromQL query against the cluster's Thanos Querier. +Returns current metric values at the specified time (or current time if not specified). +Use this for point-in-time metric values. + +Common queries: +- up{job="apiserver"} - Check if API server is up +- sum by(namespace) (container_memory_usage_bytes) - Memory usage by namespace +- rate(container_cpu_usage_seconds_total[5m]) - CPU usage rate +- kube_pod_status_phase{phase="Running"} - Running pods count + - `query` (`string`) **(required)** - PromQL query string (e.g., 'up{job="apiserver"}', 'sum by(namespace) (container_memory_usage_bytes)') + - `time` (`string`) - Optional evaluation timestamp. Accepts RFC3339 format (e.g., '2024-01-01T12:00:00Z') or Unix timestamp. If not provided, uses current time. + +- **prometheus_query_range** - Execute a range PromQL query against the cluster's Thanos Querier. +Returns metric values over a time range with specified resolution. +Use this for time-series data, trends, and historical analysis. + +Supports relative times: +- 'now' for current time +- '-10m', '-1h', '-1d' for relative past times + +Example: Get CPU usage over the last hour with 1-minute resolution. + - `end` (`string`) **(required)** - End time. Accepts RFC3339 timestamp, Unix timestamp, 'now', or relative time + - `query` (`string`) **(required)** - PromQL query string (e.g., 'rate(container_cpu_usage_seconds_total[5m])') + - `start` (`string`) **(required)** - Start time. Accepts RFC3339 timestamp (e.g., '2024-01-01T12:00:00Z'), Unix timestamp, or relative time (e.g., '-1h', '-30m', '-1d') + - `step` (`string`) - Query resolution step width (e.g., '15s', '1m', '5m'). Determines the granularity of returned data points. Default: '1m' + +- **alertmanager_alerts** - Query active and pending alerts from the cluster's Alertmanager. +Useful for monitoring cluster health, detecting issues, and incident response. + +Returns alerts with their labels, annotations, status, and timing information. +Can filter by active/silenced/inhibited state. + +Common use cases: +- Check for critical alerts affecting the cluster +- Monitor for specific alert types (e.g., high CPU, disk pressure) +- Verify alert silences are working correctly + - `active` (`boolean`) - Filter for active (firing) alerts. Default: true + - `filter` (`string`) - Optional filter using Alertmanager filter syntax. Examples: 'alertname=Watchdog', 'severity=critical', 'namespace=openshift-monitoring' + - `inhibited` (`boolean`) - Include inhibited alerts in the results. Default: false + - `silenced` (`boolean`) - Include silenced alerts in the results. Default: false + +
+ +
+ helm - **helm_install** - Install a Helm chart in the current or provided namespace diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md new file mode 100644 index 000000000..71d49372a --- /dev/null +++ b/docs/OBSERVABILITY.md @@ -0,0 +1,247 @@ +# Observability Toolset + +This toolset provides tools for querying OpenShift cluster observability data including Prometheus metrics and Alertmanager alerts. + +## Tools + +### prometheus_query + +Execute instant PromQL queries against the cluster's Thanos Querier. + +**Parameters:** +- `query` (required) - PromQL query string +- `time` (optional) - Evaluation timestamp (RFC3339, Unix timestamp, or relative like `-5m`, `now`) + +**Example:** +``` +Query: up{job="apiserver"} +``` + +### prometheus_query_range + +Execute range PromQL queries for time-series data. + +**Parameters:** +- `query` (required) - PromQL query string +- `start` (required) - Start time (RFC3339, Unix timestamp, or relative like `-1h`) +- `end` (required) - End time (RFC3339, Unix timestamp, or relative like `now`) +- `step` (optional) - Query resolution step (default: `1m`) + +**Example:** +``` +Query: rate(container_cpu_usage_seconds_total[5m]) +Start: -1h +End: now +Step: 1m +``` + +### alertmanager_alerts + +Query alerts from the cluster's Alertmanager. + +**Parameters:** +- `active` (optional) - Include active alerts (default: true) +- `silenced` (optional) - Include silenced alerts (default: false) +- `inhibited` (optional) - Include inhibited alerts (default: false) +- `filter` (optional) - Label filter in PromQL format (e.g., `alertname="Watchdog"`) + +**Example:** +``` +Active: true +Filter: severity="critical" +``` + +## Enable the Observability Toolset + +### Option 1: Command Line + +```bash +kubernetes-mcp-server --toolsets core,config,helm,observability +``` + +### Option 2: Configuration File + +```toml +toolsets = ["core", "config", "helm", "observability"] +``` + +### Option 3: MCP Client Configuration + +```json +{ + "mcpServers": { + "kubernetes": { + "command": "npx", + "args": ["-y", "kubernetes-mcp-server@latest", "--toolsets", "core,config,helm,observability"] + } + } +} +``` + +## Configuration + +The observability toolset supports optional configuration via the config file: + +```toml +[observability] +# Custom monitoring namespace (default: "openshift-monitoring") +monitoring_namespace = "custom-monitoring" +``` + +| Option | Default | Description | +|--------|---------|-------------| +| `monitoring_namespace` | `openshift-monitoring` | Namespace where Prometheus and Alertmanager routes are located | + +## Prerequisites + +The observability tools require: + +1. **OpenShift cluster** - These tools are designed for OpenShift and rely on OpenShift-specific routes +2. **Monitoring stack enabled** - The cluster must have the monitoring stack deployed (default in OpenShift) +3. **Proper RBAC** - The user/service account must have permissions to: + - Read routes in `openshift-monitoring` namespace + - Access the Thanos Querier and Alertmanager APIs + +## How It Works + +### Route Discovery + +The tools automatically discover the Prometheus (Thanos Querier) and Alertmanager endpoints by reading OpenShift routes: + +- **Thanos Querier**: `thanos-querier` route in `openshift-monitoring` namespace +- **Alertmanager**: `alertmanager-main` route in `openshift-monitoring` namespace + +### Authentication + +The tools use the bearer token from your Kubernetes configuration to authenticate with the monitoring endpoints. This is the same credential used to access the cluster. + +### Relative Time Support + +Time parameters support multiple formats: + +| Format | Example | Description | +|--------|---------|-------------| +| RFC3339 | `2024-01-15T10:00:00Z` | Absolute timestamp | +| Unix | `1705312800` | Unix timestamp in seconds | +| Relative | `-10m`, `-1h`, `-1d` | Relative to current time | +| Keyword | `now` | Current time | + +## Security Considerations + +### Allowed Prometheus Endpoints + +Only read-only Prometheus API endpoints are allowed: +- `/api/v1/query` - Instant queries +- `/api/v1/query_range` - Range queries +- `/api/v1/series` - Series metadata +- `/api/v1/labels` - Label names +- `/api/v1/label//values` - Label values + +Administrative endpoints (like `/api/v1/admin/*`) are blocked. + +### Allowed Alertmanager Endpoints + +Only alert query endpoints are allowed: +- `/api/v2/alerts` - List alerts +- `/api/v2/silences` - List silences +- `/api/v1/alerts` - Legacy alert endpoint + +### Query Limits + +- Maximum query length: 10,000 characters +- Maximum response size: 10MB + +## Common Use Cases + +### Cluster Health + +**Check if all API servers are up:** +``` +Query: up{job="apiserver"} +``` + +**API server request latency (99th percentile):** +``` +Query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket[5m])) by (le, verb)) +``` + +### Node and Pod Metrics + +**Node CPU usage percentage:** +``` +Query: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) +``` + +**Pods in CrashLoopBackOff:** +``` +Query: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 +``` + +**Container memory usage by namespace:** +``` +Query: sum by(namespace) (container_memory_working_set_bytes{container!=""}) +``` + +### Alerting + +**Get all firing critical alerts:** +``` +Tool: alertmanager_alerts +Active: true +Filter: severity="critical" +``` + +**Count alerts by severity:** +``` +Query: count by(severity) (ALERTS{alertstate="firing"}) +``` + +### Network + +**Network receive rate by pod:** +``` +Query: rate(container_network_receive_bytes_total[5m]) +Start: -1h +End: now +Step: 1m +``` + +### etcd Health + +**etcd leader changes:** +``` +Query: changes(etcd_server_leader_changes_seen_total[1h]) +``` + +**etcd disk sync duration:** +``` +Query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) +``` + +## Troubleshooting + +### "failed to get route" Error + +The monitoring routes may not exist or the user lacks permissions: +```bash +oc get routes -n openshift-monitoring +``` + +### "no bearer token available" Error + +Ensure your kubeconfig has a valid token: +```bash +oc whoami +oc get pods -n openshift-monitoring +``` + +### Empty Results from Prometheus + +Verify the query works in the OpenShift console: +1. Go to **Observe** > **Metrics** +2. Enter your PromQL query +3. Check for results + +### TLS Certificate Errors + +The tools use `InsecureSkipVerify` for route access. If you need strict TLS verification, this would require additional configuration. diff --git a/internal/tools/update-readme/main.go b/internal/tools/update-readme/main.go index 9273d15a5..b2181483c 100644 --- a/internal/tools/update-readme/main.go +++ b/internal/tools/update-readme/main.go @@ -18,6 +18,7 @@ import ( _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kiali" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt" + _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/observability" ) type OpenShift struct{} diff --git a/pkg/config/config_default.go b/pkg/config/config_default.go index febea70cf..67c5f82fc 100644 --- a/pkg/config/config_default.go +++ b/pkg/config/config_default.go @@ -9,7 +9,7 @@ import ( func Default() *StaticConfig { defaultConfig := StaticConfig{ ListOutput: "table", - Toolsets: []string{"core", "config", "helm"}, + Toolsets: []string{"core", "config", "helm", "observability"}, } overrides := defaultOverrides() mergedConfig := mergeConfig(defaultConfig, overrides) diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 862551773..e3a88f75c 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -247,8 +247,8 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { s.Equalf("table", config.ListOutput, "Expected ListOutput to be table, got %s", config.ListOutput) }) s.Run("toolsets defaulted correctly", func() { - s.Require().Lenf(config.Toolsets, 3, "Expected 3 toolsets, got %d", len(config.Toolsets)) - for _, toolset := range []string{"core", "config", "helm"} { + s.Require().Lenf(config.Toolsets, 4, "Expected 4 toolsets, got %d", len(config.Toolsets)) + for _, toolset := range []string{"core", "config", "helm", "observability"} { s.Containsf(config.Toolsets, toolset, "Expected toolsets to contain %s", toolset) } }) @@ -568,7 +568,7 @@ func (s *ConfigSuite) TestStandaloneConfigDirPreservesDefaults() { s.Run("preserves default values", func() { s.Equal("9999", config.Port, "port should be from drop-in") s.Equal("table", config.ListOutput, "list_output should be default") - s.Equal([]string{"core", "config", "helm"}, config.Toolsets, "toolsets should be default") + s.Equal([]string{"core", "config", "helm", "observability"}, config.Toolsets, "toolsets should be default") }) } @@ -585,7 +585,7 @@ func (s *ConfigSuite) TestStandaloneConfigDirEmpty() { s.Run("returns defaults for empty directory", func() { s.Equal("table", config.ListOutput, "list_output should be default") - s.Equal([]string{"core", "config", "helm"}, config.Toolsets, "toolsets should be default") + s.Equal([]string{"core", "config", "helm", "observability"}, config.Toolsets, "toolsets should be default") }) } @@ -914,7 +914,7 @@ func (s *ConfigSuite) TestBothConfigAndConfigDirEmpty() { s.Run("returns default configuration", func() { s.Equal("table", config.ListOutput) - s.Equal([]string{"core", "config", "helm"}, config.Toolsets) + s.Equal([]string{"core", "config", "helm", "observability"}, config.Toolsets) s.Equal(0, config.LogLevel) }) } @@ -1034,7 +1034,7 @@ func (s *ConfigSuite) TestEmptyConfigFile() { s.Equal("9999", config.Port, "port should be from drop-in") // Defaults should still be applied for unset values s.Equal("table", config.ListOutput, "list_output should be default") - s.Equal([]string{"core", "config", "helm"}, config.Toolsets, "toolsets should be default") + s.Equal([]string{"core", "config", "helm", "observability"}, config.Toolsets, "toolsets should be default") }) } diff --git a/pkg/mcp/modules.go b/pkg/mcp/modules.go index 255f42177..7502c2e12 100644 --- a/pkg/mcp/modules.go +++ b/pkg/mcp/modules.go @@ -6,4 +6,5 @@ import ( _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kiali" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt" + _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/observability" ) diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json index da8e244a3..ce0ef9a61 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json @@ -1,4 +1,47 @@ [ + { + "annotations": { + "title": "Alertmanager: Get Alerts", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Query active and pending alerts from the cluster's Alertmanager.\nUseful for monitoring cluster health, detecting issues, and incident response.\n\nReturns alerts with their labels, annotations, status, and timing information.\nCan filter by active/silenced/inhibited state.\n\nCommon use cases:\n- Check for critical alerts affecting the cluster\n- Monitor for specific alert types (e.g., high CPU, disk pressure)\n- Verify alert silences are working correctly", + "inputSchema": { + "type": "object", + "properties": { + "active": { + "default": true, + "description": "Filter for active (firing) alerts. Default: true", + "type": "boolean" + }, + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "enum": [ + "extra-cluster", + "fake-context" + ], + "type": "string" + }, + "filter": { + "description": "Optional filter using Alertmanager filter syntax. Examples: 'alertname=Watchdog', 'severity=critical', 'namespace=openshift-monitoring'", + "type": "string" + }, + "inhibited": { + "default": false, + "description": "Include inhibited alerts in the results. Default: false", + "type": "boolean" + }, + "silenced": { + "default": false, + "description": "Include silenced alerts in the results. Default: false", + "type": "boolean" + } + } + }, + "name": "alertmanager_alerts" + }, { "annotations": { "title": "Configuration: Contexts List", @@ -610,6 +653,87 @@ }, "name": "pods_top" }, + { + "annotations": { + "title": "Prometheus: Instant Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute an instant PromQL query against the cluster's Thanos Querier.\nReturns current metric values at the specified time (or current time if not specified).\nUse this for point-in-time metric values.\n\nCommon queries:\n- up{job=\"apiserver\"} - Check if API server is up\n- sum by(namespace) (container_memory_usage_bytes) - Memory usage by namespace\n- rate(container_cpu_usage_seconds_total[5m]) - CPU usage rate\n- kube_pod_status_phase{phase=\"Running\"} - Running pods count", + "inputSchema": { + "type": "object", + "properties": { + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "enum": [ + "extra-cluster", + "fake-context" + ], + "type": "string" + }, + "query": { + "description": "PromQL query string (e.g., 'up{job=\"apiserver\"}', 'sum by(namespace) (container_memory_usage_bytes)')", + "type": "string" + }, + "time": { + "description": "Optional evaluation timestamp. Accepts RFC3339 format (e.g., '2024-01-01T12:00:00Z') or Unix timestamp. If not provided, uses current time.", + "type": "string" + } + }, + "required": [ + "query" + ] + }, + "name": "prometheus_query" + }, + { + "annotations": { + "title": "Prometheus: Range Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute a range PromQL query against the cluster's Thanos Querier.\nReturns metric values over a time range with specified resolution.\nUse this for time-series data, trends, and historical analysis.\n\nSupports relative times:\n- 'now' for current time\n- '-10m', '-1h', '-1d' for relative past times\n\nExample: Get CPU usage over the last hour with 1-minute resolution.", + "inputSchema": { + "type": "object", + "properties": { + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "enum": [ + "extra-cluster", + "fake-context" + ], + "type": "string" + }, + "end": { + "description": "End time. Accepts RFC3339 timestamp, Unix timestamp, 'now', or relative time", + "type": "string" + }, + "query": { + "description": "PromQL query string (e.g., 'rate(container_cpu_usage_seconds_total[5m])')", + "type": "string" + }, + "start": { + "description": "Start time. Accepts RFC3339 timestamp (e.g., '2024-01-01T12:00:00Z'), Unix timestamp, or relative time (e.g., '-1h', '-30m', '-1d')", + "type": "string" + }, + "step": { + "default": "1m", + "description": "Query resolution step width (e.g., '15s', '1m', '5m'). Determines the granularity of returned data points. Default: '1m'", + "type": "string" + } + }, + "required": [ + "query", + "start", + "end" + ] + }, + "name": "prometheus_query_range" + }, { "annotations": { "title": "Resources: Create or Update", diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json index 691cccaee..559573122 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json @@ -1,4 +1,43 @@ [ + { + "annotations": { + "title": "Alertmanager: Get Alerts", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Query active and pending alerts from the cluster's Alertmanager.\nUseful for monitoring cluster health, detecting issues, and incident response.\n\nReturns alerts with their labels, annotations, status, and timing information.\nCan filter by active/silenced/inhibited state.\n\nCommon use cases:\n- Check for critical alerts affecting the cluster\n- Monitor for specific alert types (e.g., high CPU, disk pressure)\n- Verify alert silences are working correctly", + "inputSchema": { + "type": "object", + "properties": { + "active": { + "default": true, + "description": "Filter for active (firing) alerts. Default: true", + "type": "boolean" + }, + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "type": "string" + }, + "filter": { + "description": "Optional filter using Alertmanager filter syntax. Examples: 'alertname=Watchdog', 'severity=critical', 'namespace=openshift-monitoring'", + "type": "string" + }, + "inhibited": { + "default": false, + "description": "Include inhibited alerts in the results. Default: false", + "type": "boolean" + }, + "silenced": { + "default": false, + "description": "Include silenced alerts in the results. Default: false", + "type": "boolean" + } + } + }, + "name": "alertmanager_alerts" + }, { "annotations": { "title": "Configuration: Contexts List", @@ -546,6 +585,79 @@ }, "name": "pods_top" }, + { + "annotations": { + "title": "Prometheus: Instant Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute an instant PromQL query against the cluster's Thanos Querier.\nReturns current metric values at the specified time (or current time if not specified).\nUse this for point-in-time metric values.\n\nCommon queries:\n- up{job=\"apiserver\"} - Check if API server is up\n- sum by(namespace) (container_memory_usage_bytes) - Memory usage by namespace\n- rate(container_cpu_usage_seconds_total[5m]) - CPU usage rate\n- kube_pod_status_phase{phase=\"Running\"} - Running pods count", + "inputSchema": { + "type": "object", + "properties": { + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "type": "string" + }, + "query": { + "description": "PromQL query string (e.g., 'up{job=\"apiserver\"}', 'sum by(namespace) (container_memory_usage_bytes)')", + "type": "string" + }, + "time": { + "description": "Optional evaluation timestamp. Accepts RFC3339 format (e.g., '2024-01-01T12:00:00Z') or Unix timestamp. If not provided, uses current time.", + "type": "string" + } + }, + "required": [ + "query" + ] + }, + "name": "prometheus_query" + }, + { + "annotations": { + "title": "Prometheus: Range Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute a range PromQL query against the cluster's Thanos Querier.\nReturns metric values over a time range with specified resolution.\nUse this for time-series data, trends, and historical analysis.\n\nSupports relative times:\n- 'now' for current time\n- '-10m', '-1h', '-1d' for relative past times\n\nExample: Get CPU usage over the last hour with 1-minute resolution.", + "inputSchema": { + "type": "object", + "properties": { + "context": { + "description": "Optional parameter selecting which context to run the tool in. Defaults to fake-context if not set", + "type": "string" + }, + "end": { + "description": "End time. Accepts RFC3339 timestamp, Unix timestamp, 'now', or relative time", + "type": "string" + }, + "query": { + "description": "PromQL query string (e.g., 'rate(container_cpu_usage_seconds_total[5m])')", + "type": "string" + }, + "start": { + "description": "Start time. Accepts RFC3339 timestamp (e.g., '2024-01-01T12:00:00Z'), Unix timestamp, or relative time (e.g., '-1h', '-30m', '-1d')", + "type": "string" + }, + "step": { + "default": "1m", + "description": "Query resolution step width (e.g., '15s', '1m', '5m'). Determines the granularity of returned data points. Default: '1m'", + "type": "string" + } + }, + "required": [ + "query", + "start", + "end" + ] + }, + "name": "prometheus_query_range" + }, { "annotations": { "title": "Resources: Create or Update", diff --git a/pkg/mcp/testdata/toolsets-full-tools-openshift.json b/pkg/mcp/testdata/toolsets-full-tools-openshift.json index 21e7fd600..6b5ef3112 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-openshift.json +++ b/pkg/mcp/testdata/toolsets-full-tools-openshift.json @@ -1,4 +1,39 @@ [ + { + "annotations": { + "title": "Alertmanager: Get Alerts", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Query active and pending alerts from the cluster's Alertmanager.\nUseful for monitoring cluster health, detecting issues, and incident response.\n\nReturns alerts with their labels, annotations, status, and timing information.\nCan filter by active/silenced/inhibited state.\n\nCommon use cases:\n- Check for critical alerts affecting the cluster\n- Monitor for specific alert types (e.g., high CPU, disk pressure)\n- Verify alert silences are working correctly", + "inputSchema": { + "type": "object", + "properties": { + "active": { + "default": true, + "description": "Filter for active (firing) alerts. Default: true", + "type": "boolean" + }, + "filter": { + "description": "Optional filter using Alertmanager filter syntax. Examples: 'alertname=Watchdog', 'severity=critical', 'namespace=openshift-monitoring'", + "type": "string" + }, + "inhibited": { + "default": false, + "description": "Include inhibited alerts in the results. Default: false", + "type": "boolean" + }, + "silenced": { + "default": false, + "description": "Include silenced alerts in the results. Default: false", + "type": "boolean" + } + } + }, + "name": "alertmanager_alerts" + }, { "annotations": { "title": "Configuration: View", @@ -479,6 +514,71 @@ }, "name": "projects_list" }, + { + "annotations": { + "title": "Prometheus: Instant Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute an instant PromQL query against the cluster's Thanos Querier.\nReturns current metric values at the specified time (or current time if not specified).\nUse this for point-in-time metric values.\n\nCommon queries:\n- up{job=\"apiserver\"} - Check if API server is up\n- sum by(namespace) (container_memory_usage_bytes) - Memory usage by namespace\n- rate(container_cpu_usage_seconds_total[5m]) - CPU usage rate\n- kube_pod_status_phase{phase=\"Running\"} - Running pods count", + "inputSchema": { + "type": "object", + "properties": { + "query": { + "description": "PromQL query string (e.g., 'up{job=\"apiserver\"}', 'sum by(namespace) (container_memory_usage_bytes)')", + "type": "string" + }, + "time": { + "description": "Optional evaluation timestamp. Accepts RFC3339 format (e.g., '2024-01-01T12:00:00Z') or Unix timestamp. If not provided, uses current time.", + "type": "string" + } + }, + "required": [ + "query" + ] + }, + "name": "prometheus_query" + }, + { + "annotations": { + "title": "Prometheus: Range Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute a range PromQL query against the cluster's Thanos Querier.\nReturns metric values over a time range with specified resolution.\nUse this for time-series data, trends, and historical analysis.\n\nSupports relative times:\n- 'now' for current time\n- '-10m', '-1h', '-1d' for relative past times\n\nExample: Get CPU usage over the last hour with 1-minute resolution.", + "inputSchema": { + "type": "object", + "properties": { + "end": { + "description": "End time. Accepts RFC3339 timestamp, Unix timestamp, 'now', or relative time", + "type": "string" + }, + "query": { + "description": "PromQL query string (e.g., 'rate(container_cpu_usage_seconds_total[5m])')", + "type": "string" + }, + "start": { + "description": "Start time. Accepts RFC3339 timestamp (e.g., '2024-01-01T12:00:00Z'), Unix timestamp, or relative time (e.g., '-1h', '-30m', '-1d')", + "type": "string" + }, + "step": { + "default": "1m", + "description": "Query resolution step width (e.g., '15s', '1m', '5m'). Determines the granularity of returned data points. Default: '1m'", + "type": "string" + } + }, + "required": [ + "query", + "start", + "end" + ] + }, + "name": "prometheus_query_range" + }, { "annotations": { "title": "Resources: Create or Update", diff --git a/pkg/mcp/testdata/toolsets-full-tools.json b/pkg/mcp/testdata/toolsets-full-tools.json index fcb890919..73e53dc78 100644 --- a/pkg/mcp/testdata/toolsets-full-tools.json +++ b/pkg/mcp/testdata/toolsets-full-tools.json @@ -1,4 +1,39 @@ [ + { + "annotations": { + "title": "Alertmanager: Get Alerts", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Query active and pending alerts from the cluster's Alertmanager.\nUseful for monitoring cluster health, detecting issues, and incident response.\n\nReturns alerts with their labels, annotations, status, and timing information.\nCan filter by active/silenced/inhibited state.\n\nCommon use cases:\n- Check for critical alerts affecting the cluster\n- Monitor for specific alert types (e.g., high CPU, disk pressure)\n- Verify alert silences are working correctly", + "inputSchema": { + "type": "object", + "properties": { + "active": { + "default": true, + "description": "Filter for active (firing) alerts. Default: true", + "type": "boolean" + }, + "filter": { + "description": "Optional filter using Alertmanager filter syntax. Examples: 'alertname=Watchdog', 'severity=critical', 'namespace=openshift-monitoring'", + "type": "string" + }, + "inhibited": { + "default": false, + "description": "Include inhibited alerts in the results. Default: false", + "type": "boolean" + }, + "silenced": { + "default": false, + "description": "Include silenced alerts in the results. Default: false", + "type": "boolean" + } + } + }, + "name": "alertmanager_alerts" + }, { "annotations": { "title": "Configuration: View", @@ -466,6 +501,71 @@ }, "name": "pods_top" }, + { + "annotations": { + "title": "Prometheus: Instant Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute an instant PromQL query against the cluster's Thanos Querier.\nReturns current metric values at the specified time (or current time if not specified).\nUse this for point-in-time metric values.\n\nCommon queries:\n- up{job=\"apiserver\"} - Check if API server is up\n- sum by(namespace) (container_memory_usage_bytes) - Memory usage by namespace\n- rate(container_cpu_usage_seconds_total[5m]) - CPU usage rate\n- kube_pod_status_phase{phase=\"Running\"} - Running pods count", + "inputSchema": { + "type": "object", + "properties": { + "query": { + "description": "PromQL query string (e.g., 'up{job=\"apiserver\"}', 'sum by(namespace) (container_memory_usage_bytes)')", + "type": "string" + }, + "time": { + "description": "Optional evaluation timestamp. Accepts RFC3339 format (e.g., '2024-01-01T12:00:00Z') or Unix timestamp. If not provided, uses current time.", + "type": "string" + } + }, + "required": [ + "query" + ] + }, + "name": "prometheus_query" + }, + { + "annotations": { + "title": "Prometheus: Range Query", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + }, + "description": "Execute a range PromQL query against the cluster's Thanos Querier.\nReturns metric values over a time range with specified resolution.\nUse this for time-series data, trends, and historical analysis.\n\nSupports relative times:\n- 'now' for current time\n- '-10m', '-1h', '-1d' for relative past times\n\nExample: Get CPU usage over the last hour with 1-minute resolution.", + "inputSchema": { + "type": "object", + "properties": { + "end": { + "description": "End time. Accepts RFC3339 timestamp, Unix timestamp, 'now', or relative time", + "type": "string" + }, + "query": { + "description": "PromQL query string (e.g., 'rate(container_cpu_usage_seconds_total[5m])')", + "type": "string" + }, + "start": { + "description": "Start time. Accepts RFC3339 timestamp (e.g., '2024-01-01T12:00:00Z'), Unix timestamp, or relative time (e.g., '-1h', '-30m', '-1d')", + "type": "string" + }, + "step": { + "default": "1m", + "description": "Query resolution step width (e.g., '15s', '1m', '5m'). Determines the granularity of returned data points. Default: '1m'", + "type": "string" + } + }, + "required": [ + "query", + "start", + "end" + ] + }, + "name": "prometheus_query_range" + }, { "annotations": { "title": "Resources: Create or Update", diff --git a/pkg/prometheus/alertmanager.go b/pkg/prometheus/alertmanager.go new file mode 100644 index 000000000..1b06941e1 --- /dev/null +++ b/pkg/prometheus/alertmanager.go @@ -0,0 +1,40 @@ +package prometheus + +import ( + "context" + "encoding/json" + "fmt" + "net/url" +) + +// buildAlertsParams constructs query parameters for Alertmanager alerts API. +func buildAlertsParams(active, silenced, inhibited bool, filter string) url.Values { + params := url.Values{} + params.Set("active", fmt.Sprintf("%t", active)) + params.Set("silenced", fmt.Sprintf("%t", silenced)) + params.Set("inhibited", fmt.Sprintf("%t", inhibited)) + if filter != "" { + params.Add("filter", filter) + } + return params +} + +// GetAlerts retrieves alerts from Alertmanager. +func (c *Client) GetAlerts(ctx context.Context, active, silenced, inhibited bool, filter string) ([]Alert, error) { + body, err := c.executeRequest(ctx, "/api/v2/alerts", buildAlertsParams(active, silenced, inhibited, filter)) + if err != nil { + return nil, err + } + + var alerts []Alert + if err := json.Unmarshal(body, &alerts); err != nil { + return nil, fmt.Errorf("failed to parse alerts response: %w", err) + } + + return alerts, nil +} + +// GetAlertsRaw retrieves raw JSON alerts from Alertmanager. +func (c *Client) GetAlertsRaw(ctx context.Context, active, silenced, inhibited bool, filter string) ([]byte, error) { + return c.executeRequest(ctx, "/api/v2/alerts", buildAlertsParams(active, silenced, inhibited, filter)) +} diff --git a/pkg/prometheus/client.go b/pkg/prometheus/client.go new file mode 100644 index 000000000..10ecbf2c4 --- /dev/null +++ b/pkg/prometheus/client.go @@ -0,0 +1,155 @@ +package prometheus + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" +) + +const ( + // DefaultTimeout is the default HTTP timeout. + DefaultTimeout = 30 * time.Second + + // MaxResponseSize is the maximum response size (10MB). + MaxResponseSize = 10 * 1024 * 1024 +) + +// Client is an HTTP client for Prometheus and Alertmanager APIs. +type Client struct { + baseURL string + bearerToken string + tlsConfig *tls.Config + timeout time.Duration +} + +// NewClient creates a new Prometheus client with the specified base URL and options. +func NewClient(baseURL string, opts ...ClientOption) *Client { + c := &Client{ + baseURL: baseURL, + tlsConfig: newDefaultTLSConfig(), + timeout: DefaultTimeout, + } + + for _, opt := range opts { + opt(c) + } + + return c +} + +// Query executes an instant PromQL query at the specified time. +// If timeStr is empty, the current time is used. +func (c *Client) Query(ctx context.Context, query string, timeStr string) (*QueryResult, error) { + params := url.Values{} + params.Set("query", query) + if timeStr != "" { + params.Set("time", timeStr) + } + + body, err := c.executeRequest(ctx, "/api/v1/query", params) + if err != nil { + return nil, err + } + + var result QueryResult + if err := json.Unmarshal(body, &result); err != nil { + return nil, fmt.Errorf("failed to parse query response: %w", err) + } + + return &result, nil +} + +// QueryRange executes a range PromQL query over the specified time range. +func (c *Client) QueryRange(ctx context.Context, query, start, end, step string) (*QueryResult, error) { + params := url.Values{} + params.Set("query", query) + params.Set("start", start) + params.Set("end", end) + params.Set("step", step) + + body, err := c.executeRequest(ctx, "/api/v1/query_range", params) + if err != nil { + return nil, err + } + + var result QueryResult + if err := json.Unmarshal(body, &result); err != nil { + return nil, fmt.Errorf("failed to parse query_range response: %w", err) + } + + return &result, nil +} + +// QueryRaw executes a query and returns the raw JSON response. +func (c *Client) QueryRaw(ctx context.Context, endpoint string, params url.Values) ([]byte, error) { + return c.executeRequest(ctx, endpoint, params) +} + +// executeRequest executes an HTTP GET request with authentication. +func (c *Client) executeRequest(ctx context.Context, endpoint string, params url.Values) ([]byte, error) { + // Build URL + requestURL := c.baseURL + endpoint + if len(params) > 0 { + requestURL += "?" + params.Encode() + } + + // Create request + req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + // Add authentication + if c.bearerToken != "" { + req.Header.Set("Authorization", "Bearer "+c.bearerToken) + } + + // Execute request + client := c.createHTTPClient() + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + // Read response with size limit + limitedReader := io.LimitReader(resp.Body, MaxResponseSize+1) + body, err := io.ReadAll(limitedReader) + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + if len(body) > MaxResponseSize { + return nil, fmt.Errorf("response size exceeds maximum of %d bytes", MaxResponseSize) + } + + // Check HTTP status + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, truncateString(string(body), 200)) + } + + return body, nil +} + +// createHTTPClient creates an HTTP client with the configured TLS and timeout settings. +func (c *Client) createHTTPClient() *http.Client { + return &http.Client{ + Timeout: c.timeout, + Transport: &http.Transport{ + TLSClientConfig: c.tlsConfig, + }, + } +} + +// truncateString truncates a string to the specified length. +func truncateString(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen] + "..." +} diff --git a/pkg/prometheus/client_test.go b/pkg/prometheus/client_test.go new file mode 100644 index 000000000..1fed6efd9 --- /dev/null +++ b/pkg/prometheus/client_test.go @@ -0,0 +1,467 @@ +package prometheus + +import ( + "context" + "crypto/tls" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/suite" + "k8s.io/client-go/rest" +) + +type PrometheusSuite struct { + suite.Suite +} + +func (s *PrometheusSuite) TestNewClient() { + s.Run("creates client with defaults", func() { + client := NewClient("https://prometheus.example.com") + + s.Equal("https://prometheus.example.com", client.baseURL) + s.Equal("", client.bearerToken) + s.Equal(DefaultTimeout, client.timeout) + s.NotNil(client.tlsConfig) + }) + + s.Run("applies bearer token option", func() { + client := NewClient("https://prometheus.example.com", + WithBearerToken("test-token"), + ) + + s.Equal("test-token", client.bearerToken) + }) + + s.Run("applies timeout option", func() { + client := NewClient("https://prometheus.example.com", + WithTimeout(60*time.Second), + ) + + s.Equal(60*time.Second, client.timeout) + }) + + s.Run("applies insecure option", func() { + client := NewClient("https://prometheus.example.com", + WithInsecure(true), + ) + + s.True(client.tlsConfig.InsecureSkipVerify) + }) + + s.Run("trims whitespace from bearer token", func() { + client := NewClient("https://prometheus.example.com", + WithBearerToken(" test-token "), + ) + + s.Equal("test-token", client.bearerToken) + }) +} + +func (s *PrometheusSuite) TestWithBearerTokenFromRESTConfig() { + s.Run("uses token from BearerToken field", func() { + config := &rest.Config{ + BearerToken: "direct-token", + } + + client := NewClient("https://prometheus.example.com", + WithBearerTokenFromRESTConfig(config), + ) + + s.Equal("direct-token", client.bearerToken) + }) + + s.Run("handles nil config gracefully", func() { + client := NewClient("https://prometheus.example.com", + WithBearerTokenFromRESTConfig(nil), + ) + + s.Equal("", client.bearerToken) + }) +} + +func (s *PrometheusSuite) TestWithTLSFromRESTConfig() { + s.Run("handles nil config gracefully", func() { + client := NewClient("https://prometheus.example.com", + WithTLSFromRESTConfig(nil), + ) + + s.NotNil(client.tlsConfig) + }) + + s.Run("uses CAData when available", func() { + // Create a minimal PEM certificate for testing + caPEM := []byte(`-----BEGIN CERTIFICATE----- +MIIBkTCB+wIJAKHBfpegPjMCMA0GCSqGSIb3DQEBCwUAMBExDzANBgNVBAMMBnRl +c3RjYTAeFw0yNDAxMDEwMDAwMDBaFw0yNTAxMDEwMDAwMDBaMBExDzANBgNVBAMM +BnRlc3RjYTBcMA0GCSqGSIb3DQEBAQUAA0sAMEgCQQC7o96FCFhP2RxnNwj7mVXh +qGYXt9L9BJVjjTpD2hCRVEJgqGYb3bSoGiK4MYpqnLJDt9IBSfJz7JBkjHDvDZLX +AgMBAAGjUzBRMB0GA1UdDgQWBBQS0P3hKf3cG8XKBQMO3F/3GmZ7wjAfBgNVHSME +GDAWgBQS0P3hKf3cG8XKBQMO3F/3GmZ7wjAPBgNVHRMBAf8EBTADAQH/MA0GCSqG +SIb3DQEBCwUAA0EAFHbN1pWPxvCqVTH1gHCJdNlHqY3hg3PA2PIzv1NiaP3qmJk0 +cDq6b5fP0Z3e6Q1OvH5hEYnD6W8fXG5M8CxHjg== +-----END CERTIFICATE-----`) + + config := &rest.Config{ + TLSClientConfig: rest.TLSClientConfig{ + CAData: caPEM, + }, + } + + client := NewClient("https://prometheus.example.com", + WithTLSFromRESTConfig(config), + ) + + s.NotNil(client.tlsConfig.RootCAs) + }) +} + +func (s *PrometheusSuite) TestQuery() { + s.Run("executes instant query", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.Equal("/api/v1/query", r.URL.Path) + s.Equal("up", r.URL.Query().Get("query")) + + response := QueryResult{ + Status: "success", + Data: Data{ + ResultType: "vector", + Result: []Result{ + { + Metric: map[string]string{"__name__": "up", "job": "apiserver"}, + Value: []any{1234567890.0, "1"}, + }, + }, + }, + } + _ = json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + client := NewClient(server.URL) + result, err := client.Query(context.Background(), "up", "") + + s.NoError(err) + s.Equal("success", result.Status) + s.Len(result.Data.Result, 1) + s.Equal("up", result.Data.Result[0].Metric["__name__"]) + }) + + s.Run("includes time parameter when specified", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.Equal("1234567890", r.URL.Query().Get("time")) + + response := QueryResult{Status: "success"} + _ = json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + client := NewClient(server.URL) + _, err := client.Query(context.Background(), "up", "1234567890") + + s.NoError(err) + }) + + s.Run("includes bearer token in request", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.Equal("Bearer test-token", r.Header.Get("Authorization")) + + response := QueryResult{Status: "success"} + _ = json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + client := NewClient(server.URL, WithBearerToken("test-token")) + _, err := client.Query(context.Background(), "up", "") + + s.NoError(err) + }) + + s.Run("returns error for HTTP error status", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "Internal Server Error", http.StatusInternalServerError) + })) + defer server.Close() + + client := NewClient(server.URL) + _, err := client.Query(context.Background(), "up", "") + + s.Error(err) + s.Contains(err.Error(), "500") + }) +} + +func (s *PrometheusSuite) TestQueryRange() { + s.Run("executes range query with all parameters", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.Equal("/api/v1/query_range", r.URL.Path) + s.Equal("rate(http_requests_total[5m])", r.URL.Query().Get("query")) + s.Equal("2024-01-01T00:00:00Z", r.URL.Query().Get("start")) + s.Equal("2024-01-01T01:00:00Z", r.URL.Query().Get("end")) + s.Equal("1m", r.URL.Query().Get("step")) + + response := QueryResult{ + Status: "success", + Data: Data{ + ResultType: "matrix", + Result: []Result{ + { + Metric: map[string]string{"__name__": "http_requests_total"}, + Values: [][]any{ + {1234567890.0, "10"}, + {1234567950.0, "15"}, + }, + }, + }, + }, + } + _ = json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + client := NewClient(server.URL) + result, err := client.QueryRange(context.Background(), + "rate(http_requests_total[5m])", + "2024-01-01T00:00:00Z", + "2024-01-01T01:00:00Z", + "1m", + ) + + s.NoError(err) + s.Equal("success", result.Status) + s.Equal("matrix", result.Data.ResultType) + s.Len(result.Data.Result, 1) + s.Len(result.Data.Result[0].Values, 2) + }) +} + +func (s *PrometheusSuite) TestGetAlerts() { + s.Run("retrieves alerts with parameters", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.Equal("/api/v2/alerts", r.URL.Path) + s.Equal("true", r.URL.Query().Get("active")) + s.Equal("false", r.URL.Query().Get("silenced")) + s.Equal("false", r.URL.Query().Get("inhibited")) + + alerts := []Alert{ + { + Labels: map[string]string{"alertname": "HighCPU", "severity": "warning"}, + Annotations: map[string]string{"summary": "CPU usage is high"}, + StartsAt: "2024-01-01T00:00:00Z", + Status: AlertStatus{ + State: "active", + }, + }, + } + _ = json.NewEncoder(w).Encode(alerts) + })) + defer server.Close() + + client := NewClient(server.URL) + alerts, err := client.GetAlerts(context.Background(), true, false, false, "") + + s.NoError(err) + s.Len(alerts, 1) + s.Equal("HighCPU", alerts[0].Labels["alertname"]) + s.Equal("active", alerts[0].Status.State) + }) + + s.Run("includes filter parameter when specified", func() { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.Equal("alertname=Watchdog", r.URL.Query().Get("filter")) + + alerts := []Alert{} + _ = json.NewEncoder(w).Encode(alerts) + })) + defer server.Close() + + client := NewClient(server.URL) + _, err := client.GetAlerts(context.Background(), true, false, false, "alertname=Watchdog") + + s.NoError(err) + }) +} + +func (s *PrometheusSuite) TestConvertRelativeTime() { + s.Run("handles 'now' keyword", func() { + before := time.Now().UTC() + result, err := ConvertRelativeTime("now") + after := time.Now().UTC() + + s.NoError(err) + s.Contains(result, "T", "Result should be RFC3339 format") + + // Parse and verify it's within the expected time range + parsed, err := time.Parse(time.RFC3339, result) + s.NoError(err) + s.True(parsed.After(before.Add(-time.Second)) && parsed.Before(after.Add(time.Second)), + "Parsed time should be close to current time") + }) + + s.Run("handles RFC3339 timestamp unchanged", func() { + input := "2024-01-01T12:00:00Z" + result, err := ConvertRelativeTime(input) + + s.NoError(err) + s.Equal(input, result, "RFC3339 timestamp should be returned unchanged") + }) + + s.Run("handles Unix timestamp unchanged", func() { + input := "1704110400" + result, err := ConvertRelativeTime(input) + + s.NoError(err) + s.Equal(input, result, "Unix timestamp should be returned unchanged") + }) + + s.Run("handles relative time -10m", func() { + before := time.Now().UTC().Add(-10 * time.Minute) + result, err := ConvertRelativeTime("-10m") + after := time.Now().UTC().Add(-10 * time.Minute) + + s.NoError(err) + s.Contains(result, "T", "Result should be RFC3339 format") + + parsed, err := time.Parse(time.RFC3339, result) + s.NoError(err) + s.True(parsed.After(before.Add(-time.Second)) && parsed.Before(after.Add(time.Second)), + "Parsed time should be approximately 10 minutes ago") + }) + + s.Run("handles relative time -1h", func() { + before := time.Now().UTC().Add(-1 * time.Hour) + result, err := ConvertRelativeTime("-1h") + after := time.Now().UTC().Add(-1 * time.Hour) + + s.NoError(err) + s.Contains(result, "T", "Result should be RFC3339 format") + + parsed, err := time.Parse(time.RFC3339, result) + s.NoError(err) + s.True(parsed.After(before.Add(-time.Second)) && parsed.Before(after.Add(time.Second)), + "Parsed time should be approximately 1 hour ago") + }) + + s.Run("handles relative time -1d (days)", func() { + before := time.Now().UTC().Add(-24 * time.Hour) + result, err := ConvertRelativeTime("-1d") + after := time.Now().UTC().Add(-24 * time.Hour) + + s.NoError(err) + s.Contains(result, "T", "Result should be RFC3339 format") + + parsed, err := time.Parse(time.RFC3339, result) + s.NoError(err) + s.True(parsed.After(before.Add(-time.Second)) && parsed.Before(after.Add(time.Second)), + "Parsed time should be approximately 1 day ago") + }) + + s.Run("handles relative time -30s (seconds)", func() { + before := time.Now().UTC().Add(-30 * time.Second) + result, err := ConvertRelativeTime("-30s") + after := time.Now().UTC().Add(-30 * time.Second) + + s.NoError(err) + s.Contains(result, "T", "Result should be RFC3339 format") + + parsed, err := time.Parse(time.RFC3339, result) + s.NoError(err) + s.True(parsed.After(before.Add(-time.Second)) && parsed.Before(after.Add(time.Second)), + "Parsed time should be approximately 30 seconds ago") + }) + + s.Run("handles whitespace around input", func() { + result, err := ConvertRelativeTime(" now ") + + s.NoError(err) + s.Contains(result, "T", "Result should be RFC3339 format") + }) + + s.Run("returns error for invalid format", func() { + _, err := ConvertRelativeTime("invalid") + + s.Error(err) + s.Contains(err.Error(), "invalid time format") + }) + + s.Run("returns error for malformed relative time", func() { + _, err := ConvertRelativeTime("-abc") + + s.Error(err) + s.Contains(err.Error(), "invalid relative time format") + }) +} + +func (s *PrometheusSuite) TestTruncateString() { + s.Run("returns original string if shorter than max", func() { + result := truncateString("hello", 10) + s.Equal("hello", result) + }) + + s.Run("returns original string if equal to max", func() { + result := truncateString("hello", 5) + s.Equal("hello", result) + }) + + s.Run("truncates and adds ellipsis if longer than max", func() { + result := truncateString("hello world", 5) + s.Equal("hello...", result) + }) +} + +func (s *PrometheusSuite) TestCreateHTTPClient() { + s.Run("creates client with timeout", func() { + client := NewClient("https://example.com", WithTimeout(60*time.Second)) + httpClient := client.createHTTPClient() + + s.Equal(60*time.Second, httpClient.Timeout) + }) + + s.Run("creates client with TLS config", func() { + client := NewClient("https://example.com", WithInsecure(true)) + httpClient := client.createHTTPClient() + + transport, ok := httpClient.Transport.(*http.Transport) + s.True(ok) + s.True(transport.TLSClientConfig.InsecureSkipVerify) + }) +} + +func (s *PrometheusSuite) TestNewDefaultTLSConfig() { + s.Run("sets minimum TLS version", func() { + config := newDefaultTLSConfig() + s.Equal(uint16(tls.VersionTLS12), config.MinVersion) + }) +} + +func (s *PrometheusSuite) TestParseIntFromString() { + s.Run("returns error for empty string", func() { + _, err := parseIntFromString("") + s.Error(err) + s.Contains(err.Error(), "empty string") + }) + + s.Run("returns error for number too large", func() { + _, err := parseIntFromString("12345678901") // 11 digits + s.Error(err) + s.Contains(err.Error(), "number too large") + }) + + s.Run("parses valid number", func() { + result, err := parseIntFromString("365") + s.NoError(err) + s.Equal(365, result) + }) + + s.Run("parses max allowed digits", func() { + result, err := parseIntFromString("1234567890") // exactly 10 digits + s.NoError(err) + s.Equal(1234567890, result) + }) +} + +func TestPrometheusSuite(t *testing.T) { + suite.Run(t, new(PrometheusSuite)) +} diff --git a/pkg/prometheus/options.go b/pkg/prometheus/options.go new file mode 100644 index 000000000..60f569eca --- /dev/null +++ b/pkg/prometheus/options.go @@ -0,0 +1,163 @@ +package prometheus + +import ( + "crypto/tls" + "crypto/x509" + "os" + "strings" + "time" + + "k8s.io/client-go/rest" + "k8s.io/klog/v2" +) + +// ClientOption is a function that configures a Client. +type ClientOption func(*Client) + +// WithBearerToken sets the bearer token for authentication. +func WithBearerToken(token string) ClientOption { + return func(c *Client) { + c.bearerToken = strings.TrimSpace(token) + } +} + +// WithBearerTokenFromRESTConfig extracts and sets the bearer token from a Kubernetes REST config. +// It tries the token directly first, then falls back to reading from a token file. +func WithBearerTokenFromRESTConfig(config *rest.Config) ClientOption { + return func(c *Client) { + if config == nil { + return + } + + // Try bearer token directly + if config.BearerToken != "" { + c.bearerToken = config.BearerToken + return + } + + // Try bearer token file + if config.BearerTokenFile != "" { + token, err := os.ReadFile(config.BearerTokenFile) + if err != nil { + klog.V(2).Infof("Failed to read token file %s: %v", config.BearerTokenFile, err) + return + } + c.bearerToken = strings.TrimSpace(string(token)) + } + } +} + +// WithTLSFromRESTConfig configures TLS using the CA from a Kubernetes REST config. +// It tries CAData first, then CAFile, then system cert pool, and finally falls back to insecure. +func WithTLSFromRESTConfig(config *rest.Config) ClientOption { + return func(c *Client) { + if config == nil { + return + } + + // Try to build a cert pool with the cluster CA + var certPool *x509.CertPool + var caLoaded bool + + // First, try to load CA from REST config's CAData + if len(config.CAData) > 0 { + // Start with system cert pool if available + if systemPool, err := x509.SystemCertPool(); err == nil && systemPool != nil { + certPool = systemPool + } else { + certPool = x509.NewCertPool() + } + if ok := certPool.AppendCertsFromPEM(config.CAData); ok { + c.tlsConfig.RootCAs = certPool + caLoaded = true + klog.V(4).Info("Loaded cluster CA from REST config CAData") + } else { + klog.V(2).Info("Failed to parse CA certificates from REST config CAData") + } + } + + // If CAData wasn't available or didn't work, try CAFile + if !caLoaded && config.CAFile != "" { + caPEM, err := os.ReadFile(config.CAFile) + if err != nil { + klog.V(2).Infof("Failed to read CA file %s: %v", config.CAFile, err) + } else { + // Start with system cert pool if available + if systemPool, err := x509.SystemCertPool(); err == nil && systemPool != nil { + certPool = systemPool + } else { + certPool = x509.NewCertPool() + } + if ok := certPool.AppendCertsFromPEM(caPEM); ok { + c.tlsConfig.RootCAs = certPool + caLoaded = true + klog.V(4).Infof("Loaded cluster CA from file %s", config.CAFile) + } else { + klog.V(2).Infof("Failed to parse CA certificates from file %s", config.CAFile) + } + } + } + + // If no CA was loaded, try system cert pool alone (for routes with public CAs) + if !caLoaded { + if systemPool, err := x509.SystemCertPool(); err == nil && systemPool != nil { + c.tlsConfig.RootCAs = systemPool + klog.V(4).Info("Using system certificate pool for TLS verification") + } else { + // Last resort: skip verification with a warning + klog.Warning("No cluster CA available and system cert pool failed; using insecure TLS (skip verification)") + c.tlsConfig.InsecureSkipVerify = true + } + } + } +} + +// WithCustomCA configures TLS using a custom CA certificate file. +func WithCustomCA(caFile string) ClientOption { + return func(c *Client) { + caFile = strings.TrimSpace(caFile) + if caFile == "" { + return + } + + caPEM, err := os.ReadFile(caFile) + if err != nil { + klog.Errorf("Failed to read CA certificate from file %s: %v; proceeding without custom CA", caFile, err) + return + } + + // Start with the host system pool when possible so we don't drop system roots + var certPool *x509.CertPool + if systemPool, err := x509.SystemCertPool(); err == nil && systemPool != nil { + certPool = systemPool + } else { + certPool = x509.NewCertPool() + } + if ok := certPool.AppendCertsFromPEM(caPEM); ok { + c.tlsConfig.RootCAs = certPool + } else { + klog.V(0).Infof("Failed to append provided certificate authority; proceeding without custom CA") + } + } +} + +// WithInsecure configures whether to skip TLS verification. +func WithInsecure(insecure bool) ClientOption { + return func(c *Client) { + c.tlsConfig.InsecureSkipVerify = insecure + } +} + +// WithTimeout sets the HTTP client timeout. +func WithTimeout(timeout time.Duration) ClientOption { + return func(c *Client) { + c.timeout = timeout + } +} + +// newDefaultTLSConfig creates a default TLS configuration. +func newDefaultTLSConfig() *tls.Config { + return &tls.Config{ + MinVersion: tls.VersionTLS12, + } +} diff --git a/pkg/prometheus/time.go b/pkg/prometheus/time.go new file mode 100644 index 000000000..49034d130 --- /dev/null +++ b/pkg/prometheus/time.go @@ -0,0 +1,80 @@ +package prometheus + +import ( + "fmt" + "strings" + "time" +) + +// ConvertRelativeTime converts relative time strings to RFC3339 timestamps. +// Supports: "now", "-10m", "-1h", "-1d", or passthrough for RFC3339/Unix timestamps. +func ConvertRelativeTime(timeStr string) (string, error) { + timeStr = strings.TrimSpace(timeStr) + + // If already a timestamp (contains T) or is numeric (Unix timestamp), return as-is + if strings.Contains(timeStr, "T") || isNumeric(timeStr) { + return timeStr, nil + } + + // Handle 'now' + if timeStr == "now" { + return time.Now().UTC().Format(time.RFC3339), nil + } + + // Handle relative times like '-10m', '-1h', '-1d', '-30s' + if strings.HasPrefix(timeStr, "-") { + // Parse duration (Go's time.ParseDuration doesn't support 'd' for days) + durationStr := timeStr[1:] // Remove leading '-' + + // Handle days specially + if strings.HasSuffix(durationStr, "d") { + days, err := parseIntFromString(strings.TrimSuffix(durationStr, "d")) + if err != nil { + return "", fmt.Errorf("invalid relative time format: %s", timeStr) + } + targetTime := time.Now().UTC().Add(-time.Duration(days) * 24 * time.Hour) + return targetTime.Format(time.RFC3339), nil + } + + // Parse standard durations (s, m, h) + duration, err := time.ParseDuration(durationStr) + if err != nil { + return "", fmt.Errorf("invalid relative time format: %s", timeStr) + } + targetTime := time.Now().UTC().Add(-duration) + return targetTime.Format(time.RFC3339), nil + } + + return "", fmt.Errorf("invalid time format: %s; expected 'now', relative time like '-10m', '-1h', '-1d', or RFC3339 timestamp", timeStr) +} + +// isNumeric checks if a string contains only digits. +func isNumeric(s string) bool { + if len(s) == 0 { + return false + } + for _, c := range s { + if c < '0' || c > '9' { + return false + } + } + return true +} + +// parseIntFromString parses an integer from a string with overflow protection. +func parseIntFromString(s string) (int, error) { + if len(s) == 0 { + return 0, fmt.Errorf("empty string") + } + if len(s) > 10 { // int32 max is 10 digits, prevents overflow + return 0, fmt.Errorf("number too large: %s", s) + } + var result int + for _, c := range s { + if c < '0' || c > '9' { + return 0, fmt.Errorf("invalid number: %s", s) + } + result = result*10 + int(c-'0') + } + return result, nil +} diff --git a/pkg/prometheus/types.go b/pkg/prometheus/types.go new file mode 100644 index 000000000..2ce405d6d --- /dev/null +++ b/pkg/prometheus/types.go @@ -0,0 +1,54 @@ +// Package prometheus provides a shared HTTP client for Prometheus and Alertmanager APIs. +// It supports flexible authentication (bearer token), TLS configuration (REST config CA, +// custom CA file, or insecure mode), and can be used by multiple toolsets with different +// URL discovery mechanisms. +package prometheus + +// QueryResult represents a Prometheus API query response. +type QueryResult struct { + Status string `json:"status"` + Data Data `json:"data"` + ErrorType string `json:"errorType,omitempty"` + Error string `json:"error,omitempty"` + Warnings []string `json:"warnings,omitempty"` +} + +// Data contains the query result data. +type Data struct { + ResultType string `json:"resultType"` + Result []Result `json:"result"` +} + +// Result represents a single result in a query response. +type Result struct { + Metric map[string]string `json:"metric"` + // Value is used for instant queries - [timestamp, value] + Value []any `json:"value,omitempty"` + // Values is used for range queries - [[timestamp, value], ...] + Values [][]any `json:"values,omitempty"` +} + +// Alert represents an Alertmanager alert. +type Alert struct { + Annotations map[string]string `json:"annotations"` + EndsAt string `json:"endsAt"` + Fingerprint string `json:"fingerprint"` + Receivers []Receiver `json:"receivers"` + StartsAt string `json:"startsAt"` + Status AlertStatus `json:"status"` + UpdatedAt string `json:"updatedAt"` + GeneratorURL string `json:"generatorURL,omitempty"` + Labels map[string]string `json:"labels"` +} + +// Receiver represents an Alertmanager receiver. +type Receiver struct { + Name string `json:"name"` +} + +// AlertStatus represents the status of an alert. +type AlertStatus struct { + InhibitedBy []string `json:"inhibitedBy"` + SilencedBy []string `json:"silencedBy"` + State string `json:"state"` +} diff --git a/pkg/toolsets/observability/alertmanager.go b/pkg/toolsets/observability/alertmanager.go new file mode 100644 index 000000000..47e68fc7b --- /dev/null +++ b/pkg/toolsets/observability/alertmanager.go @@ -0,0 +1,126 @@ +package observability + +import ( + "fmt" + + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" +) + +// initAlertmanager returns the Alertmanager tools. +func initAlertmanager() []api.ServerTool { + return []api.ServerTool{ + initAlertmanagerAlerts(), + } +} + +// initAlertmanagerAlerts creates the alertmanager_alerts tool. +func initAlertmanagerAlerts() api.ServerTool { + return api.ServerTool{ + Tool: api.Tool{ + Name: "alertmanager_alerts", + Description: `Query active and pending alerts from the cluster's Alertmanager. +Useful for monitoring cluster health, detecting issues, and incident response. + +Returns alerts with their labels, annotations, status, and timing information. +Can filter by active/silenced/inhibited state. + +Common use cases: +- Check for critical alerts affecting the cluster +- Monitor for specific alert types (e.g., high CPU, disk pressure) +- Verify alert silences are working correctly`, + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "active": { + Type: "boolean", + Description: "Filter for active (firing) alerts. Default: true", + Default: api.ToRawMessage(true), + }, + "silenced": { + Type: "boolean", + Description: "Include silenced alerts in the results. Default: false", + Default: api.ToRawMessage(false), + }, + "inhibited": { + Type: "boolean", + Description: "Include inhibited alerts in the results. Default: false", + Default: api.ToRawMessage(false), + }, + "filter": { + Type: "string", + Description: "Optional filter using Alertmanager filter syntax. Examples: 'alertname=Watchdog', 'severity=critical', 'namespace=openshift-monitoring'", + }, + }, + }, + Annotations: api.ToolAnnotations{ + Title: "Alertmanager: Get Alerts", + ReadOnlyHint: ptr.To(true), + DestructiveHint: ptr.To(false), + IdempotentHint: ptr.To(true), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: alertmanagerAlertsHandler, + } +} + +// alertmanagerAlertsHandler handles Alertmanager alerts queries. +func alertmanagerAlertsHandler(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + // Validate endpoint (security check) + endpoint := "/api/v2/alerts" + if err := validateAlertmanagerEndpoint(endpoint); err != nil { + return api.NewToolCallResult("", err), nil + } + + // Get Alertmanager URL + baseURL, err := getRouteURL(params.Context, params, alertmanagerRoute, getMonitoringNamespace(params)) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("failed to get Alertmanager route: %w", err)), nil + } + + // Handle active parameter (default: true) + active := true + if v, ok := params.GetArguments()["active"].(bool); ok { + active = v + } + + // Handle silenced parameter (default: false) + silenced := false + if v, ok := params.GetArguments()["silenced"].(bool); ok { + silenced = v + } + + // Handle inhibited parameter (default: false) + inhibited := false + if v, ok := params.GetArguments()["inhibited"].(bool); ok { + inhibited = v + } + + // Handle optional filter + filter := "" + if f, ok := params.GetArguments()["filter"].(string); ok && f != "" { + // Validate filter length + if len(f) > maxQueryLength { + return api.NewToolCallResult("", fmt.Errorf("filter exceeds maximum length of %d characters", maxQueryLength)), nil + } + filter = f + } + + // Create client and execute request + client := newPrometheusClient(baseURL, params) + body, err := client.GetAlertsRaw(params.Context, active, silenced, inhibited, filter) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("alertmanager query failed: %w", err)), nil + } + + // Format response + result, err := prettyJSON(body) + if err != nil { + return api.NewToolCallResult(string(body), nil), nil + } + + return api.NewToolCallResult(result, nil), nil +} diff --git a/pkg/toolsets/observability/config.go b/pkg/toolsets/observability/config.go new file mode 100644 index 000000000..64653fdd1 --- /dev/null +++ b/pkg/toolsets/observability/config.go @@ -0,0 +1,36 @@ +package observability + +import ( + "context" + + "github.com/BurntSushi/toml" + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/config" +) + +// Config holds observability toolset configuration +type Config struct { + // MonitoringNamespace is the namespace where monitoring components are deployed. + // Defaults to "openshift-monitoring" if not specified. + MonitoringNamespace string `toml:"monitoring_namespace,omitempty"` +} + +var _ api.ExtendedConfig = (*Config)(nil) + +// Validate checks that the configuration values are valid. +func (c *Config) Validate() error { + // All fields are optional with sensible defaults, no validation required + return nil +} + +func observabilityToolsetParser(_ context.Context, primitive toml.Primitive, md toml.MetaData) (api.ExtendedConfig, error) { + var cfg Config + if err := md.PrimitiveDecode(primitive, &cfg); err != nil { + return nil, err + } + return &cfg, nil +} + +func init() { + config.RegisterToolsetConfig("observability", observabilityToolsetParser) +} diff --git a/pkg/toolsets/observability/helpers.go b/pkg/toolsets/observability/helpers.go new file mode 100644 index 000000000..f5420ac69 --- /dev/null +++ b/pkg/toolsets/observability/helpers.go @@ -0,0 +1,168 @@ +package observability + +import ( + "context" + "encoding/json" + "fmt" + "regexp" + "sync" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/prometheus" +) + +const ( + // defaultMonitoringNamespace is the default namespace for OpenShift monitoring components + defaultMonitoringNamespace = "openshift-monitoring" + + // thanosQuerierRoute is the route name for Thanos Querier + thanosQuerierRoute = "thanos-querier" + + // alertmanagerRoute is the route name for Alertmanager + alertmanagerRoute = "alertmanager-main" + + // maxQueryLength is the maximum allowed query length to prevent DoS + maxQueryLength = 10000 +) + +// routeGVR is the GroupVersionResource for OpenShift Routes +var routeGVR = schema.GroupVersionResource{ + Group: "route.openshift.io", + Version: "v1", + Resource: "routes", +} + +// routeURLCache caches resolved route URLs for the lifetime of the server process. +// This avoids repeated Kubernetes API calls since routes rarely change. +// Key format: "apiServerHost/namespace/routeName", value: URL string. +// The API server host is included to support multi-cluster (ACM) environments. +var routeURLCache sync.Map + +// allowedPrometheusEndpoints is a whitelist of allowed Prometheus API endpoints +var allowedPrometheusEndpoints = map[string]bool{ + "/api/v1/query": true, + "/api/v1/query_range": true, + "/api/v1/series": true, + "/api/v1/labels": true, +} + +// allowedPrometheusLabelPattern matches /api/v1/label/