diff --git a/docker-compose.yml b/docker-compose.yml index 5df7e10f93..86bb931434 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,7 +63,7 @@ services: # Only to test prometheus integration prometheus: - image: prom/prometheus:v3.0.0-beta.1 + image: prom/prometheus:v3.4.2 command: - --web.enable-remote-write-receiver - --enable-feature=native-histograms @@ -85,13 +85,12 @@ services: - debug grafana: - image: grafana/grafana:11.3.1 + image: grafana/grafana:12.0.2 ports: - '9300:3000' volumes: - grafana-storage:/var/lib/grafana - ./docker/grafana/provisioning:/etc/grafana/provisioning - container_name: grafana restart: unless-stopped networks: - primary @@ -100,6 +99,8 @@ services: - GF_INSTALL_PLUGINS=grafana-clickhouse-datasource - CLICKHOUSE_USER=${CLICKHOUSE_USER:-default} - CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-changeme} + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin profiles: - debug @@ -261,7 +262,7 @@ services: volumes: - ./docker/redis/redis-cluster.conf:/usr/local/etc/redis/redis.conf healthcheck: - test: ["CMD", "redis-cli", "-p", "6379", "ping"] + test: ['CMD', 'redis-cli', '-p', '6379', 'ping'] interval: 10s timeout: 5s retries: 3 @@ -279,7 +280,7 @@ services: volumes: - ./docker/redis/redis-cluster.conf:/usr/local/etc/redis/redis.conf healthcheck: - test: ["CMD", "redis-cli", "-p", "6379", "ping"] + test: ['CMD', 'redis-cli', '-p', '6379', 'ping'] interval: 10s timeout: 5s retries: 3 @@ -297,7 +298,7 @@ services: volumes: - ./docker/redis/redis-cluster.conf:/usr/local/etc/redis/redis.conf healthcheck: - test: ["CMD", "redis-cli", "-p", "6379", "ping"] + test: ['CMD', 'redis-cli', '-p', '6379', 'ping'] interval: 10s timeout: 5s retries: 3 diff --git a/router/bench-limited-cardinality.js b/router/bench-limited-cardinality.js new file mode 100644 index 0000000000..99bd9a41a2 --- /dev/null +++ b/router/bench-limited-cardinality.js @@ -0,0 +1,185 @@ +import http from 'k6/http'; +import { check } from 'k6'; +import { randomString } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js'; + +/* + Benchmarking script to run a graphql query with a random operation name from a fixed size pool. + Useful to test metric attributes. + */ + +export const options = { + stages: [ + { duration: '15s', target: 20 }, + { duration: '15s', target: 50 }, + { duration: '20s', target: 100 }, + { duration: '30m', target: 100 }, + ], +}; + +// in the simple case from a clean state it's around (operationName)*5 series per metric +// mostly due to wg_subgraph_id and wg_subgraph_name array exploding + +// 300 should be under the default cardinality limit (1500 < 2000) +// 500 should be slightly over the default cardinality limit (2500 > 2000) +const distinctNames = 300; + +export function setup() { + let randomNames = []; + + for (let i = 0; i < distinctNames; i++) { + randomNames.push(randomString(10, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')); + } + + console.log('Generated ' + distinctNames + ' random names'); + + return { randomNames }; +} + +export default function ({ randomNames }) { + let query = ` + query $$__REPLACE_ME__$$ { + employees { + # resolved through employees subgraph + id + # overridden by the products subgraph + notes + details { + # resolved through either employees or family subgraph + forename + surname + # resolved through employees subgraph + location { + key { + name + } + } + # resolved through family subgraph + hasChildren + # maritalStatus can return null + maritalStatus + nationality + # pets can return null + pets { + class + gender + name + ... on Cat { + type + } + ... on Dog { + breed + } + ... on Alligator { + dangerous + } + } + } + # resolved through employees subgraph + role { + departments + title + ... on Engineer { + engineerType + } + ... on Operator { + operatorType + } + } + # resolved through hobbies subgraph + hobbies { + ... on Exercise { + category + } + ... on Flying { + planeModels + yearsOfExperience + } + ... on Gaming { + genres + name + yearsOfExperience + } + ... on Other { + name + } + ... on Programming { + languages + } + ... on Travelling { + countriesLived { + key { + name + } + } + } + } + # resolved through products subgraph + products + } + # can return null + employee(id: 1) { + # resolved through employees subgraph + id + details { + forename + location { + key { + name + } + } + } + } + teammates(team: OPERATIONS) { + # resolved through employees subgraph + id + ...EmployeeNameFragment + # resolved through products subgraph + products + } + productTypes { + ... on Documentation { + url(product: SDK) + urls(products: [COSMO, MARKETING]) + } + ... on Consultancy { + lead { + ...EmployeeNameFragment + } + name + } + } + a: findEmployees(criteria: { + hasPets: true, nationality: UKRAINIAN, nested: { maritalStatus: ENGAGED } + }) { + ...EmployeeNameFragment + } + b: findEmployees(criteria: { + hasPets: true, nationality: GERMAN, nested: { maritalStatus: MARRIED, hasChildren: true } + }) { + ...EmployeeNameFragment + } +} + +fragment EmployeeNameFragment on Employee { + details { + forename + } +}`; + + let headers = { + 'Content-Type': 'application/json', + 'GraphQL-Client-Name': 'k6', + 'GraphQL-Client-Version': '0.0.1', + }; + + let operationName = randomNames[Math.floor(Math.random() * randomNames.length)]; + + query = query.replace(/\$\$__REPLACE_ME__\$\$/g, operationName); + + let res = http.post('http://localhost:3002/graphql', JSON.stringify({ query: query, operationName: operationName }), { + headers: headers, + }); + check(res, { + 'is status 200': (r) => r.status === 200 && r.body.includes('errors') === false, + }); +} diff --git a/router/core/graph_server.go b/router/core/graph_server.go index a346246c13..0c5b7cd493 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -743,7 +743,7 @@ func (s *graphServer) buildGraphMux(ctx context.Context, rmetric.WithBaseAttributes(baseMetricAttributes), rmetric.WithLogger(s.logger), rmetric.WithProcessStartTime(s.processStartTime), - rmetric.WithCardinalityLimit(rmetric.DefaultCardinalityLimit), + rmetric.WithCardinalityLimit(s.metricConfig.CardinalityLimit), rmetric.WithRouterInfoAttributes(routerInfoBaseAttrs), ) if err != nil { diff --git a/router/core/router.go b/router/core/router.go index f3e3b9c04e..473f61a7f1 100644 --- a/router/core/router.go +++ b/router/core/router.go @@ -2175,6 +2175,7 @@ func MetricConfigFromTelemetry(cfg *config.Telemetry) *rmetric.Config { Version: Version, Attributes: cfg.Metrics.Attributes, ResourceAttributes: buildResourceAttributes(cfg.ResourceAttributes), + CardinalityLimit: cfg.Metrics.CardinalityLimit, OpenTelemetry: rmetric.OpenTelemetry{ Enabled: cfg.Metrics.OTLP.Enabled, RouterRuntime: cfg.Metrics.OTLP.RouterRuntime, diff --git a/router/pkg/config/config.go b/router/pkg/config/config.go index b375abcbd3..b83f0360ac 100644 --- a/router/pkg/config/config.go +++ b/router/pkg/config/config.go @@ -123,9 +123,10 @@ type MetricsOTLPExporter struct { } type Metrics struct { - Attributes []CustomAttribute `yaml:"attributes"` - OTLP MetricsOTLP `yaml:"otlp"` - Prometheus Prometheus `yaml:"prometheus"` + Attributes []CustomAttribute `yaml:"attributes"` + OTLP MetricsOTLP `yaml:"otlp"` + Prometheus Prometheus `yaml:"prometheus"` + CardinalityLimit int `yaml:"experiment_cardinality_limit" envDefault:"2000" env:"METRICS_EXPERIMENT_CARDINALITY_LIMIT"` } type MetricsOTLP struct { diff --git a/router/pkg/config/config.schema.json b/router/pkg/config/config.schema.json index 07d53fcee7..6fbe8851ec 100644 --- a/router/pkg/config/config.schema.json +++ b/router/pkg/config/config.schema.json @@ -972,6 +972,12 @@ "description": "The configuration for the collection and export of metrics. The metrics are collected and exported using the OpenTelemetry protocol (OTLP) and Prometheus.", "additionalProperties": false, "properties": { + "experiment_cardinality_limit": { + "type": "integer", + "description": "Sets a hard limit on the number of Metric Points that can be collected during a collection cycle. NOTE: This option is experimental and may change in future versions.", + "minimum": 1, + "default": 2000 + }, "attributes": { "type": "array", "description": "The attributes to add to OTLP Metrics and Prometheus.", diff --git a/router/pkg/config/testdata/config_defaults.json b/router/pkg/config/testdata/config_defaults.json index 0d7fdd4c41..2501daddae 100644 --- a/router/pkg/config/testdata/config_defaults.json +++ b/router/pkg/config/testdata/config_defaults.json @@ -59,7 +59,8 @@ "Enabled": false, "IncludeOperationSha": false } - } + }, + "CardinalityLimit": 2000 } }, "GraphqlMetrics": { diff --git a/router/pkg/config/testdata/config_full.json b/router/pkg/config/testdata/config_full.json index fc7a0b45f4..7869972aef 100644 --- a/router/pkg/config/testdata/config_full.json +++ b/router/pkg/config/testdata/config_full.json @@ -89,7 +89,8 @@ "Enabled": true, "IncludeOperationSha": false } - } + }, + "CardinalityLimit": 2000 } }, "GraphqlMetrics": { diff --git a/router/pkg/metric/config.go b/router/pkg/metric/config.go index 5c95a5ec9c..027fa971b4 100644 --- a/router/pkg/metric/config.go +++ b/router/pkg/metric/config.go @@ -14,6 +14,9 @@ import ( // DefaultServerName Default resource name. const DefaultServerName = "cosmo-router" +// DefaultCardinalityLimit is the hard limit on the number of metric streams that can be collected for a single instrument. +const DefaultCardinalityLimit = 2000 + type PrometheusConfig struct { Enabled bool ConnectionStats bool @@ -115,6 +118,9 @@ type Config struct { Attributes []config.CustomAttribute + // CardinalityLimit is the hard limit on the number of metric streams that can be collected for a single instrument. + CardinalityLimit int + // IsUsingCloudExporter indicates whether the cloud exporter is used. // This value is used for tests to enable/disable the simulated cloud exporter. IsUsingCloudExporter bool @@ -126,12 +132,12 @@ func (c *Config) IsEnabled() bool { // DefaultConfig returns the default config. func DefaultConfig(serviceVersion string) *Config { - return &Config{ Name: DefaultServerName, Version: serviceVersion, ResourceAttributes: make([]attribute.KeyValue, 0), Attributes: make([]config.CustomAttribute, 0), + CardinalityLimit: DefaultCardinalityLimit, OpenTelemetry: OpenTelemetry{ Enabled: false, RouterRuntime: true, diff --git a/router/pkg/metric/metric_store.go b/router/pkg/metric/metric_store.go index 6c84b93f42..80b0341601 100644 --- a/router/pkg/metric/metric_store.go +++ b/router/pkg/metric/metric_store.go @@ -15,9 +15,6 @@ import ( "go.opentelemetry.io/otel/sdk/metric" ) -// DefaultCardinalityLimit is the hard limit on the number of metric streams that can be collected for a single instrument. -const DefaultCardinalityLimit = 2000 - // Server HTTP metrics. const ( RequestCounter = "router.http.requests" // Incoming request count total diff --git a/router/pkg/metric/prometheus_server.go b/router/pkg/metric/prometheus_server.go index 1bebe1d536..b3c2b85cdb 100644 --- a/router/pkg/metric/prometheus_server.go +++ b/router/pkg/metric/prometheus_server.go @@ -1,6 +1,11 @@ package metric import ( + "net/http" + "strings" + "time" + "unicode" + "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/prometheus/client_golang/prometheus" @@ -8,10 +13,6 @@ import ( "github.com/wundergraph/cosmo/router/pkg/otel" "go.opentelemetry.io/otel/attribute" "go.uber.org/zap" - "net/http" - "strings" - "time" - "unicode" ) // Excluded by default from Prometheus export because of high cardinality @@ -23,11 +24,30 @@ var defaultExcludedOtelKeys = []attribute.Key{ func NewPrometheusServer(logger *zap.Logger, listenAddr string, path string, registry *prometheus.Registry) *http.Server { r := chi.NewRouter() r.Use(middleware.Recoverer) + + handlerLogger, err := zap.NewStdLogAt( + logger.With(zap.String("component", "prometheus_handler")), + zap.ErrorLevel, + ) + if err != nil { + logger.Error("Failed to create Prometheus handler logger", zap.Error(err)) + return nil + } + + serverLogger, err := zap.NewStdLogAt( + logger.With(zap.String("component", "prometheus_server")), + zap.ErrorLevel, + ) + if err != nil { + logger.Error("Failed to create Prometheus server logger", zap.Error(err)) + return nil + } + r.Handle(path, promhttp.HandlerFor(registry, promhttp.HandlerOpts{ EnableOpenMetrics: true, - ErrorLog: zap.NewStdLog(logger), + ErrorLog: handlerLogger, Registry: registry, - Timeout: 10 * time.Second, + Timeout: 60 * time.Second, })) svr := &http.Server{ @@ -36,7 +56,7 @@ func NewPrometheusServer(logger *zap.Logger, listenAddr string, path string, reg WriteTimeout: 1 * time.Minute, ReadHeaderTimeout: 2 * time.Second, IdleTimeout: 30 * time.Second, - ErrorLog: zap.NewStdLog(logger), + ErrorLog: serverLogger, Handler: r, }