From c62c679ea22563e9f8ca82b3ae123c5f4f9cc186 Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 09:33:10 +0100 Subject: [PATCH 1/7] Update grafana, prometheus, remove containers names --- docker-compose.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5df7e10f93..92a357b06f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,7 +63,7 @@ services: # Only to test prometheus integration prometheus: - image: prom/prometheus:v3.0.0-beta.1 + image: prom/prometheus:v3.4.2 command: - --web.enable-remote-write-receiver - --enable-feature=native-histograms @@ -85,13 +85,12 @@ services: - debug grafana: - image: grafana/grafana:11.3.1 + image: grafana/grafana:12.0.2 ports: - '9300:3000' volumes: - grafana-storage:/var/lib/grafana - ./docker/grafana/provisioning:/etc/grafana/provisioning - container_name: grafana restart: unless-stopped networks: - primary @@ -261,7 +260,7 @@ services: volumes: - ./docker/redis/redis-cluster.conf:/usr/local/etc/redis/redis.conf healthcheck: - test: ["CMD", "redis-cli", "-p", "6379", "ping"] + test: ['CMD', 'redis-cli', '-p', '6379', 'ping'] interval: 10s timeout: 5s retries: 3 @@ -279,7 +278,7 @@ services: volumes: - ./docker/redis/redis-cluster.conf:/usr/local/etc/redis/redis.conf healthcheck: - test: ["CMD", "redis-cli", "-p", "6379", "ping"] + test: ['CMD', 'redis-cli', '-p', '6379', 'ping'] interval: 10s timeout: 5s retries: 3 @@ -297,7 +296,7 @@ services: volumes: - ./docker/redis/redis-cluster.conf:/usr/local/etc/redis/redis.conf healthcheck: - test: ["CMD", "redis-cli", "-p", "6379", "ping"] + test: ['CMD', 'redis-cli', '-p', '6379', 'ping'] interval: 10s timeout: 5s retries: 3 From 58b011270fe5e4d75a8647b5ecf55061f045d07c Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 09:36:06 +0100 Subject: [PATCH 2/7] Put prometheus server error logs at error level --- router/pkg/metric/prometheus_server.go | 34 ++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/router/pkg/metric/prometheus_server.go b/router/pkg/metric/prometheus_server.go index 1bebe1d536..b3c2b85cdb 100644 --- a/router/pkg/metric/prometheus_server.go +++ b/router/pkg/metric/prometheus_server.go @@ -1,6 +1,11 @@ package metric import ( + "net/http" + "strings" + "time" + "unicode" + "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/prometheus/client_golang/prometheus" @@ -8,10 +13,6 @@ import ( "github.com/wundergraph/cosmo/router/pkg/otel" "go.opentelemetry.io/otel/attribute" "go.uber.org/zap" - "net/http" - "strings" - "time" - "unicode" ) // Excluded by default from Prometheus export because of high cardinality @@ -23,11 +24,30 @@ var defaultExcludedOtelKeys = []attribute.Key{ func NewPrometheusServer(logger *zap.Logger, listenAddr string, path string, registry *prometheus.Registry) *http.Server { r := chi.NewRouter() r.Use(middleware.Recoverer) + + handlerLogger, err := zap.NewStdLogAt( + logger.With(zap.String("component", "prometheus_handler")), + zap.ErrorLevel, + ) + if err != nil { + logger.Error("Failed to create Prometheus handler logger", zap.Error(err)) + return nil + } + + serverLogger, err := zap.NewStdLogAt( + logger.With(zap.String("component", "prometheus_server")), + zap.ErrorLevel, + ) + if err != nil { + logger.Error("Failed to create Prometheus server logger", zap.Error(err)) + return nil + } + r.Handle(path, promhttp.HandlerFor(registry, promhttp.HandlerOpts{ EnableOpenMetrics: true, - ErrorLog: zap.NewStdLog(logger), + ErrorLog: handlerLogger, Registry: registry, - Timeout: 10 * time.Second, + Timeout: 60 * time.Second, })) svr := &http.Server{ @@ -36,7 +56,7 @@ func NewPrometheusServer(logger *zap.Logger, listenAddr string, path string, reg WriteTimeout: 1 * time.Minute, ReadHeaderTimeout: 2 * time.Second, IdleTimeout: 30 * time.Second, - ErrorLog: zap.NewStdLog(logger), + ErrorLog: serverLogger, Handler: r, } From 116e3d77779748e48a1eb1e2a19b002d52f9526a Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 09:36:18 +0100 Subject: [PATCH 3/7] Add benchmark with limited but high cardinality --- router/bench-limited-cardinality.js | 177 ++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 router/bench-limited-cardinality.js diff --git a/router/bench-limited-cardinality.js b/router/bench-limited-cardinality.js new file mode 100644 index 0000000000..3b07f66994 --- /dev/null +++ b/router/bench-limited-cardinality.js @@ -0,0 +1,177 @@ +import http from 'k6/http'; +import { check } from 'k6'; +import { randomString } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js'; + +/* + Benchmarking script to run a graphql query with a random operation name from a fixed size pool. + Useful to test metric attributes. + */ + +export const options = { + stages: [ + { duration: '15s', target: 20 }, + { duration: '15s', target: 50 }, + { duration: '20s', target: 100 }, + { duration: '30m', target: 100 }, + ], +}; + +export function setup() { + let randomNames = []; + + // should be under the default cardinality limit + for (let i = 0; i < 1500; i++) { + randomNames.push(randomString(10, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')); + } + + return { randomNames }; +} + +export default function ({ randomNames }) { + let query = ` + query $$__REPLACE_ME__$$ { + employees { + # resolved through employees subgraph + id + # overridden by the products subgraph + notes + details { + # resolved through either employees or family subgraph + forename + surname + # resolved through employees subgraph + location { + key { + name + } + } + # resolved through family subgraph + hasChildren + # maritalStatus can return null + maritalStatus + nationality + # pets can return null + pets { + class + gender + name + ... on Cat { + type + } + ... on Dog { + breed + } + ... on Alligator { + dangerous + } + } + } + # resolved through employees subgraph + role { + departments + title + ... on Engineer { + engineerType + } + ... on Operator { + operatorType + } + } + # resolved through hobbies subgraph + hobbies { + ... on Exercise { + category + } + ... on Flying { + planeModels + yearsOfExperience + } + ... on Gaming { + genres + name + yearsOfExperience + } + ... on Other { + name + } + ... on Programming { + languages + } + ... on Travelling { + countriesLived { + key { + name + } + } + } + } + # resolved through products subgraph + products + } + # can return null + employee(id: 1) { + # resolved through employees subgraph + id + details { + forename + location { + key { + name + } + } + } + } + teammates(team: OPERATIONS) { + # resolved through employees subgraph + id + ...EmployeeNameFragment + # resolved through products subgraph + products + } + productTypes { + ... on Documentation { + url(product: SDK) + urls(products: [COSMO, MARKETING]) + } + ... on Consultancy { + lead { + ...EmployeeNameFragment + } + name + } + } + a: findEmployees(criteria: { + hasPets: true, nationality: UKRAINIAN, nested: { maritalStatus: ENGAGED } + }) { + ...EmployeeNameFragment + } + b: findEmployees(criteria: { + hasPets: true, nationality: GERMAN, nested: { maritalStatus: MARRIED, hasChildren: true } + }) { + ...EmployeeNameFragment + } +} + +fragment EmployeeNameFragment on Employee { + details { + forename + } +}`; + + let headers = { + 'Content-Type': 'application/json', + 'GraphQL-Client-Name': 'k6', + 'GraphQL-Client-Version': '0.0.1', + }; + + let operationName = randomNames[Math.floor(Math.random() * randomNames.length)]; + + query = query.replace(/\$\$__REPLACE_ME__\$\$/g, operationName); + + let res = http.post('http://localhost:3002/graphql', JSON.stringify({ query: query, operationName: operationName }), { + headers: headers, + }); + check(res, { + 'is status 200': (r) => r.status === 200 && r.body.includes('errors') === false, + }); +} From 90666f7c92b44a5c1931782bbe39c949ac2008f1 Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 11:23:05 +0100 Subject: [PATCH 4/7] feat: configurable cardinality limit for otel --- router/bench-limited-cardinality.js | 12 ++++++++++-- router/core/graph_server.go | 2 +- router/core/router.go | 1 + router/pkg/config/config.go | 7 ++++--- router/pkg/config/config.schema.json | 6 ++++++ router/pkg/config/testdata/config_defaults.json | 3 ++- router/pkg/config/testdata/config_full.json | 3 ++- router/pkg/metric/config.go | 8 +++++++- router/pkg/metric/metric_store.go | 3 --- 9 files changed, 33 insertions(+), 12 deletions(-) diff --git a/router/bench-limited-cardinality.js b/router/bench-limited-cardinality.js index 3b07f66994..99bd9a41a2 100644 --- a/router/bench-limited-cardinality.js +++ b/router/bench-limited-cardinality.js @@ -16,14 +16,22 @@ export const options = { ], }; +// in the simple case from a clean state it's around (operationName)*5 series per metric +// mostly due to wg_subgraph_id and wg_subgraph_name array exploding + +// 300 should be under the default cardinality limit (1500 < 2000) +// 500 should be slightly over the default cardinality limit (2500 > 2000) +const distinctNames = 300; + export function setup() { let randomNames = []; - // should be under the default cardinality limit - for (let i = 0; i < 1500; i++) { + for (let i = 0; i < distinctNames; i++) { randomNames.push(randomString(10, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')); } + console.log('Generated ' + distinctNames + ' random names'); + return { randomNames }; } diff --git a/router/core/graph_server.go b/router/core/graph_server.go index a346246c13..0c5b7cd493 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -743,7 +743,7 @@ func (s *graphServer) buildGraphMux(ctx context.Context, rmetric.WithBaseAttributes(baseMetricAttributes), rmetric.WithLogger(s.logger), rmetric.WithProcessStartTime(s.processStartTime), - rmetric.WithCardinalityLimit(rmetric.DefaultCardinalityLimit), + rmetric.WithCardinalityLimit(s.metricConfig.CardinalityLimit), rmetric.WithRouterInfoAttributes(routerInfoBaseAttrs), ) if err != nil { diff --git a/router/core/router.go b/router/core/router.go index f3e3b9c04e..473f61a7f1 100644 --- a/router/core/router.go +++ b/router/core/router.go @@ -2175,6 +2175,7 @@ func MetricConfigFromTelemetry(cfg *config.Telemetry) *rmetric.Config { Version: Version, Attributes: cfg.Metrics.Attributes, ResourceAttributes: buildResourceAttributes(cfg.ResourceAttributes), + CardinalityLimit: cfg.Metrics.CardinalityLimit, OpenTelemetry: rmetric.OpenTelemetry{ Enabled: cfg.Metrics.OTLP.Enabled, RouterRuntime: cfg.Metrics.OTLP.RouterRuntime, diff --git a/router/pkg/config/config.go b/router/pkg/config/config.go index b375abcbd3..684b29ae88 100644 --- a/router/pkg/config/config.go +++ b/router/pkg/config/config.go @@ -123,9 +123,10 @@ type MetricsOTLPExporter struct { } type Metrics struct { - Attributes []CustomAttribute `yaml:"attributes"` - OTLP MetricsOTLP `yaml:"otlp"` - Prometheus Prometheus `yaml:"prometheus"` + Attributes []CustomAttribute `yaml:"attributes"` + OTLP MetricsOTLP `yaml:"otlp"` + Prometheus Prometheus `yaml:"prometheus"` + CardinalityLimit int `yaml:"cardinality_limit" envDefault:"2000" env:"METRICS_CARDINALITY_LIMIT"` } type MetricsOTLP struct { diff --git a/router/pkg/config/config.schema.json b/router/pkg/config/config.schema.json index 07d53fcee7..d6d781acc4 100644 --- a/router/pkg/config/config.schema.json +++ b/router/pkg/config/config.schema.json @@ -972,6 +972,12 @@ "description": "The configuration for the collection and export of metrics. The metrics are collected and exported using the OpenTelemetry protocol (OTLP) and Prometheus.", "additionalProperties": false, "properties": { + "cardinality_limit": { + "type": "integer", + "description": "Sets a hard limit on the number of Metric Points that can be collected during a collection cycle", + "minimum": 1, + "default": 2000 + }, "attributes": { "type": "array", "description": "The attributes to add to OTLP Metrics and Prometheus.", diff --git a/router/pkg/config/testdata/config_defaults.json b/router/pkg/config/testdata/config_defaults.json index 0d7fdd4c41..2501daddae 100644 --- a/router/pkg/config/testdata/config_defaults.json +++ b/router/pkg/config/testdata/config_defaults.json @@ -59,7 +59,8 @@ "Enabled": false, "IncludeOperationSha": false } - } + }, + "CardinalityLimit": 2000 } }, "GraphqlMetrics": { diff --git a/router/pkg/config/testdata/config_full.json b/router/pkg/config/testdata/config_full.json index fc7a0b45f4..7869972aef 100644 --- a/router/pkg/config/testdata/config_full.json +++ b/router/pkg/config/testdata/config_full.json @@ -89,7 +89,8 @@ "Enabled": true, "IncludeOperationSha": false } - } + }, + "CardinalityLimit": 2000 } }, "GraphqlMetrics": { diff --git a/router/pkg/metric/config.go b/router/pkg/metric/config.go index 5c95a5ec9c..027fa971b4 100644 --- a/router/pkg/metric/config.go +++ b/router/pkg/metric/config.go @@ -14,6 +14,9 @@ import ( // DefaultServerName Default resource name. const DefaultServerName = "cosmo-router" +// DefaultCardinalityLimit is the hard limit on the number of metric streams that can be collected for a single instrument. +const DefaultCardinalityLimit = 2000 + type PrometheusConfig struct { Enabled bool ConnectionStats bool @@ -115,6 +118,9 @@ type Config struct { Attributes []config.CustomAttribute + // CardinalityLimit is the hard limit on the number of metric streams that can be collected for a single instrument. + CardinalityLimit int + // IsUsingCloudExporter indicates whether the cloud exporter is used. // This value is used for tests to enable/disable the simulated cloud exporter. IsUsingCloudExporter bool @@ -126,12 +132,12 @@ func (c *Config) IsEnabled() bool { // DefaultConfig returns the default config. func DefaultConfig(serviceVersion string) *Config { - return &Config{ Name: DefaultServerName, Version: serviceVersion, ResourceAttributes: make([]attribute.KeyValue, 0), Attributes: make([]config.CustomAttribute, 0), + CardinalityLimit: DefaultCardinalityLimit, OpenTelemetry: OpenTelemetry{ Enabled: false, RouterRuntime: true, diff --git a/router/pkg/metric/metric_store.go b/router/pkg/metric/metric_store.go index 6c84b93f42..80b0341601 100644 --- a/router/pkg/metric/metric_store.go +++ b/router/pkg/metric/metric_store.go @@ -15,9 +15,6 @@ import ( "go.opentelemetry.io/otel/sdk/metric" ) -// DefaultCardinalityLimit is the hard limit on the number of metric streams that can be collected for a single instrument. -const DefaultCardinalityLimit = 2000 - // Server HTTP metrics. const ( RequestCounter = "router.http.requests" // Incoming request count total From 8d3dc5aca5dae8a7140d06b7a0d28d0f7a9fb644 Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 11:23:26 +0100 Subject: [PATCH 5/7] chore: make grafana accessible without login in debug compose --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 92a357b06f..86bb931434 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -99,6 +99,8 @@ services: - GF_INSTALL_PLUGINS=grafana-clickhouse-datasource - CLICKHOUSE_USER=${CLICKHOUSE_USER:-default} - CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-changeme} + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin profiles: - debug From 4d06cac279f37018b46d36be11cd75a5fc4eb87d Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 11:32:13 +0100 Subject: [PATCH 6/7] Mark option as experimental --- router/pkg/config/config.go | 2 +- router/pkg/config/config.schema.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/router/pkg/config/config.go b/router/pkg/config/config.go index 684b29ae88..2e9a4fd270 100644 --- a/router/pkg/config/config.go +++ b/router/pkg/config/config.go @@ -126,7 +126,7 @@ type Metrics struct { Attributes []CustomAttribute `yaml:"attributes"` OTLP MetricsOTLP `yaml:"otlp"` Prometheus Prometheus `yaml:"prometheus"` - CardinalityLimit int `yaml:"cardinality_limit" envDefault:"2000" env:"METRICS_CARDINALITY_LIMIT"` + CardinalityLimit int `yaml:"experimental_cardinality_limit" envDefault:"2000" env:"METRICS_X_CARDINALITY_LIMIT"` } type MetricsOTLP struct { diff --git a/router/pkg/config/config.schema.json b/router/pkg/config/config.schema.json index d6d781acc4..b82dee3027 100644 --- a/router/pkg/config/config.schema.json +++ b/router/pkg/config/config.schema.json @@ -972,9 +972,9 @@ "description": "The configuration for the collection and export of metrics. The metrics are collected and exported using the OpenTelemetry protocol (OTLP) and Prometheus.", "additionalProperties": false, "properties": { - "cardinality_limit": { + "experimental_cardinality_limit": { "type": "integer", - "description": "Sets a hard limit on the number of Metric Points that can be collected during a collection cycle", + "description": "Sets a hard limit on the number of Metric Points that can be collected during a collection cycle. NOTE: This option is experimental and may change in future versions.", "minimum": 1, "default": 2000 }, From 682c92bc401388b1a99f327776a17a81f245da0a Mon Sep 17 00:00:00 2001 From: endigma Date: Thu, 3 Jul 2025 11:40:02 +0100 Subject: [PATCH 7/7] s/experimental/experiment and expand X to EXPERIMENT in envar --- router/pkg/config/config.go | 2 +- router/pkg/config/config.schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/router/pkg/config/config.go b/router/pkg/config/config.go index 2e9a4fd270..b83f0360ac 100644 --- a/router/pkg/config/config.go +++ b/router/pkg/config/config.go @@ -126,7 +126,7 @@ type Metrics struct { Attributes []CustomAttribute `yaml:"attributes"` OTLP MetricsOTLP `yaml:"otlp"` Prometheus Prometheus `yaml:"prometheus"` - CardinalityLimit int `yaml:"experimental_cardinality_limit" envDefault:"2000" env:"METRICS_X_CARDINALITY_LIMIT"` + CardinalityLimit int `yaml:"experiment_cardinality_limit" envDefault:"2000" env:"METRICS_EXPERIMENT_CARDINALITY_LIMIT"` } type MetricsOTLP struct { diff --git a/router/pkg/config/config.schema.json b/router/pkg/config/config.schema.json index b82dee3027..6fbe8851ec 100644 --- a/router/pkg/config/config.schema.json +++ b/router/pkg/config/config.schema.json @@ -972,7 +972,7 @@ "description": "The configuration for the collection and export of metrics. The metrics are collected and exported using the OpenTelemetry protocol (OTLP) and Prometheus.", "additionalProperties": false, "properties": { - "experimental_cardinality_limit": { + "experiment_cardinality_limit": { "type": "integer", "description": "Sets a hard limit on the number of Metric Points that can be collected during a collection cycle. NOTE: This option is experimental and may change in future versions.", "minimum": 1,