grafana · pracucci · Dec 22, 2020 · Dec 22, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,9 @@
   - Cortex / Queries: added "Lazy loaded index-headers" and "Index-header lazy load duration"
   - Cortex / Compactor: added "Tenants compaction progress", "Average blocks / tenant" and "Tenants with largest number of blocks"
   - Alerts: added "CortexMemoryMapAreasTooHigh"
+* [ENHANCEMENT] Fine-tuned gRPC keepalive pings to work nicely with Cortex default settings.
+  - `-server.grpc.keepalive.min-time-between-pings=10s`
+  - `-server.grpc.keepalive.ping-without-stream-allowed:true`
 * [BUGFIX] Fixed workingset memory panel while rolling out a StatefulSet. #229
 * [BUGFIX] Fixed `CortexRequestErrors` alert to not include `ready` route. #230
 

diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet
@@ -17,6 +17,7 @@
   else [],
 
   alertmanager_args::
+    $._config.grpcConfig +
     {
       target: 'alertmanager',
       'log.level': 'debug',

diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet
@@ -139,6 +139,11 @@
       }
     else {},
 
+    grpcConfig:: {
+      'server.grpc.keepalive.min-time-between-pings': '10s',
+      'server.grpc.keepalive.ping-without-stream-allowed': true,
+    },
+
     storageConfig:
       $._config.client_configs.aws +
       $._config.client_configs.cassandra +

diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet
@@ -3,6 +3,7 @@
   local containerPort = $.core.v1.containerPort,
 
   distributor_args::
+    $._config.grpcConfig +
     $._config.ringConfig +
     $._config.distributorConfig +
     {

diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet
@@ -1,5 +1,6 @@
 {
   ingester_args::
+    $._config.grpcConfig +
     $._config.ringConfig +
     $._config.storeConfig +
     $._config.storageConfig +

diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet
@@ -2,6 +2,7 @@
   local container = $.core.v1.container,
 
   querier_args::
+    $._config.grpcConfig +
     $._config.ringConfig +
     $._config.storeConfig +
     $._config.storageConfig +

diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet
@@ -1,49 +1,54 @@
 {
   local container = $.core.v1.container,
 
-  query_frontend_args:: {
-    target: 'query-frontend',
-
-    // Need log.level=debug so all queries are logged, needed for analyse.py.
-    'log.level': 'debug',
-
-    // Increase HTTP server response write timeout, as we were seeing some
-    // queries that return a lot of data timeing out.
-    'server.http-write-timeout': '1m',
-
-    // Split long queries up into multiple day-long queries.
-    'querier.split-queries-by-interval': '24h',
-
-    // Cache query results.
-    'querier.align-querier-with-step': true,
-    'querier.cache-results': true,
-    'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace,
-    'frontend.memcached.service': 'memcached-client',
-    'frontend.memcached.timeout': '500ms',
-
-    // So that exporters like cloudwatch can still send in data and be un-cached.
-    'frontend.max-cache-freshness': '10m',
-
-    // Compress HTTP responses; improves latency for very big results and slow
-    // connections.
-    'querier.compress-http-responses': true,
-
-    // So it can recieve big responses from the querier.
-    'server.grpc-max-recv-msg-size-bytes': 100 << 20,
-
-    // Limit queries to 500 days, allow this to be override per-user.
-    'store.max-query-length': '12000h',  // 500 Days
-    'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
-  } + if $._config.queryFrontend.sharded_queries_enabled then {
-    'querier.parallelise-shardable-queries': 'true',
-
-    // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
-    // basically base * shard_factor * query_split_factor / num_frontends where
-    'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
-
-    'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
-  } + $._config.storageConfig
-  else {},
+  query_frontend_args::
+    $._config.ringConfig +
+    {
+      target: 'query-frontend',
+
+      // Need log.level=debug so all queries are logged, needed for analyse.py.
+      'log.level': 'debug',
+
+      // Increase HTTP server response write timeout, as we were seeing some
+      // queries that return a lot of data timeing out.
+      'server.http-write-timeout': '1m',
+
+      // Split long queries up into multiple day-long queries.
+      'querier.split-queries-by-interval': '24h',
+
+      // Cache query results.
+      'querier.align-querier-with-step': true,
+      'querier.cache-results': true,
+      'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace,
+      'frontend.memcached.service': 'memcached-client',
+      'frontend.memcached.timeout': '500ms',
+
+      // So that exporters like cloudwatch can still send in data and be un-cached.
+      'frontend.max-cache-freshness': '10m',
+
+      // Compress HTTP responses; improves latency for very big results and slow
+      // connections.
+      'querier.compress-http-responses': true,
+
+      // So it can receive big responses from the querier.
+      'server.grpc-max-recv-msg-size-bytes': 100 << 20,
+
+      // Limit queries to 500 days, allow this to be override per-user.
+      'store.max-query-length': '12000h',  // 500 Days
+      'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
+    } + (
+      if $._config.queryFrontend.sharded_queries_enabled then
+      {
+        'querier.parallelise-shardable-queries': 'true',
+
+        // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
+        // basically base * shard_factor * query_split_factor / num_frontends where
+        'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
+
+        'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
+      } + $._config.storageConfig
+    else {}
+  ),
 
   query_frontend_container::
     container.new('query-frontend', $._images.query_frontend) +

diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet
@@ -2,6 +2,7 @@
   local container = $.core.v1.container,
 
   ruler_args::
+    $._config.grpcConfig +
     $._config.ringConfig +
     $._config.storeConfig +
     $._config.storageConfig +

diff --git a/cortex/tsdb.libsonnet b/cortex/tsdb.libsonnet
@@ -123,6 +123,7 @@
     pvc.mixin.metadata.withName('compactor-data'),
 
   compactor_args::
+    $._config.grpcConfig +
     $._config.storageConfig +
     $._config.blocksStorageConfig +
     {
@@ -178,6 +179,7 @@
     pvc.mixin.metadata.withName('store-gateway-data'),
 
   store_gateway_args::
+    $._config.grpcConfig +
     $._config.storageConfig +
     $._config.blocksStorageConfig +
     {