cortexproject · pracucci · Oct 30, 2020 · Oct 19, 2020 · Oct 27, 2020 · Oct 27, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
   - `cortex_ingester_tsdb_head_truncations_total`
   - `cortex_ingester_tsdb_head_gc_duration_seconds`
 * [ENHANCEMENT] Added `cortex_alertmanager_config_hash` metric to expose hash of Alertmanager Config loaded per user. #3388
+* [ENHANCEMENT] Query-Frontend / Query-Scheduler: New component called "Query-Scheduler" has been introduced. Query-Scheduler is simply a queue of requests, moved outside of Query-Frontend. This allows Query-Frontend to be scaled separately from number of queues. To make Query-Frontend and Querier use Query-Scheduler, they need to be started with `-frontend.scheduler-address` and `-querier.scheduler-address` options respectively. #3374
 
 ## 1.5.0 in progress
 

diff --git a/development/tsdb-blocks-storage-s3/config/cortex.yaml b/development/tsdb-blocks-storage-s3/config/cortex.yaml
@@ -120,6 +120,10 @@ store_gateway:
 
 frontend_worker:
   frontend_address: "query-frontend:9007"
+  match_max_concurrent: true
+
+  # By setting scheduler_address, querier worker would use scheduler instead of frontend.
+  # scheduler_address: "query-scheduler:9012"
 
 query_range:
   split_queries_by_interval: 24h

diff --git a/development/tsdb-blocks-storage-s3/config/grafana-agent.yaml b/development/tsdb-blocks-storage-s3/config/grafana-agent.yaml
@@ -25,7 +25,7 @@ prometheus:
                 namespace: 'tsdb-blocks-storage-s3'
         - job_name: tsdb-blocks-storage-s3/querier
           static_configs:
-            - targets: ['querier:8004']
+            - targets: ['querier:8004', 'querier-with-scheduler:8013']
               labels:
                 cluster: 'docker-compose'
                 namespace: 'tsdb-blocks-storage-s3'
@@ -43,7 +43,7 @@ prometheus:
                 namespace: 'tsdb-blocks-storage-s3'
         - job_name: tsdb-blocks-storage-s3/query-frontend
           static_configs:
-            - targets: ['query-frontend:8007']
+            - targets: ['query-frontend:8007', 'query-frontend-with-scheduler:8012']
               labels:
                 cluster: 'docker-compose'
                 namespace: 'tsdb-blocks-storage-s3'
@@ -53,6 +53,12 @@ prometheus:
               labels:
                 cluster: 'docker-compose'
                 namespace: 'tsdb-blocks-storage-s3'
+        - job_name: tsdb-blocks-storage-s3/query-scheduler
+          static_configs:
+            - targets: ['query-scheduler:8011']
+              labels:
+                cluster: 'docker-compose'
+                namespace: 'tsdb-blocks-storage-s3'
 
       remote_write:
         - url: http://distributor:8001/api/prom/push
diff --git a/development/tsdb-blocks-storage-s3/config/prometheus.yaml b/development/tsdb-blocks-storage-s3/config/prometheus.yaml
@@ -18,7 +18,7 @@ scrape_configs:
           namespace: 'tsdb-blocks-storage-s3'
   - job_name: tsdb-blocks-storage-s3/querier
     static_configs:
-      - targets: ['querier:8004']
+      - targets: ['querier:8004', 'query-frontend-with-scheduler:8013']
         labels:
           cluster: 'docker-compose'
           namespace: 'tsdb-blocks-storage-s3'
@@ -36,7 +36,7 @@ scrape_configs:
           namespace: 'tsdb-blocks-storage-s3'
   - job_name: tsdb-blocks-storage-s3/query-frontend
     static_configs:
-      - targets: ['query-frontend:8007']
+      - targets: ['query-frontend:8007', 'query-frontend-with-scheduler:8012']
         labels:
           cluster: 'docker-compose'
           namespace: 'tsdb-blocks-storage-s3'
@@ -46,6 +46,12 @@ scrape_configs:
         labels:
           cluster: 'docker-compose'
           namespace: 'tsdb-blocks-storage-s3'
+  - job_name: tsdb-blocks-storage-s3/query-scheduler
+    static_configs:
+      - targets: ['query-scheduler:8011']
+        labels:
+          cluster: 'docker-compose'
+          namespace: 'tsdb-blocks-storage-s3'
 
 remote_write:
   - url: http://distributor:8001/api/prom/push
diff --git a/development/tsdb-blocks-storage-s3/docker-compose.yml b/development/tsdb-blocks-storage-s3/docker-compose.yml
@@ -271,3 +271,68 @@ services:
       - 18022:18022
     volumes:
       - ./config:/cortex/config
+
+  query-scheduler:
+    build:
+      context:    .
+      dockerfile: dev.dockerfile
+    image: cortex
+    command: ["sh", "-c", "sleep 3 && exec ./dlv exec ./cortex --listen=:18011 --headless=true --api-version=2 --accept-multiclient --continue -- -config.file=./config/cortex.yaml -target=query-scheduler -server.http-listen-port=8011 -server.grpc-listen-port=9011 -store.max-query-length=8760h -log.level=debug"]
+    depends_on:
+      - consul
+      - minio
+    environment:
+      - JAEGER_AGENT_HOST=jaeger
+      - JAEGER_AGENT_PORT=6831
+      - JAEGER_TAGS=app=query-scheduler
+      - JAEGER_SAMPLER_TYPE=const
+      - JAEGER_SAMPLER_PARAM=1
+    ports:
+      - 8011:8011
+      - 18011:18011
+    volumes:
+      - ./config:/cortex/config
+
+  # This frontend uses query-scheduler, activated by `-frontend.scheduler-address` option.
+  query-frontend-with-scheduler:
+    build:
+      context:    .
+      dockerfile: dev.dockerfile
+    image: cortex
+    command: ["sh", "-c", "sleep 3 && exec ./dlv exec ./cortex --listen=:18012 --headless=true --api-version=2 --accept-multiclient --continue -- -config.file=./config/cortex.yaml -target=query-frontend -server.http-listen-port=8012 -server.grpc-listen-port=9012 -store.max-query-length=8760h -frontend.scheduler-address=query-scheduler:9011 -log.level=debug"]
+    depends_on:
+      - consul
+      - minio
+    environment:
+      - JAEGER_AGENT_HOST=jaeger
+      - JAEGER_AGENT_PORT=6831
+      - JAEGER_TAGS=app=query-frontend2
+      - JAEGER_SAMPLER_TYPE=const
+      - JAEGER_SAMPLER_PARAM=1
+    ports:
+      - 8012:8012
+      - 18012:18012
+    volumes:
+      - ./config:/cortex/config
+
+  # This querier is connecting to query-scheduler, instead of query-frontend. This is achieved by setting -querier.scheduler-address="..."
+  querier-with-scheduler:
+    build:
+      context:    .
+      dockerfile: dev.dockerfile
+    image: cortex
+    command: ["sh", "-c", "sleep 3 && exec ./dlv exec ./cortex --listen=:18013 --headless=true --api-version=2 --accept-multiclient --continue -- -config.file=./config/cortex.yaml -target=querier -server.http-listen-port=8013 -server.grpc-listen-port=9013 -querier.scheduler-address=query-scheduler:9011 -log.level=debug"]
+    depends_on:
+      - consul
+      - minio
+    environment:
+      - JAEGER_AGENT_HOST=jaeger
+      - JAEGER_AGENT_PORT=6831
+      - JAEGER_TAGS=app=querier-scheduler
+      - JAEGER_SAMPLER_TYPE=const
+      - JAEGER_SAMPLER_PARAM=1
+    ports:
+      - 8013:8013
+      - 18013:18013
+    volumes:
+      - ./config:/cortex/config
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -157,6 +157,13 @@ runtime_config:
 
 # The memberlist_config configures the Gossip memberlist.
 [memberlist: <memberlist_config>]
+
+query_scheduler:
+  # Maximum number of outstanding requests per tenant per query-scheduler.
+  # In-flight requests above this limit will fail with HTTP response status code
+  # 429.
+  # CLI flag: -query-scheduler.max-outstanding-requests-per-tenant
+  [max_outstanding_requests_per_tenant: <int> | default = 100]
 ```
 
 ### `server_config`
@@ -757,27 +764,109 @@ store_gateway_client:
 The `query_frontend_config` configures the Cortex query-frontend.
 
 ```yaml
+# Log queries that are slower than the specified duration. Set to 0 to disable.
+# Set to < 0 to enable on all queries.
+# CLI flag: -frontend.log-queries-longer-than
+[log_queries_longer_than: <duration> | default = 0s]
+
+# Max body size for downstream prometheus.
+# CLI flag: -frontend.max-body-size
+[max_body_size: <int> | default = 10485760]
+
 # Maximum number of outstanding requests per tenant per frontend; requests
 # beyond this error with HTTP 429.
 # CLI flag: -querier.max-outstanding-requests-per-tenant
 [max_outstanding_per_tenant: <int> | default = 100]
 
+# DNS hostname used for finding query-schedulers.
+# CLI flag: -frontend.scheduler-address
+[scheduler_address: <string> | default = ""]
+
+# How often to resolve the scheduler-address, in order to look for new
+# query-scheduler instances.
+# CLI flag: -frontend.scheduler-dns-lookup-period
+[scheduler_dns_lookup_period: <duration> | default = 10s]
+
+# Number of concurrent workers forwarding queries to single query-scheduler.
+# CLI flag: -frontend.scheduler-worker-concurrency
+[scheduler_worker_concurrency: <int> | default = 5]
+
+grpc_client_config:
+  # gRPC client max receive message size (bytes).
+  # CLI flag: -frontend.grpc-client-config.grpc-max-recv-msg-size
+  [max_recv_msg_size: <int> | default = 104857600]
+
+  # gRPC client max send message size (bytes).
+  # CLI flag: -frontend.grpc-client-config.grpc-max-send-msg-size
+  [max_send_msg_size: <int> | default = 16777216]
+
+  # Deprecated: Use gzip compression when sending messages.  If true, overrides
+  # grpc-compression flag.
+  # CLI flag: -frontend.grpc-client-config.grpc-use-gzip-compression
+  [use_gzip_compression: <boolean> | default = false]
+
+  # Use compression when sending messages. Supported values are: 'gzip',
+  # 'snappy' and '' (disable compression)
+  # CLI flag: -frontend.grpc-client-config.grpc-compression
+  [grpc_compression: <string> | default = ""]
+
+  # Rate limit for gRPC client; 0 means disabled.
+  # CLI flag: -frontend.grpc-client-config.grpc-client-rate-limit
+  [rate_limit: <float> | default = 0]
+
+  # Rate limit burst for gRPC client.
+  # CLI flag: -frontend.grpc-client-config.grpc-client-rate-limit-burst
+  [rate_limit_burst: <int> | default = 0]
+
+  # Enable backoff and retry when we hit ratelimits.
+  # CLI flag: -frontend.grpc-client-config.backoff-on-ratelimits
+  [backoff_on_ratelimits: <boolean> | default = false]
+
+  backoff_config:
+    # Minimum delay when backing off.
+    # CLI flag: -frontend.grpc-client-config.backoff-min-period
+    [min_period: <duration> | default = 100ms]
+
+    # Maximum delay when backing off.
+    # CLI flag: -frontend.grpc-client-config.backoff-max-period
+    [max_period: <duration> | default = 10s]
+
+    # Number of times to backoff and retry before failing.
+    # CLI flag: -frontend.grpc-client-config.backoff-retries
+    [max_retries: <int> | default = 10]
+
+  # Path to the client certificate file, which will be used for authenticating
+  # with the server. Also requires the key path to be configured.
+  # CLI flag: -frontend.grpc-client-config.tls-cert-path
+  [tls_cert_path: <string> | default = ""]
+
+  # Path to the key file for the client certificate. Also requires the client
+  # certificate to be configured.
+  # CLI flag: -frontend.grpc-client-config.tls-key-path
+  [tls_key_path: <string> | default = ""]
+
+  # Path to the CA certificates file to validate server certificate against. If
+  # not set, the host's root CA certificates are used.
+  # CLI flag: -frontend.grpc-client-config.tls-ca-path
+  [tls_ca_path: <string> | default = ""]
+
+  # Skip validating server certificate.
+  # CLI flag: -frontend.grpc-client-config.tls-insecure-skip-verify
+  [tls_insecure_skip_verify: <boolean> | default = false]
+
+# Name of network interface to read address from. This address is sent to
+# query-scheduler and querier, which uses it to send the query response back to
+# query-frontend.
+# CLI flag: -frontend.instance-interface-names
+[instance_interface_names: <list of string> | default = [eth0 en0]]
+
 # Compress HTTP responses.
 # CLI flag: -querier.compress-http-responses
 [compress_responses: <boolean> | default = false]
 
 # URL of downstream Prometheus.
 # CLI flag: -frontend.downstream-url
 [downstream_url: <string> | default = ""]
-
-# Max body size for downstream prometheus.
-# CLI flag: -frontend.max-body-size
-[max_body_size: <int> | default = 10485760]
-
-# Log queries that are slower than the specified duration. Set to 0 to disable.
-# Set to < 0 to enable on all queries.
-# CLI flag: -frontend.log-queries-longer-than
-[log_queries_longer_than: <duration> | default = 0s]
 ```
 
 ### `query_range_config`
@@ -2454,7 +2543,10 @@ grpc_client_config:
 The `frontend_worker_config` configures the worker - running within the Cortex querier - picking up and executing queries enqueued by the query-frontend.
 
 ```yaml
-# Address of query frontend service, in host:port format.
+# Address of query frontend service, in host:port format. If
+# -querier.scheduler-address is set as well, querier will use scheduler instead.
+# If neither -querier.frontend-address or -querier.scheduler-address is set,
+# queries must arrive via HTTP endpoint.
 # CLI flag: -querier.frontend-address
 [frontend_address: <string> | default = ""]
 
@@ -2538,6 +2630,17 @@ grpc_client_config:
   # Skip validating server certificate.
   # CLI flag: -querier.frontend-client.tls-insecure-skip-verify
   [tls_insecure_skip_verify: <boolean> | default = false]
+
+# Hostname (and port) of scheduler that querier will periodically resolve,
+# connect to and receive queries from. If set, takes precedence over
+# -querier.frontend-address.
+# CLI flag: -querier.scheduler-address
+[scheduler_address: <string> | default = ""]
+
+# How often to resolve the scheduler-address, in order to look for new
+# query-scheduler instances.
+# CLI flag: -querier.scheduler-dns-lookup-period
+[scheduler_dns_lookup_period: <duration> | default = 10s]
 ```
 
 ### `etcd_config`
@@ -2903,10 +3006,11 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
 
 # Maximum number of queriers that can handle requests for a single tenant. If
 # set to 0 or value higher than number of available queriers, *all* queriers
-# will handle requests for the tenant. Each frontend will select the same set of
-# queriers for the same tenant (given that all queriers are connected to all
-# frontends). This option only works with queriers connecting to the
-# query-frontend, not when using downstream URL.
+# will handle requests for the tenant. Each frontend (or query-scheduler, if
+# used) will select the same set of queriers for the same tenant (given that all
+# queriers are connected to all frontends / query-schedulers). This option only
+# works with queriers connecting to the query-frontend / query-scheduler, not
+# when using downstream URL.
 # CLI flag: -frontend.max-queriers-per-tenant
 [max_queriers_per_tenant: <int> | default = 0]
 

diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md
@@ -55,3 +55,4 @@ Currently experimental features are:
 - Blocksconvert tools
 - OpenStack Swift storage support.
 - Metric relabeling in the distributor.
+- Scalable query-frontend (when using query-scheduler)
diff --git a/docs/guides/shuffle-sharding.md b/docs/guides/shuffle-sharding.md
@@ -80,7 +80,7 @@ _The shard size can be overridden on a per-tenant basis in the limits overrides
 
 By default all Cortex queriers can execute received queries for given tenant.
 
-When shuffle sharding is **enabled** by setting `-frontend.max-queriers-per-tenant` (or its respective YAML config option) to a value higher than 0 and lower than the number of available queriers, only specified number of queriers will execute queries for single tenant. Note that this distribution happens in query-frontend. When not using query-frontend, this option is not available.
+When shuffle sharding is **enabled** by setting `-frontend.max-queriers-per-tenant` (or its respective YAML config option) to a value higher than 0 and lower than the number of available queriers, only specified number of queriers will execute queries for single tenant. Note that this distribution happens in query-frontend, or query-scheduler if used. When using query-scheduler, `-frontend.max-queriers-per-tenant` option must be set for query-scheduler component. When not using query-frontend (with or without scheduler), this option is not available.
 
 _The maximum number of queriers can be overridden on a per-tenant basis in the limits overrides configuration._
 

diff --git a/pkg/api/api.go b/pkg/api/api.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cortexproject/cortex/pkg/ingester/client"
 	"github.com/cortexproject/cortex/pkg/querier"
 	"github.com/cortexproject/cortex/pkg/querier/frontend"
+	"github.com/cortexproject/cortex/pkg/querier/frontend2"
 	"github.com/cortexproject/cortex/pkg/ring"
 	"github.com/cortexproject/cortex/pkg/ruler"
 	"github.com/cortexproject/cortex/pkg/storegateway"
@@ -308,9 +309,21 @@ func (a *API) RegisterQueryAPI(handler http.Handler) {
 // RegisterQueryFrontend registers the Prometheus routes supported by the
 // Cortex querier service. Currently this can not be registered simultaneously
 // with the Querier.
-func (a *API) RegisterQueryFrontend(f *frontend.Frontend) {
+func (a *API) RegisterQueryFrontendHandler(h http.Handler) {
+	a.RegisterQueryAPI(h)
+}
+
+func (a *API) RegisterQueryFrontend1(f *frontend.Frontend) {
 	frontend.RegisterFrontendServer(a.server.GRPC, f)
-	a.RegisterQueryAPI(f.Handler())
+}
+
+func (a *API) RegisterQueryFrontend2(f *frontend2.Frontend2) {
+	frontend2.RegisterFrontendForQuerierServer(a.server.GRPC, f)
+}
+
+func (a *API) RegisterQueryScheduler(f *frontend2.Scheduler) {
+	frontend2.RegisterSchedulerForFrontendServer(a.server.GRPC, f)
+	frontend2.RegisterSchedulerForQuerierServer(a.server.GRPC, f)
 }
 
 // RegisterServiceMapHandler registers the Cortex structs service handler