Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- `cortex_ingester_tsdb_head_truncations_total`
- `cortex_ingester_tsdb_head_gc_duration_seconds`
* [ENHANCEMENT] Added `cortex_alertmanager_config_hash` metric to expose hash of Alertmanager Config loaded per user. #3388
* [ENHANCEMENT] Query-Frontend / Query-Scheduler: New component called "Query-Scheduler" has been introduced. Query-Scheduler is simply a queue of requests, moved outside of Query-Frontend. This allows Query-Frontend to be scaled separately from number of queues. To make Query-Frontend and Querier use Query-Scheduler, they need to be started with `-frontend.scheduler-address` and `-querier.scheduler-address` options respectively. #3374

## 1.5.0 in progress

Expand Down
4 changes: 4 additions & 0 deletions development/tsdb-blocks-storage-s3/config/cortex.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ store_gateway:

frontend_worker:
frontend_address: "query-frontend:9007"
match_max_concurrent: true

# By setting scheduler_address, querier worker would use scheduler instead of frontend.
# scheduler_address: "query-scheduler:9012"

query_range:
split_queries_by_interval: 24h
Expand Down
10 changes: 8 additions & 2 deletions development/tsdb-blocks-storage-s3/config/grafana-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ prometheus:
namespace: 'tsdb-blocks-storage-s3'
- job_name: tsdb-blocks-storage-s3/querier
static_configs:
- targets: ['querier:8004']
- targets: ['querier:8004', 'querier-with-scheduler:8013']
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'
Expand All @@ -43,7 +43,7 @@ prometheus:
namespace: 'tsdb-blocks-storage-s3'
- job_name: tsdb-blocks-storage-s3/query-frontend
static_configs:
- targets: ['query-frontend:8007']
- targets: ['query-frontend:8007', 'query-frontend-with-scheduler:8012']
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'
Expand All @@ -53,6 +53,12 @@ prometheus:
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'
- job_name: tsdb-blocks-storage-s3/query-scheduler
static_configs:
- targets: ['query-scheduler:8011']
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'

remote_write:
- url: http://distributor:8001/api/prom/push
10 changes: 8 additions & 2 deletions development/tsdb-blocks-storage-s3/config/prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ scrape_configs:
namespace: 'tsdb-blocks-storage-s3'
- job_name: tsdb-blocks-storage-s3/querier
static_configs:
- targets: ['querier:8004']
- targets: ['querier:8004', 'query-frontend-with-scheduler:8013']
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'
Expand All @@ -36,7 +36,7 @@ scrape_configs:
namespace: 'tsdb-blocks-storage-s3'
- job_name: tsdb-blocks-storage-s3/query-frontend
static_configs:
- targets: ['query-frontend:8007']
- targets: ['query-frontend:8007', 'query-frontend-with-scheduler:8012']
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'
Expand All @@ -46,6 +46,12 @@ scrape_configs:
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'
- job_name: tsdb-blocks-storage-s3/query-scheduler
static_configs:
- targets: ['query-scheduler:8011']
labels:
cluster: 'docker-compose'
namespace: 'tsdb-blocks-storage-s3'

remote_write:
- url: http://distributor:8001/api/prom/push
65 changes: 65 additions & 0 deletions development/tsdb-blocks-storage-s3/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,68 @@ services:
- 18022:18022
volumes:
- ./config:/cortex/config

query-scheduler:
build:
context: .
dockerfile: dev.dockerfile
image: cortex
command: ["sh", "-c", "sleep 3 && exec ./dlv exec ./cortex --listen=:18011 --headless=true --api-version=2 --accept-multiclient --continue -- -config.file=./config/cortex.yaml -target=query-scheduler -server.http-listen-port=8011 -server.grpc-listen-port=9011 -store.max-query-length=8760h -log.level=debug"]
depends_on:
- consul
- minio
environment:
- JAEGER_AGENT_HOST=jaeger
- JAEGER_AGENT_PORT=6831
- JAEGER_TAGS=app=query-scheduler
- JAEGER_SAMPLER_TYPE=const
- JAEGER_SAMPLER_PARAM=1
ports:
- 8011:8011
- 18011:18011
volumes:
- ./config:/cortex/config

# This frontend uses query-scheduler, activated by `-frontend.scheduler-address` option.
query-frontend-with-scheduler:
build:
context: .
dockerfile: dev.dockerfile
image: cortex
command: ["sh", "-c", "sleep 3 && exec ./dlv exec ./cortex --listen=:18012 --headless=true --api-version=2 --accept-multiclient --continue -- -config.file=./config/cortex.yaml -target=query-frontend -server.http-listen-port=8012 -server.grpc-listen-port=9012 -store.max-query-length=8760h -frontend.scheduler-address=query-scheduler:9011 -log.level=debug"]
depends_on:
- consul
- minio
environment:
- JAEGER_AGENT_HOST=jaeger
- JAEGER_AGENT_PORT=6831
- JAEGER_TAGS=app=query-frontend2
- JAEGER_SAMPLER_TYPE=const
- JAEGER_SAMPLER_PARAM=1
ports:
- 8012:8012
- 18012:18012
volumes:
- ./config:/cortex/config

# This querier is connecting to query-scheduler, instead of query-frontend. This is achieved by setting -querier.scheduler-address="..."
querier-with-scheduler:
build:
context: .
dockerfile: dev.dockerfile
image: cortex
command: ["sh", "-c", "sleep 3 && exec ./dlv exec ./cortex --listen=:18013 --headless=true --api-version=2 --accept-multiclient --continue -- -config.file=./config/cortex.yaml -target=querier -server.http-listen-port=8013 -server.grpc-listen-port=9013 -querier.scheduler-address=query-scheduler:9011 -log.level=debug"]
depends_on:
- consul
- minio
environment:
- JAEGER_AGENT_HOST=jaeger
- JAEGER_AGENT_PORT=6831
- JAEGER_TAGS=app=querier-scheduler
- JAEGER_SAMPLER_TYPE=const
- JAEGER_SAMPLER_PARAM=1
ports:
- 8013:8013
- 18013:18013
volumes:
- ./config:/cortex/config
132 changes: 118 additions & 14 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ runtime_config:

# The memberlist_config configures the Gossip memberlist.
[memberlist: <memberlist_config>]

query_scheduler:
# Maximum number of outstanding requests per tenant per query-scheduler.
# In-flight requests above this limit will fail with HTTP response status code
# 429.
# CLI flag: -query-scheduler.max-outstanding-requests-per-tenant
[max_outstanding_requests_per_tenant: <int> | default = 100]
```

### `server_config`
Expand Down Expand Up @@ -757,27 +764,109 @@ store_gateway_client:
The `query_frontend_config` configures the Cortex query-frontend.

```yaml
# Log queries that are slower than the specified duration. Set to 0 to disable.
# Set to < 0 to enable on all queries.
# CLI flag: -frontend.log-queries-longer-than
[log_queries_longer_than: <duration> | default = 0s]

# Max body size for downstream prometheus.
# CLI flag: -frontend.max-body-size
[max_body_size: <int> | default = 10485760]

# Maximum number of outstanding requests per tenant per frontend; requests
# beyond this error with HTTP 429.
# CLI flag: -querier.max-outstanding-requests-per-tenant
[max_outstanding_per_tenant: <int> | default = 100]

# DNS hostname used for finding query-schedulers.
# CLI flag: -frontend.scheduler-address
[scheduler_address: <string> | default = ""]

# How often to resolve the scheduler-address, in order to look for new
# query-scheduler instances.
# CLI flag: -frontend.scheduler-dns-lookup-period
[scheduler_dns_lookup_period: <duration> | default = 10s]

# Number of concurrent workers forwarding queries to single query-scheduler.
# CLI flag: -frontend.scheduler-worker-concurrency
[scheduler_worker_concurrency: <int> | default = 5]

grpc_client_config:
# gRPC client max receive message size (bytes).
# CLI flag: -frontend.grpc-client-config.grpc-max-recv-msg-size
[max_recv_msg_size: <int> | default = 104857600]

# gRPC client max send message size (bytes).
# CLI flag: -frontend.grpc-client-config.grpc-max-send-msg-size
[max_send_msg_size: <int> | default = 16777216]

# Deprecated: Use gzip compression when sending messages. If true, overrides
# grpc-compression flag.
# CLI flag: -frontend.grpc-client-config.grpc-use-gzip-compression
[use_gzip_compression: <boolean> | default = false]

# Use compression when sending messages. Supported values are: 'gzip',
# 'snappy' and '' (disable compression)
# CLI flag: -frontend.grpc-client-config.grpc-compression
[grpc_compression: <string> | default = ""]

# Rate limit for gRPC client; 0 means disabled.
# CLI flag: -frontend.grpc-client-config.grpc-client-rate-limit
[rate_limit: <float> | default = 0]

# Rate limit burst for gRPC client.
# CLI flag: -frontend.grpc-client-config.grpc-client-rate-limit-burst
[rate_limit_burst: <int> | default = 0]

# Enable backoff and retry when we hit ratelimits.
# CLI flag: -frontend.grpc-client-config.backoff-on-ratelimits
[backoff_on_ratelimits: <boolean> | default = false]

backoff_config:
# Minimum delay when backing off.
# CLI flag: -frontend.grpc-client-config.backoff-min-period
[min_period: <duration> | default = 100ms]

# Maximum delay when backing off.
# CLI flag: -frontend.grpc-client-config.backoff-max-period
[max_period: <duration> | default = 10s]

# Number of times to backoff and retry before failing.
# CLI flag: -frontend.grpc-client-config.backoff-retries
[max_retries: <int> | default = 10]

# Path to the client certificate file, which will be used for authenticating
# with the server. Also requires the key path to be configured.
# CLI flag: -frontend.grpc-client-config.tls-cert-path
[tls_cert_path: <string> | default = ""]

# Path to the key file for the client certificate. Also requires the client
# certificate to be configured.
# CLI flag: -frontend.grpc-client-config.tls-key-path
[tls_key_path: <string> | default = ""]

# Path to the CA certificates file to validate server certificate against. If
# not set, the host's root CA certificates are used.
# CLI flag: -frontend.grpc-client-config.tls-ca-path
[tls_ca_path: <string> | default = ""]

# Skip validating server certificate.
# CLI flag: -frontend.grpc-client-config.tls-insecure-skip-verify
[tls_insecure_skip_verify: <boolean> | default = false]

# Name of network interface to read address from. This address is sent to
# query-scheduler and querier, which uses it to send the query response back to
# query-frontend.
# CLI flag: -frontend.instance-interface-names
[instance_interface_names: <list of string> | default = [eth0 en0]]

# Compress HTTP responses.
# CLI flag: -querier.compress-http-responses
[compress_responses: <boolean> | default = false]

# URL of downstream Prometheus.
# CLI flag: -frontend.downstream-url
[downstream_url: <string> | default = ""]

# Max body size for downstream prometheus.
# CLI flag: -frontend.max-body-size
[max_body_size: <int> | default = 10485760]

# Log queries that are slower than the specified duration. Set to 0 to disable.
# Set to < 0 to enable on all queries.
# CLI flag: -frontend.log-queries-longer-than
[log_queries_longer_than: <duration> | default = 0s]
```

### `query_range_config`
Expand Down Expand Up @@ -2454,7 +2543,10 @@ grpc_client_config:
The `frontend_worker_config` configures the worker - running within the Cortex querier - picking up and executing queries enqueued by the query-frontend.

```yaml
# Address of query frontend service, in host:port format.
# Address of query frontend service, in host:port format. If
# -querier.scheduler-address is set as well, querier will use scheduler instead.
# If neither -querier.frontend-address or -querier.scheduler-address is set,
# queries must arrive via HTTP endpoint.
# CLI flag: -querier.frontend-address
[frontend_address: <string> | default = ""]

Expand Down Expand Up @@ -2538,6 +2630,17 @@ grpc_client_config:
# Skip validating server certificate.
# CLI flag: -querier.frontend-client.tls-insecure-skip-verify
[tls_insecure_skip_verify: <boolean> | default = false]

# Hostname (and port) of scheduler that querier will periodically resolve,
# connect to and receive queries from. If set, takes precedence over
# -querier.frontend-address.
# CLI flag: -querier.scheduler-address
[scheduler_address: <string> | default = ""]

# How often to resolve the scheduler-address, in order to look for new
# query-scheduler instances.
# CLI flag: -querier.scheduler-dns-lookup-period
[scheduler_dns_lookup_period: <duration> | default = 10s]
```

### `etcd_config`
Expand Down Expand Up @@ -2903,10 +3006,11 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s

# Maximum number of queriers that can handle requests for a single tenant. If
# set to 0 or value higher than number of available queriers, *all* queriers
# will handle requests for the tenant. Each frontend will select the same set of
# queriers for the same tenant (given that all queriers are connected to all
# frontends). This option only works with queriers connecting to the
# query-frontend, not when using downstream URL.
# will handle requests for the tenant. Each frontend (or query-scheduler, if
# used) will select the same set of queriers for the same tenant (given that all
# queriers are connected to all frontends / query-schedulers). This option only
# works with queriers connecting to the query-frontend / query-scheduler, not
# when using downstream URL.
# CLI flag: -frontend.max-queriers-per-tenant
[max_queriers_per_tenant: <int> | default = 0]

Expand Down
1 change: 1 addition & 0 deletions docs/configuration/v1-guarantees.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,4 @@ Currently experimental features are:
- Blocksconvert tools
- OpenStack Swift storage support.
- Metric relabeling in the distributor.
- Scalable query-frontend (when using query-scheduler)
2 changes: 1 addition & 1 deletion docs/guides/shuffle-sharding.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ _The shard size can be overridden on a per-tenant basis in the limits overrides

By default all Cortex queriers can execute received queries for given tenant.

When shuffle sharding is **enabled** by setting `-frontend.max-queriers-per-tenant` (or its respective YAML config option) to a value higher than 0 and lower than the number of available queriers, only specified number of queriers will execute queries for single tenant. Note that this distribution happens in query-frontend. When not using query-frontend, this option is not available.
When shuffle sharding is **enabled** by setting `-frontend.max-queriers-per-tenant` (or its respective YAML config option) to a value higher than 0 and lower than the number of available queriers, only specified number of queriers will execute queries for single tenant. Note that this distribution happens in query-frontend, or query-scheduler if used. When using query-scheduler, `-frontend.max-queriers-per-tenant` option must be set for query-scheduler component. When not using query-frontend (with or without scheduler), this option is not available.

_The maximum number of queriers can be overridden on a per-tenant basis in the limits overrides configuration._

Expand Down
17 changes: 15 additions & 2 deletions pkg/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cortexproject/cortex/pkg/ingester/client"
"github.com/cortexproject/cortex/pkg/querier"
"github.com/cortexproject/cortex/pkg/querier/frontend"
"github.com/cortexproject/cortex/pkg/querier/frontend2"
"github.com/cortexproject/cortex/pkg/ring"
"github.com/cortexproject/cortex/pkg/ruler"
"github.com/cortexproject/cortex/pkg/storegateway"
Expand Down Expand Up @@ -308,9 +309,21 @@ func (a *API) RegisterQueryAPI(handler http.Handler) {
// RegisterQueryFrontend registers the Prometheus routes supported by the
// Cortex querier service. Currently this can not be registered simultaneously
// with the Querier.
func (a *API) RegisterQueryFrontend(f *frontend.Frontend) {
func (a *API) RegisterQueryFrontendHandler(h http.Handler) {
a.RegisterQueryAPI(h)
}

func (a *API) RegisterQueryFrontend1(f *frontend.Frontend) {
frontend.RegisterFrontendServer(a.server.GRPC, f)
a.RegisterQueryAPI(f.Handler())
}

func (a *API) RegisterQueryFrontend2(f *frontend2.Frontend2) {
frontend2.RegisterFrontendForQuerierServer(a.server.GRPC, f)
}

func (a *API) RegisterQueryScheduler(f *frontend2.Scheduler) {
frontend2.RegisterSchedulerForFrontendServer(a.server.GRPC, f)
frontend2.RegisterSchedulerForQuerierServer(a.server.GRPC, f)
}

// RegisterServiceMapHandler registers the Cortex structs service handler
Expand Down
Loading