diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f392f62f..953de1378 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - Liquid Context Debugging policy. It's a policy only meant for debugging purposes, returns the context available when evaluating liquid [PR #849](https://github.com/3scale/apicast/pull/849) - Logging policy. It allows to enable/disable access logs per service [PR #856](https://github.com/3scale/apicast/pull/856), [THREESCALE-1148](https://issues.jboss.org/browse/THREESCALE-1148) - Support JWK through OIDC Discovery [PR #850](https://github.com/3scale/apicast/pull/850) +- Initial Prometheus metrics policy (backend responses and nginx metrics) [PR #860](https://github.com/3scale/apicast/pull/860), [THREESCALE-1230](https://issues.jboss.org/browse/THREESCALE-1230) ### Changed diff --git a/gateway/config/production.lua b/gateway/config/production.lua index 99a946242..1597149ca 100644 --- a/gateway/config/production.lua +++ b/gateway/config/production.lua @@ -4,4 +4,5 @@ return { configuration_loader = 'boot', configuration_cache = os.getenv('APICAST_CONFIGURATION_CACHE') or 5*60, timer_resolution = '100ms', + port = { metrics = 9421 }, } diff --git a/gateway/config/staging.lua b/gateway/config/staging.lua index fc0649b3a..ceab8ff6c 100644 --- a/gateway/config/staging.lua +++ b/gateway/config/staging.lua @@ -3,4 +3,5 @@ return { lua_code_cache = 'on', configuration_loader = 'lazy', configuration_cache = os.getenv('APICAST_CONFIGURATION_CACHE'), + port = { metrics = 9421 }, -- see https://github.com/prometheus/prometheus/wiki/Default-port-allocations, } diff --git a/gateway/http.d/lua_capture_error_log.conf b/gateway/http.d/lua_capture_error_log.conf new file mode 100644 index 000000000..adcbdcac5 --- /dev/null +++ b/gateway/http.d/lua_capture_error_log.conf @@ -0,0 +1,3 @@ +# To be able to use the ngx.errlog methods that we call from the Metrics policy +# Ref: https://github.com/openresty/lua-nginx-module#lua_capture_error_log +lua_capture_error_log 4k; diff --git a/gateway/src/apicast/backend_client.lua b/gateway/src/apicast/backend_client.lua index 84ff89569..277aa8be1 100644 --- a/gateway/src/apicast/backend_client.lua +++ b/gateway/src/apicast/backend_client.lua @@ -20,6 +20,7 @@ local http_ng = require('resty.http_ng') local user_agent = require('apicast.user_agent') local resty_url = require('resty.url') local resty_env = require('resty.env') +local threescale_backend_status_counters = require('apicast.metrics.3scale_backend_status') local http_proxy = require('resty.http.proxy') local http_ng_ngx = require('resty.http_ng.backend.ngx') @@ -97,6 +98,10 @@ function _M:new(service, http_client) }, mt) end +local function inc_backend_status_metric(status) + threescale_backend_status_counters.inc(status) +end + local function build_args(args) local query = {} @@ -133,6 +138,8 @@ local function call_backend_transaction(self, path, options, ...) ngx.log(ngx.INFO, 'backend client uri: ', url, ' ok: ', res.ok, ' status: ', res.status, ' body: ', res.body, ' error: ', res.error) + inc_backend_status_metric(res.status) + return res end @@ -229,6 +236,8 @@ function _M:report(reports_batch) local report_body = format_transactions(reports_batch) local res = http_client.post(report_uri, report_body) + inc_backend_status_metric(res.status) + return res end diff --git a/gateway/src/apicast/metrics/3scale_backend_status.lua b/gateway/src/apicast/metrics/3scale_backend_status.lua new file mode 100644 index 000000000..836e0ff54 --- /dev/null +++ b/gateway/src/apicast/metrics/3scale_backend_status.lua @@ -0,0 +1,27 @@ +local prometheus = require('apicast.prometheus') +local metrics_updater = require('apicast.metrics.updater') + +local format = string.format + +local _M = {} + +local backend_response_metric = prometheus( + 'counter', + 'backend_response', + "Response status codes from 3scale's backend", + { 'status' } +) + +local function label_for_status(status) + if not status or status == 0 then + return 'invalid_status' + else + return format("%dxx", status/100) + end +end + +function _M.inc(status) + metrics_updater.inc(backend_response_metric, label_for_status(status)) +end + +return _M diff --git a/gateway/src/apicast/metrics/updater.lua b/gateway/src/apicast/metrics/updater.lua new file mode 100644 index 000000000..aa63f19f6 --- /dev/null +++ b/gateway/src/apicast/metrics/updater.lua @@ -0,0 +1,20 @@ +local tonumber = tonumber + +local _M = {} + +local function metric_op(op, metric, value, label) + local metric_labels = {} + if not metric then return end + metric_labels[1] = label + metric[op](metric, tonumber(value) or 0, metric_labels) +end + +function _M.set(metric, value, label) + return metric_op('set', metric, value, label) +end + +function _M.inc(metric, label) + return metric_op('inc', metric, 1, label) +end + +return _M diff --git a/gateway/src/apicast/policy/nginx_metrics/init.lua b/gateway/src/apicast/policy/nginx_metrics/init.lua new file mode 100644 index 000000000..f225bf0d5 --- /dev/null +++ b/gateway/src/apicast/policy/nginx_metrics/init.lua @@ -0,0 +1 @@ +return require('nginx_metrics') diff --git a/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua b/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua new file mode 100644 index 000000000..158b726ea --- /dev/null +++ b/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua @@ -0,0 +1,113 @@ +local _M = require('apicast.policy').new('Metrics') + +local resty_env = require('resty.env') +local errlog = require('ngx.errlog') +local prometheus = require('apicast.prometheus') +local metrics_updater = require('apicast.metrics.updater') +local tonumber = tonumber +local select = select +local find = string.find +local pairs = pairs + +local new = _M.new + +local log_levels_list = { + 'emerg', + 'alert', + 'crit', + 'error', + 'warn', + 'notice', + 'info', + 'debug', +} + +local log_level_env = 'NGINX_METRICS_LOG_LEVEL' +local max_logs_env = 'NGINX_METRICS_MAX_LOGS' + +local log_level_default = 'error' +local max_logs_default = 100 + +local function find_i(t, value) + for i=1, #t do + if t[i] == value then return i end + end +end + +local empty = {} + +local function get_logs(max) + return errlog.get_logs(max) or empty +end + +local function filter_level() + local level = resty_env.value(log_level_env) or log_level_default + + local level_index = find_i(log_levels_list, level) + + if not level_index then + ngx.log(ngx.WARN, _M._NAME, ': invalid level: ', level, ' using error instead') + level_index = find_i(log_levels_list, 'error') + end + + return level_index +end + +function _M.new(configuration) + local m = new() + + local config = configuration or empty + + -- how many logs to take in one iteration + m.max_logs = tonumber(config.max_logs) or + resty_env.value(max_logs_env) or + max_logs_default + + return m +end + +local logs_metric = prometheus('counter', 'nginx_error_log', "Items in nginx error log", {'level'}) +local http_connections_metric = prometheus('gauge', 'nginx_http_connections', 'Number of HTTP connections', {'state'}) +local shdict_capacity_metric = prometheus('gauge', 'openresty_shdict_capacity', 'OpenResty shared dictionary capacity', {'dict'}) +local shdict_free_space_metric = prometheus('gauge', 'openresty_shdict_free_space', 'OpenResty shared dictionary free space', {'dict'}) + +function _M.init() + errlog.set_filter_level(filter_level()) + + get_logs(100) -- to throw them away after setting the filter level (and get rid of debug ones) + + for name,dict in pairs(ngx.shared) do + metrics_updater.set(shdict_capacity_metric, dict:capacity(), name) + end +end + +function _M:metrics() + local logs = get_logs(self.max_logs) + + for i = 1, #logs, 3 do + metrics_updater.inc(logs_metric, log_levels_list[logs[i]] or 'unknown') + end + + local response = ngx.location.capture("/nginx_status") + + if response.status == 200 then + local accepted, handled, total = select(3, find(response.body, [[accepts handled requests%s+(%d+) (%d+) (%d+)]])) + local var = ngx.var + + metrics_updater.set(http_connections_metric, var.connections_reading, 'reading') + metrics_updater.set(http_connections_metric, var.connections_waiting, 'waiting') + metrics_updater.set(http_connections_metric, var.connections_writing, 'writing') + metrics_updater.set(http_connections_metric, var.connections_active, 'active') + metrics_updater.set(http_connections_metric, accepted, 'accepted') + metrics_updater.set(http_connections_metric, handled, 'handled') + metrics_updater.set(http_connections_metric, total, 'total') + else + prometheus:log_error('Could not get status from nginx') + end + + for name,dict in pairs(ngx.shared) do + metrics_updater.set(shdict_free_space_metric, dict:free_space(), name) + end +end + +return _M diff --git a/gateway/src/apicast/policy_chain.lua b/gateway/src/apicast/policy_chain.lua index 27b6af42e..ec5632e35 100644 --- a/gateway/src/apicast/policy_chain.lua +++ b/gateway/src/apicast/policy_chain.lua @@ -60,7 +60,8 @@ end local DEFAULT_POLICIES = { 'apicast.policy.load_configuration', 'apicast.policy.find_service', - 'apicast.policy.local_chain' + 'apicast.policy.local_chain', + 'apicast.policy.nginx_metrics' } --- Return new policy chain with default policies. diff --git a/t/fixtures/configs/without_nginx_metrics.lua b/t/fixtures/configs/without_nginx_metrics.lua new file mode 100644 index 000000000..739e68449 --- /dev/null +++ b/t/fixtures/configs/without_nginx_metrics.lua @@ -0,0 +1,14 @@ +local PolicyChain = require('apicast.policy_chain') + +local policies = { + 'apicast.policy.load_configuration', + 'apicast.policy.find_service', + 'apicast.policy.local_chain' +} + +local policy_chain = PolicyChain.build(policies) + +return { + policy_chain = policy_chain, + port = { metrics = 9421 }, +} diff --git a/t/prometheus-metrics.t b/t/prometheus-metrics.t new file mode 100644 index 000000000..005691e60 --- /dev/null +++ b/t/prometheus-metrics.t @@ -0,0 +1,121 @@ +use lib 't'; +use Test::APIcast::Blackbox 'no_plan'; + +# The output varies between requests, so run only once +repeat_each(1); + +run_tests(); + +__DATA__ + +=== TEST 1: metrics endpoint works +--- configuration +{ +} +--- request +GET /metrics +--- more_headers +Host: metrics +--- response_body +# HELP nginx_http_connections Number of HTTP connections +# TYPE nginx_http_connections gauge +nginx_http_connections{state="accepted"} 1 +nginx_http_connections{state="active"} 1 +nginx_http_connections{state="handled"} 1 +nginx_http_connections{state="reading"} 0 +nginx_http_connections{state="total"} 1 +nginx_http_connections{state="waiting"} 0 +nginx_http_connections{state="writing"} 1 +# HELP nginx_metric_errors_total Number of nginx-lua-prometheus errors +# TYPE nginx_metric_errors_total counter +nginx_metric_errors_total 0 +# HELP openresty_shdict_capacity OpenResty shared dictionary capacity +# TYPE openresty_shdict_capacity gauge +openresty_shdict_capacity{dict="api_keys"} 10485760 +openresty_shdict_capacity{dict="batched_reports"} 1048576 +openresty_shdict_capacity{dict="batched_reports_locks"} 1048576 +openresty_shdict_capacity{dict="cached_auths"} 1048576 +openresty_shdict_capacity{dict="configuration"} 10485760 +openresty_shdict_capacity{dict="init"} 16384 +openresty_shdict_capacity{dict="limiter"} 1048576 +openresty_shdict_capacity{dict="locks"} 1048576 +openresty_shdict_capacity{dict="prometheus_metrics"} 16777216 +# HELP openresty_shdict_free_space OpenResty shared dictionary free space +# TYPE openresty_shdict_free_space gauge +openresty_shdict_free_space{dict="api_keys"} 10412032 +openresty_shdict_free_space{dict="batched_reports"} 1032192 +openresty_shdict_free_space{dict="batched_reports_locks"} 1032192 +openresty_shdict_free_space{dict="cached_auths"} 1032192 +openresty_shdict_free_space{dict="configuration"} 10412032 +openresty_shdict_free_space{dict="init"} 4096 +openresty_shdict_free_space{dict="limiter"} 1032192 +openresty_shdict_free_space{dict="locks"} 1032192 +openresty_shdict_free_space{dict="prometheus_metrics"} 16662528 +--- error_code: 200 +--- no_error_log +[error] + +=== TEST 2: metric endpoints shows backend responses when the APIcast policy is in the chain +We do a couple of authorized requests to backend (2xx) and a couple of +unauthorized ones (4xx) and check that those metrics are shown correctly when +calling the prometheus metrics endpoint. +To simplify the output of the metrics endpoint, we use an environment config +that does not include the nginx metrics (tested in the previous test). +--- environment_file: t/fixtures/configs/without_nginx_metrics.lua +--- configuration +{ + "services": [ + { + "id": 42, + "backend_version": 1, + "backend_authentication_type": "service_token", + "backend_authentication_value": "token-value", + "proxy": { + "api_backend": "http://test:$TEST_NGINX_SERVER_PORT/", + "proxy_rules": [ + { "pattern": "/", "http_method": "GET", "metric_system_name": "hits", "delta": 1 } + ], + "policy_chain": [ + { "name": "apicast.policy.apicast" } + ] + } + } + ] +} +--- upstream + location / { + content_by_lua_block { + ngx.say('yay, api backend'); + } + } +--- backend + location /transactions/authrep.xml { + content_by_lua_block { + -- Check only the user key and assume the rest of params are OK + if ngx.req.get_uri_args(0)['user_key'] == 'invalid' then + ngx.exit(403) + else + ngx.exit(200) + end + } + } +--- request eval +["GET /?user_key=valid", "GET /?user_key=valid", "GET /?user_key=invalid", "GET /?user_key=invalid", "GET /metrics"] +--- more_headers eval +["", "", "", "", "Host: metrics"] +--- error_code eval +[ 200, 200, 403, 403, 200 ] +--- response_body eval +[ "yay, api backend\x{0a}", "yay, api backend\x{0a}", "Authentication failed", "Authentication failed", +<<'METRICS_OUTPUT' +# HELP backend_response Response status codes from 3scale's backend +# TYPE backend_response counter +backend_response{status="2xx"} 2 +backend_response{status="4xx"} 2 +# HELP nginx_metric_errors_total Number of nginx-lua-prometheus errors +# TYPE nginx_metric_errors_total counter +nginx_metric_errors_total 0 +METRICS_OUTPUT +] +--- no_error_log +[error]