Merge pull request #860 from 3scale/metrics-policy

Metrics policy
3scale · Aug 30, 2018 · c27e7f5 · c27e7f5
2 parents 8813f78 + 77cf346
commit c27e7f5
Show file tree

Hide file tree

Showing 12 changed files with 313 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,6 +34,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 - Liquid Context Debugging policy. It's a policy only meant for debugging purposes, returns the context available when evaluating liquid [PR #849](https://github.com/3scale/apicast/pull/849)
 - Logging policy. It allows to enable/disable access logs per service [PR #856](https://github.com/3scale/apicast/pull/856), [THREESCALE-1148](https://issues.jboss.org/browse/THREESCALE-1148)
 - Support JWK through OIDC Discovery [PR #850](https://github.com/3scale/apicast/pull/850)
+- Initial Prometheus metrics policy (backend responses and nginx metrics) [PR #860](https://github.com/3scale/apicast/pull/860), [THREESCALE-1230](https://issues.jboss.org/browse/THREESCALE-1230)
 
 ### Changed
 

diff --git a/gateway/config/production.lua b/gateway/config/production.lua
@@ -4,4 +4,5 @@ return {
     configuration_loader = 'boot',
     configuration_cache = os.getenv('APICAST_CONFIGURATION_CACHE') or 5*60,
     timer_resolution = '100ms',
+    port = { metrics = 9421 },
 }
diff --git a/gateway/config/staging.lua b/gateway/config/staging.lua
@@ -3,4 +3,5 @@ return {
     lua_code_cache = 'on',
     configuration_loader = 'lazy',
     configuration_cache = os.getenv('APICAST_CONFIGURATION_CACHE'),
+    port = { metrics = 9421 }, -- see https://github.com/prometheus/prometheus/wiki/Default-port-allocations,
 }
diff --git a/gateway/http.d/lua_capture_error_log.conf b/gateway/http.d/lua_capture_error_log.conf
@@ -0,0 +1,3 @@
+# To be able to use the ngx.errlog methods that we call from the Metrics policy
+# Ref: https://github.com/openresty/lua-nginx-module#lua_capture_error_log
+lua_capture_error_log 4k;
diff --git a/gateway/src/apicast/backend_client.lua b/gateway/src/apicast/backend_client.lua
@@ -20,6 +20,7 @@ local http_ng = require('resty.http_ng')
 local user_agent = require('apicast.user_agent')
 local resty_url = require('resty.url')
 local resty_env = require('resty.env')
+local threescale_backend_status_counters = require('apicast.metrics.3scale_backend_status')
 
 local http_proxy = require('resty.http.proxy')
 local http_ng_ngx = require('resty.http_ng.backend.ngx')
@@ -97,6 +98,10 @@ function _M:new(service, http_client)
   }, mt)
 end
 
+local function inc_backend_status_metric(status)
+  threescale_backend_status_counters.inc(status)
+end
+
 local function build_args(args)
   local query = {}
 
@@ -133,6 +138,8 @@ local function call_backend_transaction(self, path, options, ...)
 
   ngx.log(ngx.INFO, 'backend client uri: ', url, ' ok: ', res.ok, ' status: ', res.status, ' body: ', res.body, ' error: ', res.error)
 
+  inc_backend_status_metric(res.status)
+
   return res
 end
 
@@ -229,6 +236,8 @@ function _M:report(reports_batch)
   local report_body = format_transactions(reports_batch)
   local res = http_client.post(report_uri, report_body)
 
+  inc_backend_status_metric(res.status)
+
   return res
 end
 

diff --git a/gateway/src/apicast/metrics/3scale_backend_status.lua b/gateway/src/apicast/metrics/3scale_backend_status.lua
@@ -0,0 +1,27 @@
+local prometheus = require('apicast.prometheus')
+local metrics_updater = require('apicast.metrics.updater')
+
+local format = string.format
+
+local _M = {}
+
+local backend_response_metric = prometheus(
+  'counter',
+  'backend_response',
+  "Response status codes from 3scale's backend",
+  { 'status' }
+)
+
+local function label_for_status(status)
+  if not status or status == 0 then
+    return 'invalid_status'
+  else
+    return format("%dxx", status/100)
+  end
+end
+
+function _M.inc(status)
+  metrics_updater.inc(backend_response_metric, label_for_status(status))
+end
+
+return _M
diff --git a/gateway/src/apicast/metrics/updater.lua b/gateway/src/apicast/metrics/updater.lua
@@ -0,0 +1,20 @@
+local tonumber = tonumber
+
+local _M = {}
+
+local function metric_op(op, metric, value, label)
+  local metric_labels = {}
+  if not metric then return end
+  metric_labels[1] = label
+  metric[op](metric, tonumber(value) or 0, metric_labels)
+end
+
+function _M.set(metric, value, label)
+  return metric_op('set', metric, value, label)
+end
+
+function _M.inc(metric, label)
+  return metric_op('inc', metric, 1, label)
+end
+
+return _M
diff --git a/gateway/src/apicast/policy/nginx_metrics/init.lua b/gateway/src/apicast/policy/nginx_metrics/init.lua
@@ -0,0 +1 @@
+return require('nginx_metrics')
diff --git a/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua b/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua
@@ -0,0 +1,113 @@
+local _M = require('apicast.policy').new('Metrics')
+
+local resty_env = require('resty.env')
+local errlog = require('ngx.errlog')
+local prometheus = require('apicast.prometheus')
+local metrics_updater = require('apicast.metrics.updater')
+local tonumber = tonumber
+local select = select
+local find = string.find
+local pairs = pairs
+
+local new = _M.new
+
+local log_levels_list = {
+  'emerg',
+  'alert',
+  'crit',
+  'error',
+  'warn',
+  'notice',
+  'info',
+  'debug',
+}
+
+local log_level_env = 'NGINX_METRICS_LOG_LEVEL'
+local max_logs_env = 'NGINX_METRICS_MAX_LOGS'
+
+local log_level_default = 'error'
+local max_logs_default = 100
+
+local function find_i(t, value)
+  for i=1, #t do
+    if t[i] == value then return i end
+  end
+end
+
+local empty = {}
+
+local function get_logs(max)
+  return errlog.get_logs(max) or empty
+end
+
+local function filter_level()
+  local level = resty_env.value(log_level_env) or log_level_default
+
+  local level_index = find_i(log_levels_list, level)
+
+  if not level_index then
+    ngx.log(ngx.WARN, _M._NAME, ': invalid level: ', level, ' using error instead')
+    level_index = find_i(log_levels_list, 'error')
+  end
+
+  return level_index
+end
+
+function _M.new(configuration)
+  local m = new()
+
+  local config = configuration or empty
+
+  -- how many logs to take in one iteration
+  m.max_logs = tonumber(config.max_logs) or
+               resty_env.value(max_logs_env) or
+               max_logs_default
+
+  return m
+end
+
+local logs_metric = prometheus('counter', 'nginx_error_log', "Items in nginx error log", {'level'})
+local http_connections_metric =  prometheus('gauge', 'nginx_http_connections', 'Number of HTTP connections', {'state'})
+local shdict_capacity_metric = prometheus('gauge', 'openresty_shdict_capacity', 'OpenResty shared dictionary capacity', {'dict'})
+local shdict_free_space_metric = prometheus('gauge', 'openresty_shdict_free_space', 'OpenResty shared dictionary free space', {'dict'})
+
+function _M.init()
+  errlog.set_filter_level(filter_level())
+
+  get_logs(100) -- to throw them away after setting the filter level (and get rid of debug ones)
+
+  for name,dict in pairs(ngx.shared) do
+    metrics_updater.set(shdict_capacity_metric, dict:capacity(), name)
+  end
+end
+
+function _M:metrics()
+  local logs = get_logs(self.max_logs)
+
+  for i = 1, #logs, 3 do
+    metrics_updater.inc(logs_metric, log_levels_list[logs[i]] or 'unknown')
+  end
+
+  local response = ngx.location.capture("/nginx_status")
+
+  if response.status == 200 then
+    local accepted, handled, total = select(3, find(response.body, [[accepts handled requests%s+(%d+) (%d+) (%d+)]]))
+    local var = ngx.var
+
+    metrics_updater.set(http_connections_metric, var.connections_reading, 'reading')
+    metrics_updater.set(http_connections_metric, var.connections_waiting, 'waiting')
+    metrics_updater.set(http_connections_metric, var.connections_writing, 'writing')
+    metrics_updater.set(http_connections_metric, var.connections_active, 'active')
+    metrics_updater.set(http_connections_metric, accepted, 'accepted')
+    metrics_updater.set(http_connections_metric, handled, 'handled')
+    metrics_updater.set(http_connections_metric, total, 'total')
+  else
+    prometheus:log_error('Could not get status from nginx')
+  end
+
+  for name,dict in pairs(ngx.shared) do
+    metrics_updater.set(shdict_free_space_metric, dict:free_space(), name)
+  end
+end
+
+return _M
diff --git a/gateway/src/apicast/policy_chain.lua b/gateway/src/apicast/policy_chain.lua
@@ -60,7 +60,8 @@ end
 local DEFAULT_POLICIES = {
     'apicast.policy.load_configuration',
     'apicast.policy.find_service',
-    'apicast.policy.local_chain'
+    'apicast.policy.local_chain',
+    'apicast.policy.nginx_metrics'
 }
 
 --- Return new policy chain with default policies.

diff --git a/t/fixtures/configs/without_nginx_metrics.lua b/t/fixtures/configs/without_nginx_metrics.lua
@@ -0,0 +1,14 @@
+local PolicyChain = require('apicast.policy_chain')
+
+local policies = {
+  'apicast.policy.load_configuration',
+  'apicast.policy.find_service',
+  'apicast.policy.local_chain'
+}
+
+local policy_chain = PolicyChain.build(policies)
+
+return {
+  policy_chain = policy_chain,
+  port = { metrics = 9421 },
+}
diff --git a/t/prometheus-metrics.t b/t/prometheus-metrics.t
@@ -0,0 +1,121 @@
+use lib 't';
+use Test::APIcast::Blackbox 'no_plan';
+
+# The output varies between requests, so run only once
+repeat_each(1);
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: metrics endpoint works
+--- configuration
+{
+}
+--- request
+GET /metrics
+--- more_headers
+Host: metrics
+--- response_body
+# HELP nginx_http_connections Number of HTTP connections
+# TYPE nginx_http_connections gauge
+nginx_http_connections{state="accepted"} 1
+nginx_http_connections{state="active"} 1
+nginx_http_connections{state="handled"} 1
+nginx_http_connections{state="reading"} 0
+nginx_http_connections{state="total"} 1
+nginx_http_connections{state="waiting"} 0
+nginx_http_connections{state="writing"} 1
+# HELP nginx_metric_errors_total Number of nginx-lua-prometheus errors
+# TYPE nginx_metric_errors_total counter
+nginx_metric_errors_total 0
+# HELP openresty_shdict_capacity OpenResty shared dictionary capacity
+# TYPE openresty_shdict_capacity gauge
+openresty_shdict_capacity{dict="api_keys"} 10485760
+openresty_shdict_capacity{dict="batched_reports"} 1048576
+openresty_shdict_capacity{dict="batched_reports_locks"} 1048576
+openresty_shdict_capacity{dict="cached_auths"} 1048576
+openresty_shdict_capacity{dict="configuration"} 10485760
+openresty_shdict_capacity{dict="init"} 16384
+openresty_shdict_capacity{dict="limiter"} 1048576
+openresty_shdict_capacity{dict="locks"} 1048576
+openresty_shdict_capacity{dict="prometheus_metrics"} 16777216
+# HELP openresty_shdict_free_space OpenResty shared dictionary free space
+# TYPE openresty_shdict_free_space gauge
+openresty_shdict_free_space{dict="api_keys"} 10412032
+openresty_shdict_free_space{dict="batched_reports"} 1032192
+openresty_shdict_free_space{dict="batched_reports_locks"} 1032192
+openresty_shdict_free_space{dict="cached_auths"} 1032192
+openresty_shdict_free_space{dict="configuration"} 10412032
+openresty_shdict_free_space{dict="init"} 4096
+openresty_shdict_free_space{dict="limiter"} 1032192
+openresty_shdict_free_space{dict="locks"} 1032192
+openresty_shdict_free_space{dict="prometheus_metrics"} 16662528
+--- error_code: 200
+--- no_error_log
+[error]
+
+=== TEST 2: metric endpoints shows backend responses when the APIcast policy is in the chain
+We do a couple of authorized requests to backend (2xx) and a couple of
+unauthorized ones (4xx) and check that those metrics are shown correctly when
+calling the prometheus metrics endpoint.
+To simplify the output of the metrics endpoint, we use an environment config
+that does not include the nginx metrics (tested in the previous test).
+--- environment_file: t/fixtures/configs/without_nginx_metrics.lua
+--- configuration
+{
+  "services": [
+    {
+      "id": 42,
+      "backend_version":  1,
+      "backend_authentication_type": "service_token",
+      "backend_authentication_value": "token-value",
+      "proxy": {
+        "api_backend": "http://test:$TEST_NGINX_SERVER_PORT/",
+        "proxy_rules": [
+          { "pattern": "/", "http_method": "GET", "metric_system_name": "hits", "delta": 1 }
+        ],
+        "policy_chain": [
+          { "name": "apicast.policy.apicast" }
+        ]
+      }
+    }
+  ]
+}
+--- upstream
+  location / {
+     content_by_lua_block {
+       ngx.say('yay, api backend');
+     }
+  }
+--- backend
+  location /transactions/authrep.xml {
+    content_by_lua_block {
+      -- Check only the user key and assume the rest of params are OK
+      if ngx.req.get_uri_args(0)['user_key'] == 'invalid' then
+        ngx.exit(403)
+      else
+        ngx.exit(200)
+      end
+    }
+  }
+--- request eval
+["GET /?user_key=valid", "GET /?user_key=valid", "GET /?user_key=invalid", "GET /?user_key=invalid", "GET /metrics"]
+--- more_headers eval
+["", "", "", "", "Host: metrics"]
+--- error_code eval
+[ 200, 200, 403, 403, 200 ]
+--- response_body eval
+[ "yay, api backend\x{0a}", "yay, api backend\x{0a}", "Authentication failed", "Authentication failed",
+<<'METRICS_OUTPUT'
+# HELP backend_response Response status codes from 3scale's backend
+# TYPE backend_response counter
+backend_response{status="2xx"} 2
+backend_response{status="4xx"} 2
+# HELP nginx_metric_errors_total Number of nginx-lua-prometheus errors
+# TYPE nginx_metric_errors_total counter
+nginx_metric_errors_total 0
+METRICS_OUTPUT
+]
+--- no_error_log
+[error]