diff --git a/docs/setup/settings.asciidoc b/docs/setup/settings.asciidoc index 04c2de1f58cc1..193cb3f1f8135 100644 --- a/docs/setup/settings.asciidoc +++ b/docs/setup/settings.asciidoc @@ -69,6 +69,8 @@ information and all requests. The minimum value is 100. `status.allowAnonymous`:: *Default: false* If authentication is enabled, setting this to `true` allows unauthenticated users to access the Kibana server status API and status page. +`cpu.cgroup.path.override`:: Override for cgroup cpu path when mounted in manner that is inconsistent with `/proc/self/cgroup` +`cpuacct.cgroup.path.override`:: Override for cgroup cpuacct path when mounted in manner that is inconsistent with `/proc/self/cgroup` `console.enabled`:: *Default: true* Set to false to disable Console. Toggling this will cause the server to regenerate assets on the next startup, which may cause a delay before pages start being served. `elasticsearch.tribe.url:`:: Optional URL of the Elasticsearch tribe instance to use for all your diff --git a/package.json b/package.json index 31dfc510ba34e..40bf1c137564c 100644 --- a/package.json +++ b/package.json @@ -249,6 +249,7 @@ "makelogs": "3.2.3", "marked-text-renderer": "0.1.0", "mocha": "2.5.3", + "mock-fs": "4.0.0", "murmurhash3js": "3.0.1", "ncp": "2.0.0", "nock": "8.0.0", diff --git a/src/server/config/schema.js b/src/server/config/schema.js index a0340c141c644..e10db656dfe43 100644 --- a/src/server/config/schema.js +++ b/src/server/config/schema.js @@ -27,6 +27,21 @@ module.exports = () => Joi.object({ exclusive: Joi.boolean().default(false) }).default(), + cpu: Joi.object({ + cgroup: Joi.object({ + path: Joi.object({ + override: Joi.string().default() + }) + }) + }), + + cpuacct: Joi.object({ + cgroup: Joi.object({ + path: Joi.object({ + override: Joi.string().default() + }) + }) + }), server: Joi.object({ uuid: Joi.string().guid().default(), diff --git a/src/server/status/__tests__/cgroup.js b/src/server/status/__tests__/cgroup.js new file mode 100644 index 0000000000000..8818e9b553b53 --- /dev/null +++ b/src/server/status/__tests__/cgroup.js @@ -0,0 +1,170 @@ +import expect from 'expect.js'; +import mockFs from 'mock-fs'; +import { cGroups as cGroupsFsStub } from './fs_stubs'; +import { getAllStats, readControlGroups, readCPUStat } from '../cgroup'; + +describe('Control Group', function () { + const fsStub = cGroupsFsStub(); + + afterEach(() => { + mockFs.restore(); + }); + + describe('readControlGroups', () => { + it('parses the file', async () => { + mockFs({ '/proc/self/cgroup': fsStub.cGroupContents }); + const cGroup = await readControlGroups(); + + expect(cGroup).to.eql({ + freezer: '/', + net_cls: '/', + net_prio: '/', + pids: '/', + blkio: '/', + memory: '/', + devices: '/user.slice', + hugetlb: '/', + perf_event: '/', + cpu: `/${fsStub.hierarchy}`, + cpuacct: `/${fsStub.hierarchy}`, + cpuset: `/${fsStub.hierarchy}`, + 'name=systemd': '/user.slice/user-1000.slice/session-2359.scope' + }); + }); + }); + + describe('readCPUStat', () => { + it('parses the file', async () => { + mockFs({ '/sys/fs/cgroup/cpu/fakeGroup/cpu.stat': fsStub.cpuStatContents }); + const cpuStat = await readCPUStat('fakeGroup'); + + expect(cpuStat).to.eql({ + number_of_elapsed_periods: 0, + number_of_times_throttled: 10, + time_throttled_nanos: 20 + }); + }); + + it('returns default stats for missing file', async () => { + mockFs(); + const cpuStat = await readCPUStat('fakeGroup'); + + expect(cpuStat).to.eql({ + number_of_elapsed_periods: -1, + number_of_times_throttled: -1, + time_throttled_nanos: -1 + }); + }); + }); + + describe('getAllStats', () => { + it('can override the cpu group path', async () => { + mockFs({ + '/proc/self/cgroup': fsStub.cGroupContents, + [`${fsStub.cpuAcctDir}/cpuacct.usage`]: '357753491408', + '/sys/fs/cgroup/cpu/docker/cpu.cfs_period_us': '100000', + '/sys/fs/cgroup/cpu/docker/cpu.cfs_quota_us': '5000', + '/sys/fs/cgroup/cpu/docker/cpu.stat': fsStub.cpuStatContents, + }); + + console.log('fsStub.cpuAcctDir', fsStub.cpuAcctDir); + const stats = await getAllStats({ cpuPath: '/docker' }); + + expect(stats).to.eql({ + cpuacct: { + control_group: `/${fsStub.hierarchy}`, + usage_nanos: 357753491408, + }, + cpu: { + control_group: '/docker', + cfs_period_micros: 100000, + cfs_quota_micros: 5000, + stat: { + number_of_elapsed_periods: 0, + number_of_times_throttled: 10, + time_throttled_nanos: 20 + } + } + }); + }); + + it('can override the cpuacct group path', async () => { + mockFs({ + '/proc/self/cgroup': fsStub.cGroupContents, + '/sys/fs/cgroup/cpuacct/docker/cpuacct.usage': '357753491408', + [`${fsStub.cpuDir}/cpu.cfs_period_us`]: '100000', + [`${fsStub.cpuDir}/cpu.cfs_quota_us`]: '5000', + [`${fsStub.cpuDir}/cpu.stat`]: fsStub.cpuStatContents, + }); + + const stats = await getAllStats({ cpuAcctPath: '/docker' }); + + expect(stats).to.eql({ + cpuacct: { + control_group: '/docker', + usage_nanos: 357753491408, + }, + cpu: { + control_group: `/${fsStub.hierarchy}`, + cfs_period_micros: 100000, + cfs_quota_micros: 5000, + stat: { + number_of_elapsed_periods: 0, + number_of_times_throttled: 10, + time_throttled_nanos: 20 + } + } + }); + }); + + it('extracts control group stats', async () => { + mockFs(fsStub.files); + const stats = await getAllStats(); + + expect(stats).to.eql({ + cpuacct: { + control_group: `/${fsStub.hierarchy}`, + usage_nanos: 357753491408, + }, + cpu: { + control_group: `/${fsStub.hierarchy}`, + cfs_period_micros: 100000, + cfs_quota_micros: 5000, + stat: { + number_of_elapsed_periods: 0, + number_of_times_throttled: 10, + time_throttled_nanos: 20 + } + } + }); + }); + + it('returns null when all files are missing', async () => { + mockFs({}); + const stats = await getAllStats(); + expect(stats).to.be.null; + }); + + it('returns null if CPU accounting files are missing', async () => { + mockFs({ + '/proc/self/cgroup': fsStub.cGroupContents, + [`${fsStub.cpuDir}/cpu.stat`]: fsStub.cpuStatContents + }); + const stats = await getAllStats(); + + expect(stats).to.be.null; + }); + + it('returns null if cpuStat file is missing', async () => { + mockFs({ + '/proc/self/cgroup': fsStub.cGroupContents, + [`${fsStub.cpuAcctDir}/cpuacct.usage`]: '357753491408', + [`${fsStub.cpuDir}/cpu.cfs_period_us`]: '100000', + [`${fsStub.cpuDir}/cpu.cfs_quota_us`]: '5000' + }); + const stats = await getAllStats(); + + expect(stats).to.be.null; + }); + }); +}); diff --git a/src/server/status/__tests__/fs_stubs.js b/src/server/status/__tests__/fs_stubs.js new file mode 100644 index 0000000000000..592e6203e66a3 --- /dev/null +++ b/src/server/status/__tests__/fs_stubs.js @@ -0,0 +1,42 @@ +export function cGroups(hierarchy) { + if (!hierarchy) { + hierarchy = Math.random().toString(36).substring(7); + } + + const cpuAcctDir = `/sys/fs/cgroup/cpuacct/${hierarchy}`; + const cpuDir = `/sys/fs/cgroup/cpu/${hierarchy}`; + + const cGroupContents = [ + '10:freezer:/', + '9:net_cls,net_prio:/', + '8:pids:/', + '7:blkio:/', + '6:memory:/', + '5:devices:/user.slice', + '4:hugetlb:/', + '3:perf_event:/', + '2:cpu,cpuacct,cpuset:/' + hierarchy, + '1:name=systemd:/user.slice/user-1000.slice/session-2359.scope' + ].join('\n'); + + const cpuStatContents = [ + 'nr_periods 0', + 'nr_throttled 10', + 'throttled_time 20' + ].join('\n'); + + return { + hierarchy, + cGroupContents, + cpuStatContents, + cpuAcctDir, + cpuDir, + files: { + '/proc/self/cgroup': cGroupContents, + [`${cpuAcctDir}/cpuacct.usage`]: '357753491408', + [`${cpuDir}/cpu.cfs_period_us`]: '100000', + [`${cpuDir}/cpu.cfs_quota_us`]: '5000', + [`${cpuDir}/cpu.stat`]: cpuStatContents, + } + }; +} diff --git a/src/server/status/__tests__/metrics.js b/src/server/status/__tests__/metrics.js index 67d5167e3cfb6..8d1d3d65a6a35 100644 --- a/src/server/status/__tests__/metrics.js +++ b/src/server/status/__tests__/metrics.js @@ -1,5 +1,8 @@ import _ from 'lodash'; import expect from 'expect.js'; +import sinon from 'sinon'; +import mockFs from 'mock-fs'; +import { cGroups as cGroupsFsStub } from './fs_stubs'; import { getMetrics } from '../metrics'; @@ -20,7 +23,8 @@ describe('Metrics', function () { 'psdelay': 1.6091690063476562, 'host': '123' }; - const config = { + + const sampleConfig = { ops: { interval: 5000 }, @@ -29,28 +33,104 @@ describe('Metrics', function () { } }; - let metrics; - beforeEach(() => { - metrics = getMetrics({ - event: _.cloneDeep(mockOps), - config: { - get: path => _.get(config, path) - } + describe('with cgroups', () => { + it('should provide cgroups', async () => { + const fsStub = cGroupsFsStub(); + const event = _.cloneDeep(mockOps); + const config = { get: path => _.get(sampleConfig, path) }; + const kbnServer = { log: sinon.mock() }; + + mockFs(fsStub.files); + const metrics = await getMetrics(event, config, kbnServer); + mockFs.restore(); + + expect(_.get(metrics, 'os.cgroup')).to.eql({ + cpuacct: { + control_group: `/${fsStub.hierarchy}`, + usage_nanos: 357753491408, + }, + cpu: { + control_group: `/${fsStub.hierarchy}`, + cfs_period_micros: 100000, + cfs_quota_micros: 5000, + stat: { + number_of_elapsed_periods: 0, + number_of_times_throttled: 10, + time_throttled_nanos: 20 + } + } + }); }); - }); - it('should snake case the request object', () => { - expect(metrics.requests.status_codes).not.to.be(undefined); - expect(metrics.requests.statusCodes).to.be(undefined); - }); + it('can override cgroup path', async () => { + const fsStub = cGroupsFsStub('foo'); + const event = _.cloneDeep(mockOps); + const configOverride = Object.assign(sampleConfig, { + cpu: { + cgroup: { + path: { + override: '/foo' + } + } + }, + + cpuacct: { + cgroup: { + path: { + override: '/foo' + } + } + }, + }); + const config = { get: path => _.get(configOverride, path) }; + const kbnServer = { log: sinon.mock() }; + + mockFs(fsStub.files); + const metrics = await getMetrics(event, config, kbnServer); + mockFs.restore(); - it('should provide defined metrics', () => { - (function checkMetrics(currentMetric) { - _.forOwn(currentMetric, value => { - if (typeof value === 'object') return checkMetrics(value); - expect(currentMetric).not.to.be(undefined); + expect(_.get(metrics, 'os.cgroup')).to.eql({ + cpuacct: { + control_group: `/foo`, + usage_nanos: 357753491408, + }, + cpu: { + control_group: `/foo`, + cfs_period_micros: 100000, + cfs_quota_micros: 5000, + stat: { + number_of_elapsed_periods: 0, + number_of_times_throttled: 10, + time_throttled_nanos: 20 + } + } }); + }); + }); + + describe('without cgroups', () => { + let metrics; + beforeEach(async () => { + const event = _.cloneDeep(mockOps); + const config = { get: path => _.get(sampleConfig, path) }; + const kbnServer = { log: sinon.mock() }; + + metrics = await getMetrics(event, config, kbnServer); + }); - }(metrics)); + it('should snake case the request object', () => { + expect(metrics.requests.status_codes).not.to.be(undefined); + expect(metrics.requests.statusCodes).to.be(undefined); + }); + + it('should provide defined metrics', () => { + (function checkMetrics(currentMetric) { + _.forOwn(currentMetric, value => { + if (typeof value === 'object') return checkMetrics(value); + expect(currentMetric).not.to.be(undefined); + }); + + }(metrics)); + }); }); }); diff --git a/src/server/status/cgroup.js b/src/server/status/cgroup.js new file mode 100644 index 0000000000000..0cba85cc8dfad --- /dev/null +++ b/src/server/status/cgroup.js @@ -0,0 +1,138 @@ +import fs from 'fs'; +import { promisify } from 'bluebird'; +import { join as joinPath } from 'path'; + +// Logic from elasticsearch/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java + +const CONTROL_GROUP_RE = new RegExp('\\d+:([^:]+):(/.*)'); +const CONTROLLER_SEPERATOR_RE = ','; + +const PROC_SELF_CGROUP_FILE = '/proc/self/cgroup'; +const PROC_CGROUP_CPU_DIR = '/sys/fs/cgroup/cpu'; +const PROC_CGROUP_CPUACCT_DIR = '/sys/fs/cgroup/cpuacct'; + +const GROUP_CPUACCT = 'cpuacct'; +const CPUACCT_USAGE_FILE = 'cpuacct.usage'; + +const GROUP_CPU = 'cpu'; +const CPU_FS_PERIOD_US_FILE = 'cpu.cfs_period_us'; +const CPU_FS_QUOTA_US_FILE = 'cpu.cfs_quota_us'; +const CPU_STATS_FILE = 'cpu.stat'; + +const readFile = promisify(fs.readFile); + +export function readControlGroups() { + return readFile(PROC_SELF_CGROUP_FILE) + .then(data => { + const response = {}; + + data.toString().split(/\n/).forEach(line => { + const matches = line.match(CONTROL_GROUP_RE); + + if (matches === null) { + return; + } + + const controllers = matches[1].split(CONTROLLER_SEPERATOR_RE); + controllers.forEach(controller => { + response[controller] = matches[2]; + }); + }); + + return response; + }); +} + +function fileContentsToInteger(path) { + return readFile(path).then(data => { + return parseInt(data.toString(), 10); + }); +} + +function readCPUAcctUsage(controlGroup) { + return fileContentsToInteger(joinPath(PROC_CGROUP_CPUACCT_DIR, controlGroup, CPUACCT_USAGE_FILE)); +} + +function readCPUFsPeriod(controlGroup) { + return fileContentsToInteger(joinPath(PROC_CGROUP_CPU_DIR, controlGroup, CPU_FS_PERIOD_US_FILE)); +} + +function readCPUFsQuota(controlGroup) { + return fileContentsToInteger(joinPath(PROC_CGROUP_CPU_DIR, controlGroup, CPU_FS_QUOTA_US_FILE)); +} + +export function readCPUStat(controlGroup) { + return new Promise((resolve, reject) => { + const stat = { + number_of_elapsed_periods: -1, + number_of_times_throttled: -1, + time_throttled_nanos: -1 + }; + + readFile(joinPath(PROC_CGROUP_CPU_DIR, controlGroup, CPU_STATS_FILE)).then(data => { + data.toString().split(/\n/).forEach(line => { + const fields = line.split(/\s+/); + + switch(fields[0]) { + case 'nr_periods': + stat.number_of_elapsed_periods = parseInt(fields[1], 10); + break; + + case 'nr_throttled': + stat.number_of_times_throttled = parseInt(fields[1], 10); + break; + + case 'throttled_time': + stat.time_throttled_nanos = parseInt(fields[1], 10); + break; + } + }); + + resolve(stat); + }).catch(err => { + if (err.code === 'ENOENT') { + return resolve(stat); + } + + reject(err); + }); + }); +} + +export function getAllStats(options = {}) { + return new Promise((resolve, reject) => { + readControlGroups().then(groups => { + const cpuPath = options.cpuPath || groups[GROUP_CPU]; + const cpuAcctPath = options.cpuAcctPath || groups[GROUP_CPUACCT]; + + return Promise.all([ + readCPUAcctUsage(cpuAcctPath), + readCPUFsPeriod(cpuPath), + readCPUFsQuota(cpuPath), + readCPUStat(cpuPath) + ]).then(([ cpuAcctUsage, cpuFsPeriod, cpuFsQuota, cpuStat ]) => { + resolve({ + cpuacct: { + control_group: cpuAcctPath, + usage_nanos: cpuAcctUsage + }, + + cpu: { + control_group: cpuPath, + cfs_period_micros: cpuFsPeriod, + cfs_quota_micros: cpuFsQuota, + stat: cpuStat + } + }); + }).catch(rejectUnlessFileNotFound); + }).catch(rejectUnlessFileNotFound); + + function rejectUnlessFileNotFound(err) { + if (err.code === 'ENOENT') { + resolve(null); + } + + reject(err); + } + }); +} diff --git a/src/server/status/metrics.js b/src/server/status/metrics.js index 790fe7ac5f2d4..8c12e666e9282 100644 --- a/src/server/status/metrics.js +++ b/src/server/status/metrics.js @@ -1,39 +1,71 @@ -import _ from 'lodash'; +import { get, set, isObject } from 'lodash'; import { keysToSnakeCaseShallow } from '../../utils/case_conversion'; +import { getAllStats as cGroupStats } from './cgroup'; + +let cGroupStatsAvailable = true; export function collectMetrics(kbnServer, server, config) { - server.plugins['even-better'].monitor.on('ops', function (event) { - kbnServer.metrics = getMetrics({ event, config }); + server.plugins['even-better'].monitor.on('ops', event => { + getMetrics(event, config, server).then(data => { kbnServer.metrics = data; }); }); } -export function getMetrics({ event, config }) { +export async function getMetrics(event, config, server) { const port = config.get('server.port'); const timestamp = new Date().toISOString(); - return { + const cgroup = await cGroupStatsIfAvailable(); + + const metrics = { last_updated: timestamp, collection_interval_in_millis: config.get('ops.interval'), uptime_in_millis: process.uptime() * 1000, process: { mem: { - heap_max_in_bytes: _.get(event, 'psmem.heapTotal'), - heap_used_in_bytes: _.get(event, 'psmem.heapUsed') + heap_max_in_bytes: get(event, 'psmem.heapTotal'), + heap_used_in_bytes: get(event, 'psmem.heapUsed') } }, os: { cpu: { load_average: { - '1m': _.get(event, 'osload.0'), - '5m': _.get(event, 'osload.1'), - '15m': _.get(event, 'osload.1') + '1m': get(event, 'osload.0'), + '5m': get(event, 'osload.1'), + '15m': get(event, 'osload.1') } } }, response_times: { - avg_in_millis: _.get(event, ['responseTimes', port, 'avg']), - max_in_millis: _.get(event, ['responseTimes', port, 'max']) + avg_in_millis: get(event, ['responseTimes', port, 'avg']), + max_in_millis: get(event, ['responseTimes', port, 'max']) }, - requests: keysToSnakeCaseShallow(_.get(event, ['requests', port])), - concurrent_connections: _.get(event, ['concurrents', port]) + requests: keysToSnakeCaseShallow(get(event, ['requests', port])), + concurrent_connections: get(event, ['concurrents', port]) }; + + async function cGroupStatsIfAvailable() { + if (!cGroupStatsAvailable) { + return; + } + + try { + const cgroup = await cGroupStats({ + cpuPath: config.get('cpu.cgroup.path.override'), + cpuAcctPath: config.get('cpuacct.cgroup.path.override') + }); + + if (isObject(cgroup)) { + return cgroup; + } + + cGroupStatsAvailable = false; + } catch (e) { + server.log(['error', 'metrics', 'cgroup'], e); + } + } + + if (isObject(cgroup)) { + set(metrics, 'os.cgroup', cgroup); + } + + return metrics; }