From ecf33135b34b158454fe090a62e0b56077a9997c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20C=C3=B4t=C3=A9?= Date: Fri, 21 Aug 2020 11:42:39 -0400 Subject: [PATCH 01/42] Make task manager maxWorkers and pollInterval observables (#75293) * WIP step 1 * WIP step 2 * Cleanup * Make maxWorkers an observable for the task pool * Cleanup * Fix test failures * Use BehaviorSubject * Add some tests --- .../server/polling/task_poller.test.ts | 72 ++++++++++++++++--- .../server/polling/task_poller.ts | 15 ++-- .../task_manager/server/task_manager.ts | 9 ++- .../task_manager/server/task_pool.test.ts | 33 ++++++--- .../plugins/task_manager/server/task_pool.ts | 9 ++- 5 files changed, 105 insertions(+), 33 deletions(-) diff --git a/x-pack/plugins/task_manager/server/polling/task_poller.test.ts b/x-pack/plugins/task_manager/server/polling/task_poller.test.ts index 607e2ac2b80fa..1c6aff2ad58b9 100644 --- a/x-pack/plugins/task_manager/server/polling/task_poller.test.ts +++ b/x-pack/plugins/task_manager/server/polling/task_poller.test.ts @@ -5,7 +5,7 @@ */ import _ from 'lodash'; -import { Subject } from 'rxjs'; +import { Subject, of, BehaviorSubject } from 'rxjs'; import { Option, none, some } from 'fp-ts/lib/Option'; import { createTaskPoller, PollingError, PollingErrorType } from './task_poller'; import { fakeSchedulers } from 'rxjs-marbles/jest'; @@ -24,10 +24,11 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, getCapacity: () => 1, work, + workTimeout: pollInterval * 5, pollRequests$: new Subject>(), }).subscribe(() => {}); @@ -40,9 +41,51 @@ describe('TaskPoller', () => { await sleep(0); expect(work).toHaveBeenCalledTimes(1); + await sleep(0); + await sleep(0); + advance(pollInterval + 10); + await sleep(0); + expect(work).toHaveBeenCalledTimes(2); + }) + ); + + test( + 'poller adapts to pollInterval changes', + fakeSchedulers(async (advance) => { + const pollInterval = 100; + const pollInterval$ = new BehaviorSubject(pollInterval); + const bufferCapacity = 5; + + const work = jest.fn(async () => true); + createTaskPoller({ + pollInterval$, + bufferCapacity, + getCapacity: () => 1, + work, + workTimeout: pollInterval * 5, + pollRequests$: new Subject>(), + }).subscribe(() => {}); + + // `work` is async, we have to force a node `tick` await sleep(0); advance(pollInterval); + expect(work).toHaveBeenCalledTimes(1); + + pollInterval$.next(pollInterval * 2); + + // `work` is async, we have to force a node `tick` + await sleep(0); + advance(pollInterval); + expect(work).toHaveBeenCalledTimes(1); + advance(pollInterval); expect(work).toHaveBeenCalledTimes(2); + + pollInterval$.next(pollInterval / 2); + + // `work` is async, we have to force a node `tick` + await sleep(0); + advance(pollInterval / 2); + expect(work).toHaveBeenCalledTimes(3); }) ); @@ -56,9 +99,10 @@ describe('TaskPoller', () => { let hasCapacity = true; createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work, + workTimeout: pollInterval * 5, getCapacity: () => (hasCapacity ? 1 : 0), pollRequests$: new Subject>(), }).subscribe(() => {}); @@ -113,9 +157,10 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work, + workTimeout: pollInterval * 5, getCapacity: () => 1, pollRequests$, }).subscribe(jest.fn()); @@ -157,9 +202,10 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work, + workTimeout: pollInterval * 5, getCapacity: () => (hasCapacity ? 1 : 0), pollRequests$, }).subscribe(() => {}); @@ -200,9 +246,10 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work, + workTimeout: pollInterval * 5, getCapacity: () => 1, pollRequests$, }).subscribe(() => {}); @@ -235,7 +282,7 @@ describe('TaskPoller', () => { const handler = jest.fn(); const pollRequests$ = new Subject>(); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work: async (...args) => { await worker; @@ -285,7 +332,7 @@ describe('TaskPoller', () => { type ResolvableTupple = [string, PromiseLike & Resolvable]; const pollRequests$ = new Subject>(); createTaskPoller<[string, Resolvable], string[]>({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work: async (...resolvables) => { await Promise.all(resolvables.map(([, future]) => future)); @@ -344,11 +391,12 @@ describe('TaskPoller', () => { const handler = jest.fn(); const pollRequests$ = new Subject>(); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work: async (...args) => { throw new Error('failed to work'); }, + workTimeout: pollInterval * 5, getCapacity: () => 5, pollRequests$, }).subscribe(handler); @@ -383,9 +431,10 @@ describe('TaskPoller', () => { return callCount; }); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work, + workTimeout: pollInterval * 5, getCapacity: () => 5, pollRequests$, }).subscribe(handler); @@ -424,9 +473,10 @@ describe('TaskPoller', () => { const work = jest.fn(async () => {}); const pollRequests$ = new Subject>(); createTaskPoller({ - pollInterval, + pollInterval$: of(pollInterval), bufferCapacity, work, + workTimeout: pollInterval * 5, getCapacity: () => 5, pollRequests$, }).subscribe(handler); diff --git a/x-pack/plugins/task_manager/server/polling/task_poller.ts b/x-pack/plugins/task_manager/server/polling/task_poller.ts index a1435ffafe8f8..867d01691c41d 100644 --- a/x-pack/plugins/task_manager/server/polling/task_poller.ts +++ b/x-pack/plugins/task_manager/server/polling/task_poller.ts @@ -11,7 +11,7 @@ import { performance } from 'perf_hooks'; import { after } from 'lodash'; import { Subject, merge, interval, of, Observable } from 'rxjs'; -import { mapTo, filter, scan, concatMap, tap, catchError } from 'rxjs/operators'; +import { mapTo, filter, scan, concatMap, tap, catchError, switchMap } from 'rxjs/operators'; import { pipe } from 'fp-ts/lib/pipeable'; import { Option, none, map as mapOptional, getOrElse } from 'fp-ts/lib/Option'; @@ -30,12 +30,12 @@ import { timeoutPromiseAfter } from './timeout_promise_after'; type WorkFn = (...params: T[]) => Promise; interface Opts { - pollInterval: number; + pollInterval$: Observable; bufferCapacity: number; getCapacity: () => number; pollRequests$: Observable>; work: WorkFn; - workTimeout?: number; + workTimeout: number; } /** @@ -52,7 +52,7 @@ interface Opts { * of unique request argumets of type T. The queue holds all the buffered request arguments streamed in via pollRequests$ */ export function createTaskPoller({ - pollInterval, + pollInterval$, getCapacity, pollRequests$, bufferCapacity, @@ -67,7 +67,10 @@ export function createTaskPoller({ // emit a polling event on demand pollRequests$, // emit a polling event on a fixed interval - interval(pollInterval).pipe(mapTo(none)) + pollInterval$.pipe( + switchMap((period) => interval(period)), + mapTo(none) + ) ).pipe( // buffer all requests in a single set (to remove duplicates) as we don't want // work to take place in parallel (it could cause Task Manager to pull in the same @@ -95,7 +98,7 @@ export function createTaskPoller({ await promiseResult( timeoutPromiseAfter( work(...pullFromSet(set, getCapacity())), - workTimeout ?? pollInterval, + workTimeout, () => new Error(`work has timed out`) ) ), diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index fb2d5e07030a4..6a39f2a762e75 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -import { Subject, Observable, Subscription } from 'rxjs'; +import { BehaviorSubject, Subject, Observable, Subscription } from 'rxjs'; import { filter } from 'rxjs/operators'; import { performance } from 'perf_hooks'; @@ -149,6 +149,9 @@ export class TaskManager { // pipe store events into the TaskManager's event stream this.store.events.subscribe((event) => this.events$.next(event)); + const maxWorkers$ = new BehaviorSubject(opts.config.max_workers); + const pollInterval$ = new BehaviorSubject(opts.config.poll_interval); + this.bufferedStore = new BufferedTaskStore(this.store, { bufferMaxOperations: opts.config.max_workers, logger: this.logger, @@ -156,7 +159,7 @@ export class TaskManager { this.pool = new TaskPool({ logger: this.logger, - maxWorkers: opts.config.max_workers, + maxWorkers$, }); const { @@ -166,7 +169,7 @@ export class TaskManager { this.poller$ = createObservableMonitor>, Error>( () => createTaskPoller({ - pollInterval, + pollInterval$, bufferCapacity: opts.config.request_capacity, getCapacity: () => this.pool.availableWorkers, pollRequests$: this.claimRequests$, diff --git a/x-pack/plugins/task_manager/server/task_pool.test.ts b/x-pack/plugins/task_manager/server/task_pool.test.ts index 8b2bce455589e..ec6613ece4eed 100644 --- a/x-pack/plugins/task_manager/server/task_pool.test.ts +++ b/x-pack/plugins/task_manager/server/task_pool.test.ts @@ -5,6 +5,7 @@ */ import sinon from 'sinon'; +import { of, Subject } from 'rxjs'; import { TaskPool, TaskPoolRunResult } from './task_pool'; import { mockLogger, resolvable, sleep } from './test_utils'; import { asOk } from './lib/result_type'; @@ -14,7 +15,7 @@ import moment from 'moment'; describe('TaskPool', () => { test('occupiedWorkers are a sum of running tasks', async () => { const pool = new TaskPool({ - maxWorkers: 200, + maxWorkers$: of(200), logger: mockLogger(), }); @@ -26,7 +27,7 @@ describe('TaskPool', () => { test('availableWorkers are a function of total_capacity - occupiedWorkers', async () => { const pool = new TaskPool({ - maxWorkers: 10, + maxWorkers$: of(10), logger: mockLogger(), }); @@ -36,9 +37,21 @@ describe('TaskPool', () => { expect(pool.availableWorkers).toEqual(7); }); + test('availableWorkers is 0 until maxWorkers$ pushes a value', async () => { + const maxWorkers$ = new Subject(); + const pool = new TaskPool({ + maxWorkers$, + logger: mockLogger(), + }); + + expect(pool.availableWorkers).toEqual(0); + maxWorkers$.next(10); + expect(pool.availableWorkers).toEqual(10); + }); + test('does not run tasks that are beyond its available capacity', async () => { const pool = new TaskPool({ - maxWorkers: 2, + maxWorkers$: of(2), logger: mockLogger(), }); @@ -60,7 +73,7 @@ describe('TaskPool', () => { test('should log when marking a Task as running fails', async () => { const logger = mockLogger(); const pool = new TaskPool({ - maxWorkers: 2, + maxWorkers$: of(2), logger, }); @@ -83,7 +96,7 @@ describe('TaskPool', () => { test('should log when running a Task fails', async () => { const logger = mockLogger(); const pool = new TaskPool({ - maxWorkers: 3, + maxWorkers$: of(3), logger, }); @@ -106,7 +119,7 @@ describe('TaskPool', () => { test('should not log when running a Task fails due to the Task SO having been deleted while in flight', async () => { const logger = mockLogger(); const pool = new TaskPool({ - maxWorkers: 3, + maxWorkers$: of(3), logger, }); @@ -130,7 +143,7 @@ describe('TaskPool', () => { test('Running a task which fails still takes up capacity', async () => { const logger = mockLogger(); const pool = new TaskPool({ - maxWorkers: 1, + maxWorkers$: of(1), logger, }); @@ -147,7 +160,7 @@ describe('TaskPool', () => { test('clears up capacity when a task completes', async () => { const pool = new TaskPool({ - maxWorkers: 1, + maxWorkers$: of(1), logger: mockLogger(), }); @@ -193,7 +206,7 @@ describe('TaskPool', () => { test('run cancels expired tasks prior to running new tasks', async () => { const logger = mockLogger(); const pool = new TaskPool({ - maxWorkers: 2, + maxWorkers$: of(2), logger, }); @@ -251,7 +264,7 @@ describe('TaskPool', () => { const logger = mockLogger(); const pool = new TaskPool({ logger, - maxWorkers: 20, + maxWorkers$: of(20), }); const cancelled = resolvable(); diff --git a/x-pack/plugins/task_manager/server/task_pool.ts b/x-pack/plugins/task_manager/server/task_pool.ts index 92374908c60f7..c029349c13b77 100644 --- a/x-pack/plugins/task_manager/server/task_pool.ts +++ b/x-pack/plugins/task_manager/server/task_pool.ts @@ -8,6 +8,7 @@ * This module contains the logic that ensures we don't run too many * tasks at once in a given Kibana instance. */ +import { Observable } from 'rxjs'; import moment, { Duration } from 'moment'; import { performance } from 'perf_hooks'; import { padStart } from 'lodash'; @@ -16,7 +17,7 @@ import { TaskRunner } from './task_runner'; import { isTaskSavedObjectNotFoundError } from './lib/is_task_not_found_error'; interface Opts { - maxWorkers: number; + maxWorkers$: Observable; logger: Logger; } @@ -31,7 +32,7 @@ const VERSION_CONFLICT_MESSAGE = 'Task has been claimed by another Kibana servic * Runs tasks in batches, taking costs into account. */ export class TaskPool { - private maxWorkers: number; + private maxWorkers: number = 0; private running = new Set(); private logger: Logger; @@ -44,8 +45,10 @@ export class TaskPool { * @prop {Logger} logger - The task manager logger. */ constructor(opts: Opts) { - this.maxWorkers = opts.maxWorkers; this.logger = opts.logger; + opts.maxWorkers$.subscribe((maxWorkers) => { + this.maxWorkers = maxWorkers; + }); } /** From d721fea15ca7d597ca97f0a15ff6128b8c541e9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20C=C3=B4t=C3=A9?= Date: Wed, 9 Sep 2020 13:38:40 -0400 Subject: [PATCH 02/42] Make the task manager store emit error events (#75679) * Add errors$ observable to the task store * Add unit tests --- .../task_manager/server/task_store.test.ts | 229 ++++++++++++++---- .../plugins/task_manager/server/task_store.ts | 119 ++++++--- 2 files changed, 269 insertions(+), 79 deletions(-) diff --git a/x-pack/plugins/task_manager/server/task_store.test.ts b/x-pack/plugins/task_manager/server/task_store.test.ts index a02123c4a3f8d..78be118507954 100644 --- a/x-pack/plugins/task_manager/server/task_store.test.ts +++ b/x-pack/plugins/task_manager/server/task_store.test.ts @@ -7,7 +7,7 @@ import _ from 'lodash'; import sinon from 'sinon'; import uuid from 'uuid'; -import { filter } from 'rxjs/operators'; +import { filter, first } from 'rxjs/operators'; import { Option, some, none } from 'fp-ts/lib/Option'; import { @@ -66,8 +66,21 @@ const mockedDate = new Date('2019-02-12T21:01:22.479Z'); describe('TaskStore', () => { describe('schedule', () => { + let store: TaskStore; + + beforeAll(() => { + store = new TaskStore({ + index: 'tasky', + taskManagerId: '', + serializer, + callCluster: jest.fn(), + maxAttempts: 2, + definitions: taskDefinitions, + savedObjectsRepository: savedObjectsClient, + }); + }); + async function testSchedule(task: unknown) { - const callCluster = jest.fn(); savedObjectsClient.create.mockImplementation(async (type: string, attributes: unknown) => ({ id: 'testid', type, @@ -75,15 +88,6 @@ describe('TaskStore', () => { references: [], version: '123', })); - const store = new TaskStore({ - index: 'tasky', - taskManagerId: '', - serializer, - callCluster, - maxAttempts: 2, - definitions: taskDefinitions, - savedObjectsRepository: savedObjectsClient, - }); const result = await store.schedule(task as TaskInstance); expect(savedObjectsClient.create).toHaveBeenCalledTimes(1); @@ -176,12 +180,28 @@ describe('TaskStore', () => { /Unsupported task type "nope"/i ); }); + + test('pushes error from saved objects client to errors$', async () => { + const task: TaskInstance = { + id: 'id', + params: { hello: 'world' }, + state: { foo: 'bar' }, + taskType: 'report', + }; + + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + savedObjectsClient.create.mockRejectedValue(new Error('Failure')); + await expect(store.schedule(task)).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); }); describe('fetch', () => { - async function testFetch(opts?: SearchOpts, hits: unknown[] = []) { - const callCluster = sinon.spy(async (name: string, params?: unknown) => ({ hits: { hits } })); - const store = new TaskStore({ + let store: TaskStore; + const callCluster = jest.fn(); + + beforeAll(() => { + store = new TaskStore({ index: 'tasky', taskManagerId: '', serializer, @@ -190,15 +210,19 @@ describe('TaskStore', () => { definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); + }); + + async function testFetch(opts?: SearchOpts, hits: unknown[] = []) { + callCluster.mockResolvedValue({ hits: { hits } }); const result = await store.fetch(opts); - sinon.assert.calledOnce(callCluster); - sinon.assert.calledWith(callCluster, 'search'); + expect(callCluster).toHaveBeenCalledTimes(1); + expect(callCluster).toHaveBeenCalledWith('search', expect.anything()); return { result, - args: callCluster.args[0][1], + args: callCluster.mock.calls[0][1], }; } @@ -230,6 +254,13 @@ describe('TaskStore', () => { }, }); }); + + test('pushes error from call cluster to errors$', async () => { + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + callCluster.mockRejectedValue(new Error('Failure')); + await expect(store.fetch()).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); }); describe('claimAvailableTasks', () => { @@ -831,9 +862,46 @@ if (doc['task.runAt'].size()!=0) { }, ]); }); + + test('pushes error from saved objects client to errors$', async () => { + const callCluster = jest.fn(); + const store = new TaskStore({ + index: 'tasky', + taskManagerId: '', + serializer, + callCluster, + definitions: taskDefinitions, + maxAttempts: 2, + savedObjectsRepository: savedObjectsClient, + }); + + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + callCluster.mockRejectedValue(new Error('Failure')); + await expect( + store.claimAvailableTasks({ + claimOwnershipUntil: new Date(), + size: 10, + }) + ).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); }); describe('update', () => { + let store: TaskStore; + + beforeAll(() => { + store = new TaskStore({ + index: 'tasky', + taskManagerId: '', + serializer, + callCluster: jest.fn(), + maxAttempts: 2, + definitions: taskDefinitions, + savedObjectsRepository: savedObjectsClient, + }); + }); + test('refreshes the index, handles versioning', async () => { const task = { runAt: mockedDate, @@ -862,16 +930,6 @@ if (doc['task.runAt'].size()!=0) { } ); - const store = new TaskStore({ - index: 'tasky', - taskManagerId: '', - serializer, - callCluster: jest.fn(), - maxAttempts: 2, - definitions: taskDefinitions, - savedObjectsRepository: savedObjectsClient, - }); - const result = await store.update(task); expect(savedObjectsClient.update).toHaveBeenCalledWith( @@ -905,28 +963,116 @@ if (doc['task.runAt'].size()!=0) { version: '123', }); }); + + test('pushes error from saved objects client to errors$', async () => { + const task = { + runAt: mockedDate, + scheduledAt: mockedDate, + startedAt: null, + retryAt: null, + id: 'task:324242', + params: { hello: 'world' }, + state: { foo: 'bar' }, + taskType: 'report', + attempts: 3, + status: 'idle' as TaskStatus, + version: '123', + ownerId: null, + }; + + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + savedObjectsClient.update.mockRejectedValue(new Error('Failure')); + await expect(store.update(task)).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); + }); + + describe('bulkUpdate', () => { + let store: TaskStore; + + beforeAll(() => { + store = new TaskStore({ + index: 'tasky', + taskManagerId: '', + serializer, + callCluster: jest.fn(), + maxAttempts: 2, + definitions: taskDefinitions, + savedObjectsRepository: savedObjectsClient, + }); + }); + + test('pushes error from saved objects client to errors$', async () => { + const task = { + runAt: mockedDate, + scheduledAt: mockedDate, + startedAt: null, + retryAt: null, + id: 'task:324242', + params: { hello: 'world' }, + state: { foo: 'bar' }, + taskType: 'report', + attempts: 3, + status: 'idle' as TaskStatus, + version: '123', + ownerId: null, + }; + + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + savedObjectsClient.bulkUpdate.mockRejectedValue(new Error('Failure')); + await expect(store.bulkUpdate([task])).rejects.toThrowErrorMatchingInlineSnapshot( + `"Failure"` + ); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); }); describe('remove', () => { - test('removes the task with the specified id', async () => { - const id = `id-${_.random(1, 20)}`; - const callCluster = jest.fn(); - const store = new TaskStore({ + let store: TaskStore; + + beforeAll(() => { + store = new TaskStore({ index: 'tasky', taskManagerId: '', serializer, - callCluster, + callCluster: jest.fn(), maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); + }); + + test('removes the task with the specified id', async () => { + const id = `id-${_.random(1, 20)}`; const result = await store.remove(id); expect(result).toBeUndefined(); expect(savedObjectsClient.delete).toHaveBeenCalledWith('task', id); }); + + test('pushes error from saved objects client to errors$', async () => { + const id = `id-${_.random(1, 20)}`; + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + savedObjectsClient.delete.mockRejectedValue(new Error('Failure')); + await expect(store.remove(id)).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); }); describe('get', () => { + let store: TaskStore; + + beforeAll(() => { + store = new TaskStore({ + index: 'tasky', + taskManagerId: '', + serializer, + callCluster: jest.fn(), + maxAttempts: 2, + definitions: taskDefinitions, + savedObjectsRepository: savedObjectsClient, + }); + }); + test('gets the task with the specified id', async () => { const id = `id-${_.random(1, 20)}`; const task = { @@ -944,7 +1090,6 @@ if (doc['task.runAt'].size()!=0) { ownerId: null, }; - const callCluster = jest.fn(); savedObjectsClient.get.mockImplementation(async (type: string, objectId: string) => ({ id: objectId, type, @@ -956,22 +1101,20 @@ if (doc['task.runAt'].size()!=0) { version: '123', })); - const store = new TaskStore({ - index: 'tasky', - taskManagerId: '', - serializer, - callCluster, - maxAttempts: 2, - definitions: taskDefinitions, - savedObjectsRepository: savedObjectsClient, - }); - const result = await store.get(id); expect(result).toEqual(task); expect(savedObjectsClient.get).toHaveBeenCalledWith('task', id); }); + + test('pushes error from saved objects client to errors$', async () => { + const id = `id-${_.random(1, 20)}`; + const firstErrorPromise = store.errors$.pipe(first()).toPromise(); + savedObjectsClient.get.mockRejectedValue(new Error('Failure')); + await expect(store.get(id)).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); + expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); + }); }); describe('getLifecycle', () => { diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index f2da41053e6ab..4f193c9401cf9 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -121,6 +121,7 @@ export class TaskStore { public readonly maxAttempts: number; public readonly index: string; public readonly taskManagerId: string; + public readonly errors$ = new Subject(); private callCluster: ElasticJs; private definitions: TaskDictionary; @@ -171,11 +172,17 @@ export class TaskStore { ); } - const savedObject = await this.savedObjectsRepository.create( - 'task', - taskInstanceToAttributes(taskInstance), - { id: taskInstance.id, refresh: false } - ); + let savedObject; + try { + savedObject = await this.savedObjectsRepository.create( + 'task', + taskInstanceToAttributes(taskInstance), + { id: taskInstance.id, refresh: false } + ); + } catch (e) { + this.errors$.next(e); + throw e; + } return savedObjectToConcreteTaskInstance(savedObject); } @@ -333,12 +340,22 @@ export class TaskStore { */ public async update(doc: ConcreteTaskInstance): Promise { const attributes = taskInstanceToAttributes(doc); - const updatedSavedObject = await this.savedObjectsRepository.update< - SerializedConcreteTaskInstance - >('task', doc.id, attributes, { - refresh: false, - version: doc.version, - }); + + let updatedSavedObject; + try { + updatedSavedObject = await this.savedObjectsRepository.update( + 'task', + doc.id, + attributes, + { + refresh: false, + version: doc.version, + } + ); + } catch (e) { + this.errors$.next(e); + throw e; + } return savedObjectToConcreteTaskInstance( // The SavedObjects update api forces a Partial on the `attributes` on the response, @@ -362,8 +379,11 @@ export class TaskStore { return attrsById; }, new Map()); - const updatedSavedObjects: Array = ( - await this.savedObjectsRepository.bulkUpdate( + let updatedSavedObjects: Array; + try { + ({ saved_objects: updatedSavedObjects } = await this.savedObjectsRepository.bulkUpdate< + SerializedConcreteTaskInstance + >( docs.map((doc) => ({ type: 'task', id: doc.id, @@ -373,8 +393,11 @@ export class TaskStore { { refresh: false, } - ) - ).saved_objects; + )); + } catch (e) { + this.errors$.next(e); + throw e; + } return updatedSavedObjects.map((updatedSavedObject, index) => isSavedObjectsUpdateResponse(updatedSavedObject) @@ -404,7 +427,12 @@ export class TaskStore { * @returns {Promise} */ public async remove(id: string): Promise { - await this.savedObjectsRepository.delete('task', id); + try { + await this.savedObjectsRepository.delete('task', id); + } catch (e) { + this.errors$.next(e); + throw e; + } } /** @@ -414,7 +442,14 @@ export class TaskStore { * @returns {Promise} */ public async get(id: string): Promise { - return savedObjectToConcreteTaskInstance(await this.savedObjectsRepository.get('task', id)); + let result; + try { + result = await this.savedObjectsRepository.get('task', id); + } catch (e) { + this.errors$.next(e); + throw e; + } + return savedObjectToConcreteTaskInstance(result); } /** @@ -438,14 +473,20 @@ export class TaskStore { private async search(opts: SearchOpts = {}): Promise { const { query } = ensureQueryOnlyReturnsTaskObjects(opts); - const result = await this.callCluster('search', { - index: this.index, - ignoreUnavailable: true, - body: { - ...opts, - query, - }, - }); + let result; + try { + result = await this.callCluster('search', { + index: this.index, + ignoreUnavailable: true, + body: { + ...opts, + query, + }, + }); + } catch (e) { + this.errors$.next(e); + throw e; + } const rawDocs = (result as SearchResponse).hits.hits; @@ -463,17 +504,23 @@ export class TaskStore { { max_docs }: UpdateByQueryOpts = {} ): Promise { const { query } = ensureQueryOnlyReturnsTaskObjects(opts); - const result = await this.callCluster('updateByQuery', { - index: this.index, - ignoreUnavailable: true, - refresh: true, - max_docs, - conflicts: 'proceed', - body: { - ...opts, - query, - }, - }); + let result; + try { + result = await this.callCluster('updateByQuery', { + index: this.index, + ignoreUnavailable: true, + refresh: true, + max_docs, + conflicts: 'proceed', + body: { + ...opts, + query, + }, + }); + } catch (e) { + this.errors$.next(e); + throw e; + } // eslint-disable-next-line @typescript-eslint/naming-convention const { total, updated, version_conflicts } = result as UpdateDocumentByQueryResponse; From 5f850dbf74a6fa0601a57a368373767b3acd7006 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Fri, 18 Sep 2020 11:21:39 +0100 Subject: [PATCH 03/42] moved TM integration tests to TS --- .../task_manager/{index.js => index.ts} | 6 +- ...ager_integration.js => task_management.ts} | 67 +++++++++++-------- 2 files changed, 44 insertions(+), 29 deletions(-) rename x-pack/test/plugin_api_integration/test_suites/task_manager/{index.js => index.ts} (64%) rename x-pack/test/plugin_api_integration/test_suites/task_manager/{task_manager_integration.js => task_management.ts} (91%) diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.js b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts similarity index 64% rename from x-pack/test/plugin_api_integration/test_suites/task_manager/index.js rename to x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts index 93350ad4d2c80..c6d817119d415 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.js +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts @@ -4,9 +4,11 @@ * you may not use this file except in compliance with the Elastic License. */ -export default function ({ loadTestFile }) { +import { FtrProviderContext } from '../../ftr_provider_context'; + +export default function ({ loadTestFile }: FtrProviderContext) { describe('task_manager', function taskManagerSuite() { this.tags('ciGroup2'); - loadTestFile(require.resolve('./task_manager_integration')); + loadTestFile(require.resolve('./task_management')); }); } diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_manager_integration.js b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts similarity index 91% rename from x-pack/test/plugin_api_integration/test_suites/task_manager/task_manager_integration.js rename to x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts index c87a5039360b8..fcf2d5b235123 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_manager_integration.js +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts @@ -8,19 +8,33 @@ import _ from 'lodash'; import expect from '@kbn/expect'; import url from 'url'; import supertestAsPromised from 'supertest-as-promised'; +import { FtrProviderContext } from '../../ftr_provider_context'; +import TaskManagerMapping from '../../../../plugins/task_manager/server/saved_objects/mappings.json'; +import { + DEFAULT_MAX_WORKERS, + DEFAULT_POLL_INTERVAL, +} from '../../../../plugins/task_manager/server/config'; +import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; +import { SavedObjectsRawDoc } from '../../../../../src/core/server'; const { task: { properties: taskManagerIndexMapping }, -} = require('../../../../plugins/task_manager/server/saved_objects/mappings.json'); +} = TaskManagerMapping; -const { - DEFAULT_MAX_WORKERS, - DEFAULT_POLL_INTERVAL, -} = require('../../../../plugins/task_manager/server/config.ts'); +const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); +export interface RawDoc { + _id: string; + _source: any; + _type?: string; +} +export interface SearchResults { + hits: { + hits: RawDoc[]; + }; +} -export default function ({ getService }) { +export default function ({ getService }: FtrProviderContext) { const es = getService('legacyEs'); const log = getService('log'); const retry = getService('retry'); @@ -53,14 +67,16 @@ export default function ({ getService }) { } }); - function currentTasks() { + function currentTasks(): Promise<{ + docs: ConcreteTaskInstance[]; + }> { return supertest .get('/api/sample_tasks') .expect(200) .then((response) => response.body); } - function currentTask(task) { + function currentTask(task: string): ConcreteTaskInstance { return supertest .get(`/api/sample_tasks/task/${task}`) .send({ task }) @@ -68,30 +84,26 @@ export default function ({ getService }) { .then((response) => response.body); } - function ensureTasksIndexRefreshed() { - return supertest - .get(`/api/ensure_tasks_index_refreshed`) - .send({}) - .expect(200) - .then((response) => response.body); + function ensureTasksIndexRefreshed(): Promise { + return supertest.get(`/api/ensure_tasks_index_refreshed`).send({}).expect(200); } - function historyDocs(taskId) { + function historyDocs(taskId: string) { return es .search({ index: testHistoryIndex, q: taskId ? `taskId:${taskId}` : 'type:task', }) - .then((result) => result.hits.hits); + .then((result: SearchResults) => result.hits.hits); } - function scheduleTask(task) { + function scheduleTask(task: string): ConcreteTaskInstance { return supertest .post('/api/sample_tasks/schedule') .set('kbn-xsrf', 'xxx') .send({ task }) .expect(200) - .then((response) => response.body); + .then((response: { body: ConcreteTaskInstance }) => response.body); } function runTaskNow(task) { @@ -109,7 +121,7 @@ export default function ({ getService }) { .set('kbn-xsrf', 'xxx') .send({ task }) .expect(200) - .then((response) => response.body); + .then((response: { body: ConcreteTaskInstance }) => response.body); } function releaseTasksWaitingForEventToComplete(event) { @@ -120,11 +132,14 @@ export default function ({ getService }) { .expect(200); } - function getTaskById(tasks, id) { + function getTaskById(tasks: ConcreteTaskInstance[], id: string) { return tasks.filter((task) => task.id === id)[0]; } - async function provideParamsToTasksWaitingForParams(taskId, data = {}) { + async function provideParamsToTasksWaitingForParams( + taskId: string, + data: Record = {} + ) { // wait for task to start running and stall on waitForParams await retry.try(async () => { const tasks = (await currentTasks()).docs; @@ -564,12 +579,10 @@ export default function ({ getService }) { expect(await runNowResultWithExpectedFailure).to.eql({ id: taskThatFailsBeforeRunNow.id }); }); - async function expectReschedule(originalRunAt, currentTask, expectedDiff) { + async function expectReschedule(originalRunAt: number, task: Task, expectedDiff: number) { const buffer = 10000; - expect(Date.parse(currentTask.runAt) - originalRunAt).to.be.greaterThan( - expectedDiff - buffer - ); - expect(Date.parse(currentTask.runAt) - originalRunAt).to.be.lessThan(expectedDiff + buffer); + expect(Date.parse(task.runAt) - originalRunAt).to.be.greaterThan(expectedDiff - buffer); + expect(Date.parse(task.runAt) - originalRunAt).to.be.lessThan(expectedDiff + buffer); } it('should run tasks in parallel, allowing for long running tasks along side faster tasks', async () => { From c0a7038f94f8c3a6ca74ded9c8c84561d50ed51a Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 22 Sep 2020 12:36:07 +0100 Subject: [PATCH 04/42] introduce health endpoint in TM --- .../server/action_type_registry.test.ts | 4 +- .../actions/server/actions_client.test.ts | 4 +- .../server/builtin_action_types/index.test.ts | 4 +- .../server/create_execute_function.test.ts | 2 +- .../usage/actions_usage_collector.test.ts | 4 +- .../alerts/server/alert_type_registry.test.ts | 4 +- .../alerts/server/alerts_client.test.ts | 4 +- .../server/alerts_client_factory.test.ts | 4 +- .../usage/alerts_usage_collector.test.ts | 4 +- .../task_manager/server/config.test.ts | 1 + x-pack/plugins/task_manager/server/config.ts | 9 + .../task_manager/server/monitoring/index.ts | 18 ++ .../runtime_statistics_aggregator.ts | 14 ++ .../monitoring/workload_statistics.test.ts | 118 +++++++++++ .../server/monitoring/workload_statistics.ts | 65 ++++++ x-pack/plugins/task_manager/server/plugin.ts | 16 +- .../server/queries/aggregation_clauses.ts | 84 ++++++++ .../server/routes/_mock_handler_arguments.ts | 33 +++ .../task_manager/server/routes/health.test.ts | 188 ++++++++++++++++++ .../task_manager/server/routes/health.ts | 108 ++++++++++ .../task_manager/server/routes/index.ts | 7 + .../task_manager/server/task_manager.mock.ts | 39 ++-- .../task_manager/server/task_manager.ts | 15 ++ .../task_manager/server/task_store.mock.ts | 1 + .../plugins/task_manager/server/task_store.ts | 38 ++++ x-pack/test/plugin_api_integration/config.ts | 1 + .../test_suites/task_manager/health_route.ts | 86 ++++++++ .../test_suites/task_manager/index.ts | 1 + .../task_manager/task_management.ts | 84 +++++--- 29 files changed, 891 insertions(+), 69 deletions(-) create mode 100644 x-pack/plugins/task_manager/server/monitoring/index.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts create mode 100644 x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts create mode 100644 x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts create mode 100644 x-pack/plugins/task_manager/server/routes/health.test.ts create mode 100644 x-pack/plugins/task_manager/server/routes/health.ts create mode 100644 x-pack/plugins/task_manager/server/routes/index.ts create mode 100644 x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts diff --git a/x-pack/plugins/actions/server/action_type_registry.test.ts b/x-pack/plugins/actions/server/action_type_registry.test.ts index b25e33400df5d..52cf6b6473f74 100644 --- a/x-pack/plugins/actions/server/action_type_registry.test.ts +++ b/x-pack/plugins/actions/server/action_type_registry.test.ts @@ -4,7 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; import { ActionTypeRegistry, ActionTypeRegistryOpts } from './action_type_registry'; import { ActionType, ExecutorType } from './types'; import { ActionExecutor, ExecutorError, ILicenseState, TaskRunnerFactory } from './lib'; @@ -12,7 +12,7 @@ import { actionsConfigMock } from './actions_config.mock'; import { licenseStateMock } from './lib/license_state.mock'; import { ActionsConfigurationUtilities } from './actions_config'; -const mockTaskManager = taskManagerMock.setup(); +const mockTaskManager = taskManagerMock.createSetup(); let mockedLicenseState: jest.Mocked; let mockedActionsConfig: jest.Mocked; let actionTypeRegistryParams: ActionTypeRegistryOpts; diff --git a/x-pack/plugins/actions/server/actions_client.test.ts b/x-pack/plugins/actions/server/actions_client.test.ts index adef12454f2d5..7ea215c0a5922 100644 --- a/x-pack/plugins/actions/server/actions_client.test.ts +++ b/x-pack/plugins/actions/server/actions_client.test.ts @@ -10,7 +10,7 @@ import { ActionTypeRegistry, ActionTypeRegistryOpts } from './action_type_regist import { ActionsClient } from './actions_client'; import { ExecutorType } from './types'; import { ActionExecutor, TaskRunnerFactory, ILicenseState } from './lib'; -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; import { actionsConfigMock } from './actions_config.mock'; import { getActionsConfigurationUtilities } from './actions_config'; import { licenseStateMock } from './lib/license_state.mock'; @@ -33,7 +33,7 @@ const authorization = actionsAuthorizationMock.create(); const executionEnqueuer = jest.fn(); const request = {} as KibanaRequest; -const mockTaskManager = taskManagerMock.setup(); +const mockTaskManager = taskManagerMock.createSetup(); let actionsClient: ActionsClient; let mockedLicenseState: jest.Mocked; diff --git a/x-pack/plugins/actions/server/builtin_action_types/index.test.ts b/x-pack/plugins/actions/server/builtin_action_types/index.test.ts index acab6dd41b4b3..1088a4e0d9062 100644 --- a/x-pack/plugins/actions/server/builtin_action_types/index.test.ts +++ b/x-pack/plugins/actions/server/builtin_action_types/index.test.ts @@ -6,7 +6,7 @@ import { ActionExecutor, TaskRunnerFactory } from '../lib'; import { ActionTypeRegistry } from '../action_type_registry'; -import { taskManagerMock } from '../../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../../task_manager/server/mocks'; import { registerBuiltInActionTypes } from './index'; import { Logger } from '../../../../../src/core/server'; import { loggingSystemMock } from '../../../../../src/core/server/mocks'; @@ -21,7 +21,7 @@ export function createActionTypeRegistry(): { } { const logger = loggingSystemMock.create().get() as jest.Mocked; const actionTypeRegistry = new ActionTypeRegistry({ - taskManager: taskManagerMock.setup(), + taskManager: taskManagerMock.createSetup(), taskRunnerFactory: new TaskRunnerFactory( new ActionExecutor({ isESOUsingEphemeralEncryptionKey: false }) ), diff --git a/x-pack/plugins/actions/server/create_execute_function.test.ts b/x-pack/plugins/actions/server/create_execute_function.test.ts index 7682f01ed769d..cfbc68879ae0e 100644 --- a/x-pack/plugins/actions/server/create_execute_function.test.ts +++ b/x-pack/plugins/actions/server/create_execute_function.test.ts @@ -6,7 +6,7 @@ import { KibanaRequest } from 'src/core/server'; import uuid from 'uuid'; -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; import { createExecutionEnqueuerFunction } from './create_execute_function'; import { savedObjectsClientMock } from '../../../../src/core/server/mocks'; import { actionTypeRegistryMock } from './action_type_registry.mock'; diff --git a/x-pack/plugins/actions/server/usage/actions_usage_collector.test.ts b/x-pack/plugins/actions/server/usage/actions_usage_collector.test.ts index 2e2944aab425c..0e6c2ff37eb02 100644 --- a/x-pack/plugins/actions/server/usage/actions_usage_collector.test.ts +++ b/x-pack/plugins/actions/server/usage/actions_usage_collector.test.ts @@ -6,9 +6,9 @@ import { UsageCollectionSetup } from 'src/plugins/usage_collection/server'; import { registerActionsUsageCollector } from './actions_usage_collector'; -import { taskManagerMock } from '../../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../../task_manager/server/mocks'; -const mockTaskManagerStart = taskManagerMock.start(); +const mockTaskManagerStart = taskManagerMock.createStart(); beforeEach(() => jest.resetAllMocks()); diff --git a/x-pack/plugins/alerts/server/alert_type_registry.test.ts b/x-pack/plugins/alerts/server/alert_type_registry.test.ts index 048cc3d5a4440..020b4f55619b7 100644 --- a/x-pack/plugins/alerts/server/alert_type_registry.test.ts +++ b/x-pack/plugins/alerts/server/alert_type_registry.test.ts @@ -7,9 +7,9 @@ import { TaskRunnerFactory } from './task_runner'; import { AlertTypeRegistry } from './alert_type_registry'; import { AlertType } from './types'; -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; -const taskManager = taskManagerMock.setup(); +const taskManager = taskManagerMock.createSetup(); const alertTypeRegistryParams = { taskManager, taskRunnerFactory: new TaskRunnerFactory(), diff --git a/x-pack/plugins/alerts/server/alerts_client.test.ts b/x-pack/plugins/alerts/server/alerts_client.test.ts index a6cffb0284815..250adb83dad46 100644 --- a/x-pack/plugins/alerts/server/alerts_client.test.ts +++ b/x-pack/plugins/alerts/server/alerts_client.test.ts @@ -9,7 +9,7 @@ import { AlertsClient, CreateOptions, ConstructorOptions } from './alerts_client import { savedObjectsClientMock, loggingSystemMock } from '../../../../src/core/server/mocks'; import { nodeTypes } from '../../../../src/plugins/data/common'; import { esKuery } from '../../../../src/plugins/data/server'; -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; import { alertTypeRegistryMock } from './alert_type_registry.mock'; import { alertsAuthorizationMock } from './authorization/alerts_authorization.mock'; import { TaskStatus } from '../../task_manager/server'; @@ -24,7 +24,7 @@ import { QueryEventsBySavedObjectResult } from '../../event_log/server'; import { SavedObject } from 'kibana/server'; import { EventsFactory } from './lib/alert_instance_summary_from_event_log.test'; -const taskManager = taskManagerMock.start(); +const taskManager = taskManagerMock.createStart(); const alertTypeRegistry = alertTypeRegistryMock.create(); const unsecuredSavedObjectsClient = savedObjectsClientMock.create(); const eventLogClient = eventLogClientMock.create(); diff --git a/x-pack/plugins/alerts/server/alerts_client_factory.test.ts b/x-pack/plugins/alerts/server/alerts_client_factory.test.ts index ac91d689798c9..770658fdde108 100644 --- a/x-pack/plugins/alerts/server/alerts_client_factory.test.ts +++ b/x-pack/plugins/alerts/server/alerts_client_factory.test.ts @@ -7,7 +7,7 @@ import { Request } from 'hapi'; import { AlertsClientFactory, AlertsClientFactoryOpts } from './alerts_client_factory'; import { alertTypeRegistryMock } from './alert_type_registry.mock'; -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; import { KibanaRequest } from '../../../../src/core/server'; import { savedObjectsClientMock, @@ -35,7 +35,7 @@ const features = featuresPluginMock.createStart(); const securityPluginSetup = securityMock.createSetup(); const alertsClientFactoryParams: jest.Mocked = { logger: loggingSystemMock.create().get(), - taskManager: taskManagerMock.start(), + taskManager: taskManagerMock.createStart(), alertTypeRegistry: alertTypeRegistryMock.create(), getSpaceId: jest.fn(), getSpace: jest.fn(), diff --git a/x-pack/plugins/alerts/server/usage/alerts_usage_collector.test.ts b/x-pack/plugins/alerts/server/usage/alerts_usage_collector.test.ts index b48d173ba36d9..a5f83bc393d4e 100644 --- a/x-pack/plugins/alerts/server/usage/alerts_usage_collector.test.ts +++ b/x-pack/plugins/alerts/server/usage/alerts_usage_collector.test.ts @@ -6,8 +6,8 @@ import { UsageCollectionSetup } from 'src/plugins/usage_collection/server'; import { registerAlertsUsageCollector } from './alerts_usage_collector'; -import { taskManagerMock } from '../../../task_manager/server/task_manager.mock'; -const taskManagerStart = taskManagerMock.start(); +import { taskManagerMock } from '../../../task_manager/server/mocks'; +const taskManagerStart = taskManagerMock.createStart(); beforeEach(() => jest.resetAllMocks()); diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index d5bbbe65582f1..2eb132185ff70 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -15,6 +15,7 @@ describe('config validation', () => { "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, + "monitored_aggregated_stats_refresh_rate": 60000, "poll_interval": 3000, "request_capacity": 1000, } diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index aa78cf3baa96d..1b79c17220f4e 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -10,6 +10,9 @@ export const DEFAULT_MAX_WORKERS = 10; export const DEFAULT_POLL_INTERVAL = 3000; export const DEFAULT_MAX_POLL_INACTIVITY_CYCLES = 10; +// Refresh "pull based" monitored stats at a default rate of once a minute +export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000; + export const configSchema = schema.object({ enabled: schema.boolean({ defaultValue: true }), /* The maximum number of times a task will be attempted before being abandoned as failed */ @@ -48,6 +51,12 @@ export const configSchema = schema.object({ // disable the task manager rather than trying to specify it with 0 workers min: 1, }), + /* The rate at which we refresh monitored stats that require aggregation queries against ES. */ + monitored_aggregated_stats_refresh_rate: schema.number({ + defaultValue: DEFAULT_MONITORING_REFRESH_RATE, + /* don't run monitored stat aggregations any faster than once every 5 seconds */ + min: 5000, + }), }); export type TaskManagerConfig = TypeOf; diff --git a/x-pack/plugins/task_manager/server/monitoring/index.ts b/x-pack/plugins/task_manager/server/monitoring/index.ts new file mode 100644 index 0000000000000..cf75294be1266 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/index.ts @@ -0,0 +1,18 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +import { TaskManager } from '../task_manager'; +import { AggregatedStatProvider } from './runtime_statistics_aggregator'; +import { createWorkloadAggregator } from './workload_statistics'; +import { TaskManagerConfig } from '../config'; + +export { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; + +export function createAggregatedStatsStream( + taskManager: TaskManager, + config: TaskManagerConfig +): AggregatedStatProvider { + return createWorkloadAggregator(taskManager, config.monitored_aggregated_stats_refresh_rate); +} diff --git a/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts b/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts new file mode 100644 index 0000000000000..f895bf2b02e6a --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts @@ -0,0 +1,14 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +import { Observable } from 'rxjs'; +import { JsonObject, JsonValue } from 'src/plugins/kibana_utils/common'; + +export interface AggregatedStat { + key: string; + value: JsonObject | JsonValue; +} + +export type AggregatedStatProvider = Observable; diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts new file mode 100644 index 0000000000000..32e8c21112398 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -0,0 +1,118 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { createWorkloadAggregator } from './workload_statistics'; +import { taskManagerMock } from '../task_manager.mock'; +import { first } from 'rxjs/operators'; +import { AggregationResult } from '../queries/aggregation_clauses'; + +describe('Workload Statistics Aggregator', () => { + test('queries the Task Store at a fixed interval for the current workload', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(({ + task: { + doc_count: 0, + taskType: { + buckets: [], + }, + }, + } as unknown) as AggregationResult); + + const workloadAggregator = createWorkloadAggregator(taskManager, 10); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe(() => { + expect(taskManager.aggregate).toHaveBeenCalledWith({ + aggs: { + taskType: { + terms: { field: 'task.taskType' }, + aggs: { + status: { + terms: { field: 'task.status' }, + }, + }, + }, + }, + }); + resolve(); + }); + }); + }); + + test('returns a summary of the workload by task type', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(({ + task: { + doc_count: 4, + taskType: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'actions_telemetry', + doc_count: 2, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'idle', + doc_count: 2, + }, + ], + }, + }, + { + key: 'alerting_telemetry', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + { + key: 'session_cleanup', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + ], + }, + }, + } as unknown) as AggregationResult); + + const workloadAggregator = createWorkloadAggregator(taskManager, 10); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + sum: 4, + types: { + actions_telemetry: { sum: 2, status: { idle: 2 } }, + alerting_telemetry: { sum: 1, status: { idle: 1 } }, + session_cleanup: { sum: 1, status: { idle: 1 } }, + }, + }); + resolve(); + }); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts new file mode 100644 index 0000000000000..8e73d88bea25b --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -0,0 +1,65 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { interval } from 'rxjs'; +import { concatMap, map } from 'rxjs/operators'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { keyBy, mapValues } from 'lodash'; +import { AggregatedStatProvider } from './runtime_statistics_aggregator'; +import { TaskManager } from '../task_manager'; +import { + AggregationResult, + AggregationBucketWithSubAgg, + AggregationBucket, +} from '../queries/aggregation_clauses'; + +export function createWorkloadAggregator( + taskManager: TaskManager, + refreshInterval: number +): AggregatedStatProvider { + return interval(refreshInterval).pipe( + concatMap(() => + taskManager.aggregate({ + aggs: { + taskType: { + terms: { field: 'task.taskType' }, + aggs: { + status: { + terms: { field: 'task.status' }, + }, + }, + }, + }, + }) + ), + map( + ({ + task: { + doc_count: sum, + taskType: { buckets: types }, + }, + }: AggregationResult<'task' | 'taskType' | 'status'>) => { + const summary: JsonObject = { + sum, + types: mapValues( + keyBy>( + types as Array>, + 'key' + ), + ({ doc_count: docCount, status }) => ({ + sum: docCount, + status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), + }) + ), + }; + return { + key: 'workload', + value: summary, + }; + } + ) + ); +} diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index d7dcf779376bf..715d8cf1b4d00 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -11,6 +11,8 @@ import { TaskManager } from './task_manager'; import { TaskManagerConfig } from './config'; import { Middleware } from './lib/middleware'; import { setupSavedObjects } from './saved_objects'; +import { healthRoute } from './routes'; +import { createAggregatedStatsStream } from './monitoring'; export type TaskManagerSetupContract = Pick< TaskManager, @@ -36,14 +38,24 @@ export class TaskManagerPlugin } public async setup(core: CoreSetup): Promise { - this.config = await this.initContext.config + const config = (this.config = await this.initContext.config .create() .pipe(first()) - .toPromise(); + .toPromise()); setupSavedObjects(core.savedObjects, this.config); this.taskManagerId = this.initContext.env.instanceUuid; + // Routes + const router = core.http.createRouter(); + healthRoute( + router, + config, + this.taskManager.then((tm) => createAggregatedStatsStream(tm, config)), + // if health is any more stale than the pollInterval (+1s buffer) consider the system unhealthy + config.poll_interval + 1000 + ); + return { addMiddleware: (middleware: Middleware) => { this.taskManager.then((tm) => tm.addMiddleware(middleware)); diff --git a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts new file mode 100644 index 0000000000000..84cd9d6ae2b5e --- /dev/null +++ b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts @@ -0,0 +1,84 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { TermFilter } from './query_clauses'; + +/** + * Terminology + * =========== + * The terms for the different clauses in an Elasticsearch query aggregation can be confusing, here are some + * clarifications that might help you understand the Typescript types we use here. + * + * Given the following Aggregation: + * { + * "size": 0, + * "aggs": { (1) + * "task": { + * "filter": { + * "term": { + * "type": "task" + * } + * }, + * "aggs": { (1) + * "taskType": { (2) + * "terms": { "field": "task.taskType" }, + * "aggs": { + * "status": { (2) + * "terms": { "field": "task.status" } + * } + * } + * } + * } + * } + * } + * } + * + * These are referred to as: + * (1). AggregationQuery + * (2). TermAggregation + * + */ + +export interface AggregationQuery { + [aggregationName: string]: (TermAggregation | { aggs: AggregationQuery }) & { + filter?: TermFilter; + }; +} + +interface TermAggregation { + terms: { + field: string; + }; +} + +/** + * Results of an Aggregation + */ +type ReservedNames = 'doc_count'; +type AggregationNames = Exclude; +export type Aggregation = { + doc_count: number; +} & { + [innerAggregation in Name]: AggregationBuckets; +}; + +export interface AggregationBucket { + key: string; + doc_count: number; +} + +export type AggregationBucketWithSubAgg = AggregationBucket & + { + [innerAggregation in Name]: AggregationBuckets; + }; + +export interface AggregationBuckets { + buckets: AggregationBucket[] | Array>; +} + +export type AggregationResult = { + [aggregationName in Name]: Aggregation; +}; diff --git a/x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts b/x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts new file mode 100644 index 0000000000000..c9f4de25afaf7 --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts @@ -0,0 +1,33 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { RequestHandlerContext, KibanaRequest, KibanaResponseFactory } from 'kibana/server'; +import { identity } from 'lodash'; +import { httpServerMock } from '../../../../../src/core/server/mocks'; + +export function mockHandlerArguments( + {}: {}, + req: unknown, + res?: Array> +): [RequestHandlerContext, KibanaRequest, KibanaResponseFactory] { + return [ + ({} as unknown) as RequestHandlerContext, + req as KibanaRequest, + mockResponseFactory(res), + ]; +} + +export const mockResponseFactory = (resToMock: Array> = []) => { + const factory: jest.Mocked = httpServerMock.createResponseFactory(); + resToMock.forEach((key: string) => { + if (key in factory) { + Object.defineProperty(factory, key, { + value: jest.fn(identity), + }); + } + }); + return (factory as unknown) as KibanaResponseFactory; +}; diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts new file mode 100644 index 0000000000000..4fc7b9d6b352c --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -0,0 +1,188 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { healthRoute } from './health'; +import { httpServiceMock } from 'src/core/server/mocks'; +import { mockHandlerArguments } from './_mock_handler_arguments'; +import { TaskManagerConfig } from '../config'; +import { of, Subject } from 'rxjs'; +import { get } from 'lodash'; +import { sleep } from '../test_utils'; +import { AggregatedStat } from '../monitoring'; + +beforeEach(() => { + jest.resetAllMocks(); +}); + +const configuration: TaskManagerConfig = { + enabled: true, + max_workers: 10, + index: 'foo', + max_attempts: 9, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, +}; + +describe('healthRoute', () => { + it('registers the route', async () => { + const router = httpServiceMock.createRouter(); + + healthRoute(router, configuration, Promise.resolve(of()), 1000); + + const [config] = router.get.mock.calls[0]; + + expect(config.path).toMatchInlineSnapshot(`"/api/task_manager/_health"`); + }); + + it('returns the initial config used to configure Task Manager', async () => { + const router = httpServiceMock.createRouter(); + + healthRoute(router, configuration, Promise.resolve(of()), 1000); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + expect(get(await handler(context, req, res), 'body.stats')).toMatchObject({ + configuration: { + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + }, + }, + }); + }); + + it('returns an error response if the stats are no longer fresh', async () => { + const router = httpServiceMock.createRouter(); + + healthRoute(router, configuration, Promise.resolve(of()), 1000); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + await sleep(2000); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + attributes: { + lastUpdate: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + value: { + max_poll_inactivity_cycles: 10, + max_workers: 10, + poll_interval: 6000000, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + }, + }, + }, + }, + message: new Error('Task Manager monitored stats are out of date'), + }, + }); + }); + + it('incrementally updates the stats returned by the endpoint', async () => { + const router = httpServiceMock.createRouter(); + + const aggregatedStats = Promise.resolve(new Subject()); + + healthRoute(router, configuration, Promise.resolve(aggregatedStats), 1000); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + return aggregatedStats.then(async (aggregatedStats$) => { + aggregatedStats$.next({ + key: 'newAggregatedStat', + value: { + some: { + complex: { + value: 123, + }, + }, + }, + }); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + lastUpdate: expect.any(String), + stats: { + newAggregatedStat: { + timestamp: expect.any(String), + value: { + some: { + complex: { + value: 123, + }, + }, + }, + }, + configuration: { + timestamp: expect.any(String), + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + }, + }, + }, + }, + }); + + aggregatedStats$.next({ + key: 'newAggregatedStat', + value: { + some: { + updated: { + value: 456, + }, + }, + }, + }); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + lastUpdate: expect.any(String), + stats: { + newAggregatedStat: { + timestamp: expect.any(String), + value: { + some: { + updated: { + value: 456, + }, + }, + }, + }, + configuration: { + timestamp: expect.any(String), + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + }, + }, + }, + }, + }); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts new file mode 100644 index 0000000000000..cf73c93143918 --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -0,0 +1,108 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { + IRouter, + RequestHandlerContext, + KibanaRequest, + IKibanaResponse, + KibanaResponseFactory, +} from 'kibana/server'; +import { pick } from 'lodash'; +import { set } from '@elastic/safer-lodash-set'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { map } from 'rxjs/operators'; +import { TaskManagerConfig } from '../config'; +import { AggregatedStatProvider } from '../monitoring'; + +const CONFIG_FIELDS_TO_EXPOSE = [ + 'max_workers', + 'poll_interval', + 'request_capacity', + 'max_poll_inactivity_cycles', + 'monitored_aggregated_stats_refresh_rate', +]; + +interface MonitoredStat { + timestamp: string; + value: JsonObject; +} + +interface MonitoringStats { + lastUpdate: string; + stats: Record; +} + +export function healthRoute( + router: IRouter, + initialConfig: TaskManagerConfig, + aggregatedStats: Promise, + requiredFreshness: number +) { + const initialisationTimestamp = new Date().toISOString(); + const monitoringStats: MonitoringStats = { + lastUpdate: initialisationTimestamp, + stats: { + configuration: { + timestamp: initialisationTimestamp, + value: pick<{ + max_workers: number; + poll_interval: number; + request_capacity: number; + max_poll_inactivity_cycles: number; + monitored_aggregated_stats_refresh_rate: number; + }>(initialConfig, ...CONFIG_FIELDS_TO_EXPOSE) as JsonObject, + }, + }, + }; + + aggregatedStats.then((aggregatedStats$) => { + aggregatedStats$ + .pipe( + map(({ key, value }) => { + return { + value: { timestamp: new Date().toISOString(), value }, + key, + }; + }) + ) + .subscribe(({ key, value }) => { + set(monitoringStats.stats, key, value); + monitoringStats.lastUpdate = new Date().toISOString(); + }); + }); + + router.get( + { + path: '/api/task_manager/_health', + validate: false, + }, + async function ( + context: RequestHandlerContext, + req: KibanaRequest, + res: KibanaResponseFactory + ): Promise { + const lastUpdate = Date.parse(monitoringStats.lastUpdate); + + /** + * If the monitored stats aren't fresh, return an `500 internalError` with + * the stats in the body of the api call. This makes it easier for monitoring + * services to mark the service as broken + */ + if (Date.now() - lastUpdate > requiredFreshness) { + return res.internalError({ + body: { + message: new Error('Task Manager monitored stats are out of date'), + attributes: monitoringStats, + }, + }); + } + return res.ok({ + body: monitoringStats, + }); + } + ); +} diff --git a/x-pack/plugins/task_manager/server/routes/index.ts b/x-pack/plugins/task_manager/server/routes/index.ts new file mode 100644 index 0000000000000..4fa1aa6cb7a9b --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/index.ts @@ -0,0 +1,7 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +export { healthRoute } from './health'; diff --git a/x-pack/plugins/task_manager/server/task_manager.mock.ts b/x-pack/plugins/task_manager/server/task_manager.mock.ts index 1fc626e7d58d6..8afaa22515668 100644 --- a/x-pack/plugins/task_manager/server/task_manager.mock.ts +++ b/x-pack/plugins/task_manager/server/task_manager.mock.ts @@ -4,27 +4,24 @@ * you may not use this file except in compliance with the Elastic License. */ -import { TaskManagerSetupContract, TaskManagerStartContract } from './plugin'; +import { TaskManager } from './task_manager'; + +const createTaskManagerMock = () => { + return { + registerTaskDefinitions: jest.fn(), + addMiddleware: jest.fn(), + ensureScheduled: jest.fn(), + schedule: jest.fn(), + fetch: jest.fn(), + aggregate: jest.fn(), + get: jest.fn(), + runNow: jest.fn(), + remove: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + } as jest.Mocked; +}; export const taskManagerMock = { - setup(overrides: Partial> = {}) { - const mocked: jest.Mocked = { - registerTaskDefinitions: jest.fn(), - addMiddleware: jest.fn(), - ...overrides, - }; - return mocked; - }, - start(overrides: Partial> = {}) { - const mocked: jest.Mocked = { - ensureScheduled: jest.fn(), - schedule: jest.fn(), - fetch: jest.fn(), - get: jest.fn(), - runNow: jest.fn(), - remove: jest.fn(), - ...overrides, - }; - return mocked; - }, + create: createTaskManagerMock, }; diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index fb2d5e07030a4..7df3186ca8ecf 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -60,10 +60,12 @@ import { OwnershipClaimingOpts, ClaimOwnershipResult, SearchOpts, + AggregationOpts, } from './task_store'; import { identifyEsError } from './lib/identify_es_error'; import { ensureDeprecatedFieldsAreCorrected } from './lib/correct_deprecated_fields'; import { BufferedTaskStore } from './buffered_task_store'; +import { AggregationResult } from './queries/aggregation_clauses'; const VERSION_CONFLICT_STATUS = 409; @@ -372,6 +374,19 @@ export class TaskManager { return this.store.fetch(opts); } + /** + * Fetches a list of scheduled tasks. + * + * @param opts - The query options used to filter tasks + * @returns {Promise} + */ + public async aggregate( + opts: AggregationOpts + ): Promise> { + await this.waitUntilStarted(); + return this.store.aggregate(opts); + } + /** * Get the current state of a specified task. * diff --git a/x-pack/plugins/task_manager/server/task_store.mock.ts b/x-pack/plugins/task_manager/server/task_store.mock.ts index 86db695bc5e2c..a960b52cf659a 100644 --- a/x-pack/plugins/task_manager/server/task_store.mock.ts +++ b/x-pack/plugins/task_manager/server/task_store.mock.ts @@ -22,6 +22,7 @@ export const taskStoreMock = { get: jest.fn(), getLifecycle: jest.fn(), fetch: jest.fn(), + aggregate: jest.fn(), maxAttempts, index, taskManagerId, diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index acd19bd75f7a3..17523ee9efb6e 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -60,6 +60,7 @@ import { SortByRunAtAndRetryAt, tasksClaimedByOwner, } from './queries/mark_available_tasks_as_claimed'; +import { AggregationQuery, AggregationResult } from './queries/aggregation_clauses'; export interface StoreOpts { callCluster: ElasticJs; @@ -79,6 +80,11 @@ export interface SearchOpts { search_after?: unknown[]; } +export interface AggregationOpts { + aggs: AggregationQuery; + size?: number; +} + export interface UpdateByQuerySearchOpts extends SearchOpts { script?: object; } @@ -458,6 +464,22 @@ export class TaskStore { }; } + public async aggregate({ + aggs, + size = 0, + }: AggregationOpts): Promise> { + const result = await this.callCluster('search', { + index: this.index, + ignoreUnavailable: true, + body: { + aggs: ensureAggregationOnlyReturnsTaskObjects(aggs), + size, + }, + }); + + return (result as { aggregations: AggregationResult }).aggregations; + } + private async updateByQuery( opts: UpdateByQuerySearchOpts = {}, // eslint-disable-next-line @typescript-eslint/naming-convention @@ -537,6 +559,22 @@ function ensureQueryOnlyReturnsTaskObjects(opts: SearchOpts): SearchOpts { }; } +function ensureAggregationOnlyReturnsTaskObjects( + aggs: AggregationOpts['aggs'] +): AggregationOpts['aggs'] { + const filteredAgg: AggregationQuery = { + task: { + filter: { + term: { + type: 'task', + }, + }, + aggs, + }, + }; + return filteredAgg; +} + function isSavedObjectsUpdateResponse( result: SavedObjectsUpdateResponse | Error ): result is SavedObjectsUpdateResponse { diff --git a/x-pack/test/plugin_api_integration/config.ts b/x-pack/test/plugin_api_integration/config.ts index b89ed6ad550a3..30a361ea2a379 100644 --- a/x-pack/test/plugin_api_integration/config.ts +++ b/x-pack/test/plugin_api_integration/config.ts @@ -43,6 +43,7 @@ export default async function ({ readConfigFile }: FtrConfigProviderContext) { '--xpack.eventLog.enabled=true', '--xpack.eventLog.logEntries=true', '--xpack.eventLog.indexEntries=true', + '--xpack.task_manager.monitored_aggregated_stats_refresh_rate=5000', ...plugins.map( (pluginDir) => `--plugin-path=${path.resolve(__dirname, 'plugins', pluginDir)}` ), diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts new file mode 100644 index 0000000000000..9cc7b61744432 --- /dev/null +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -0,0 +1,86 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import expect from '@kbn/expect'; +import url from 'url'; +import supertestAsPromised from 'supertest-as-promised'; +import { FtrProviderContext } from '../../ftr_provider_context'; +import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; + +interface MonitoringStats { + lastUpdate: string; + stats: { + configuration: { + timestamp: string; + value: Record; + }; + workload: { + timestamp: string; + value: Record; + }; + }; +} + +export default function ({ getService }: FtrProviderContext) { + const config = getService('config'); + const retry = getService('retry'); + const supertest = supertestAsPromised(url.format(config.get('servers.kibana'))); + + function getHealthRequest() { + return supertest.get('/api/task_manager/_health').set('kbn-xsrf', 'foo'); + } + + function getHealth(): Promise { + return getHealthRequest() + .expect(200) + .then((response) => response.body); + } + + function scheduleTask(task: Partial): Promise { + return supertest + .post('/api/sample_tasks/schedule') + .set('kbn-xsrf', 'xxx') + .send({ task }) + .expect(200) + .then((response: { body: ConcreteTaskInstance }) => response.body); + } + + const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + + describe('health', () => { + it('should return basic configuration of task manager', async () => { + expect((await getHealth()).stats.configuration.value).to.eql({ + poll_interval: 3000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + max_workers: 10, + }); + }); + + it('should return the task manager workload', async () => { + const sumSampleTaskInWorkload = + ((await getHealth()).stats.workload.value.types as { + sampleTask?: { sum: number }; + }).sampleTask?.sum ?? 0; + + await scheduleTask({ + taskType: 'sampleTask', + schedule: { interval: '1m' }, + }); + + await retry.try(async () => { + // workload is configured to refresh every 5s in FTs + await delay(5000); + + const workloadAfterScheduling = (await getHealth()).stats.workload.value; + + expect( + (workloadAfterScheduling.types as { sampleTask: { sum: number } }).sampleTask.sum + ).to.eql(sumSampleTaskInWorkload + 1); + }); + }); + }); +} diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts index c6d817119d415..5eb1353dd1291 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts @@ -10,5 +10,6 @@ export default function ({ loadTestFile }: FtrProviderContext) { describe('task_manager', function taskManagerSuite() { this.tags('ciGroup2'); loadTestFile(require.resolve('./task_management')); + loadTestFile(require.resolve('./health_route')); }); } diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts index fcf2d5b235123..2434f05b5403f 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts @@ -15,7 +15,6 @@ import { DEFAULT_POLL_INTERVAL, } from '../../../../plugins/task_manager/server/config'; import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; -import { SavedObjectsRawDoc } from '../../../../../src/core/server'; const { task: { properties: taskManagerIndexMapping }, @@ -34,6 +33,22 @@ export interface SearchResults { }; } +type DeprecatedConcreteTaskInstance = Omit & { + interval: string; +}; + +type SerializedConcreteTaskInstance = Omit< + ConcreteTaskInstance, + 'state' | 'params' | 'scheduledAt' | 'startedAt' | 'retryAt' | 'runAt' +> & { + state: State; + params: Params; + scheduledAt: string; + startedAt: string | null; + retryAt: string | null; + runAt: string; +}; + export default function ({ getService }: FtrProviderContext) { const es = getService('legacyEs'); const log = getService('log'); @@ -67,8 +82,8 @@ export default function ({ getService }: FtrProviderContext) { } }); - function currentTasks(): Promise<{ - docs: ConcreteTaskInstance[]; + function currentTasks(): Promise<{ + docs: Array>; }> { return supertest .get('/api/sample_tasks') @@ -76,7 +91,9 @@ export default function ({ getService }: FtrProviderContext) { .then((response) => response.body); } - function currentTask(task: string): ConcreteTaskInstance { + function currentTask( + task: string + ): Promise> { return supertest .get(`/api/sample_tasks/task/${task}`) .send({ task }) @@ -84,11 +101,11 @@ export default function ({ getService }: FtrProviderContext) { .then((response) => response.body); } - function ensureTasksIndexRefreshed(): Promise { + function ensureTasksIndexRefreshed() { return supertest.get(`/api/ensure_tasks_index_refreshed`).send({}).expect(200); } - function historyDocs(taskId: string) { + function historyDocs(taskId?: string): Promise { return es .search({ index: testHistoryIndex, @@ -97,16 +114,18 @@ export default function ({ getService }: FtrProviderContext) { .then((result: SearchResults) => result.hits.hits); } - function scheduleTask(task: string): ConcreteTaskInstance { + function scheduleTask( + task: Partial + ): Promise { return supertest .post('/api/sample_tasks/schedule') .set('kbn-xsrf', 'xxx') .send({ task }) .expect(200) - .then((response: { body: ConcreteTaskInstance }) => response.body); + .then((response: { body: SerializedConcreteTaskInstance }) => response.body); } - function runTaskNow(task) { + function runTaskNow(task: { id: string }) { return supertest .post('/api/sample_tasks/run_now') .set('kbn-xsrf', 'xxx') @@ -115,7 +134,7 @@ export default function ({ getService }: FtrProviderContext) { .then((response) => response.body); } - function scheduleTaskIfNotExists(task) { + function scheduleTaskIfNotExists(task: Partial) { return supertest .post('/api/sample_tasks/ensure_scheduled') .set('kbn-xsrf', 'xxx') @@ -124,7 +143,7 @@ export default function ({ getService }: FtrProviderContext) { .then((response: { body: ConcreteTaskInstance }) => response.body); } - function releaseTasksWaitingForEventToComplete(event) { + function releaseTasksWaitingForEventToComplete(event: string) { return supertest .post('/api/sample_tasks/event') .set('kbn-xsrf', 'xxx') @@ -132,7 +151,10 @@ export default function ({ getService }: FtrProviderContext) { .expect(200); } - function getTaskById(tasks: ConcreteTaskInstance[], id: string) { + function getTaskById( + tasks: Array>, + id: string + ) { return tasks.filter((task) => task.id === id)[0]; } @@ -166,7 +188,7 @@ export default function ({ getService }: FtrProviderContext) { await retry.try(async () => { expect((await historyDocs()).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; log.debug(`Task found: ${task.id}`); log.debug(`Task status: ${task.status}`); log.debug(`Task state: ${JSON.stringify(task.state, null, 2)}`); @@ -251,7 +273,7 @@ export default function ({ getService }: FtrProviderContext) { await retry.try(async () => { expect((await historyDocs(originalTask.id)).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(count + 1); @@ -272,7 +294,7 @@ export default function ({ getService }: FtrProviderContext) { await retry.try(async () => { expect((await historyDocs()).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(1); @@ -293,7 +315,7 @@ export default function ({ getService }: FtrProviderContext) { await retry.try(async () => { expect((await historyDocs()).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(1); @@ -314,7 +336,7 @@ export default function ({ getService }: FtrProviderContext) { 1 ); - const [task] = (await currentTasks()).docs.filter( + const [task] = (await currentTasks<{ count: number }>()).docs.filter( (taskDoc) => taskDoc.id === originalTask.id ); @@ -337,7 +359,7 @@ export default function ({ getService }: FtrProviderContext) { .length ).to.eql(2); - const [task] = (await currentTasks()).docs.filter( + const [task] = (await currentTasks<{ count: number }>()).docs.filter( (taskDoc) => taskDoc.id === originalTask.id ); expect(task.state.count).to.eql(2); @@ -358,7 +380,7 @@ export default function ({ getService }: FtrProviderContext) { const docs = await historyDocs(originalTask.id); expect(docs.length).to.eql(1); - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(1); @@ -408,16 +430,16 @@ export default function ({ getService }: FtrProviderContext) { expect(await runNowResult).to.eql({ id: originalTask.id }); await retry.try(async () => { - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(2); }); // drain tasks, othrwise they'll keep Task Manager stalled await retry.try(async () => { await releaseTasksWaitingForEventToComplete('releaseTheOthers'); - const tasks = (await currentTasks()).docs.filter( - (task) => task.params.originalParams.waitForEvent === 'releaseTheOthers' - ); + const tasks = ( + await currentTasks<{}, { originalParams: { waitForEvent: string } }>() + ).docs.filter((task) => task.params.originalParams.waitForEvent === 'releaseTheOthers'); expect(tasks.length).to.eql(0); }); }); @@ -435,7 +457,7 @@ export default function ({ getService }: FtrProviderContext) { 1 ); - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(1); expect(task.status).to.eql('idle'); @@ -452,7 +474,7 @@ export default function ({ getService }: FtrProviderContext) { expect(successfulRunNowResult).to.eql({ id: originalTask.id }); await retry.try(async () => { - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(2); expect(task.status).to.eql('idle'); }); @@ -530,7 +552,7 @@ export default function ({ getService }: FtrProviderContext) { // finish first run by emitting 'runNowHasBeenAttempted' event await releaseTasksWaitingForEventToComplete('runNowHasBeenAttempted'); await retry.try(async () => { - const tasks = (await currentTasks()).docs; + const tasks = (await currentTasks<{ count: number }>()).docs; expect(getTaskById(tasks, longRunningTask.id).state.count).to.eql(1); const task = await currentTask(longRunningTask.id); @@ -579,7 +601,11 @@ export default function ({ getService }: FtrProviderContext) { expect(await runNowResultWithExpectedFailure).to.eql({ id: taskThatFailsBeforeRunNow.id }); }); - async function expectReschedule(originalRunAt: number, task: Task, expectedDiff: number) { + async function expectReschedule( + originalRunAt: number, + task: SerializedConcreteTaskInstance, + expectedDiff: number + ) { const buffer = 10000; expect(Date.parse(task.runAt) - originalRunAt).to.be.greaterThan(expectedDiff - buffer); expect(Date.parse(task.runAt) - originalRunAt).to.be.lessThan(expectedDiff + buffer); @@ -607,14 +633,14 @@ export default function ({ getService }: FtrProviderContext) { }); await retry.try(async () => { - const tasks = (await currentTasks()).docs; + const tasks = (await currentTasks<{ count: number }>()).docs; expect(getTaskById(tasks, fastTask.id).state.count).to.eql(2); }); await releaseTasksWaitingForEventToComplete('rescheduleHasHappened'); await retry.try(async () => { - const tasks = (await currentTasks()).docs; + const tasks = (await currentTasks<{ count: number }>()).docs; expect(getTaskById(tasks, fastTask.id).state.count).to.greaterThan(2); expect(getTaskById(tasks, longRunningTask.id).state.count).to.eql(1); From 7e27d7b9645f3696bd045b52eaa1d2729e1cf80b Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Wed, 23 Sep 2020 13:53:23 +0100 Subject: [PATCH 05/42] added shceduling to health endpoint --- .../server/lib/bulk_operation_buffer.ts | 2 +- .../server/lib/correct_deprecated_fields.ts | 2 +- .../task_manager/server/lib/intervals.test.ts | 37 +-- .../task_manager/server/lib/intervals.ts | 94 +++----- .../task_manager/server/monitoring/index.ts | 10 +- .../monitoring/workload_statistics.test.ts | 210 +++++++++++++----- .../server/monitoring/workload_statistics.ts | 69 +++--- x-pack/plugins/task_manager/server/plugin.ts | 9 +- .../task_manager/server/task_manager.ts | 2 +- .../plugins/task_manager/server/task_pool.ts | 2 +- .../task_manager/server/task_runner.test.ts | 4 +- .../task_manager/server/task_runner.ts | 2 +- .../task_manager/server/test_utils/index.ts | 9 +- x-pack/plugins/task_manager/server/types.ts | 7 - .../test_suites/task_manager/health_route.ts | 37 ++- .../test_suites/task_manager/index.ts | 2 +- 16 files changed, 295 insertions(+), 203 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts b/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts index 57a14c2f8a56b..4de92ffc77030 100644 --- a/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts +++ b/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts @@ -4,11 +4,11 @@ * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; import { map } from 'lodash'; import { Subject, race, from } from 'rxjs'; import { bufferWhen, filter, bufferCount, flatMap, mapTo, first } from 'rxjs/operators'; import { either, Result, asOk, asErr, Ok, Err } from './result_type'; -import { Logger } from '../types'; export interface BufferOptions { bufferMaxDuration?: number; diff --git a/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts b/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts index 2de95cbb8c2fa..a15682a9d3f38 100644 --- a/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts +++ b/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts @@ -4,8 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; import { TaskInstance, TaskInstanceWithDeprecatedFields } from '../task'; -import { Logger } from '../types'; export function ensureDeprecatedFieldsAreCorrected( { id, taskType, interval, schedule, ...taskInstance }: TaskInstanceWithDeprecatedFields, diff --git a/x-pack/plugins/task_manager/server/lib/intervals.test.ts b/x-pack/plugins/task_manager/server/lib/intervals.test.ts index ac28b81eaf490..3554f8d8294f2 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.test.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.test.ts @@ -7,11 +7,9 @@ import _ from 'lodash'; import sinon from 'sinon'; import { - assertValidInterval, + parseIntervalAsSecond, intervalFromNow, intervalFromDate, - minutesFromNow, - minutesFromDate, secondsFromNow, secondsFromDate, } from './intervals'; @@ -25,29 +23,29 @@ beforeAll(() => { afterAll(() => fakeTimer.restore()); describe('taskIntervals', () => { - describe('assertValidInterval', () => { + describe('parseIntervalAsSecond', () => { test('it accepts intervals in the form `Nm`', () => { - expect(() => assertValidInterval(`${_.random(1, 1000)}m`)).not.toThrow(); + expect(() => parseIntervalAsSecond(`${_.random(1, 1000)}m`)).not.toThrow(); }); test('it accepts intervals in the form `Ns`', () => { - expect(() => assertValidInterval(`${_.random(1, 1000)}s`)).not.toThrow(); + expect(() => parseIntervalAsSecond(`${_.random(1, 1000)}s`)).not.toThrow(); }); test('it rejects 0 based intervals', () => { - expect(() => assertValidInterval('0m')).toThrow( + expect(() => parseIntervalAsSecond('0m')).toThrow( /Invalid interval "0m"\. Intervals must be of the form {number}m. Example: 5m/ ); - expect(() => assertValidInterval('0s')).toThrow( + expect(() => parseIntervalAsSecond('0s')).toThrow( /Invalid interval "0s"\. Intervals must be of the form {number}m. Example: 5m/ ); }); test('it rejects intervals are not of the form `Nm` or `Ns`', () => { - expect(() => assertValidInterval(`5m 2s`)).toThrow( + expect(() => parseIntervalAsSecond(`5m 2s`)).toThrow( /Invalid interval "5m 2s"\. Intervals must be of the form {number}m. Example: 5m/ ); - expect(() => assertValidInterval(`hello`)).toThrow( + expect(() => parseIntervalAsSecond(`hello`)).toThrow( /Invalid interval "hello"\. Intervals must be of the form {number}m. Example: 5m/ ); }); @@ -125,25 +123,6 @@ describe('taskIntervals', () => { }); }); - describe('minutesFromNow', () => { - test('it returns the current date plus a number of minutes', () => { - const mins = _.random(1, 100); - const expected = Date.now() + mins * 60 * 1000; - const nextRun = minutesFromNow(mins).getTime(); - expect(nextRun).toEqual(expected); - }); - }); - - describe('minutesFromDate', () => { - test('it returns the given date plus a number of minutes', () => { - const originalDate = new Date(2019, 1, 1); - const mins = _.random(1, 100); - const expected = originalDate.valueOf() + mins * 60 * 1000; - const nextRun = minutesFromDate(originalDate, mins).getTime(); - expect(expected).toEqual(nextRun); - }); - }); - describe('secondsFromNow', () => { test('it returns the current date plus a number of seconds', () => { const secs = _.random(1, 100); diff --git a/x-pack/plugins/task_manager/server/lib/intervals.ts b/x-pack/plugins/task_manager/server/lib/intervals.ts index 9009be5f78220..967251e6d717f 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.ts @@ -4,6 +4,22 @@ * you may not use this file except in compliance with the Elastic License. */ +import { memoize } from 'lodash'; + +export enum IntervalCadence { + Minute = 'm', + Second = 's', +} +const VALID_CADENCE = new Set(Object.values(IntervalCadence)); +const CADENCE_IN_SECONDS: Record = { + [IntervalCadence.Second]: 1, + [IntervalCadence.Minute]: 60, +}; + +function isCadence(cadence: IntervalCadence | string): cadence is IntervalCadence { + return VALID_CADENCE.has(cadence as IntervalCadence); +} + /** * Returns a date that is the specified interval from now. Currently, * only minute-intervals and second-intervals are supported. @@ -14,14 +30,7 @@ export function intervalFromNow(interval?: string): Date | undefined { if (interval === undefined) { return; } - - assertValidInterval(interval); - - if (isSeconds(interval)) { - return secondsFromNow(parseInterval(interval)); - } - - return minutesFromNow(parseInterval(interval)); + return secondsFromNow(parseIntervalAsSecond(interval)); } /** @@ -35,37 +44,7 @@ export function intervalFromDate(date: Date, interval?: string): Date | undefine if (interval === undefined) { return; } - - assertValidInterval(interval); - - if (isSeconds(interval)) { - return secondsFromDate(date, parseInterval(interval)); - } - - return minutesFromDate(date, parseInterval(interval)); -} - -/** - * Returns a date that is mins minutes from now. - * - * @param mins The number of mintues from now - */ -export function minutesFromNow(mins: number): Date { - return minutesFromDate(new Date(), mins); -} - -/** - * Returns a date that is mins minutes from given date. - * - * @param date The date to add minutes to - * @param mins The number of mintues from given date - */ -export function minutesFromDate(date: Date, mins: number): Date { - const result = new Date(date.valueOf()); - - result.setMinutes(result.getMinutes() + mins); - - return result; + return secondsFromDate(date, parseIntervalAsSecond(interval)); } /** @@ -85,9 +64,7 @@ export function secondsFromNow(secs: number): Date { */ export function secondsFromDate(date: Date, secs: number): Date { const result = new Date(date.valueOf()); - result.setSeconds(result.getSeconds() + secs); - return result; } @@ -95,29 +72,18 @@ export function secondsFromDate(date: Date, secs: number): Date { * Verifies that the specified interval matches our expected format. * * @param {string} interval - An interval such as `5m` or `10s` + * @returns {number} The interval as seconds */ -export function assertValidInterval(interval: string) { - if (isMinutes(interval)) { - return interval; +export const parseIntervalAsSecond = memoize((interval: string): number => { + const numericAsStr: string = interval.slice(0, -1); + const numeric: number = parseInt(numericAsStr, 10); + const cadence: IntervalCadence | string = interval.slice(-1); + if (!isCadence(cadence) || isNaN(numeric) || numeric <= 0 || !isNumeric(numericAsStr)) { + throw new Error( + `Invalid interval "${interval}". Intervals must be of the form {number}m. Example: 5m.` + ); } + return numeric * CADENCE_IN_SECONDS[cadence]; +}); - if (isSeconds(interval)) { - return interval; - } - - throw new Error( - `Invalid interval "${interval}". Intervals must be of the form {number}m. Example: 5m.` - ); -} - -function parseInterval(interval: string) { - return parseInt(interval, 10); -} - -function isMinutes(interval: string) { - return /^[1-9][0-9]*m$/.test(interval); -} - -function isSeconds(interval: string) { - return /^[1-9][0-9]*s$/.test(interval); -} +const isNumeric = (numAsStr: string) => /^\d+$/.test(numAsStr); diff --git a/x-pack/plugins/task_manager/server/monitoring/index.ts b/x-pack/plugins/task_manager/server/monitoring/index.ts index cf75294be1266..347731752d852 100644 --- a/x-pack/plugins/task_manager/server/monitoring/index.ts +++ b/x-pack/plugins/task_manager/server/monitoring/index.ts @@ -3,6 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; import { TaskManager } from '../task_manager'; import { AggregatedStatProvider } from './runtime_statistics_aggregator'; import { createWorkloadAggregator } from './workload_statistics'; @@ -12,7 +13,12 @@ export { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_agg export function createAggregatedStatsStream( taskManager: TaskManager, - config: TaskManagerConfig + config: TaskManagerConfig, + logger: Logger ): AggregatedStatProvider { - return createWorkloadAggregator(taskManager, config.monitored_aggregated_stats_refresh_rate); + return createWorkloadAggregator( + taskManager, + config.monitored_aggregated_stats_refresh_rate, + logger + ); } diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index 32e8c21112398..f85a6571899ec 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -6,8 +6,9 @@ import { createWorkloadAggregator } from './workload_statistics'; import { taskManagerMock } from '../task_manager.mock'; -import { first } from 'rxjs/operators'; +import { first, take, bufferCount } from 'rxjs/operators'; import { AggregationResult } from '../queries/aggregation_clauses'; +import { mockLogger } from '../test_utils'; describe('Workload Statistics Aggregator', () => { test('queries the Task Store at a fixed interval for the current workload', async () => { @@ -18,10 +19,13 @@ describe('Workload Statistics Aggregator', () => { taskType: { buckets: [], }, + schedule: { + buckets: [], + }, }, } as unknown) as AggregationResult); - const workloadAggregator = createWorkloadAggregator(taskManager, 10); + const workloadAggregator = createWorkloadAggregator(taskManager, 10, mockLogger()); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe(() => { @@ -35,6 +39,11 @@ describe('Workload Statistics Aggregator', () => { }, }, }, + schedule: { + terms: { + field: 'task.schedule.interval', + }, + }, }, }); resolve(); @@ -42,70 +51,122 @@ describe('Workload Statistics Aggregator', () => { }); }); - test('returns a summary of the workload by task type', async () => { - const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(({ + const mockAggregatedResult = ({ + task: { + doc_count: 4, + schedule: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: '3600s', + doc_count: 1, + }, + { + key: '60s', + doc_count: 1, + }, + { + key: '720m', + doc_count: 1, + }, + ], + }, + taskType: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'actions_telemetry', + doc_count: 2, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'idle', + doc_count: 2, + }, + ], + }, + }, + { + key: 'alerting_telemetry', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + { + key: 'session_cleanup', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + ], + }, + }, + } as unknown) as AggregationResult; + + function setTaskTypeCount( + result: AggregationResult, + taskType: string, + status: Record + ) { + const buckets = [ + ...result.task.taskType.buckets.filter(({ key }) => key !== taskType), + { + key: taskType, + doc_count: Object.values(status).reduce((sum, count) => sum + count, 0), + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: Object.entries(status).map(([key, count]) => ({ + key, + doc_count: count, + })), + }, + }, + ]; + return ({ task: { - doc_count: 4, + doc_count: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), taskType: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, - buckets: [ - { - key: 'actions_telemetry', - doc_count: 2, - status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: 'idle', - doc_count: 2, - }, - ], - }, - }, - { - key: 'alerting_telemetry', - doc_count: 1, - status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: 'idle', - doc_count: 1, - }, - ], - }, - }, - { - key: 'session_cleanup', - doc_count: 1, - status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: 'idle', - doc_count: 1, - }, - ], - }, - }, - ], + buckets, }, }, - } as unknown) as AggregationResult); + } as unknown) as AggregationResult; + } - const workloadAggregator = createWorkloadAggregator(taskManager, 10); + test('returns a summary of the workload by task type', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + + const workloadAggregator = createWorkloadAggregator(taskManager, 10, mockLogger()); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { expect(result.key).toEqual('workload'); expect(result.value).toMatchObject({ sum: 4, - types: { + taskTypes: { actions_telemetry: { sum: 2, status: { idle: 2 } }, alerting_telemetry: { sum: 1, status: { idle: 1 } }, session_cleanup: { sum: 1, status: { idle: 1 } }, @@ -115,4 +176,47 @@ describe('Workload Statistics Aggregator', () => { }); }); }); + + test('recovers from errors fetching the workload', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate + .mockResolvedValueOnce( + setTaskTypeCount(mockAggregatedResult, 'alerting_telemetry', { + idle: 2, + }) + ) + .mockRejectedValueOnce(new Error('Elasticsearch has gone poof')) + .mockResolvedValueOnce( + setTaskTypeCount(mockAggregatedResult, 'alerting_telemetry', { + idle: 1, + failed: 1, + }) + ); + const logger = mockLogger(); + const workloadAggregator = createWorkloadAggregator(taskManager, 10, logger); + + return new Promise((resolve) => { + workloadAggregator.pipe(take(2), bufferCount(2)).subscribe((results) => { + expect(results[0].key).toEqual('workload'); + expect(results[0].value).toMatchObject({ + sum: 5, + taskTypes: { + actions_telemetry: { sum: 2, status: { idle: 2 } }, + alerting_telemetry: { sum: 2, status: { idle: 2 } }, + session_cleanup: { sum: 1, status: { idle: 1 } }, + }, + }); + expect(results[1].key).toEqual('workload'); + expect(results[1].value).toMatchObject({ + sum: 5, + taskTypes: { + actions_telemetry: { sum: 2, status: { idle: 2 } }, + alerting_telemetry: { sum: 2, status: { idle: 1, failed: 1 } }, + session_cleanup: { sum: 1, status: { idle: 1 } }, + }, + }); + resolve(); + }); + }); + }); }); diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 8e73d88bea25b..6cb6be9797807 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -5,7 +5,8 @@ */ import { interval } from 'rxjs'; -import { concatMap, map } from 'rxjs/operators'; +import { concatMap, map, catchError } from 'rxjs/operators'; +import { Logger } from 'src/core/server'; import { JsonObject } from 'src/plugins/kibana_utils/common'; import { keyBy, mapValues } from 'lodash'; import { AggregatedStatProvider } from './runtime_statistics_aggregator'; @@ -15,10 +16,12 @@ import { AggregationBucketWithSubAgg, AggregationBucket, } from '../queries/aggregation_clauses'; +import { parseIntervalAsSecond } from '../lib/intervals'; export function createWorkloadAggregator( taskManager: TaskManager, - refreshInterval: number + refreshInterval: number, + logger: Logger ): AggregatedStatProvider { return interval(refreshInterval).pipe( concatMap(() => @@ -32,34 +35,46 @@ export function createWorkloadAggregator( }, }, }, + schedule: { + terms: { field: 'task.schedule.interval' }, + }, }, }) ), - map( - ({ - task: { - doc_count: sum, - taskType: { buckets: types }, - }, - }: AggregationResult<'task' | 'taskType' | 'status'>) => { - const summary: JsonObject = { - sum, - types: mapValues( - keyBy>( - types as Array>, - 'key' - ), - ({ doc_count: docCount, status }) => ({ - sum: docCount, - status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), - }) + map(({ task }: AggregationResult<'task' | 'taskType' | 'schedule' | 'status'>) => { + const { + doc_count: sum = 0, + taskType: { buckets: taskTypes = [] } = {}, + schedule: { buckets: schedules = [] } = {}, + } = task; + const summary: JsonObject = { + sum, + taskTypes: mapValues( + keyBy>( + taskTypes as Array>, + 'key' ), - }; - return { - key: 'workload', - value: summary, - }; - } - ) + ({ doc_count: docCount, status }) => ({ + sum: docCount, + status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), + }) + ), + schedule: (schedules as AggregationBucket[]) + .sort( + ({ key: scheduleLeft }, { key: scheduleRight }) => + parseIntervalAsSecond(scheduleLeft) - parseIntervalAsSecond(scheduleRight) + ) + .map(({ key: schedule, doc_count: count }) => [schedule, count]), + }; + return { + key: 'workload', + value: summary, + }; + }), + catchError((ex: Error, caught) => { + logger.error(`[WorkloadAggregator]: ${ex}`); + // continue to pull values from the same observable + return caught; + }) ); } diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 715d8cf1b4d00..3a4577db01b49 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -import { PluginInitializerContext, Plugin, CoreSetup, CoreStart } from 'src/core/server'; +import { PluginInitializerContext, Plugin, CoreSetup, CoreStart, Logger } from 'src/core/server'; import { Subject } from 'rxjs'; import { first } from 'rxjs/operators'; import { TaskDictionary, TaskDefinition } from './task'; @@ -31,13 +31,16 @@ export class TaskManagerPlugin currentConfig: TaskManagerConfig; taskManagerId?: string; config?: TaskManagerConfig; + logger: Logger; constructor(private readonly initContext: PluginInitializerContext) { this.initContext = initContext; this.currentConfig = {} as TaskManagerConfig; + this.logger = initContext.logger.get('taskManager'); } public async setup(core: CoreSetup): Promise { + const { logger } = this; const config = (this.config = await this.initContext.config .create() .pipe(first()) @@ -51,7 +54,7 @@ export class TaskManagerPlugin healthRoute( router, config, - this.taskManager.then((tm) => createAggregatedStatsStream(tm, config)), + this.taskManager.then((tm) => createAggregatedStatsStream(tm, config, logger)), // if health is any more stale than the pollInterval (+1s buffer) consider the system unhealthy config.poll_interval + 1000 ); @@ -67,7 +70,7 @@ export class TaskManagerPlugin } public start({ savedObjects, elasticsearch }: CoreStart): TaskManagerStartContract { - const logger = this.initContext.logger.get('taskManager'); + const { logger } = this; const savedObjectsRepository = savedObjects.createInternalRepository(['task']); this.legacyTaskManager$.next( diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index 7df3186ca8ecf..44e409a2aec37 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -3,6 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; import { Subject, Observable, Subscription } from 'rxjs'; import { filter } from 'rxjs/operators'; @@ -19,7 +20,6 @@ import { import { Result, asOk, asErr, either, map, mapErr, promiseResult } from './lib/result_type'; import { TaskManagerConfig } from './config'; -import { Logger } from './types'; import { TaskMarkRunning, TaskRun, diff --git a/x-pack/plugins/task_manager/server/task_pool.ts b/x-pack/plugins/task_manager/server/task_pool.ts index 92374908c60f7..ce7cd2bba92d6 100644 --- a/x-pack/plugins/task_manager/server/task_pool.ts +++ b/x-pack/plugins/task_manager/server/task_pool.ts @@ -11,7 +11,7 @@ import moment, { Duration } from 'moment'; import { performance } from 'perf_hooks'; import { padStart } from 'lodash'; -import { Logger } from './types'; +import { Logger } from 'src/core/server'; import { TaskRunner } from './task_runner'; import { isTaskSavedObjectNotFoundError } from './lib/is_task_not_found_error'; diff --git a/x-pack/plugins/task_manager/server/task_runner.test.ts b/x-pack/plugins/task_manager/server/task_runner.test.ts index c3191dbb349e6..81fe097f43690 100644 --- a/x-pack/plugins/task_manager/server/task_runner.test.ts +++ b/x-pack/plugins/task_manager/server/task_runner.test.ts @@ -6,7 +6,7 @@ import _ from 'lodash'; import sinon from 'sinon'; -import { minutesFromNow } from './lib/intervals'; +import { secondsFromNow } from './lib/intervals'; import { asOk, asErr } from './lib/result_type'; import { TaskEvent, asTaskRunEvent, asTaskMarkRunningEvent } from './task_events'; import { ConcreteTaskInstance, TaskStatus, TaskDictionary, TaskDefinition } from './task'; @@ -15,6 +15,8 @@ import { mockLogger } from './test_utils'; import { SavedObjectsErrorHelpers } from '../../../../src/core/server'; import moment from 'moment'; +const minutesFromNow = (mins: number): Date => secondsFromNow(mins * 60); + let fakeTimer: sinon.SinonFakeTimers; beforeAll(() => { diff --git a/x-pack/plugins/task_manager/server/task_runner.ts b/x-pack/plugins/task_manager/server/task_runner.ts index ebf13fac2f311..87d1938393f68 100644 --- a/x-pack/plugins/task_manager/server/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_runner.ts @@ -10,6 +10,7 @@ * rescheduling, middleware application, etc. */ +import { Logger } from 'src/core/server'; import apm from 'elastic-apm-node'; import { performance } from 'perf_hooks'; import Joi from 'joi'; @@ -18,7 +19,6 @@ import { identity, defaults, flow } from 'lodash'; import { asOk, asErr, mapErr, eitherAsync, unwrap, mapOk, Result } from './lib/result_type'; import { TaskRun, TaskMarkRunning, asTaskRunEvent, asTaskMarkRunningEvent } from './task_events'; import { intervalFromDate, intervalFromNow } from './lib/intervals'; -import { Logger } from './types'; import { BeforeRunFunction, BeforeMarkRunningFunction } from './lib/middleware'; import { CancelFunction, diff --git a/x-pack/plugins/task_manager/server/test_utils/index.ts b/x-pack/plugins/task_manager/server/test_utils/index.ts index 6f43a60ff42d2..a732aaf884668 100644 --- a/x-pack/plugins/task_manager/server/test_utils/index.ts +++ b/x-pack/plugins/task_manager/server/test_utils/index.ts @@ -3,6 +3,8 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; +import { loggingSystemMock } from 'src/core/server/mocks'; /* * A handful of helper functions for testing the task manager. @@ -15,12 +17,7 @@ const nativeTimeout = setTimeout; * Creates a mock task manager Logger. */ export function mockLogger() { - return { - info: jest.fn(), - debug: jest.fn(), - warn: jest.fn(), - error: jest.fn(), - }; + return loggingSystemMock.create().get() as jest.Mocked; } export interface Resolvable { diff --git a/x-pack/plugins/task_manager/server/types.ts b/x-pack/plugins/task_manager/server/types.ts index a38730ad7f768..c86ae1c3fd98d 100644 --- a/x-pack/plugins/task_manager/server/types.ts +++ b/x-pack/plugins/task_manager/server/types.ts @@ -7,10 +7,3 @@ import { TaskManager as TaskManagerClass } from './task_manager'; export type TaskManager = PublicMethodsOf; - -export interface Logger { - info(message: string): void; - debug(message: string): void; - warn(message: string): void; - error(message: string): void; -} diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 9cc7b61744432..c3c15c7ba4810 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -6,6 +6,7 @@ import expect from '@kbn/expect'; import url from 'url'; +import { keyBy, mapValues } from 'lodash'; import supertestAsPromised from 'supertest-as-promised'; import { FtrProviderContext } from '../../ftr_provider_context'; import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; @@ -50,36 +51,62 @@ export default function ({ getService }: FtrProviderContext) { const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + const monitoredAggregatedStatsRefreshRate = 5000; + describe('health', () => { it('should return basic configuration of task manager', async () => { expect((await getHealth()).stats.configuration.value).to.eql({ poll_interval: 3000, max_poll_inactivity_cycles: 10, + monitored_aggregated_stats_refresh_rate: monitoredAggregatedStatsRefreshRate, request_capacity: 1000, max_workers: 10, }); }); it('should return the task manager workload', async () => { + const workload = (await getHealth()).stats.workload; const sumSampleTaskInWorkload = - ((await getHealth()).stats.workload.value.types as { + (workload.value.taskTypes as { sampleTask?: { sum: number }; }).sampleTask?.sum ?? 0; + const schedulesWorkload = (mapValues( + keyBy(workload.value.schedule as Array<[string, number]>, ([interval, count]) => interval), + ([, count]) => count + ) as unknown) as { '37m': number | undefined; '37s': number | undefined }; + + await scheduleTask({ + taskType: 'sampleTask', + schedule: { interval: '37s' }, + }); await scheduleTask({ taskType: 'sampleTask', - schedule: { interval: '1m' }, + schedule: { interval: '37m' }, }); await retry.try(async () => { // workload is configured to refresh every 5s in FTs - await delay(5000); + await delay(monitoredAggregatedStatsRefreshRate); const workloadAfterScheduling = (await getHealth()).stats.workload.value; expect( - (workloadAfterScheduling.types as { sampleTask: { sum: number } }).sampleTask.sum - ).to.eql(sumSampleTaskInWorkload + 1); + (workloadAfterScheduling.taskTypes as { sampleTask: { sum: number } }).sampleTask.sum + ).to.eql(sumSampleTaskInWorkload + 2); + + const schedulesWorkloadAfterScheduling = (mapValues( + keyBy( + workloadAfterScheduling.schedule as Array<[string, number]>, + ([interval]) => interval + ), + ([, count]) => count + ) as unknown) as { + '37m': number; + '37s': number; + }; + expect(schedulesWorkloadAfterScheduling['37s']).to.eql(schedulesWorkload['37s'] ?? 0 + 1); + expect(schedulesWorkloadAfterScheduling['37m']).to.eql(schedulesWorkload['37m'] ?? 0 + 1); }); }); }); diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts index 5eb1353dd1291..b542bff3a4aa9 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts @@ -9,7 +9,7 @@ import { FtrProviderContext } from '../../ftr_provider_context'; export default function ({ loadTestFile }: FtrProviderContext) { describe('task_manager', function taskManagerSuite() { this.tags('ciGroup2'); - loadTestFile(require.resolve('./task_management')); loadTestFile(require.resolve('./health_route')); + loadTestFile(require.resolve('./task_management')); }); } From 0a22c432f28ab03b9aa17b62a8d33cca903e169c Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Wed, 23 Sep 2020 14:46:00 +0100 Subject: [PATCH 06/42] fixed tests --- x-pack/plugins/actions/server/create_execute_function.test.ts | 2 +- x-pack/plugins/task_manager/server/task_manager.test.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/x-pack/plugins/actions/server/create_execute_function.test.ts b/x-pack/plugins/actions/server/create_execute_function.test.ts index cfbc68879ae0e..d0500e37ceedf 100644 --- a/x-pack/plugins/actions/server/create_execute_function.test.ts +++ b/x-pack/plugins/actions/server/create_execute_function.test.ts @@ -15,7 +15,7 @@ import { asSavedObjectExecutionSource, } from './lib/action_execution_source'; -const mockTaskManager = taskManagerMock.start(); +const mockTaskManager = taskManagerMock.createStart(); const savedObjectsClient = savedObjectsClientMock.create(); const request = {} as KibanaRequest; diff --git a/x-pack/plugins/task_manager/server/task_manager.test.ts b/x-pack/plugins/task_manager/server/task_manager.test.ts index cf7f9e2a7cff3..017540a2dcc55 100644 --- a/x-pack/plugins/task_manager/server/task_manager.test.ts +++ b/x-pack/plugins/task_manager/server/task_manager.test.ts @@ -41,6 +41,7 @@ describe('TaskManager', () => { max_attempts: 9, poll_interval: 6000000, max_poll_inactivity_cycles: 10, + monitored_aggregated_stats_refresh_rate: 5000, request_capacity: 1000, }; const taskManagerOpts = { From 7c226e968808a7833e43a4889f8425209599e542 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Wed, 23 Sep 2020 16:17:49 +0100 Subject: [PATCH 07/42] fixed typing --- x-pack/plugins/task_manager/server/task_manager.mock.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/plugins/task_manager/server/task_manager.mock.ts b/x-pack/plugins/task_manager/server/task_manager.mock.ts index 8afaa22515668..ae71ea5c23793 100644 --- a/x-pack/plugins/task_manager/server/task_manager.mock.ts +++ b/x-pack/plugins/task_manager/server/task_manager.mock.ts @@ -18,6 +18,7 @@ const createTaskManagerMock = () => { runNow: jest.fn(), remove: jest.fn(), start: jest.fn(), + isStarted: jest.fn(() => true), stop: jest.fn(), } as jest.Mocked; }; From e1ee96774de6edbbb54b785f1d9020278614df3d Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Wed, 23 Sep 2020 17:08:47 +0100 Subject: [PATCH 08/42] fixed typing again --- .../plugins/task_manager/server/task_manager.mock.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/x-pack/plugins/task_manager/server/task_manager.mock.ts b/x-pack/plugins/task_manager/server/task_manager.mock.ts index ae71ea5c23793..e5325274024d8 100644 --- a/x-pack/plugins/task_manager/server/task_manager.mock.ts +++ b/x-pack/plugins/task_manager/server/task_manager.mock.ts @@ -6,8 +6,8 @@ import { TaskManager } from './task_manager'; -const createTaskManagerMock = () => { - return { +const createTaskManagerMock = (isStarted: boolean = true) => { + return ({ registerTaskDefinitions: jest.fn(), addMiddleware: jest.fn(), ensureScheduled: jest.fn(), @@ -18,9 +18,11 @@ const createTaskManagerMock = () => { runNow: jest.fn(), remove: jest.fn(), start: jest.fn(), - isStarted: jest.fn(() => true), + get isStarted() { + return isStarted; + }, stop: jest.fn(), - } as jest.Mocked; + } as unknown) as jest.Mocked; }; export const taskManagerMock = { From abec231aee9fdca0d543cbe10537d8eaca90bdda Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 28 Sep 2020 14:50:10 +0100 Subject: [PATCH 09/42] added task runtime stats to health endpoint --- .../task_manager/server/config.test.ts | 1 + x-pack/plugins/task_manager/server/config.ts | 7 + .../task_manager/server/monitoring/index.ts | 27 ++- .../monitoring_stats_stream.test.ts | 155 ++++++++++++++ .../monitoring/monitoring_stats_stream.ts | 127 ++++++++++++ .../runtime_statistics_aggregator.ts | 10 +- .../monitoring/task_run_statistics.test.ts | 193 ++++++++++++++++++ .../server/monitoring/task_run_statistics.ts | 166 +++++++++++++++ .../monitoring/workload_statistics.test.ts | 2 +- .../server/monitoring/workload_statistics.ts | 24 ++- x-pack/plugins/task_manager/server/plugin.ts | 6 +- .../task_manager/server/routes/health.test.ts | 144 ++----------- .../task_manager/server/routes/health.ts | 97 +++------ .../task_manager/server/task_events.ts | 20 +- .../task_manager/server/task_manager.mock.ts | 14 +- .../task_manager/server/task_manager.test.ts | 1 + .../task_manager/server/task_manager.ts | 51 +++-- .../test_suites/task_manager/health_route.ts | 35 +++- 18 files changed, 837 insertions(+), 243 deletions(-) create mode 100644 x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index 2eb132185ff70..f0c1937638991 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -16,6 +16,7 @@ describe('config validation', () => { "max_poll_inactivity_cycles": 10, "max_workers": 10, "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_running_average_window": 50, "poll_interval": 3000, "request_capacity": 1000, } diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index 1b79c17220f4e..a530cb2d44f4c 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -9,6 +9,7 @@ import { schema, TypeOf } from '@kbn/config-schema'; export const DEFAULT_MAX_WORKERS = 10; export const DEFAULT_POLL_INTERVAL = 3000; export const DEFAULT_MAX_POLL_INACTIVITY_CYCLES = 10; +export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50; // Refresh "pull based" monitored stats at a default rate of once a minute export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000; @@ -57,6 +58,12 @@ export const configSchema = schema.object({ /* don't run monitored stat aggregations any faster than once every 5 seconds */ min: 5000, }), + /* The size of the running average window for monitored stats. */ + monitored_stats_running_average_window: schema.number({ + defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW, + max: 100, + min: 10, + }), }); export type TaskManagerConfig = TypeOf; diff --git a/x-pack/plugins/task_manager/server/monitoring/index.ts b/x-pack/plugins/task_manager/server/monitoring/index.ts index 347731752d852..ef447d6ef0620 100644 --- a/x-pack/plugins/task_manager/server/monitoring/index.ts +++ b/x-pack/plugins/task_manager/server/monitoring/index.ts @@ -3,22 +3,29 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ + import { Logger } from 'src/core/server'; +import { Observable } from 'rxjs'; import { TaskManager } from '../task_manager'; -import { AggregatedStatProvider } from './runtime_statistics_aggregator'; -import { createWorkloadAggregator } from './workload_statistics'; import { TaskManagerConfig } from '../config'; +import { + MonitoringStats, + createAggregators, + createMonitoringStatsStream, +} from './monitoring_stats_stream'; -export { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; +export { + MonitoringStats, + RawMonitoringStats, + summarizeMonitoringStats, + createAggregators, + createMonitoringStatsStream, +} from './monitoring_stats_stream'; -export function createAggregatedStatsStream( +export function createMonitoringStats( taskManager: TaskManager, config: TaskManagerConfig, logger: Logger -): AggregatedStatProvider { - return createWorkloadAggregator( - taskManager, - config.monitored_aggregated_stats_refresh_rate, - logger - ); +): Observable { + return createMonitoringStatsStream(createAggregators(taskManager, config, logger), config); } diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts new file mode 100644 index 0000000000000..063947f2ecad7 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts @@ -0,0 +1,155 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { TaskManagerConfig } from '../config'; +import { of, Subject } from 'rxjs'; +import { take, bufferCount } from 'rxjs/operators'; +import { createMonitoringStatsStream, AggregatedStat } from './monitoring_stats_stream'; +import { JsonValue } from 'src/plugins/kibana_utils/common'; + +beforeEach(() => { + jest.resetAllMocks(); +}); + +describe('createMonitoringStatsStream', () => { + const configuration: TaskManagerConfig = { + enabled: true, + max_workers: 10, + index: 'foo', + max_attempts: 9, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + }; + + it('returns the initial config used to configure Task Manager', async () => { + return new Promise((resolve) => { + createMonitoringStatsStream(of(), configuration) + .pipe(take(1)) + .subscribe((firstValue) => { + expect(firstValue.stats).toMatchObject({ + configuration: { + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + }, + }, + }); + resolve(); + }); + }); + }); + + it('incrementally updates the stats returned by the endpoint', async () => { + const aggregatedStats$ = new Subject(); + + return new Promise((resolve) => { + createMonitoringStatsStream(aggregatedStats$, configuration) + .pipe(take(3), bufferCount(3)) + .subscribe(([initialValue, secondValue, thirdValue]) => { + expect(initialValue.stats).toMatchObject({ + lastUpdate: expect.any(String), + stats: { + configuration: { + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + }, + }, + }, + }); + + expect(secondValue).toMatchObject({ + lastUpdate: expect.any(String), + stats: { + newAggregatedStat: { + timestamp: expect.any(String), + value: { + some: { + complex: { + value: 123, + }, + }, + }, + }, + configuration: { + timestamp: expect.any(String), + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + }, + }, + }, + }); + + expect(thirdValue).toMatchObject({ + lastUpdate: expect.any(String), + stats: { + newAggregatedStat: { + timestamp: expect.any(String), + value: { + some: { + updated: { + value: 456, + }, + }, + }, + }, + configuration: { + timestamp: expect.any(String), + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + }, + }, + }, + }); + }); + + aggregatedStats$.next({ + key: 'newAggregatedStat', + value: { + some: { + complex: { + value: 123, + }, + }, + } as JsonValue, + }); + + aggregatedStats$.next({ + key: 'newAggregatedStat', + value: { + some: { + updated: { + value: 456, + }, + }, + } as JsonValue, + }); + + resolve(); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts new file mode 100644 index 0000000000000..03fa889fb732d --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -0,0 +1,127 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +import { merge, of, Observable } from 'rxjs'; +import { map, scan } from 'rxjs/operators'; +import { set } from '@elastic/safer-lodash-set'; +import { pick } from 'lodash'; +import { Logger } from 'src/core/server'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { TaskManager } from '../task_manager'; +import { createWorkloadAggregator, WorkloadStat } from './workload_statistics'; +import { createTaskRunAggregator, summarizeTaskRunStat, TaskRunStat } from './task_run_statistics'; +import { TaskManagerConfig } from '../config'; +import { AggregatedStatProvider } from './runtime_statistics_aggregator'; + +export { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; + +const CONFIG_FIELDS_TO_EXPOSE = [ + 'max_workers', + 'poll_interval', + 'request_capacity', + 'max_poll_inactivity_cycles', + 'monitored_aggregated_stats_refresh_rate', + 'monitored_stats_running_average_window', +] as const; + +type ConfigStat = Pick; + +export interface MonitoringStats { + lastUpdate: string; + stats: { + configuration: { + timestamp: string; + value: ConfigStat; + }; + workload?: { + timestamp: string; + value: WorkloadStat; + }; + runtime?: { + timestamp: string; + value: TaskRunStat; + }; + }; +} + +interface MonitoredStat { + timestamp: string; + value: JsonObject; +} + +export interface RawMonitoringStats { + lastUpdate: string; + stats: Record; +} + +export function createAggregators( + taskManager: TaskManager, + config: TaskManagerConfig, + logger: Logger +): AggregatedStatProvider { + return merge( + createTaskRunAggregator(taskManager, config.monitored_stats_running_average_window, logger), + createWorkloadAggregator(taskManager, config.monitored_aggregated_stats_refresh_rate, logger) + ); +} + +export function createMonitoringStatsStream( + provider$: AggregatedStatProvider, + config: TaskManagerConfig +): Observable { + const initialStats = initializeStats(new Date().toISOString(), config); + return merge( + // emit the initial stats + of(initialStats), + // emit updated stats whenever a provider updates a specific key on the stats + provider$.pipe( + map(({ key, value }) => { + return { + value: { timestamp: new Date().toISOString(), value }, + key, + }; + }), + scan((monitoringStats: MonitoringStats, { key, value }) => { + // incrementally merge stats as they come in + set(monitoringStats.stats, key, value); + monitoringStats.lastUpdate = new Date().toISOString(); + return monitoringStats; + }, initialStats) + ) + ); +} + +export function summarizeMonitoringStats({ + lastUpdate, + stats: { runtime, ...otherStats }, +}: MonitoringStats): RawMonitoringStats { + return { + lastUpdate, + stats: { + ...((otherStats as unknown) as RawMonitoringStats['stats']), + ...(runtime + ? { + runtime: { + ...runtime, + value: summarizeTaskRunStat(runtime.value), + }, + } + : {}), + }, + }; +} + +const initializeStats = ( + initialisationTimestamp: string, + config: TaskManagerConfig +): MonitoringStats => ({ + lastUpdate: initialisationTimestamp, + stats: { + configuration: { + timestamp: initialisationTimestamp, + value: pick(config, ...CONFIG_FIELDS_TO_EXPOSE) as ConfigStat, + }, + }, +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts b/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts index f895bf2b02e6a..bd2b3845f2526 100644 --- a/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts +++ b/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts @@ -4,11 +4,13 @@ * you may not use this file except in compliance with the Elastic License. */ import { Observable } from 'rxjs'; -import { JsonObject, JsonValue } from 'src/plugins/kibana_utils/common'; +import { JsonValue } from 'src/plugins/kibana_utils/common'; -export interface AggregatedStat { +export interface AggregatedStat { key: string; - value: JsonObject | JsonValue; + value: Stat; } -export type AggregatedStatProvider = Observable; +export type AggregatedStatProvider = Observable< + AggregatedStat +>; diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts new file mode 100644 index 0000000000000..365b8962146dc --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -0,0 +1,193 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import uuid from 'uuid'; +import { Subject } from 'rxjs'; +import stats from 'stats-lite'; +import sinon from 'sinon'; +import { take, tap, bufferCount, startWith, map } from 'rxjs/operators'; + +import { ConcreteTaskInstance, TaskStatus } from '../task'; +import { asTaskRunEvent, asTaskPollingCycleEvent } from '../task_events'; +import { asOk } from '../lib/result_type'; +import { TaskLifecycleEvent } from '../task_manager'; +import { + createTaskRunAggregator, + summarizeTaskRunStat, + TaskRunStat, + SummarizedTaskRunStat, +} from './task_run_statistics'; +import { taskManagerMock } from '../task_manager.mock'; +import { mockLogger } from '../test_utils'; +import { AggregatedStat } from './runtime_statistics_aggregator'; +import { FillPoolResult } from '../lib/fill_pool'; + +describe('Task Run Statistics', () => { + let fakeTimer: sinon.SinonFakeTimers; + + beforeAll(() => { + fakeTimer = sinon.useFakeTimers(); + }); + + afterAll(() => fakeTimer.restore()); + + test('returns a running average of task drift', async () => { + const runAtDrift = [1000, 2000, 500, 300, 400, 15000, 20000, 200]; + const taskManager = taskManagerMock.create({ + events: new Subject().pipe( + startWith( + ...runAtDrift.map((drift) => mockTaskRunEvent({ runAt: runAtMillisecondsAgo(drift) })) + ) + ), + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskManager, + runningAverageWindowSize, + mockLogger() + ); + + function expectWindowEqualsUpdate( + taskStat: AggregatedStat, + window: number[] + ) { + expect(taskStat.value.drift).toMatchObject({ + mean: stats.mean(window), + median: stats.median(window), + mode: stats.mode(window), + }); + } + + return new Promise((resolve) => { + taskRunAggregator + .pipe( + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value), + })), + take(runAtDrift.length), + bufferCount(runAtDrift.length) + ) + .subscribe((taskStats: Array>) => { + expectWindowEqualsUpdate(taskStats[0], runAtDrift.slice(0, 1)); + expectWindowEqualsUpdate(taskStats[1], runAtDrift.slice(0, 2)); + expectWindowEqualsUpdate(taskStats[2], runAtDrift.slice(0, 3)); + expectWindowEqualsUpdate(taskStats[3], runAtDrift.slice(0, 4)); + expectWindowEqualsUpdate(taskStats[4], runAtDrift.slice(0, 5)); + // from the 6th value, begin to drop old values as out window is 5 + expectWindowEqualsUpdate(taskStats[5], runAtDrift.slice(1, 6)); + expectWindowEqualsUpdate(taskStats[6], runAtDrift.slice(2, 7)); + expectWindowEqualsUpdate(taskStats[7], runAtDrift.slice(3, 8)); + resolve(); + }); + }); + }); + + test('returns polling stats', async () => { + const expectedTimestamp: string[] = []; + const taskManager = taskManagerMock.create({ + events: new Subject().pipe( + startWith( + asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), + asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), + asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), + asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled)), + asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled)), + asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled)), + asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity)), + asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity)), + asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), + asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)) + ) + ), + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskManager, + runningAverageWindowSize, + mockLogger() + ); + + return new Promise((resolve) => { + taskRunAggregator + .pipe( + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value), + })), + tap(() => { + expectedTimestamp.push(new Date().toISOString()); + // each event is a second after the previous one + fakeTimer.tick(1000); + }), + take(10), + bufferCount(10) + ) + .subscribe((taskStats: Array>) => { + expect(taskStats.map((taskStat) => taskStat.value.polling.lastSuccessfulPoll)).toEqual( + expectedTimestamp + ); + + /** + * At any given time we only keep track of the last X Polling Results + * In the tests this is ocnfiugured to a window size of 5 + */ + expect(taskStats.map((taskStat) => taskStat.value.polling.resultFrequency)).toEqual([ + // NoTasksClaimed + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled + { NoTasksClaimed: 75, RanOutOfCapacity: 0, PoolFilled: 25 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled + { NoTasksClaimed: 60, RanOutOfCapacity: 0, PoolFilled: 40 }, + // NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled + { NoTasksClaimed: 40, RanOutOfCapacity: 0, PoolFilled: 60 }, + // NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity + { NoTasksClaimed: 20, RanOutOfCapacity: 20, PoolFilled: 60 }, + // PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity + { NoTasksClaimed: 0, RanOutOfCapacity: 40, PoolFilled: 60 }, + // PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed + { NoTasksClaimed: 20, RanOutOfCapacity: 40, PoolFilled: 40 }, + // PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed, NoTasksClaimed + { NoTasksClaimed: 40, RanOutOfCapacity: 40, PoolFilled: 20 }, + ]); + resolve(); + }); + }); + }); +}); + +function runAtMillisecondsAgo(ms: number): Date { + return new Date(Date.now() - ms); +} + +const mockTaskRunEvent = (overrides: Partial = {}) => { + const task = mockTaskInstance(overrides); + return asTaskRunEvent(task.id, asOk(task)); +}; + +const mockTaskInstance = (overrides: Partial = {}): ConcreteTaskInstance => ({ + id: uuid.v4(), + attempts: 0, + status: TaskStatus.Running, + version: '123', + runAt: new Date(), + scheduledAt: new Date(), + startedAt: new Date(), + retryAt: new Date(Date.now() + 5 * 60 * 1000), + state: {}, + taskType: 'alerting:test', + params: { + alertId: '1', + }, + ownerId: null, + ...overrides, +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts new file mode 100644 index 0000000000000..ca224fc28199b --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -0,0 +1,166 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { Logger } from 'src/core/server'; +import { of, empty } from 'rxjs'; +import { filter, flatMap } from 'rxjs/operators'; +import { isUndefined, countBy, mapValues } from 'lodash'; +import stats from 'stats-lite'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; +import { TaskManager, TaskLifecycleEvent } from '../task_manager'; +import { isTaskRunEvent, isTaskPollingCycleEvent } from '../task_events'; +import { isOk } from '../lib/result_type'; +import { ConcreteTaskInstance } from '../task'; +import { FillPoolResult } from '../lib/fill_pool'; + +interface AveragedStat extends JsonObject { + mean: number; + median: number; + mode: number; +} + +interface FillPoolStat extends JsonObject { + lastSuccessfulPoll: string; + resultFrequency: FillPoolResult[]; +} + +export interface TaskRunStat extends JsonObject { + drift: number[]; + polling: FillPoolStat | Omit; +} + +interface FillPoolRawStat extends JsonObject { + lastSuccessfulPoll: string; + resultFrequency: { + [FillPoolResult.NoTasksClaimed]: number; + [FillPoolResult.RanOutOfCapacity]: number; + [FillPoolResult.PoolFilled]: number; + }; +} + +export interface SummarizedTaskRunStat extends JsonObject { + drift: AveragedStat; + polling: FillPoolRawStat | Omit; +} + +export function createTaskRunAggregator( + taskManager: TaskManager, + runningAverageWindowSize: number, + logger: Logger +): AggregatedStatProvider { + const runningStats: { + runtime: { + polling: { + lastSuccessfulPoll: (value?: string) => string | undefined; + resultFrequency: (value?: FillPoolResult) => FillPoolResult[]; + }; + drift: (value?: number) => number[]; + }; + } = { + runtime: { + polling: { + lastSuccessfulPoll: createLastValueStat(), + resultFrequency: createRunningAveragedStat(runningAverageWindowSize), + }, + drift: createRunningAveragedStat(runningAverageWindowSize), + }, + }; + return taskManager.events.pipe( + filter( + (taskEvent: TaskLifecycleEvent) => + (isTaskRunEvent(taskEvent) || isTaskPollingCycleEvent(taskEvent)) && + isOk(taskEvent.event) + ), + flatMap((taskEvent: TaskLifecycleEvent) => { + if (isTaskRunEvent(taskEvent) && isOk(taskEvent.event)) { + const task = taskEvent.event.value; + const now = Date.now(); + return of({ + key: 'runtime', + value: { + polling: { + lastSuccessfulPoll: runningStats.runtime.polling.lastSuccessfulPoll(), + resultFrequency: runningStats.runtime.polling.resultFrequency(), + }, + drift: runningStats.runtime.drift(now - task.runAt.getTime()), + }, + } as AggregatedStat); + } else if (isTaskPollingCycleEvent(taskEvent) && isOk(taskEvent.event)) { + return of({ + key: 'runtime', + value: { + polling: { + lastSuccessfulPoll: runningStats.runtime.polling.lastSuccessfulPoll( + new Date().toISOString() + ), + resultFrequency: runningStats.runtime.polling.resultFrequency(taskEvent.event.value), + }, + drift: runningStats.runtime.drift(), + }, + } as AggregatedStat); + } + return empty(); + }) + ); +} + +export function summarizeTaskRunStat({ + polling: { lastSuccessfulPoll, resultFrequency }, + drift, +}: TaskRunStat): SummarizedTaskRunStat { + return { + polling: { + ...(lastSuccessfulPoll ? { lastSuccessfulPoll } : {}), + resultFrequency: { + [FillPoolResult.NoTasksClaimed]: 0, + [FillPoolResult.RanOutOfCapacity]: 0, + [FillPoolResult.PoolFilled]: 0, + ...calculateFrequency(resultFrequency as FillPoolResult[]), + }, + }, + drift: calculateRunningAverage(drift), + }; +} + +function calculateRunningAverage(values: number[]): AveragedStat { + return { + mean: stats.mean(values), + median: stats.median(values), + mode: stats.mode(values), + }; +} + +function calculateFrequency(values: T[]): JsonObject { + return mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)); +} + +function createLastValueStat() { + let lastValue: T; + return (value?: T) => { + if (isUndefined(value)) { + return lastValue; + } else { + lastValue = value; + return lastValue; + } + }; +} + +function createRunningAveragedStat(runningAverageWindowSize: number) { + const queue = new Array(); + return (value?: T) => { + if (isUndefined(value)) { + return queue; + } else { + if (queue.length === runningAverageWindowSize) { + queue.shift(); + } + queue.push(value); + return [...queue]; + } + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index f85a6571899ec..0bcf3abfc7607 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -4,9 +4,9 @@ * you may not use this file except in compliance with the Elastic License. */ +import { first, take, bufferCount } from 'rxjs/operators'; import { createWorkloadAggregator } from './workload_statistics'; import { taskManagerMock } from '../task_manager.mock'; -import { first, take, bufferCount } from 'rxjs/operators'; import { AggregationResult } from '../queries/aggregation_clauses'; import { mockLogger } from '../test_utils'; diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 6cb6be9797807..669e6af16ea0e 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -4,7 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ -import { interval } from 'rxjs'; +import { timer } from 'rxjs'; import { concatMap, map, catchError } from 'rxjs/operators'; import { Logger } from 'src/core/server'; import { JsonObject } from 'src/plugins/kibana_utils/common'; @@ -18,12 +18,28 @@ import { } from '../queries/aggregation_clauses'; import { parseIntervalAsSecond } from '../lib/intervals'; +interface StatusStat extends JsonObject { + [status: string]: number; +} +interface TaskTypeStat extends JsonObject { + [taskType: string]: { + sum: number; + status: StatusStat; + }; +} + +export interface WorkloadStat extends JsonObject { + sum: number; + taskTypes: TaskTypeStat; + schedule: Array<[string, number]>; +} + export function createWorkloadAggregator( taskManager: TaskManager, refreshInterval: number, logger: Logger -): AggregatedStatProvider { - return interval(refreshInterval).pipe( +): AggregatedStatProvider { + return timer(0, refreshInterval).pipe( concatMap(() => taskManager.aggregate({ aggs: { @@ -47,7 +63,7 @@ export function createWorkloadAggregator( taskType: { buckets: taskTypes = [] } = {}, schedule: { buckets: schedules = [] } = {}, } = task; - const summary: JsonObject = { + const summary: WorkloadStat = { sum, taskTypes: mapValues( keyBy>( diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 3a4577db01b49..f53418aec05ad 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -12,7 +12,7 @@ import { TaskManagerConfig } from './config'; import { Middleware } from './lib/middleware'; import { setupSavedObjects } from './saved_objects'; import { healthRoute } from './routes'; -import { createAggregatedStatsStream } from './monitoring'; +import { createMonitoringStats } from './monitoring'; export type TaskManagerSetupContract = Pick< TaskManager, @@ -53,8 +53,8 @@ export class TaskManagerPlugin const router = core.http.createRouter(); healthRoute( router, - config, - this.taskManager.then((tm) => createAggregatedStatsStream(tm, config, logger)), + this.taskManager.then((tm) => createMonitoringStats(tm, config, logger)), + logger, // if health is any more stale than the pollInterval (+1s buffer) consider the system unhealthy config.poll_interval + 1000 ); diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 4fc7b9d6b352c..1ea33794a2794 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -7,64 +7,39 @@ import { healthRoute } from './health'; import { httpServiceMock } from 'src/core/server/mocks'; import { mockHandlerArguments } from './_mock_handler_arguments'; -import { TaskManagerConfig } from '../config'; -import { of, Subject } from 'rxjs'; -import { get } from 'lodash'; -import { sleep } from '../test_utils'; -import { AggregatedStat } from '../monitoring'; - -beforeEach(() => { - jest.resetAllMocks(); -}); - -const configuration: TaskManagerConfig = { - enabled: true, - max_workers: 10, - index: 'foo', - max_attempts: 9, - poll_interval: 6000000, - max_poll_inactivity_cycles: 10, - request_capacity: 1000, - monitored_aggregated_stats_refresh_rate: 5000, -}; +import { of } from 'rxjs'; +import { sleep, mockLogger } from '../test_utils'; describe('healthRoute', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + it('registers the route', async () => { const router = httpServiceMock.createRouter(); - healthRoute(router, configuration, Promise.resolve(of()), 1000); + healthRoute(router, Promise.resolve(of()), mockLogger(), 1000); const [config] = router.get.mock.calls[0]; expect(config.path).toMatchInlineSnapshot(`"/api/task_manager/_health"`); }); - it('returns the initial config used to configure Task Manager', async () => { + it('logs the Task Manager stats at a fixed interval', async () => { const router = httpServiceMock.createRouter(); + const logger = mockLogger(); - healthRoute(router, configuration, Promise.resolve(of()), 1000); + healthRoute(router, Promise.resolve(of()), logger, 1000); - const [, handler] = router.get.mock.calls[0]; + await sleep(1000); - const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); - - expect(get(await handler(context, req, res), 'body.stats')).toMatchObject({ - configuration: { - value: { - max_workers: 10, - poll_interval: 6000000, - max_poll_inactivity_cycles: 10, - request_capacity: 1000, - monitored_aggregated_stats_refresh_rate: 5000, - }, - }, - }); + expect(logger.debug).toHaveBeenCalledWith(''); }); it('returns an error response if the stats are no longer fresh', async () => { const router = httpServiceMock.createRouter(); - healthRoute(router, configuration, Promise.resolve(of()), 1000); + healthRoute(router, Promise.resolve(of()), mockLogger(), 1000); const [, handler] = router.get.mock.calls[0]; @@ -85,6 +60,7 @@ describe('healthRoute', () => { poll_interval: 6000000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, }, }, }, @@ -93,96 +69,4 @@ describe('healthRoute', () => { }, }); }); - - it('incrementally updates the stats returned by the endpoint', async () => { - const router = httpServiceMock.createRouter(); - - const aggregatedStats = Promise.resolve(new Subject()); - - healthRoute(router, configuration, Promise.resolve(aggregatedStats), 1000); - - const [, handler] = router.get.mock.calls[0]; - - const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); - - return aggregatedStats.then(async (aggregatedStats$) => { - aggregatedStats$.next({ - key: 'newAggregatedStat', - value: { - some: { - complex: { - value: 123, - }, - }, - }, - }); - - expect(await handler(context, req, res)).toMatchObject({ - body: { - lastUpdate: expect.any(String), - stats: { - newAggregatedStat: { - timestamp: expect.any(String), - value: { - some: { - complex: { - value: 123, - }, - }, - }, - }, - configuration: { - timestamp: expect.any(String), - value: { - max_workers: 10, - poll_interval: 6000000, - max_poll_inactivity_cycles: 10, - request_capacity: 1000, - monitored_aggregated_stats_refresh_rate: 5000, - }, - }, - }, - }, - }); - - aggregatedStats$.next({ - key: 'newAggregatedStat', - value: { - some: { - updated: { - value: 456, - }, - }, - }, - }); - - expect(await handler(context, req, res)).toMatchObject({ - body: { - lastUpdate: expect.any(String), - stats: { - newAggregatedStat: { - timestamp: expect.any(String), - value: { - some: { - updated: { - value: 456, - }, - }, - }, - }, - configuration: { - timestamp: expect.any(String), - value: { - max_workers: 10, - poll_interval: 6000000, - max_poll_inactivity_cycles: 10, - request_capacity: 1000, - monitored_aggregated_stats_refresh_rate: 5000, - }, - }, - }, - }, - }); - }); - }); }); diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index cf73c93143918..e99c1298363a8 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -11,68 +11,23 @@ import { IKibanaResponse, KibanaResponseFactory, } from 'kibana/server'; -import { pick } from 'lodash'; -import { set } from '@elastic/safer-lodash-set'; -import { JsonObject } from 'src/plugins/kibana_utils/common'; -import { map } from 'rxjs/operators'; -import { TaskManagerConfig } from '../config'; -import { AggregatedStatProvider } from '../monitoring'; - -const CONFIG_FIELDS_TO_EXPOSE = [ - 'max_workers', - 'poll_interval', - 'request_capacity', - 'max_poll_inactivity_cycles', - 'monitored_aggregated_stats_refresh_rate', -]; - -interface MonitoredStat { - timestamp: string; - value: JsonObject; -} - -interface MonitoringStats { - lastUpdate: string; - stats: Record; -} +import { Logger } from 'src/core/server'; +import { Observable } from 'rxjs'; +import { take } from 'rxjs/operators'; +import { debounceTime } from 'rxjs/operators'; +import { MonitoringStats, RawMonitoringStats, summarizeMonitoringStats } from '../monitoring'; export function healthRoute( router: IRouter, - initialConfig: TaskManagerConfig, - aggregatedStats: Promise, + monitoringStats: Promise>, + logger: Logger, requiredFreshness: number ) { - const initialisationTimestamp = new Date().toISOString(); - const monitoringStats: MonitoringStats = { - lastUpdate: initialisationTimestamp, - stats: { - configuration: { - timestamp: initialisationTimestamp, - value: pick<{ - max_workers: number; - poll_interval: number; - request_capacity: number; - max_poll_inactivity_cycles: number; - monitored_aggregated_stats_refresh_rate: number; - }>(initialConfig, ...CONFIG_FIELDS_TO_EXPOSE) as JsonObject, - }, - }, - }; - - aggregatedStats.then((aggregatedStats$) => { - aggregatedStats$ - .pipe( - map(({ key, value }) => { - return { - value: { timestamp: new Date().toISOString(), value }, - key, - }; - }) - ) - .subscribe(({ key, value }) => { - set(monitoringStats.stats, key, value); - monitoringStats.lastUpdate = new Date().toISOString(); - }); + /* Log Task Manager stats as a Debug log line at a fixed interval */ + monitoringStats.then((monitoringStats$) => { + monitoringStats$ + .pipe(debounceTime(requiredFreshness)) + .subscribe((stats) => logger.debug(JSON.stringify(summarizeMonitoringStats(stats)))); }); router.get( @@ -85,24 +40,32 @@ export function healthRoute( req: KibanaRequest, res: KibanaResponseFactory ): Promise { - const lastUpdate = Date.parse(monitoringStats.lastUpdate); + const { lastUpdate, stats } = await getLatestStats(await monitoringStats); + const now = Date.now(); + const timestamp = new Date(now).toISOString(); /** * If the monitored stats aren't fresh, return an `500 internalError` with * the stats in the body of the api call. This makes it easier for monitoring * services to mark the service as broken */ - if (Date.now() - lastUpdate > requiredFreshness) { - return res.internalError({ - body: { - message: new Error('Task Manager monitored stats are out of date'), - attributes: monitoringStats, - }, - }); - } + // if (now - Date.parse(lastUpdate) > requiredFreshness) { + // return res.internalError({ + // body: { + // message: new Error('Task Manager monitored stats are out of date'), + // attributes: { lastUpdate, timestamp, stats }, + // }, + // }); + // } return res.ok({ - body: monitoringStats, + body: { lastUpdate, timestamp, stats }, }); } ); } + +async function getLatestStats(monitoringStats$: Observable) { + return new Promise((resolve) => + monitoringStats$.pipe(take(1)).subscribe((stats) => resolve(summarizeMonitoringStats(stats))) + ); +} diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index e1dd85f868cdd..6dd0c1546733f 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -9,16 +9,19 @@ import { Option } from 'fp-ts/lib/Option'; import { ConcreteTaskInstance } from './task'; import { Result, Err } from './lib/result_type'; +import { FillPoolResult } from './lib/fill_pool'; +import { PollingError } from './polling'; export enum TaskEventType { TASK_CLAIM = 'TASK_CLAIM', TASK_MARK_RUNNING = 'TASK_MARK_RUNNING', TASK_RUN = 'TASK_RUN', TASK_RUN_REQUEST = 'TASK_RUN_REQUEST', + TASK_POLLING_CYCLE = 'TASK_POLLING_CYCLE', } export interface TaskEvent { - id: string; + id?: string; type: TaskEventType; event: Result; } @@ -26,6 +29,7 @@ export type TaskMarkRunning = TaskEvent; export type TaskRun = TaskEvent; export type TaskClaim = TaskEvent>; export type TaskRunRequest = TaskEvent; +export type TaskPollingCycle = TaskEvent>; export function asTaskMarkRunningEvent( id: string, @@ -69,6 +73,15 @@ export function asTaskRunRequestEvent( }; } +export function asTaskPollingCycleEvent( + event: Result> +): TaskPollingCycle { + return { + type: TaskEventType.TASK_POLLING_CYCLE, + event, + }; +} + export function isTaskMarkRunningEvent( taskEvent: TaskEvent ): taskEvent is TaskMarkRunning { @@ -85,3 +98,8 @@ export function isTaskRunRequestEvent( ): taskEvent is TaskRunRequest { return taskEvent.type === TaskEventType.TASK_RUN_REQUEST; } +export function isTaskPollingCycleEvent( + taskEvent: TaskEvent +): taskEvent is TaskPollingCycle { + return taskEvent.type === TaskEventType.TASK_POLLING_CYCLE; +} diff --git a/x-pack/plugins/task_manager/server/task_manager.mock.ts b/x-pack/plugins/task_manager/server/task_manager.mock.ts index e5325274024d8..edd56b63e4800 100644 --- a/x-pack/plugins/task_manager/server/task_manager.mock.ts +++ b/x-pack/plugins/task_manager/server/task_manager.mock.ts @@ -4,9 +4,16 @@ * you may not use this file except in compliance with the Elastic License. */ -import { TaskManager } from './task_manager'; +import { TaskManager, TaskLifecycleEvent } from './task_manager'; +import { of, Observable } from 'rxjs'; -const createTaskManagerMock = (isStarted: boolean = true) => { +const createTaskManagerMock = ({ + isStarted = true, + events = of(), +}: { + isStarted?: boolean; + events?: Observable; +} = {}) => { return ({ registerTaskDefinitions: jest.fn(), addMiddleware: jest.fn(), @@ -21,6 +28,9 @@ const createTaskManagerMock = (isStarted: boolean = true) => { get isStarted() { return isStarted; }, + get events() { + return events; + }, stop: jest.fn(), } as unknown) as jest.Mocked; }; diff --git a/x-pack/plugins/task_manager/server/task_manager.test.ts b/x-pack/plugins/task_manager/server/task_manager.test.ts index 017540a2dcc55..decd7291bc0c8 100644 --- a/x-pack/plugins/task_manager/server/task_manager.test.ts +++ b/x-pack/plugins/task_manager/server/task_manager.test.ts @@ -42,6 +42,7 @@ describe('TaskManager', () => { poll_interval: 6000000, max_poll_inactivity_cycles: 10, monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, request_capacity: 1000, }; const taskManagerOpts = { diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index 44e409a2aec37..7fcf496e0d119 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -5,7 +5,7 @@ */ import { Logger } from 'src/core/server'; import { Subject, Observable, Subscription } from 'rxjs'; -import { filter } from 'rxjs/operators'; +import { filter, tap } from 'rxjs/operators'; import { performance } from 'perf_hooks'; @@ -25,10 +25,12 @@ import { TaskRun, TaskClaim, TaskRunRequest, + TaskPollingCycle, isTaskRunEvent, isTaskClaimEvent, isTaskRunRequestEvent, asTaskRunRequestEvent, + asTaskPollingCycleEvent, } from './task_events'; import { fillPool, FillPoolResult } from './lib/fill_pool'; import { addMiddlewareToChain, BeforeSaveMiddlewareParams, Middleware } from './lib/middleware'; @@ -52,7 +54,7 @@ import { PollingErrorType, createObservableMonitor, } from './polling'; -import { TaskPool } from './task_pool'; +import { TaskPool, TaskPoolRunResult } from './task_pool'; import { TaskManagerRunner, TaskRunner } from './task_runner'; import { FetchResult, @@ -82,7 +84,12 @@ interface RunNowResult { id: string; } -export type TaskLifecycleEvent = TaskMarkRunning | TaskRun | TaskClaim | TaskRunRequest; +export type TaskLifecycleEvent = + | TaskMarkRunning + | TaskRun + | TaskClaim + | TaskRunRequest + | TaskPollingCycle; /* * The TaskManager is the public interface into the task manager system. This glues together @@ -195,6 +202,10 @@ export class TaskManager { ); } + public get events(): Observable { + return this.events$; + } + private emitEvent = (event: TaskLifecycleEvent) => { this.events$.next(event); }; @@ -245,17 +256,23 @@ export class TaskManager { this.startQueue.forEach((fn) => fn()); this.startQueue = []; - this.pollingSubscription = this.poller$.subscribe( - mapErr((error: PollingError) => { - if (error.type === PollingErrorType.RequestCapacityReached) { - pipe( - error.data, - mapOptional((id) => this.emitEvent(asTaskRunRequestEvent(id, asErr(error)))) - ); - } - this.logger.error(error.message); - }) - ); + this.pollingSubscription = this.poller$ + .pipe( + tap( + mapErr((error: PollingError) => { + if (error.type === PollingErrorType.RequestCapacityReached) { + pipe( + error.data, + mapOptional((id) => this.emitEvent(asTaskRunRequestEvent(id, asErr(error)))) + ); + } + this.logger.error(error.message); + }) + ) + ) + .subscribe((event: Result>) => { + this.emitEvent(asTaskPollingCycleEvent(event)); + }); } } @@ -522,13 +539,13 @@ export async function awaitTaskRunResult( ); }, taskEvent.event); } else { - either>( + either>( taskEvent.event, - (taskInstance: ConcreteTaskInstance) => { + (taskInstance: ConcreteTaskInstance | FillPoolResult) => { // resolve if the task has run sucessfully if (isTaskRunEvent(taskEvent)) { subscription.unsubscribe(); - resolve({ id: taskInstance.id }); + resolve({ id: (taskInstance as ConcreteTaskInstance).id }); } }, async (error: Error | Option) => { diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index c3c15c7ba4810..3c792966e8681 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -22,6 +22,16 @@ interface MonitoringStats { timestamp: string; value: Record; }; + runtime: { + timestamp: string; + value: { + drift: Record; + polling: { + lastSuccessfulPoll: string; + resultFrequency: Record; + }; + }; + }; }; } @@ -59,18 +69,19 @@ export default function ({ getService }: FtrProviderContext) { poll_interval: 3000, max_poll_inactivity_cycles: 10, monitored_aggregated_stats_refresh_rate: monitoredAggregatedStatsRefreshRate, + monitored_stats_running_average_window: 50, request_capacity: 1000, max_workers: 10, }); }); it('should return the task manager workload', async () => { - const workload = (await getHealth()).stats.workload; + const { workload } = (await getHealth()).stats; const sumSampleTaskInWorkload = (workload.value.taskTypes as { sampleTask?: { sum: number }; }).sampleTask?.sum ?? 0; - const schedulesWorkload = (mapValues( + const scheduledWorkload = (mapValues( keyBy(workload.value.schedule as Array<[string, number]>, ([interval, count]) => interval), ([, count]) => count ) as unknown) as { '37m': number | undefined; '37s': number | undefined }; @@ -105,9 +116,25 @@ export default function ({ getService }: FtrProviderContext) { '37m': number; '37s': number; }; - expect(schedulesWorkloadAfterScheduling['37s']).to.eql(schedulesWorkload['37s'] ?? 0 + 1); - expect(schedulesWorkloadAfterScheduling['37m']).to.eql(schedulesWorkload['37m'] ?? 0 + 1); + expect(schedulesWorkloadAfterScheduling['37s']).to.eql(1 + (scheduledWorkload['37s'] ?? 0)); + expect(schedulesWorkloadAfterScheduling['37m']).to.eql(1 + (scheduledWorkload['37m'] ?? 0)); }); }); + + it('should return the task manager runtime stats', async () => { + const { + runtime: { + value: { drift, polling }, + }, + } = (await getHealth()).stats; + + expect(isNaN(Date.parse(polling.lastSuccessfulPoll as string))).to.eql(false); + expect(typeof polling.resultFrequency.NoTasksClaimed).to.eql('number'); + expect(typeof polling.resultFrequency.RanOutOfCapacity).to.eql('number'); + expect(typeof polling.resultFrequency.PoolFilled).to.eql('number'); + + expect(typeof drift.mean).to.eql('number'); + expect(typeof drift.median).to.eql('number'); + }); }); } From bf0c3b443ac214733917a028f1b2b66b4f77add0 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 28 Sep 2020 15:11:40 +0100 Subject: [PATCH 10/42] removed unused import --- x-pack/plugins/task_manager/server/task_manager.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index 7fcf496e0d119..de182ea02a674 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -54,7 +54,7 @@ import { PollingErrorType, createObservableMonitor, } from './polling'; -import { TaskPool, TaskPoolRunResult } from './task_pool'; +import { TaskPool } from './task_pool'; import { TaskManagerRunner, TaskRunner } from './task_runner'; import { FetchResult, From acae863dc5e951eaee59f1cc356d096f9b517d55 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 28 Sep 2020 17:25:55 +0100 Subject: [PATCH 11/42] made task run stats reactive --- .../monitoring/monitoring_stats_stream.ts | 2 +- .../monitoring/task_run_statistics.test.ts | 61 +++++---- .../server/monitoring/task_run_statistics.ts | 117 ++++++++++-------- 3 files changed, 94 insertions(+), 86 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts index 03fa889fb732d..02ed298a047e6 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -62,7 +62,7 @@ export function createAggregators( logger: Logger ): AggregatedStatProvider { return merge( - createTaskRunAggregator(taskManager, config.monitored_stats_running_average_window, logger), + createTaskRunAggregator(taskManager, config.monitored_stats_running_average_window), createWorkloadAggregator(taskManager, config.monitored_aggregated_stats_refresh_rate, logger) ); } diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index 365b8962146dc..ee5940d84bb58 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -8,7 +8,7 @@ import uuid from 'uuid'; import { Subject } from 'rxjs'; import stats from 'stats-lite'; import sinon from 'sinon'; -import { take, tap, bufferCount, startWith, map } from 'rxjs/operators'; +import { take, tap, bufferCount, skip, map } from 'rxjs/operators'; import { ConcreteTaskInstance, TaskStatus } from '../task'; import { asTaskRunEvent, asTaskPollingCycleEvent } from '../task_events'; @@ -21,7 +21,6 @@ import { SummarizedTaskRunStat, } from './task_run_statistics'; import { taskManagerMock } from '../task_manager.mock'; -import { mockLogger } from '../test_utils'; import { AggregatedStat } from './runtime_statistics_aggregator'; import { FillPoolResult } from '../lib/fill_pool'; @@ -36,20 +35,13 @@ describe('Task Run Statistics', () => { test('returns a running average of task drift', async () => { const runAtDrift = [1000, 2000, 500, 300, 400, 15000, 20000, 200]; + const events = new Subject(); const taskManager = taskManagerMock.create({ - events: new Subject().pipe( - startWith( - ...runAtDrift.map((drift) => mockTaskRunEvent({ runAt: runAtMillisecondsAgo(drift) })) - ) - ), + events, }); const runningAverageWindowSize = 5; - const taskRunAggregator = createTaskRunAggregator( - taskManager, - runningAverageWindowSize, - mockLogger() - ); + const taskRunAggregator = createTaskRunAggregator(taskManager, runningAverageWindowSize); function expectWindowEqualsUpdate( taskStat: AggregatedStat, @@ -65,6 +57,10 @@ describe('Task Run Statistics', () => { return new Promise((resolve) => { taskRunAggregator .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, value: summarizeTaskRunStat(value), @@ -84,38 +80,30 @@ describe('Task Run Statistics', () => { expectWindowEqualsUpdate(taskStats[7], runAtDrift.slice(3, 8)); resolve(); }); + + for (const drift of runAtDrift) { + events.next(mockTaskRunEvent({ runAt: runAtMillisecondsAgo(drift) })); + } }); }); test('returns polling stats', async () => { const expectedTimestamp: string[] = []; + const events = new Subject(); const taskManager = taskManagerMock.create({ - events: new Subject().pipe( - startWith( - asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), - asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), - asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), - asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled)), - asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled)), - asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled)), - asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity)), - asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity)), - asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)), - asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed)) - ) - ), + events, }); const runningAverageWindowSize = 5; - const taskRunAggregator = createTaskRunAggregator( - taskManager, - runningAverageWindowSize, - mockLogger() - ); + const taskRunAggregator = createTaskRunAggregator(taskManager, runningAverageWindowSize); return new Promise((resolve) => { taskRunAggregator .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, value: summarizeTaskRunStat(value), @@ -161,6 +149,17 @@ describe('Task Run Statistics', () => { ]); resolve(); }); + + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); }); }); }); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index ca224fc28199b..1c07dfa8e39d9 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -4,16 +4,15 @@ * you may not use this file except in compliance with the Elastic License. */ -import { Logger } from 'src/core/server'; -import { of, empty } from 'rxjs'; -import { filter, flatMap } from 'rxjs/operators'; +import { combineLatest, Observable } from 'rxjs'; +import { filter, startWith, map } from 'rxjs/operators'; import { isUndefined, countBy, mapValues } from 'lodash'; import stats from 'stats-lite'; import { JsonObject } from 'src/plugins/kibana_utils/common'; import { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; import { TaskManager, TaskLifecycleEvent } from '../task_manager'; import { isTaskRunEvent, isTaskPollingCycleEvent } from '../task_events'; -import { isOk } from '../lib/result_type'; +import { isOk, Ok } from '../lib/result_type'; import { ConcreteTaskInstance } from '../task'; import { FillPoolResult } from '../lib/fill_pool'; @@ -49,61 +48,60 @@ export interface SummarizedTaskRunStat extends JsonObject { export function createTaskRunAggregator( taskManager: TaskManager, - runningAverageWindowSize: number, - logger: Logger + runningAverageWindowSize: number ): AggregatedStatProvider { - const runningStats: { - runtime: { - polling: { - lastSuccessfulPoll: (value?: string) => string | undefined; - resultFrequency: (value?: FillPoolResult) => FillPoolResult[]; - }; - drift: (value?: number) => number[]; - }; - } = { - runtime: { - polling: { - lastSuccessfulPoll: createLastValueStat(), - resultFrequency: createRunningAveragedStat(runningAverageWindowSize), - }, - drift: createRunningAveragedStat(runningAverageWindowSize), - }, + const driftQueue = createRunningAveragedStat(runningAverageWindowSize); + const taskRunEvents$: Observable = taskManager.events.pipe( + filter( + (taskEvent: TaskLifecycleEvent) => + isTaskRunEvent(taskEvent) && isOk(taskEvent.event) + ), + map((taskEvent: TaskLifecycleEvent) => { + const task = (taskEvent.event as Ok).value; + const now = Date.now(); + return driftQueue(now - task.runAt.getTime()); + }) + ); + + const pollingQueue = { + lastSuccessfulPoll: createLastValueStat(), + resultFrequency: createRunningAveragedStat(runningAverageWindowSize), }; - return taskManager.events.pipe( + const taskPollingEvents$: Observable = taskManager.events.pipe( filter( (taskEvent: TaskLifecycleEvent) => - (isTaskRunEvent(taskEvent) || isTaskPollingCycleEvent(taskEvent)) && - isOk(taskEvent.event) + isTaskPollingCycleEvent(taskEvent) && isOk(taskEvent.event) ), - flatMap((taskEvent: TaskLifecycleEvent) => { - if (isTaskRunEvent(taskEvent) && isOk(taskEvent.event)) { - const task = taskEvent.event.value; - const now = Date.now(); - return of({ - key: 'runtime', - value: { - polling: { - lastSuccessfulPoll: runningStats.runtime.polling.lastSuccessfulPoll(), - resultFrequency: runningStats.runtime.polling.resultFrequency(), - }, - drift: runningStats.runtime.drift(now - task.runAt.getTime()), - }, - } as AggregatedStat); - } else if (isTaskPollingCycleEvent(taskEvent) && isOk(taskEvent.event)) { - return of({ - key: 'runtime', - value: { - polling: { - lastSuccessfulPoll: runningStats.runtime.polling.lastSuccessfulPoll( - new Date().toISOString() - ), - resultFrequency: runningStats.runtime.polling.resultFrequency(taskEvent.event.value), - }, - drift: runningStats.runtime.drift(), - }, - } as AggregatedStat); - } - return empty(); + map((taskEvent: TaskLifecycleEvent) => { + return { + lastSuccessfulPoll: pollingQueue.lastSuccessfulPoll(new Date().toISOString()), + resultFrequency: pollingQueue.resultFrequency( + (taskEvent.event as Ok).value + ), + }; + }) + ); + + return combineLatest( + taskRunEvents$.pipe(startWith([])), + taskPollingEvents$.pipe( + startWith({ + resultFrequency: { + [FillPoolResult.NoTasksClaimed]: 0, + [FillPoolResult.RanOutOfCapacity]: 0, + [FillPoolResult.PoolFilled]: 0, + }, + }) + ) + ).pipe( + map(([drift, polling]) => { + return { + key: 'runtime', + value: { + drift, + polling, + }, + } as AggregatedStat; }) ); } @@ -134,10 +132,17 @@ function calculateRunningAverage(values: number[]): AveragedStat { }; } +/** + * Calculate the frequency of each term in a list of terms. + * @param values + */ function calculateFrequency(values: T[]): JsonObject { return mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)); } +/** + * Utility to keep track of one value which might change over time + */ function createLastValueStat() { let lastValue: T; return (value?: T) => { @@ -150,6 +155,10 @@ function createLastValueStat() { }; } +/** + * Utility to keep track of a limited queue of values which changes over time + * dropping older values as they slide out of the window we wish to track + */ function createRunningAveragedStat(runningAverageWindowSize: number) { const queue = new Array(); return (value?: T) => { From 4d34dac7528d8bef47462ef1a8c8c11c890a39fc Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 28 Sep 2020 20:13:30 +0100 Subject: [PATCH 12/42] fixed unit tests in health route --- .../task_manager/server/routes/health.test.ts | 159 ++++++++++++++++-- .../task_manager/server/routes/health.ts | 48 ++++-- 2 files changed, 173 insertions(+), 34 deletions(-) diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 1ea33794a2794..b0533e6a52ee4 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -4,11 +4,13 @@ * you may not use this file except in compliance with the Elastic License. */ -import { healthRoute } from './health'; +import { of, Subject } from 'rxjs'; +import { merge } from 'lodash'; import { httpServiceMock } from 'src/core/server/mocks'; +import { healthRoute } from './health'; import { mockHandlerArguments } from './_mock_handler_arguments'; -import { of } from 'rxjs'; import { sleep, mockLogger } from '../test_utils'; +import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; describe('healthRoute', () => { beforeEach(() => { @@ -29,17 +31,39 @@ describe('healthRoute', () => { const router = httpServiceMock.createRouter(); const logger = mockLogger(); - healthRoute(router, Promise.resolve(of()), logger, 1000); + const mockStat = mockHealthStats(); + await sleep(10); + const skippedMockStat = mockHealthStats(); + await sleep(10); + const nextMockStat = mockHealthStats(); + + const stats = Promise.resolve(new Subject()); + + healthRoute(router, stats, logger, 1000); + + const stats$ = await stats; - await sleep(1000); + stats$.next(mockStat); + await sleep(500); + stats$.next(skippedMockStat); + await sleep(600); + stats$.next(nextMockStat); - expect(logger.debug).toHaveBeenCalledWith(''); + expect(logger.debug).toHaveBeenCalledWith(JSON.stringify(summarizeMonitoringStats(mockStat))); + expect(logger.debug).not.toHaveBeenCalledWith( + JSON.stringify(summarizeMonitoringStats(skippedMockStat)) + ); + expect(logger.debug).toHaveBeenCalledWith( + JSON.stringify(summarizeMonitoringStats(nextMockStat)) + ); + expect(logger.debug).toHaveBeenCalledTimes(2); }); it('returns an error response if the stats are no longer fresh', async () => { const router = httpServiceMock.createRouter(); - healthRoute(router, Promise.resolve(of()), mockLogger(), 1000); + const mockStat = mockHealthStats(); + healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000); const [, handler] = router.get.mock.calls[0]; @@ -49,24 +73,123 @@ describe('healthRoute', () => { expect(await handler(context, req, res)).toMatchObject({ body: { - attributes: { - lastUpdate: expect.any(String), - stats: { - configuration: { - timestamp: expect.any(String), - value: { - max_poll_inactivity_cycles: 10, - max_workers: 10, - poll_interval: 6000000, - request_capacity: 1000, - monitored_aggregated_stats_refresh_rate: 5000, - monitored_stats_running_average_window: 50, + attributes: summarizeMonitoringStats( + mockHealthStats({ + lastUpdate: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + lastSuccessfulPoll: expect.any(String), + }, + }, }, }, + }) + ), + message: new Error('Task Manager monitored stats are out of date'), + }, + }); + }); + + it('returns an error response if the poller hasnt polled within the required freshness', async () => { + const router = httpServiceMock.createRouter(); + + const lastSuccessfulPoll = new Date(Date.now() - 2000).toISOString(); + const mockStat = mockHealthStats({ + stats: { + runtime: { + value: { + polling: { + lastSuccessfulPoll, + }, }, }, + }, + }); + healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + attributes: summarizeMonitoringStats( + mockHealthStats({ + lastUpdate: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + lastSuccessfulPoll, + }, + }, + }, + }, + }) + ), message: new Error('Task Manager monitored stats are out of date'), }, }); }); }); + +function mockHealthStats(overrides = {}) { + return (merge( + { + lastUpdate: new Date().toISOString(), + stats: { + configuration: { + timestamp: new Date().toISOString(), + value: { + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + }, + }, + }, + workload: { + timestamp: new Date().toISOString(), + value: { + sum: 4, + taskTypes: { + actions_telemetry: { sum: 2, status: { idle: 2 } }, + alerting_telemetry: { sum: 1, status: { idle: 1 } }, + session_cleanup: { sum: 1, status: { idle: 1 } }, + }, + }, + }, + runtime: { + timestamp: new Date().toISOString(), + value: { + drift: [1000, 1000], + polling: { + lastSuccessfulPoll: new Date().toISOString(), + resultFrequency: ['NoTasksClaimed', 'NoTasksClaimed', 'NoTasksClaimed'], + }, + }, + }, + }, + }, + overrides + ) as unknown) as MonitoringStats; +} diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index e99c1298363a8..46797b02740ca 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -14,7 +14,8 @@ import { import { Logger } from 'src/core/server'; import { Observable } from 'rxjs'; import { take } from 'rxjs/operators'; -import { debounceTime } from 'rxjs/operators'; +import { throttleTime } from 'rxjs/operators'; +import { isString } from 'lodash'; import { MonitoringStats, RawMonitoringStats, summarizeMonitoringStats } from '../monitoring'; export function healthRoute( @@ -25,9 +26,9 @@ export function healthRoute( ) { /* Log Task Manager stats as a Debug log line at a fixed interval */ monitoringStats.then((monitoringStats$) => { - monitoringStats$ - .pipe(debounceTime(requiredFreshness)) - .subscribe((stats) => logger.debug(JSON.stringify(summarizeMonitoringStats(stats)))); + monitoringStats$.pipe(throttleTime(requiredFreshness)).subscribe((stats) => { + logger.debug(JSON.stringify(summarizeMonitoringStats(stats))); + }); }); router.get( @@ -40,7 +41,7 @@ export function healthRoute( req: KibanaRequest, res: KibanaResponseFactory ): Promise { - const { lastUpdate, stats } = await getLatestStats(await monitoringStats); + const stats = await getLatestStats(await monitoringStats); const now = Date.now(); const timestamp = new Date(now).toISOString(); @@ -49,23 +50,38 @@ export function healthRoute( * the stats in the body of the api call. This makes it easier for monitoring * services to mark the service as broken */ - // if (now - Date.parse(lastUpdate) > requiredFreshness) { - // return res.internalError({ - // body: { - // message: new Error('Task Manager monitored stats are out of date'), - // attributes: { lastUpdate, timestamp, stats }, - // }, - // }); - // } + if ( + now - + getOldestTimestamp( + stats.lastUpdate, + stats.stats.runtime?.value.polling.lastSuccessfulPoll + ) > + requiredFreshness + ) { + return res.internalError({ + body: { + message: new Error('Task Manager monitored stats are out of date'), + attributes: { timestamp, ...summarizeMonitoringStats(stats) }, + }, + }); + } return res.ok({ - body: { lastUpdate, timestamp, stats }, + body: { timestamp, ...summarizeMonitoringStats(stats) }, }); } ); } +function getOldestTimestamp(...timestamps: unknown[]): number { + return Math.min( + ...timestamps + .map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN)) + .filter((timestamp) => !isNaN(timestamp)) + ); +} + async function getLatestStats(monitoringStats$: Observable) { - return new Promise((resolve) => - monitoringStats$.pipe(take(1)).subscribe((stats) => resolve(summarizeMonitoringStats(stats))) + return new Promise((resolve) => + monitoringStats$.pipe(take(1)).subscribe((stats) => resolve(stats)) ); } From e3ba8ad782d82ca2505c2c0fe399805f748bb703 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 29 Sep 2020 10:10:07 +0100 Subject: [PATCH 13/42] removed unused import --- x-pack/plugins/task_manager/server/routes/health.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 46797b02740ca..d48775803c780 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -16,7 +16,7 @@ import { Observable } from 'rxjs'; import { take } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; import { isString } from 'lodash'; -import { MonitoringStats, RawMonitoringStats, summarizeMonitoringStats } from '../monitoring'; +import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; export function healthRoute( router: IRouter, From e7c5da2ff6164bd443a00bed52c2682387aea989 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 29 Sep 2020 19:51:11 +0100 Subject: [PATCH 14/42] added task run duration to health endpoint --- .../monitoring/task_run_calcultors.test.ts | 76 +++++++++++++ .../server/monitoring/task_run_calcultors.ts | 62 +++++++++++ .../monitoring/task_run_statistics.test.ts | 15 ++- .../server/monitoring/task_run_statistics.ts | 104 ++++++------------ .../task_manager/server/task_events.ts | 34 +++++- .../task_manager/server/task_runner.ts | 24 +++- .../test_suites/task_manager/health_route.ts | 11 +- 7 files changed, 237 insertions(+), 89 deletions(-) create mode 100644 x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts create mode 100644 x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts new file mode 100644 index 0000000000000..2ee18da9607a4 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts @@ -0,0 +1,76 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import uuid from 'uuid'; + +import { + calculateRunningAverage, + calculateFrequency, + createRunningAveragedStat, + createMapOfRunningAveragedStats, +} from './task_run_calcultors'; + +describe('calculateRunningAverage', () => { + test('calculates the running average, median and mode of a window of values', async () => { + expect(calculateRunningAverage([2, 2, 4, 6, 6])).toEqual({ + mean: 4, + median: 4, + mode: new Set([2, 6]), + }); + }); +}); + +describe('calculateFrequency', () => { + test('calculates the frequency of each terms in the list as a percentage', async () => { + const [term1, term2, term3] = [uuid.v4(), uuid.v4(), uuid.v4()]; + expect( + calculateFrequency([term1, term2, term2, term3, term1, term1, term2, term1, term3]) + ).toEqual({ + [term3]: 22, + [term1]: 44, + [term2]: 33, + }); + }); +}); + +describe('createRunningAveragedStat', () => { + test('create a function which tracks a window of values', async () => { + const queue = createRunningAveragedStat(3); + expect(queue(1)).toEqual([1]); + expect(queue(2)).toEqual([1, 2]); + expect(queue(3)).toEqual([1, 2, 3]); + expect(queue(4)).toEqual([2, 3, 4]); + expect(queue(5)).toEqual([3, 4, 5]); + }); +}); + +describe('createMapOfRunningAveragedStats', () => { + test('create a function which tracks multiple window of values by key', async () => { + const [term1, term2, term3] = [uuid.v4(), uuid.v4(), uuid.v4()]; + const mappedQueues = createMapOfRunningAveragedStats(3); + expect(mappedQueues(term1, 1)).toEqual({ [term1]: [1] }); + expect(mappedQueues(term1, 2)).toEqual({ [term1]: [1, 2] }); + expect(mappedQueues(term2, 3)).toEqual({ [term1]: [1, 2], [term2]: [3] }); + expect(mappedQueues(term3, 4)).toEqual({ [term1]: [1, 2], [term2]: [3], [term3]: [4] }); + expect(mappedQueues(term2, 5)).toEqual({ [term1]: [1, 2], [term2]: [3, 5], [term3]: [4] }); + expect(mappedQueues(term2, 6)).toEqual({ [term1]: [1, 2], [term2]: [3, 5, 6], [term3]: [4] }); + expect(mappedQueues(term1, 7)).toEqual({ + [term1]: [1, 2, 7], + [term2]: [3, 5, 6], + [term3]: [4], + }); + expect(mappedQueues(term1, 8)).toEqual({ + [term1]: [2, 7, 8], + [term2]: [3, 5, 6], + [term3]: [4], + }); + expect(mappedQueues(term1, 9)).toEqual({ + [term1]: [7, 8, 9], + [term2]: [3, 5, 6], + [term3]: [4], + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts new file mode 100644 index 0000000000000..ef3f98b752820 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts @@ -0,0 +1,62 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import stats from 'stats-lite'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { isUndefined, countBy, mapValues } from 'lodash'; + +export interface AveragedStat extends JsonObject { + mean: number; + median: number; + mode: number; +} + +export function calculateRunningAverage(values: number[]): AveragedStat { + return { + mean: stats.mean(values), + median: stats.median(values), + mode: stats.mode(values), + }; +} + +/** + * Calculate the frequency of each term in a list of terms. + * @param values + */ +export function calculateFrequency(values: T[]): JsonObject { + return mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)); +} + +/** + * Utility to keep track of a limited queue of values which changes over time + * dropping older values as they slide out of the window we wish to track + */ +export function createRunningAveragedStat(runningAverageWindowSize: number) { + const queue = new Array(); + return (value?: T) => { + if (isUndefined(value)) { + return queue; + } else { + if (queue.length === runningAverageWindowSize) { + queue.shift(); + } + queue.push(value); + return [...queue]; + } + }; +} + +export function createMapOfRunningAveragedStats(runningAverageWindowSize: number) { + const mappedQueue: Record T[]> = {}; + const asRecordOfValues = () => mapValues(mappedQueue, (queue) => queue()); + return (key?: string, value?: T) => { + if (!isUndefined(key)) { + mappedQueue[key] = mappedQueue[key] ?? createRunningAveragedStat(runningAverageWindowSize); + mappedQueue[key](value); + } + return asRecordOfValues(); + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index ee5940d84bb58..8bd94685c4c9b 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -11,7 +11,7 @@ import sinon from 'sinon'; import { take, tap, bufferCount, skip, map } from 'rxjs/operators'; import { ConcreteTaskInstance, TaskStatus } from '../task'; -import { asTaskRunEvent, asTaskPollingCycleEvent } from '../task_events'; +import { asTaskRunEvent, asTaskPollingCycleEvent, TaskTiming } from '../task_events'; import { asOk } from '../lib/result_type'; import { TaskLifecycleEvent } from '../task_manager'; import { @@ -81,8 +81,15 @@ describe('Task Run Statistics', () => { resolve(); }); + const now = Date.now(); for (const drift of runAtDrift) { - events.next(mockTaskRunEvent({ runAt: runAtMillisecondsAgo(drift) })); + const start = Math.floor(Math.random() * 1000); + events.next( + mockTaskRunEvent( + { runAt: runAtMillisecondsAgo(drift + start) }, + { start: runAtMillisecondsAgo(start).getTime(), stop: now } + ) + ); } }); }); @@ -168,9 +175,9 @@ function runAtMillisecondsAgo(ms: number): Date { return new Date(Date.now() - ms); } -const mockTaskRunEvent = (overrides: Partial = {}) => { +const mockTaskRunEvent = (overrides: Partial = {}, timing: TaskTiming) => { const task = mockTaskInstance(overrides); - return asTaskRunEvent(task.id, asOk(task)); + return asTaskRunEvent(task.id, asOk(task), timing); }; const mockTaskInstance = (overrides: Partial = {}): ConcreteTaskInstance => ({ diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index 1c07dfa8e39d9..cea38ad0b55b9 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -6,21 +6,21 @@ import { combineLatest, Observable } from 'rxjs'; import { filter, startWith, map } from 'rxjs/operators'; -import { isUndefined, countBy, mapValues } from 'lodash'; -import stats from 'stats-lite'; import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { mapValues } from 'lodash'; import { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; import { TaskManager, TaskLifecycleEvent } from '../task_manager'; import { isTaskRunEvent, isTaskPollingCycleEvent } from '../task_events'; import { isOk, Ok } from '../lib/result_type'; import { ConcreteTaskInstance } from '../task'; import { FillPoolResult } from '../lib/fill_pool'; - -interface AveragedStat extends JsonObject { - mean: number; - median: number; - mode: number; -} +import { + AveragedStat, + calculateRunningAverage, + calculateFrequency, + createRunningAveragedStat, + createMapOfRunningAveragedStats, +} from './task_run_calcultors'; interface FillPoolStat extends JsonObject { lastSuccessfulPoll: string; @@ -29,6 +29,7 @@ interface FillPoolStat extends JsonObject { export interface TaskRunStat extends JsonObject { drift: number[]; + duration: Record; polling: FillPoolStat | Omit; } @@ -43,6 +44,7 @@ interface FillPoolRawStat extends JsonObject { export interface SummarizedTaskRunStat extends JsonObject { drift: AveragedStat; + duration: Record; polling: FillPoolRawStat | Omit; } @@ -51,22 +53,28 @@ export function createTaskRunAggregator( runningAverageWindowSize: number ): AggregatedStatProvider { const driftQueue = createRunningAveragedStat(runningAverageWindowSize); - const taskRunEvents$: Observable = taskManager.events.pipe( + const taskRunDurationQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); + const taskRunEvents$: Observable> = taskManager.events.pipe( filter( (taskEvent: TaskLifecycleEvent) => - isTaskRunEvent(taskEvent) && isOk(taskEvent.event) + isTaskRunEvent(taskEvent) && + isOk(taskEvent.event) && + !!taskEvent?.timing?.start ), map((taskEvent: TaskLifecycleEvent) => { const task = (taskEvent.event as Ok).value; - const now = Date.now(); - return driftQueue(now - task.runAt.getTime()); + const { timing } = taskEvent; + return { + duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), + drift: driftQueue(timing!.start - task.runAt.getTime()), + }; }) ); - const pollingQueue = { - lastSuccessfulPoll: createLastValueStat(), - resultFrequency: createRunningAveragedStat(runningAverageWindowSize), - }; + const resultFrequencyQueue = createRunningAveragedStat(runningAverageWindowSize); const taskPollingEvents$: Observable = taskManager.events.pipe( filter( (taskEvent: TaskLifecycleEvent) => @@ -74,16 +82,14 @@ export function createTaskRunAggregator( ), map((taskEvent: TaskLifecycleEvent) => { return { - lastSuccessfulPoll: pollingQueue.lastSuccessfulPoll(new Date().toISOString()), - resultFrequency: pollingQueue.resultFrequency( - (taskEvent.event as Ok).value - ), + lastSuccessfulPoll: new Date().toISOString(), + resultFrequency: resultFrequencyQueue((taskEvent.event as Ok).value), }; }) ); return combineLatest( - taskRunEvents$.pipe(startWith([])), + taskRunEvents$.pipe(startWith({ duration: {}, drift: [] })), taskPollingEvents$.pipe( startWith({ resultFrequency: { @@ -94,11 +100,11 @@ export function createTaskRunAggregator( }) ) ).pipe( - map(([drift, polling]) => { + map(([taskRun, polling]) => { return { key: 'runtime', value: { - drift, + ...taskRun, polling, }, } as AggregatedStat; @@ -109,6 +115,7 @@ export function createTaskRunAggregator( export function summarizeTaskRunStat({ polling: { lastSuccessfulPoll, resultFrequency }, drift, + duration, }: TaskRunStat): SummarizedTaskRunStat { return { polling: { @@ -121,55 +128,6 @@ export function summarizeTaskRunStat({ }, }, drift: calculateRunningAverage(drift), - }; -} - -function calculateRunningAverage(values: number[]): AveragedStat { - return { - mean: stats.mean(values), - median: stats.median(values), - mode: stats.mode(values), - }; -} - -/** - * Calculate the frequency of each term in a list of terms. - * @param values - */ -function calculateFrequency(values: T[]): JsonObject { - return mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)); -} - -/** - * Utility to keep track of one value which might change over time - */ -function createLastValueStat() { - let lastValue: T; - return (value?: T) => { - if (isUndefined(value)) { - return lastValue; - } else { - lastValue = value; - return lastValue; - } - }; -} - -/** - * Utility to keep track of a limited queue of values which changes over time - * dropping older values as they slide out of the window we wish to track - */ -function createRunningAveragedStat(runningAverageWindowSize: number) { - const queue = new Array(); - return (value?: T) => { - if (isUndefined(value)) { - return queue; - } else { - if (queue.length === runningAverageWindowSize) { - queue.shift(); - } - queue.push(value); - return [...queue]; - } + duration: mapValues(duration, (typedDuration) => calculateRunningAverage(typedDuration)), }; } diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index 6dd0c1546733f..4982ca7235a75 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -20,8 +20,19 @@ export enum TaskEventType { TASK_POLLING_CYCLE = 'TASK_POLLING_CYCLE', } +export interface TaskTiming { + start: number; + stop: number; +} + +export function startTaskTimer(): () => TaskTiming { + const start = Date.now(); + return () => ({ start, stop: Date.now() }); +} + export interface TaskEvent { id?: string; + timing?: TaskTiming; type: TaskEventType; event: Result; } @@ -33,52 +44,65 @@ export type TaskPollingCycle = TaskEvent + event: Result, + timing?: TaskTiming ): TaskMarkRunning { return { id, type: TaskEventType.TASK_MARK_RUNNING, event, + timing, }; } -export function asTaskRunEvent(id: string, event: Result): TaskRun { +export function asTaskRunEvent( + id: string, + event: Result, + timing?: TaskTiming +): TaskRun { return { id, type: TaskEventType.TASK_RUN, event, + timing, }; } export function asTaskClaimEvent( id: string, - event: Result> + event: Result>, + timing?: TaskTiming ): TaskClaim { return { id, type: TaskEventType.TASK_CLAIM, event, + timing, }; } export function asTaskRunRequestEvent( id: string, // we only emit a TaskRunRequest event when it fails - event: Err + event: Err, + timing?: TaskTiming ): TaskRunRequest { return { id, type: TaskEventType.TASK_RUN_REQUEST, event, + timing, }; } export function asTaskPollingCycleEvent( - event: Result> + event: Result>, + timing?: TaskTiming ): TaskPollingCycle { return { type: TaskEventType.TASK_POLLING_CYCLE, event, + timing, }; } diff --git a/x-pack/plugins/task_manager/server/task_runner.ts b/x-pack/plugins/task_manager/server/task_runner.ts index 87d1938393f68..3716e57e928f5 100644 --- a/x-pack/plugins/task_manager/server/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_runner.ts @@ -17,7 +17,14 @@ import Joi from 'joi'; import { identity, defaults, flow } from 'lodash'; import { asOk, asErr, mapErr, eitherAsync, unwrap, mapOk, Result } from './lib/result_type'; -import { TaskRun, TaskMarkRunning, asTaskRunEvent, asTaskMarkRunningEvent } from './task_events'; +import { + TaskRun, + TaskMarkRunning, + asTaskRunEvent, + asTaskMarkRunningEvent, + startTaskTimer, + TaskTiming, +} from './task_events'; import { intervalFromDate, intervalFromNow } from './lib/intervals'; import { BeforeRunFunction, BeforeMarkRunningFunction } from './lib/middleware'; import { @@ -174,6 +181,7 @@ export class TaskManagerRunner implements TaskRunner { taskInstance: this.instance, }); + const stopTaskTimer = startTaskTimer(); const apmTrans = apm.startTransaction( `taskManager run ${this.instance.taskType}`, 'taskManager' @@ -183,13 +191,16 @@ export class TaskManagerRunner implements TaskRunner { const result = await this.task.run(); const validatedResult = this.validateResult(result); if (apmTrans) apmTrans.end('success'); - return this.processResult(validatedResult); + return this.processResult(validatedResult, stopTaskTimer()); } catch (err) { this.logger.error(`Task ${this} failed: ${err}`); // in error scenario, we can not get the RunResult // re-use modifiedContext's state, which is correct as of beforeRun if (apmTrans) apmTrans.end('error'); - return this.processResult(asErr({ error: err, state: modifiedContext.taskInstance.state })); + return this.processResult( + asErr({ error: err, state: modifiedContext.taskInstance.state }), + stopTaskTimer() + ); } } @@ -384,7 +395,8 @@ export class TaskManagerRunner implements TaskRunner { } private async processResult( - result: Result + result: Result, + taskTiming: TaskTiming ): Promise> { await eitherAsync( result, @@ -394,11 +406,11 @@ export class TaskManagerRunner implements TaskRunner { } else { await this.processResultWhenDone(); } - this.onTaskEvent(asTaskRunEvent(this.id, asOk(this.instance))); + this.onTaskEvent(asTaskRunEvent(this.id, asOk(this.instance), taskTiming)); }, async ({ error }: FailedRunResult) => { await this.processResultForRecurringTask(result); - this.onTaskEvent(asTaskRunEvent(this.id, asErr(error))); + this.onTaskEvent(asTaskRunEvent(this.id, asErr(error), taskTiming)); } ); return result; diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 3c792966e8681..d5fef2852eed3 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -26,6 +26,7 @@ interface MonitoringStats { timestamp: string; value: { drift: Record; + duration: Record>; polling: { lastSuccessfulPoll: string; resultFrequency: Record; @@ -122,9 +123,14 @@ export default function ({ getService }: FtrProviderContext) { }); it('should return the task manager runtime stats', async () => { + await scheduleTask({ + taskType: 'sampleTask', + schedule: { interval: '5s' }, + }); + const { runtime: { - value: { drift, polling }, + value: { drift, polling, duration }, }, } = (await getHealth()).stats; @@ -135,6 +141,9 @@ export default function ({ getService }: FtrProviderContext) { expect(typeof drift.mean).to.eql('number'); expect(typeof drift.median).to.eql('number'); + + expect(typeof duration.sampleTask.mean).to.eql('number'); + expect(typeof duration.sampleTask.median).to.eql('number'); }); }); } From 192782a6580747e8228bcf87d539149dd1d731ca Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Wed, 30 Sep 2020 10:08:13 +0100 Subject: [PATCH 15/42] removed mode from TM stats --- x-pack/plugins/task_manager/server/config.ts | 4 +++- .../server/monitoring/task_run_calcultors.test.ts | 3 +-- .../task_manager/server/monitoring/task_run_calcultors.ts | 8 ++++---- .../server/monitoring/task_run_statistics.test.ts | 3 +-- .../task_manager/server/monitoring/task_run_statistics.ts | 6 +----- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index a530cb2d44f4c..f2de109273714 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -9,10 +9,12 @@ import { schema, TypeOf } from '@kbn/config-schema'; export const DEFAULT_MAX_WORKERS = 10; export const DEFAULT_POLL_INTERVAL = 3000; export const DEFAULT_MAX_POLL_INACTIVITY_CYCLES = 10; -export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50; +// Monitoring Constants +// =================== // Refresh "pull based" monitored stats at a default rate of once a minute export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000; +export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50; export const configSchema = schema.object({ enabled: schema.boolean({ defaultValue: true }), diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts index 2ee18da9607a4..e2994dd1098f8 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts @@ -14,11 +14,10 @@ import { } from './task_run_calcultors'; describe('calculateRunningAverage', () => { - test('calculates the running average, median and mode of a window of values', async () => { + test('calculates the running average and median of a window of values', async () => { expect(calculateRunningAverage([2, 2, 4, 6, 6])).toEqual({ mean: 4, median: 4, - mode: new Set([2, 6]), }); }); }); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts index ef3f98b752820..bda99e8735eb4 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts @@ -11,14 +11,12 @@ import { isUndefined, countBy, mapValues } from 'lodash'; export interface AveragedStat extends JsonObject { mean: number; median: number; - mode: number; } export function calculateRunningAverage(values: number[]): AveragedStat { return { - mean: stats.mean(values), + mean: Math.round(stats.mean(values)), median: stats.median(values), - mode: stats.mode(values), }; } @@ -27,7 +25,9 @@ export function calculateRunningAverage(values: number[]): AveragedStat { * @param values */ export function calculateFrequency(values: T[]): JsonObject { - return mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)); + return values.length + ? mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)) + : {}; } /** diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index 8bd94685c4c9b..dc536897490c5 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -48,9 +48,8 @@ describe('Task Run Statistics', () => { window: number[] ) { expect(taskStat.value.drift).toMatchObject({ - mean: stats.mean(window), + mean: Math.round(stats.mean(window)), median: stats.median(window), - mode: stats.mode(window), }); } diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index cea38ad0b55b9..b6db4f40d5ffd 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -92,11 +92,7 @@ export function createTaskRunAggregator( taskRunEvents$.pipe(startWith({ duration: {}, drift: [] })), taskPollingEvents$.pipe( startWith({ - resultFrequency: { - [FillPoolResult.NoTasksClaimed]: 0, - [FillPoolResult.RanOutOfCapacity]: 0, - [FillPoolResult.PoolFilled]: 0, - }, + resultFrequency: [], }) ) ).pipe( From 09bf68e965d27759392a475174c25365a674980f Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Wed, 30 Sep 2020 12:29:53 +0100 Subject: [PATCH 16/42] report run result frequency in tm stats --- .../monitoring/task_run_statistics.test.ts | 254 +++++++++++++++--- .../server/monitoring/task_run_statistics.ts | 72 +++-- .../task_manager/server/routes/health.test.ts | 2 + .../task_manager/server/task_events.ts | 13 +- .../task_manager/server/task_manager.test.ts | 31 ++- .../task_manager/server/task_manager.ts | 32 ++- .../task_manager/server/task_runner.test.ts | 50 +++- .../task_manager/server/task_runner.ts | 61 ++++- 8 files changed, 417 insertions(+), 98 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index dc536897490c5..b0a67216927f8 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -14,6 +14,7 @@ import { ConcreteTaskInstance, TaskStatus } from '../task'; import { asTaskRunEvent, asTaskPollingCycleEvent, TaskTiming } from '../task_events'; import { asOk } from '../lib/result_type'; import { TaskLifecycleEvent } from '../task_manager'; +import { TaskRunResult } from '../task_runner'; import { createTaskRunAggregator, summarizeTaskRunStat, @@ -93,6 +94,183 @@ describe('Task Run Statistics', () => { }); }); + test('returns a running average of task run duration', async () => { + const runDurations = [1000, 2000, 500, 300, 400, 15000, 20000, 200]; + const runDurationsInReverse = runDurations.reverse(); + const events = new Subject(); + const taskManager = taskManagerMock.create({ + events, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator(taskManager, runningAverageWindowSize); + + function expectWindowEqualsUpdate( + taskStat: AggregatedStat, + windows: Record + ) { + for (const [type, window] of Object.entries(windows)) { + expect(taskStat.value.duration[type]).toMatchObject({ + mean: Math.round(stats.mean(window)), + median: stats.median(window), + }); + } + } + + return new Promise((resolve, reject) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value), + })), + take(runDurations.length * 2), + bufferCount(runDurations.length * 2) + ) + .subscribe((taskStats: Array>) => { + try { + expectWindowEqualsUpdate(taskStats[0], { 'alerting:test': runDurations.slice(0, 1) }); + expectWindowEqualsUpdate(taskStats[1], { 'alerting:test': runDurations.slice(0, 2) }); + expectWindowEqualsUpdate(taskStats[2], { 'alerting:test': runDurations.slice(0, 3) }); + expectWindowEqualsUpdate(taskStats[3], { 'alerting:test': runDurations.slice(0, 4) }); + expectWindowEqualsUpdate(taskStats[4], { 'alerting:test': runDurations.slice(0, 5) }); + // from the 6th value, begin to drop old values as out window is 5 + expectWindowEqualsUpdate(taskStats[5], { 'alerting:test': runDurations.slice(1, 6) }); + expectWindowEqualsUpdate(taskStats[6], { 'alerting:test': runDurations.slice(2, 7) }); + expectWindowEqualsUpdate(taskStats[7], { 'alerting:test': runDurations.slice(3, 8) }); + expectWindowEqualsUpdate(taskStats[8], { + 'actions:test': runDurations.slice(0, 1), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[9], { + 'actions:test': runDurations.slice(0, 2), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[10], { + 'actions:test': runDurations.slice(0, 3), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[11], { + 'actions:test': runDurations.slice(0, 4), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[12], { + 'actions:test': runDurations.slice(0, 5), + 'alerting:test': runDurations.slice(3, 8), + }); + // from the 6th value, begin to drop old values as out window is 5 + expectWindowEqualsUpdate(taskStats[13], { + 'actions:test': runDurations.slice(1, 6), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[14], { + 'actions:test': runDurations.slice(2, 7), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[15], { + 'actions:test': runDurations.slice(3, 8), + 'alerting:test': runDurations.slice(3, 8), + }); + resolve(); + } catch (e) { + reject(e); + } + }); + + const now = Date.now(); + for (const runDuration of runDurations) { + events.next( + mockTaskRunEvent( + { taskType: 'alerting:test' }, + { start: runAtMillisecondsAgo(runDuration).getTime(), stop: now } + ) + ); + } + for (const runDuration of runDurationsInReverse) { + events.next( + mockTaskRunEvent( + { taskType: 'actions:test' }, + { start: runAtMillisecondsAgo(runDuration).getTime(), stop: now } + ) + ); + } + }); + }); + + test('returns the frequency of task run results', async () => { + const events = new Subject(); + const taskManager = taskManagerMock.create({ + events, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator(taskManager, runningAverageWindowSize); + + return new Promise((resolve, reject) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value), + })), + take(10), + bufferCount(10) + ) + .subscribe((taskStats: Array>) => { + try { + /** + * At any given time we only keep track of the last X Polling Results + * In the tests this is ocnfiugured to a window size of 5 + */ + expect(taskStats.map((taskStat) => taskStat.value.taskRunResultFrequency)).toEqual([ + // Success + { Success: 100, RetryScheduled: 0, Failed: 0 }, + // Success, Success, + { Success: 100, RetryScheduled: 0, Failed: 0 }, + // Success, Success, Success + { Success: 100, RetryScheduled: 0, Failed: 0 }, + // Success, Success, Success, Failed + { Success: 75, RetryScheduled: 0, Failed: 25 }, + // Success, Success, Success, Failed, Failed + { Success: 60, RetryScheduled: 0, Failed: 40 }, + // Success, Success, Failed, Failed, Failed + { Success: 40, RetryScheduled: 0, Failed: 60 }, + // Success, Failed, Failed, Failed, RetryScheduled + { Success: 20, RetryScheduled: 20, Failed: 60 }, + // Failed, Failed, Failed, RetryScheduled, RetryScheduled + { Success: 0, RetryScheduled: 40, Failed: 60 }, + // Failed, Failed, RetryScheduled, RetryScheduled, Success + { Success: 20, RetryScheduled: 40, Failed: 40 }, + // Failed, RetryScheduled, RetryScheduled, Success, Success + { Success: 40, RetryScheduled: 40, Failed: 20 }, + ]); + resolve(); + } catch (e) { + reject(e); + } + }); + + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.RetryScheduled)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.RetryScheduled)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + }); + }); + test('returns polling stats', async () => { const expectedTimestamp: string[] = []; const events = new Subject(); @@ -103,7 +281,7 @@ describe('Task Run Statistics', () => { const runningAverageWindowSize = 5; const taskRunAggregator = createTaskRunAggregator(taskManager, runningAverageWindowSize); - return new Promise((resolve) => { + return new Promise((resolve, reject) => { taskRunAggregator .pipe( // skip initial stat which is just initialized data which @@ -123,37 +301,41 @@ describe('Task Run Statistics', () => { bufferCount(10) ) .subscribe((taskStats: Array>) => { - expect(taskStats.map((taskStat) => taskStat.value.polling.lastSuccessfulPoll)).toEqual( - expectedTimestamp - ); - - /** - * At any given time we only keep track of the last X Polling Results - * In the tests this is ocnfiugured to a window size of 5 - */ - expect(taskStats.map((taskStat) => taskStat.value.polling.resultFrequency)).toEqual([ - // NoTasksClaimed - { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, - // NoTasksClaimed, NoTasksClaimed, - { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, - // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed - { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, - // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled - { NoTasksClaimed: 75, RanOutOfCapacity: 0, PoolFilled: 25 }, - // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled - { NoTasksClaimed: 60, RanOutOfCapacity: 0, PoolFilled: 40 }, - // NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled - { NoTasksClaimed: 40, RanOutOfCapacity: 0, PoolFilled: 60 }, - // NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity - { NoTasksClaimed: 20, RanOutOfCapacity: 20, PoolFilled: 60 }, - // PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity - { NoTasksClaimed: 0, RanOutOfCapacity: 40, PoolFilled: 60 }, - // PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed - { NoTasksClaimed: 20, RanOutOfCapacity: 40, PoolFilled: 40 }, - // PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed, NoTasksClaimed - { NoTasksClaimed: 40, RanOutOfCapacity: 40, PoolFilled: 20 }, - ]); - resolve(); + try { + expect(taskStats.map((taskStat) => taskStat.value.polling.lastSuccessfulPoll)).toEqual( + expectedTimestamp + ); + + /** + * At any given time we only keep track of the last X Polling Results + * In the tests this is ocnfiugured to a window size of 5 + */ + expect(taskStats.map((taskStat) => taskStat.value.polling.resultFrequency)).toEqual([ + // NoTasksClaimed + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled + { NoTasksClaimed: 75, RanOutOfCapacity: 0, PoolFilled: 25 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled + { NoTasksClaimed: 60, RanOutOfCapacity: 0, PoolFilled: 40 }, + // NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled + { NoTasksClaimed: 40, RanOutOfCapacity: 0, PoolFilled: 60 }, + // NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity + { NoTasksClaimed: 20, RanOutOfCapacity: 20, PoolFilled: 60 }, + // PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity + { NoTasksClaimed: 0, RanOutOfCapacity: 40, PoolFilled: 60 }, + // PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed + { NoTasksClaimed: 20, RanOutOfCapacity: 40, PoolFilled: 40 }, + // PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed, NoTasksClaimed + { NoTasksClaimed: 40, RanOutOfCapacity: 40, PoolFilled: 20 }, + ]); + resolve(); + } catch (e) { + reject(e); + } }); events.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); @@ -174,9 +356,13 @@ function runAtMillisecondsAgo(ms: number): Date { return new Date(Date.now() - ms); } -const mockTaskRunEvent = (overrides: Partial = {}, timing: TaskTiming) => { +const mockTaskRunEvent = ( + overrides: Partial = {}, + timing: TaskTiming, + result: TaskRunResult = TaskRunResult.Success +) => { const task = mockTaskInstance(overrides); - return asTaskRunEvent(task.id, asOk(task), timing); + return asTaskRunEvent(task.id, asOk({ task, result }), timing); }; const mockTaskInstance = (overrides: Partial = {}): ConcreteTaskInstance => ({ diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index b6db4f40d5ffd..1708542ed8587 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -10,9 +10,17 @@ import { JsonObject } from 'src/plugins/kibana_utils/common'; import { mapValues } from 'lodash'; import { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; import { TaskManager, TaskLifecycleEvent } from '../task_manager'; -import { isTaskRunEvent, isTaskPollingCycleEvent } from '../task_events'; -import { isOk, Ok } from '../lib/result_type'; +import { + isTaskRunEvent, + isTaskPollingCycleEvent, + TaskRun, + ErroredTask, + RanTask, + TaskTiming, +} from '../task_events'; +import { isOk, Ok, unwrap } from '../lib/result_type'; import { ConcreteTaskInstance } from '../task'; +import { TaskRunResult } from '../task_runner'; import { FillPoolResult } from '../lib/fill_pool'; import { AveragedStat, @@ -30,9 +38,9 @@ interface FillPoolStat extends JsonObject { export interface TaskRunStat extends JsonObject { drift: number[]; duration: Record; + taskRunResultFrequency: TaskRunResult[]; polling: FillPoolStat | Omit; } - interface FillPoolRawStat extends JsonObject { lastSuccessfulPoll: string; resultFrequency: { @@ -45,6 +53,12 @@ interface FillPoolRawStat extends JsonObject { export interface SummarizedTaskRunStat extends JsonObject { drift: AveragedStat; duration: Record; + taskRunResultFrequency: { + [TaskRunResult.Success]: number; + [TaskRunResult.SuccessRescheduled]: number; + [TaskRunResult.RetryScheduled]: number; + [TaskRunResult.Failed]: number; + }; polling: FillPoolRawStat | Omit; } @@ -52,25 +66,12 @@ export function createTaskRunAggregator( taskManager: TaskManager, runningAverageWindowSize: number ): AggregatedStatProvider { - const driftQueue = createRunningAveragedStat(runningAverageWindowSize); - const taskRunDurationQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); - const taskRunEvents$: Observable> = taskManager.events.pipe( - filter( - (taskEvent: TaskLifecycleEvent) => - isTaskRunEvent(taskEvent) && - isOk(taskEvent.event) && - !!taskEvent?.timing?.start - ), + const taskRunEventToStat = createTaskRunEventToStat(runningAverageWindowSize); + const taskRunEvents$: Observable> = taskManager.events.pipe( + filter((taskEvent: TaskLifecycleEvent) => isTaskRunEvent(taskEvent) && hasTiming(taskEvent)), map((taskEvent: TaskLifecycleEvent) => { - const task = (taskEvent.event as Ok).value; - const { timing } = taskEvent; - return { - duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), - drift: driftQueue(timing!.start - task.runAt.getTime()), - }; + const { task, result }: RanTask | ErroredTask = unwrap((taskEvent as TaskRun).event); + return taskRunEventToStat(task, taskEvent.timing!, result); }) ); @@ -89,7 +90,7 @@ export function createTaskRunAggregator( ); return combineLatest( - taskRunEvents$.pipe(startWith({ duration: {}, drift: [] })), + taskRunEvents$.pipe(startWith({ duration: {}, drift: [], taskRunResultFrequency: [] })), taskPollingEvents$.pipe( startWith({ resultFrequency: [], @@ -108,10 +109,30 @@ export function createTaskRunAggregator( ); } +function hasTiming(taskEvent: TaskLifecycleEvent) { + return !!taskEvent?.timing; +} + +function createTaskRunEventToStat(runningAverageWindowSize: number) { + const driftQueue = createRunningAveragedStat(runningAverageWindowSize); + const taskRunDurationQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); + const resultFrequencyQueue = createRunningAveragedStat(runningAverageWindowSize); + return ( + task: ConcreteTaskInstance, + timing: TaskTiming, + result: TaskRunResult + ): Omit => ({ + duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), + drift: driftQueue(timing!.start - task.runAt.getTime()), + taskRunResultFrequency: resultFrequencyQueue(result), + }); +} + export function summarizeTaskRunStat({ polling: { lastSuccessfulPoll, resultFrequency }, drift, duration, + taskRunResultFrequency, }: TaskRunStat): SummarizedTaskRunStat { return { polling: { @@ -125,5 +146,12 @@ export function summarizeTaskRunStat({ }, drift: calculateRunningAverage(drift), duration: mapValues(duration, (typedDuration) => calculateRunningAverage(typedDuration)), + taskRunResultFrequency: { + [TaskRunResult.Success]: 0, + [TaskRunResult.SuccessRescheduled]: 0, + [TaskRunResult.RetryScheduled]: 0, + [TaskRunResult.Failed]: 0, + ...calculateFrequency(taskRunResultFrequency), + }, }; } diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index b0533e6a52ee4..82f1717092dfc 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -182,6 +182,8 @@ function mockHealthStats(overrides = {}) { timestamp: new Date().toISOString(), value: { drift: [1000, 1000], + duration: [], + taskRunResultFrequency: [], polling: { lastSuccessfulPoll: new Date().toISOString(), resultFrequency: ['NoTasksClaimed', 'NoTasksClaimed', 'NoTasksClaimed'], diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index 4982ca7235a75..b011d435e28dc 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -11,6 +11,7 @@ import { ConcreteTaskInstance } from './task'; import { Result, Err } from './lib/result_type'; import { FillPoolResult } from './lib/fill_pool'; import { PollingError } from './polling'; +import { TaskRunResult } from './task_runner'; export enum TaskEventType { TASK_CLAIM = 'TASK_CLAIM', @@ -36,8 +37,16 @@ export interface TaskEvent { type: TaskEventType; event: Result; } +export interface RanTask { + task: ConcreteTaskInstance; + result: TaskRunResult; +} +export type ErroredTask = RanTask & { + error: Error; +}; + export type TaskMarkRunning = TaskEvent; -export type TaskRun = TaskEvent; +export type TaskRun = TaskEvent; export type TaskClaim = TaskEvent>; export type TaskRunRequest = TaskEvent; export type TaskPollingCycle = TaskEvent>; @@ -57,7 +66,7 @@ export function asTaskMarkRunningEvent( export function asTaskRunEvent( id: string, - event: Result, + event: Result, timing?: TaskTiming ): TaskRun { return { diff --git a/x-pack/plugins/task_manager/server/task_manager.test.ts b/x-pack/plugins/task_manager/server/task_manager.test.ts index decd7291bc0c8..52a3beaf174d1 100644 --- a/x-pack/plugins/task_manager/server/task_manager.test.ts +++ b/x-pack/plugins/task_manager/server/task_manager.test.ts @@ -26,6 +26,7 @@ import { SavedObjectsSerializer, SavedObjectTypeRegistry } from '../../../../src import { mockLogger } from './test_utils'; import { asErr, asOk } from './lib/result_type'; import { ConcreteTaskInstance, TaskLifecycleResult, TaskStatus } from './task'; +import { TaskRunResult } from './task_runner'; import { Middleware } from './lib/middleware'; const savedObjectsClient = savedObjectsRepositoryMock.create(); @@ -284,7 +285,7 @@ describe('TaskManager', () => { const result = awaitTaskRunResult(id, events$, getLifecycle); const task = { id } as ConcreteTaskInstance; - events$.next(asTaskRunEvent(id, asOk(task))); + events$.next(asTaskRunEvent(id, asOk({ task, result: TaskRunResult.Success }))); return expect(result).resolves.toEqual({ id }); }); @@ -299,7 +300,16 @@ describe('TaskManager', () => { const task = { id } as ConcreteTaskInstance; events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskMarkRunningEvent(id, asOk(task))); - events$.next(asTaskRunEvent(id, asErr(new Error('some thing gone wrong')))); + events$.next( + asTaskRunEvent( + id, + asErr({ + error: new Error('some thing gone wrong'), + task, + result: TaskRunResult.Failed, + }) + ) + ); return expect(result).rejects.toMatchInlineSnapshot( `[Error: Failed to run task "01ddff11-e88a-4d13-bc4e-256164e755e2": Error: some thing gone wrong]` @@ -381,7 +391,7 @@ describe('TaskManager', () => { await expect(result).rejects.toEqual( new Error( - `Failed to run task "${id}" as Task Manager is at capacity, please try again later` + `Failed to run task "${id}": Task Manager is at capacity, please try again later` ) ); expect(getLifecycle).not.toHaveBeenCalled(); @@ -432,9 +442,20 @@ describe('TaskManager', () => { events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskClaimEvent(differentTask, asOk(otherTask))); - events$.next(asTaskRunEvent(differentTask, asOk(task))); + events$.next( + asTaskRunEvent(differentTask, asOk({ task: otherTask, result: TaskRunResult.Success })) + ); - events$.next(asTaskRunEvent(id, asErr(new Error('some thing gone wrong')))); + events$.next( + asTaskRunEvent( + id, + asErr({ + task, + error: new Error('some thing gone wrong'), + result: TaskRunResult.Failed, + }) + ) + ); return expect(result).rejects.toMatchInlineSnapshot( `[Error: Failed to run task "01ddff11-e88a-4d13-bc4e-256164e755e2": Error: some thing gone wrong]` diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index de182ea02a674..f9fcb2d567393 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -26,11 +26,13 @@ import { TaskClaim, TaskRunRequest, TaskPollingCycle, + ErroredTask, isTaskRunEvent, isTaskClaimEvent, isTaskRunRequestEvent, asTaskRunRequestEvent, asTaskPollingCycleEvent, + RanTask, } from './task_events'; import { fillPool, FillPoolResult } from './lib/fill_pool'; import { addMiddlewareToChain, BeforeSaveMiddlewareParams, Middleware } from './lib/middleware'; @@ -539,26 +541,32 @@ export async function awaitTaskRunResult( ); }, taskEvent.event); } else { - either>( + either< + RanTask | ConcreteTaskInstance | FillPoolResult, + Error | ErroredTask | Option + >( taskEvent.event, - (taskInstance: ConcreteTaskInstance | FillPoolResult) => { + (taskInstance: RanTask | ConcreteTaskInstance | FillPoolResult) => { // resolve if the task has run sucessfully if (isTaskRunEvent(taskEvent)) { subscription.unsubscribe(); - resolve({ id: (taskInstance as ConcreteTaskInstance).id }); + resolve({ id: (taskInstance as RanTask).task.id }); } }, - async (error: Error | Option) => { + async (errorResult: Error | ErroredTask | Option) => { // reject if any error event takes place for the requested task subscription.unsubscribe(); - if (isTaskRunRequestEvent(taskEvent)) { - return reject( - new Error( - `Failed to run task "${taskId}" as Task Manager is at capacity, please try again later` - ) - ); - } - return reject(new Error(`Failed to run task "${taskId}": ${error}`)); + return reject( + new Error( + `Failed to run task "${taskId}"${ + isTaskRunRequestEvent(taskEvent) + ? `: Task Manager is at capacity, please try again later` + : isTaskRunEvent(taskEvent) + ? `: ${(errorResult as ErroredTask).error}` + : `: ${errorResult}` + }` + ) + ); } ); } diff --git a/x-pack/plugins/task_manager/server/task_runner.test.ts b/x-pack/plugins/task_manager/server/task_runner.test.ts index 81fe097f43690..4a1a1bf73d1d6 100644 --- a/x-pack/plugins/task_manager/server/task_runner.test.ts +++ b/x-pack/plugins/task_manager/server/task_runner.test.ts @@ -8,9 +8,9 @@ import _ from 'lodash'; import sinon from 'sinon'; import { secondsFromNow } from './lib/intervals'; import { asOk, asErr } from './lib/result_type'; -import { TaskEvent, asTaskRunEvent, asTaskMarkRunningEvent } from './task_events'; +import { TaskEvent, asTaskRunEvent, asTaskMarkRunningEvent, TaskRun } from './task_events'; import { ConcreteTaskInstance, TaskStatus, TaskDictionary, TaskDefinition } from './task'; -import { TaskManagerRunner } from './task_runner'; +import { TaskManagerRunner, TaskRunResult } from './task_runner'; import { mockLogger } from './test_utils'; import { SavedObjectsErrorHelpers } from '../../../../src/core/server'; import moment from 'moment'; @@ -790,7 +790,9 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asOk(instance))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) + ); }); test('emits TaskEvent when a recurring task is run successfully', async () => { @@ -816,14 +818,16 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asOk(instance))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) + ); }); test('emits TaskEvent when a task run throws an error', async () => { const id = _.random(1, 20).toString(); const error = new Error('Dangit!'); const onTaskEvent = jest.fn(); - const { runner } = testOpts({ + const { runner, instance } = testOpts({ onTaskEvent, instance: { id, @@ -840,7 +844,11 @@ describe('TaskManagerRunner', () => { }); await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asErr(error))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent(id, asErr({ error, task: instance, result: TaskRunResult.RetryScheduled })) + ) + ); expect(onTaskEvent).toHaveBeenCalledTimes(1); }); @@ -848,7 +856,7 @@ describe('TaskManagerRunner', () => { const id = _.random(1, 20).toString(); const error = new Error('Dangit!'); const onTaskEvent = jest.fn(); - const { runner } = testOpts({ + const { runner, instance } = testOpts({ onTaskEvent, instance: { id, @@ -868,7 +876,11 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asErr(error))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent(id, asErr({ error, task: instance, result: TaskRunResult.RetryScheduled })) + ) + ); expect(onTaskEvent).toHaveBeenCalledTimes(1); }); @@ -876,7 +888,7 @@ describe('TaskManagerRunner', () => { const id = _.random(1, 20).toString(); const error = new Error('Dangit!'); const onTaskEvent = jest.fn(); - const { runner, store } = testOpts({ + const { runner, store, instance: originalInstance } = testOpts({ onTaskEvent, instance: { id, @@ -899,7 +911,18 @@ describe('TaskManagerRunner', () => { const instance = store.update.args[0][0]; expect(instance.status).toBe('failed'); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asErr(error))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent( + id, + asErr({ + error, + task: originalInstance, + result: TaskRunResult.Failed, + }) + ) + ) + ); expect(onTaskEvent).toHaveBeenCalledTimes(1); }); }); @@ -910,6 +933,13 @@ describe('TaskManagerRunner', () => { onTaskEvent?: (event: TaskEvent) => void; } + function withAnyTiming(taskRun: TaskRun) { + return { + ...taskRun, + timing: { start: expect.any(Number), stop: expect.any(Number) }, + }; + } + function testOpts(opts: TestOpts) { const callCluster = sinon.stub(); const createTaskRunner = sinon.stub(); diff --git a/x-pack/plugins/task_manager/server/task_runner.ts b/x-pack/plugins/task_manager/server/task_runner.ts index 3716e57e928f5..c14c1266c4ad0 100644 --- a/x-pack/plugins/task_manager/server/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_runner.ts @@ -16,7 +16,7 @@ import { performance } from 'perf_hooks'; import Joi from 'joi'; import { identity, defaults, flow } from 'lodash'; -import { asOk, asErr, mapErr, eitherAsync, unwrap, mapOk, Result } from './lib/result_type'; +import { asOk, asErr, mapErr, eitherAsync, unwrap, isOk, mapOk, Result } from './lib/result_type'; import { TaskRun, TaskMarkRunning, @@ -72,6 +72,21 @@ interface Opts { onTaskEvent?: (event: TaskRun | TaskMarkRunning) => void; } +export enum TaskRunResult { + // Task completed successfully + Success = 'Success', + // Recurring Task completed successfully + SuccessRescheduled = 'Success', + // // Task completed successfully after a retry + // SuccessfulRetry = 'SuccessfulRetry', + // // Recurring Task completed successfully after a retry + // SuccessfulRetryRescheduled = 'SuccessfulRetry', + // Task has failed and a retry has been scheduled + RetryScheduled = 'RetryScheduled', + // Task has failed + Failed = 'Failed', +} + /** * Runs a background task, ensures that errors are properly handled, * allows for cancellation. @@ -350,8 +365,9 @@ export class TaskManagerRunner implements TaskRunner { private async processResultForRecurringTask( result: Result - ): Promise { - const fieldUpdates = flow( + ): Promise { + const hasTaskRunFailed = isOk(result); + const fieldUpdates: Partial & Pick = flow( // if running the task has failed ,try to correct by scheduling a retry in the near future mapErr(this.rescheduleFailedRun), // if retrying is possible (new runAt) or this is an recurring task - reschedule @@ -370,7 +386,7 @@ export class TaskManagerRunner implements TaskRunner { await this.bufferedTaskStore.update( defaults( { - ...(fieldUpdates as Partial), + ...fieldUpdates, // reset fields that track the lifecycle of the concluded `task run` startedAt: null, retryAt: null, @@ -379,9 +395,15 @@ export class TaskManagerRunner implements TaskRunner { this.instance ) ); + + return fieldUpdates.status === TaskStatus.Failed + ? TaskRunResult.Failed + : hasTaskRunFailed + ? TaskRunResult.SuccessRescheduled + : TaskRunResult.RetryScheduled; } - private async processResultWhenDone(): Promise { + private async processResultWhenDone(): Promise { // not a recurring task: clean up by removing the task instance from store try { await this.bufferedTaskStore.remove(this.instance.id); @@ -392,25 +414,38 @@ export class TaskManagerRunner implements TaskRunner { throw err; } } + return TaskRunResult.Success; } private async processResult( result: Result, taskTiming: TaskTiming ): Promise> { + const task = this.instance; await eitherAsync( result, async ({ runAt }: SuccessfulRunResult) => { - if (runAt || this.instance.schedule) { - await this.processResultForRecurringTask(result); - } else { - await this.processResultWhenDone(); - } - this.onTaskEvent(asTaskRunEvent(this.id, asOk(this.instance), taskTiming)); + this.onTaskEvent( + asTaskRunEvent( + this.id, + asOk({ + task, + result: await (runAt || task.schedule + ? this.processResultForRecurringTask(result) + : this.processResultWhenDone()), + }), + taskTiming + ) + ); }, async ({ error }: FailedRunResult) => { - await this.processResultForRecurringTask(result); - this.onTaskEvent(asTaskRunEvent(this.id, asErr(error), taskTiming)); + this.onTaskEvent( + asTaskRunEvent( + this.id, + asErr({ task, result: await this.processResultForRecurringTask(result), error }), + taskTiming + ) + ); } ); return result; From aa787b6ddc2036f9cdd625bb92ce3263b65f0b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20C=C3=B4t=C3=A9?= Date: Wed, 30 Sep 2020 19:54:18 -0400 Subject: [PATCH 17/42] Temporarily apply back pressure to maxWorkers and pollInterval when 429 errors occur (#77096) * WIP * Cleanup * Add error count to message * Reset observable values on stop * Add comments * Fix issues when changing configurations * Cleanup code * Cleanup pt2 * Some renames * Fix typecheck * Use observables to manage throughput * Rename class * Switch to createManagedConfiguration * Add some comments * Start unit tests * Add logs * Fix log level * Attempt at adding integration tests * Fix test failures * Fix timer * Revert "Fix timer" This reverts commit 0817e5e6a5ef9bdfe9329a559f4a5674ebbbef24. * Use Symbol * Fix merge scan --- .../managed_configuration.test.ts | 102 +++++++++ .../lib/create_managed_configuration.test.ts | 213 ++++++++++++++++++ .../lib/create_managed_configuration.ts | 160 +++++++++++++ .../server/polling/observable_monitor.ts | 11 +- .../server/polling/task_poller.test.ts | 13 +- .../server/polling/task_poller.ts | 8 +- .../task_manager/server/task_manager.ts | 16 +- .../task_manager/server/task_pool.test.ts | 8 +- .../plugins/task_manager/server/task_pool.ts | 1 + 9 files changed, 519 insertions(+), 13 deletions(-) create mode 100644 x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts create mode 100644 x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts create mode 100644 x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts new file mode 100644 index 0000000000000..4fc8ae899518c --- /dev/null +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -0,0 +1,102 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import sinon from 'sinon'; +import { mockLogger } from '../test_utils'; +import { TaskManager } from '../task_manager'; +import { savedObjectsRepositoryMock } from '../../../../../src/core/server/mocks'; +import { + SavedObjectsSerializer, + SavedObjectTypeRegistry, + SavedObjectsErrorHelpers, +} from '../../../../../src/core/server'; +import { ADJUST_THROUGHPUT_INTERVAL } from '../lib/create_managed_configuration'; + +describe('managed configuration', () => { + let taskManager: TaskManager; + let clock: sinon.SinonFakeTimers; + const callAsInternalUser = jest.fn(); + const logger = mockLogger(); + const serializer = new SavedObjectsSerializer(new SavedObjectTypeRegistry()); + const savedObjectsClient = savedObjectsRepositoryMock.create(); + const config = { + enabled: true, + max_workers: 10, + index: 'foo', + max_attempts: 9, + poll_interval: 3000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + }; + + beforeEach(() => { + jest.resetAllMocks(); + callAsInternalUser.mockResolvedValue({ total: 0, updated: 0, version_conflicts: 0 }); + clock = sinon.useFakeTimers(); + taskManager = new TaskManager({ + config, + logger, + serializer, + callAsInternalUser, + taskManagerId: 'some-uuid', + savedObjectsRepository: savedObjectsClient, + }); + taskManager.registerTaskDefinitions({ + foo: { + type: 'foo', + title: 'Foo', + createTaskRunner: jest.fn(), + }, + }); + taskManager.start(); + }); + + afterEach(() => clock.restore()); + + test('should lower max workers when Elasticsearch returns 429 error', async () => { + savedObjectsClient.create.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') + ); + // Cause "too many requests" error to be thrown + await expect( + taskManager.schedule({ + taskType: 'foo', + state: {}, + params: {}, + }) + ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + expect(logger.warn).toHaveBeenCalledWith( + 'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Max workers configuration changing from 10 to 8 after seeing 1 error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith('Task pool now using 10 as the max worker value'); + }); + + test('should increase poll interval when Elasticsearch returns 429 error', async () => { + savedObjectsClient.create.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') + ); + // Cause "too many requests" error to be thrown + await expect( + taskManager.schedule({ + taskType: 'foo', + state: {}, + params: {}, + }) + ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + expect(logger.warn).toHaveBeenCalledWith( + 'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Poll interval configuration changing from 3000 to 3600 after seeing 1 error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms'); + }); +}); diff --git a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts new file mode 100644 index 0000000000000..b6b5cd003c5d4 --- /dev/null +++ b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts @@ -0,0 +1,213 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import sinon from 'sinon'; +import { Subject } from 'rxjs'; +import { mockLogger } from '../test_utils'; +import { SavedObjectsErrorHelpers } from '../../../../../src/core/server'; +import { + createManagedConfiguration, + ADJUST_THROUGHPUT_INTERVAL, +} from './create_managed_configuration'; + +describe('createManagedConfiguration()', () => { + let clock: sinon.SinonFakeTimers; + const logger = mockLogger(); + + beforeEach(() => { + jest.resetAllMocks(); + clock = sinon.useFakeTimers(); + }); + + afterEach(() => clock.restore()); + + test('returns observables with initialized values', async () => { + const maxWorkersSubscription = jest.fn(); + const pollIntervalSubscription = jest.fn(); + const { maxWorkersConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + logger, + errors$: new Subject(), + startingMaxWorkers: 1, + startingPollInterval: 2, + }); + maxWorkersConfiguration$.subscribe(maxWorkersSubscription); + pollIntervalConfiguration$.subscribe(pollIntervalSubscription); + expect(maxWorkersSubscription).toHaveBeenCalledTimes(1); + expect(maxWorkersSubscription).toHaveBeenNthCalledWith(1, 1); + expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); + expect(pollIntervalSubscription).toHaveBeenNthCalledWith(1, 2); + }); + + test(`skips errors that aren't about too many requests`, async () => { + const maxWorkersSubscription = jest.fn(); + const pollIntervalSubscription = jest.fn(); + const errors$ = new Subject(); + const { maxWorkersConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + errors$, + logger, + startingMaxWorkers: 100, + startingPollInterval: 100, + }); + maxWorkersConfiguration$.subscribe(maxWorkersSubscription); + pollIntervalConfiguration$.subscribe(pollIntervalSubscription); + errors$.next(new Error('foo')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + expect(maxWorkersSubscription).toHaveBeenCalledTimes(1); + expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); + }); + + describe('maxWorker configuration', () => { + function setupScenario(startingMaxWorkers: number) { + const errors$ = new Subject(); + const subscription = jest.fn(); + const { maxWorkersConfiguration$ } = createManagedConfiguration({ + errors$, + startingMaxWorkers, + logger, + startingPollInterval: 1, + }); + maxWorkersConfiguration$.subscribe(subscription); + return { subscription, errors$ }; + } + + beforeEach(() => { + jest.resetAllMocks(); + clock = sinon.useFakeTimers(); + }); + + afterEach(() => clock.restore()); + + test('should decrease configuration at the next interval when an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(100); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL - 1); + expect(subscription).toHaveBeenCalledTimes(1); + clock.tick(1); + expect(subscription).toHaveBeenCalledTimes(2); + expect(subscription).toHaveBeenNthCalledWith(2, 80); + }); + + test('should log a warning when the configuration changes from the starting value', async () => { + const { errors$ } = setupScenario(100); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + expect(logger.warn).toHaveBeenCalledWith( + 'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" error(s).' + ); + }); + + test('should increase configuration back to normal incrementally after an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(100); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL * 10); + expect(subscription).toHaveBeenNthCalledWith(2, 80); + expect(subscription).toHaveBeenNthCalledWith(3, 84); + // 88.2- > 89 from Math.ceil + expect(subscription).toHaveBeenNthCalledWith(4, 89); + expect(subscription).toHaveBeenNthCalledWith(5, 94); + expect(subscription).toHaveBeenNthCalledWith(6, 99); + // 103.95 -> 100 from Math.min with starting value + expect(subscription).toHaveBeenNthCalledWith(7, 100); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(7); + }); + + test('should keep reducing configuration when errors keep emitting', async () => { + const { subscription, errors$ } = setupScenario(100); + for (let i = 0; i < 20; i++) { + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + } + expect(subscription).toHaveBeenNthCalledWith(2, 80); + expect(subscription).toHaveBeenNthCalledWith(3, 64); + // 51.2 -> 51 from Math.floor + expect(subscription).toHaveBeenNthCalledWith(4, 51); + expect(subscription).toHaveBeenNthCalledWith(5, 40); + expect(subscription).toHaveBeenNthCalledWith(6, 32); + expect(subscription).toHaveBeenNthCalledWith(7, 25); + expect(subscription).toHaveBeenNthCalledWith(8, 20); + expect(subscription).toHaveBeenNthCalledWith(9, 16); + expect(subscription).toHaveBeenNthCalledWith(10, 12); + expect(subscription).toHaveBeenNthCalledWith(11, 9); + expect(subscription).toHaveBeenNthCalledWith(12, 7); + expect(subscription).toHaveBeenNthCalledWith(13, 5); + expect(subscription).toHaveBeenNthCalledWith(14, 4); + expect(subscription).toHaveBeenNthCalledWith(15, 3); + expect(subscription).toHaveBeenNthCalledWith(16, 2); + expect(subscription).toHaveBeenNthCalledWith(17, 1); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(17); + }); + }); + + describe('pollInterval configuration', () => { + function setupScenario(startingPollInterval: number) { + const errors$ = new Subject(); + const subscription = jest.fn(); + const { pollIntervalConfiguration$ } = createManagedConfiguration({ + logger, + errors$, + startingPollInterval, + startingMaxWorkers: 1, + }); + pollIntervalConfiguration$.subscribe(subscription); + return { subscription, errors$ }; + } + + beforeEach(() => { + jest.resetAllMocks(); + clock = sinon.useFakeTimers(); + }); + + afterEach(() => clock.restore()); + + test('should increase configuration at the next interval when an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(100); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL - 1); + expect(subscription).toHaveBeenCalledTimes(1); + clock.tick(1); + expect(subscription).toHaveBeenCalledTimes(2); + expect(subscription).toHaveBeenNthCalledWith(2, 120); + }); + + test('should log a warning when the configuration changes from the starting value', async () => { + const { errors$ } = setupScenario(100); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + expect(logger.warn).toHaveBeenCalledWith( + 'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" error(s).' + ); + }); + + test('should decrease configuration back to normal incrementally after an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(100); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL * 10); + expect(subscription).toHaveBeenNthCalledWith(2, 120); + expect(subscription).toHaveBeenNthCalledWith(3, 114); + // 108.3 -> 108 from Math.floor + expect(subscription).toHaveBeenNthCalledWith(4, 108); + expect(subscription).toHaveBeenNthCalledWith(5, 102); + // 96.9 -> 100 from Math.max with the starting value + expect(subscription).toHaveBeenNthCalledWith(6, 100); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(6); + }); + + test('should increase configuration when errors keep emitting', async () => { + const { subscription, errors$ } = setupScenario(100); + for (let i = 0; i < 3; i++) { + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + } + expect(subscription).toHaveBeenNthCalledWith(2, 120); + expect(subscription).toHaveBeenNthCalledWith(3, 144); + // 172.8 -> 173 from Math.ceil + expect(subscription).toHaveBeenNthCalledWith(4, 173); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts new file mode 100644 index 0000000000000..3dc5fd50d3ca4 --- /dev/null +++ b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts @@ -0,0 +1,160 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { interval, merge, of, Observable } from 'rxjs'; +import { filter, mergeScan, map, scan, distinctUntilChanged, startWith } from 'rxjs/operators'; +import { SavedObjectsErrorHelpers } from '../../../../../src/core/server'; +import { Logger } from '../types'; + +const FLUSH_MARKER = Symbol('flush'); +export const ADJUST_THROUGHPUT_INTERVAL = 10 * 1000; + +// When errors occur, reduce maxWorkers by MAX_WORKERS_DECREASE_PERCENTAGE +// When errors no longer occur, start increasing maxWorkers by MAX_WORKERS_INCREASE_PERCENTAGE +// until starting value is reached +const MAX_WORKERS_DECREASE_PERCENTAGE = 0.8; +const MAX_WORKERS_INCREASE_PERCENTAGE = 1.05; + +// When errors occur, increase pollInterval by POLL_INTERVAL_INCREASE_PERCENTAGE +// When errors no longer occur, start decreasing pollInterval by POLL_INTERVAL_DECREASE_PERCENTAGE +// until starting value is reached +const POLL_INTERVAL_DECREASE_PERCENTAGE = 0.95; +const POLL_INTERVAL_INCREASE_PERCENTAGE = 1.2; + +interface ManagedConfigurationOpts { + logger: Logger; + startingMaxWorkers: number; + startingPollInterval: number; + errors$: Observable; +} + +interface ManagedConfiguration { + maxWorkersConfiguration$: Observable; + pollIntervalConfiguration$: Observable; +} + +export function createManagedConfiguration({ + logger, + startingMaxWorkers, + startingPollInterval, + errors$, +}: ManagedConfigurationOpts): ManagedConfiguration { + const errorCheck$ = countErrors(errors$, ADJUST_THROUGHPUT_INTERVAL); + return { + maxWorkersConfiguration$: errorCheck$.pipe( + createMaxWorkersScan(logger, startingMaxWorkers), + startWith(startingMaxWorkers), + distinctUntilChanged() + ), + pollIntervalConfiguration$: errorCheck$.pipe( + createPollIntervalScan(logger, startingPollInterval), + startWith(startingPollInterval), + distinctUntilChanged() + ), + }; +} + +function createMaxWorkersScan(logger: Logger, startingMaxWorkers: number) { + return scan((previousMaxWorkers: number, errorCount: number) => { + let newMaxWorkers: number; + if (errorCount > 0) { + // Decrease max workers by MAX_WORKERS_DECREASE_PERCENTAGE while making sure it doesn't go lower than 1. + // Using Math.floor to make sure the number is different than previous while not being a decimal value. + newMaxWorkers = Math.max(Math.floor(previousMaxWorkers * MAX_WORKERS_DECREASE_PERCENTAGE), 1); + } else { + // Increase max workers by MAX_WORKERS_INCREASE_PERCENTAGE while making sure it doesn't go + // higher than the starting value. Using Math.ceil to make sure the number is different than + // previous while not being a decimal value + newMaxWorkers = Math.min( + startingMaxWorkers, + Math.ceil(previousMaxWorkers * MAX_WORKERS_INCREASE_PERCENTAGE) + ); + } + if (newMaxWorkers !== previousMaxWorkers) { + logger.debug( + `Max workers configuration changing from ${previousMaxWorkers} to ${newMaxWorkers} after seeing ${errorCount} error(s)` + ); + if (previousMaxWorkers === startingMaxWorkers) { + logger.warn( + `Max workers configuration is temporarily reduced after Elasticsearch returned ${errorCount} "too many request" error(s).` + ); + } + } + return newMaxWorkers; + }, startingMaxWorkers); +} + +function createPollIntervalScan(logger: Logger, startingPollInterval: number) { + return scan((previousPollInterval: number, errorCount: number) => { + let newPollInterval: number; + if (errorCount > 0) { + // Increase poll interval by POLL_INTERVAL_INCREASE_PERCENTAGE and use Math.ceil to + // make sure the number is different than previous while not being a decimal value. + newPollInterval = Math.ceil(previousPollInterval * POLL_INTERVAL_INCREASE_PERCENTAGE); + } else { + // Decrease poll interval by POLL_INTERVAL_DECREASE_PERCENTAGE and use Math.floor to + // make sure the number is different than previous while not being a decimal value. + newPollInterval = Math.max( + startingPollInterval, + Math.floor(previousPollInterval * POLL_INTERVAL_DECREASE_PERCENTAGE) + ); + } + if (newPollInterval !== previousPollInterval) { + logger.debug( + `Poll interval configuration changing from ${previousPollInterval} to ${newPollInterval} after seeing ${errorCount} error(s)` + ); + if (previousPollInterval === startingPollInterval) { + logger.warn( + `Poll interval configuration is temporarily increased after Elasticsearch returned ${errorCount} "too many request" error(s).` + ); + } + } + return newPollInterval; + }, startingPollInterval); +} + +function countErrors(errors$: Observable, countInterval: number): Observable { + return merge( + // Flush error count at fixed interval + interval(countInterval).pipe(map(() => FLUSH_MARKER)), + errors$.pipe(filter((e) => SavedObjectsErrorHelpers.isTooManyRequestsError(e))) + ).pipe( + // When tag is "flush", reset the error counter + // Otherwise increment the error counter + mergeScan(({ count }, next) => { + return next === FLUSH_MARKER + ? of(emitErrorCount(count), resetErrorCount()) + : of(incementErrorCount(count)); + }, emitErrorCount(0)), + filter(isEmitEvent), + map(({ count }) => count) + ); +} + +function emitErrorCount(count: number) { + return { + tag: 'emit', + count, + }; +} + +function isEmitEvent(event: { tag: string; count: number }) { + return event.tag === 'emit'; +} + +function incementErrorCount(count: number) { + return { + tag: 'inc', + count: count + 1, + }; +} + +function resetErrorCount() { + return { + tag: 'initial', + count: 0, + }; +} diff --git a/x-pack/plugins/task_manager/server/polling/observable_monitor.ts b/x-pack/plugins/task_manager/server/polling/observable_monitor.ts index 7b06117ef59d1..e0c31f7014a6a 100644 --- a/x-pack/plugins/task_manager/server/polling/observable_monitor.ts +++ b/x-pack/plugins/task_manager/server/polling/observable_monitor.ts @@ -5,8 +5,16 @@ */ import { Subject, Observable, throwError, interval, timer, Subscription } from 'rxjs'; -import { exhaustMap, tap, takeUntil, switchMap, switchMapTo, catchError } from 'rxjs/operators'; import { noop } from 'lodash'; +import { + exhaustMap, + tap, + takeUntil, + switchMap, + switchMapTo, + catchError, + startWith, +} from 'rxjs/operators'; const DEFAULT_HEARTBEAT_INTERVAL = 1000; @@ -31,6 +39,7 @@ export function createObservableMonitor( return new Observable((subscriber) => { const subscription: Subscription = interval(heartbeatInterval) .pipe( + startWith(0), // switch from the heartbeat interval to the instantiated observable until it completes / errors exhaustMap(() => takeUntilDurationOfInactivity(observableFactory(), inactivityTimeout)), // if an error is thrown, catch it, notify and try to recover diff --git a/x-pack/plugins/task_manager/server/polling/task_poller.test.ts b/x-pack/plugins/task_manager/server/polling/task_poller.test.ts index 1c6aff2ad58b9..956c8b05f3860 100644 --- a/x-pack/plugins/task_manager/server/polling/task_poller.test.ts +++ b/x-pack/plugins/task_manager/server/polling/task_poller.test.ts @@ -9,7 +9,7 @@ import { Subject, of, BehaviorSubject } from 'rxjs'; import { Option, none, some } from 'fp-ts/lib/Option'; import { createTaskPoller, PollingError, PollingErrorType } from './task_poller'; import { fakeSchedulers } from 'rxjs-marbles/jest'; -import { sleep, resolvable, Resolvable } from '../test_utils'; +import { sleep, resolvable, Resolvable, mockLogger } from '../test_utils'; import { asOk, asErr } from '../lib/result_type'; describe('TaskPoller', () => { @@ -24,6 +24,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, getCapacity: () => 1, @@ -58,6 +59,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); createTaskPoller({ + logger: mockLogger(), pollInterval$, bufferCapacity, getCapacity: () => 1, @@ -99,6 +101,7 @@ describe('TaskPoller', () => { let hasCapacity = true; createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -157,6 +160,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -202,6 +206,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -246,6 +251,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -282,6 +288,7 @@ describe('TaskPoller', () => { const handler = jest.fn(); const pollRequests$ = new Subject>(); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work: async (...args) => { @@ -332,6 +339,7 @@ describe('TaskPoller', () => { type ResolvableTupple = [string, PromiseLike & Resolvable]; const pollRequests$ = new Subject>(); createTaskPoller<[string, Resolvable], string[]>({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work: async (...resolvables) => { @@ -391,6 +399,7 @@ describe('TaskPoller', () => { const handler = jest.fn(); const pollRequests$ = new Subject>(); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work: async (...args) => { @@ -431,6 +440,7 @@ describe('TaskPoller', () => { return callCount; }); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -473,6 +483,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => {}); const pollRequests$ = new Subject>(); createTaskPoller({ + logger: mockLogger(), pollInterval$: of(pollInterval), bufferCapacity, work, diff --git a/x-pack/plugins/task_manager/server/polling/task_poller.ts b/x-pack/plugins/task_manager/server/polling/task_poller.ts index 867d01691c41d..7515668a19d40 100644 --- a/x-pack/plugins/task_manager/server/polling/task_poller.ts +++ b/x-pack/plugins/task_manager/server/polling/task_poller.ts @@ -15,6 +15,7 @@ import { mapTo, filter, scan, concatMap, tap, catchError, switchMap } from 'rxjs import { pipe } from 'fp-ts/lib/pipeable'; import { Option, none, map as mapOptional, getOrElse } from 'fp-ts/lib/Option'; +import { Logger } from '../types'; import { pullFromSet } from '../lib/pull_from_set'; import { Result, @@ -30,6 +31,7 @@ import { timeoutPromiseAfter } from './timeout_promise_after'; type WorkFn = (...params: T[]) => Promise; interface Opts { + logger: Logger; pollInterval$: Observable; bufferCapacity: number; getCapacity: () => number; @@ -52,6 +54,7 @@ interface Opts { * of unique request argumets of type T. The queue holds all the buffered request arguments streamed in via pollRequests$ */ export function createTaskPoller({ + logger, pollInterval$, getCapacity, pollRequests$, @@ -68,7 +71,10 @@ export function createTaskPoller({ pollRequests$, // emit a polling event on a fixed interval pollInterval$.pipe( - switchMap((period) => interval(period)), + switchMap((period) => { + logger.debug(`Task poller now using interval of ${period}ms`); + return interval(period); + }), mapTo(none) ) ).pipe( diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index 6a39f2a762e75..cc611e124ea7b 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -import { BehaviorSubject, Subject, Observable, Subscription } from 'rxjs'; +import { Subject, Observable, Subscription } from 'rxjs'; import { filter } from 'rxjs/operators'; import { performance } from 'perf_hooks'; @@ -17,6 +17,7 @@ import { ISavedObjectsRepository, } from '../../../../src/core/server'; import { Result, asOk, asErr, either, map, mapErr, promiseResult } from './lib/result_type'; +import { createManagedConfiguration } from './lib/create_managed_configuration'; import { TaskManagerConfig } from './config'; import { Logger } from './types'; @@ -149,8 +150,12 @@ export class TaskManager { // pipe store events into the TaskManager's event stream this.store.events.subscribe((event) => this.events$.next(event)); - const maxWorkers$ = new BehaviorSubject(opts.config.max_workers); - const pollInterval$ = new BehaviorSubject(opts.config.poll_interval); + const { maxWorkersConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + logger: this.logger, + errors$: this.store.errors$, + startingMaxWorkers: opts.config.max_workers, + startingPollInterval: opts.config.poll_interval, + }); this.bufferedStore = new BufferedTaskStore(this.store, { bufferMaxOperations: opts.config.max_workers, @@ -159,7 +164,7 @@ export class TaskManager { this.pool = new TaskPool({ logger: this.logger, - maxWorkers$, + maxWorkers$: maxWorkersConfiguration$, }); const { @@ -169,7 +174,8 @@ export class TaskManager { this.poller$ = createObservableMonitor>, Error>( () => createTaskPoller({ - pollInterval$, + logger: this.logger, + pollInterval$: pollIntervalConfiguration$, bufferCapacity: opts.config.request_capacity, getCapacity: () => this.pool.availableWorkers, pollRequests$: this.claimRequests$, diff --git a/x-pack/plugins/task_manager/server/task_pool.test.ts b/x-pack/plugins/task_manager/server/task_pool.test.ts index ec6613ece4eed..12b731b2b78ae 100644 --- a/x-pack/plugins/task_manager/server/task_pool.test.ts +++ b/x-pack/plugins/task_manager/server/task_pool.test.ts @@ -130,11 +130,9 @@ describe('TaskPool', () => { const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); - expect(logger.debug.mock.calls[0]).toMatchInlineSnapshot(` - Array [ - "Task TaskType \\"shooooo\\" failed in attempt to run: Saved object [task/foo] not found", - ] - `); + expect(logger.debug).toHaveBeenCalledWith( + 'Task TaskType "shooooo" failed in attempt to run: Saved object [task/foo] not found' + ); expect(logger.warn).not.toHaveBeenCalled(); expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); diff --git a/x-pack/plugins/task_manager/server/task_pool.ts b/x-pack/plugins/task_manager/server/task_pool.ts index c029349c13b77..44f5f5648c2ac 100644 --- a/x-pack/plugins/task_manager/server/task_pool.ts +++ b/x-pack/plugins/task_manager/server/task_pool.ts @@ -47,6 +47,7 @@ export class TaskPool { constructor(opts: Opts) { this.logger = opts.logger; opts.maxWorkers$.subscribe((maxWorkers) => { + this.logger.debug(`Task pool now using ${maxWorkers} as the max worker value`); this.maxWorkers = maxWorkers; }); } From 4a5d652a2c2b423cea5adb811db94808723268e9 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Thu, 1 Oct 2020 14:30:56 +0100 Subject: [PATCH 18/42] added success and failure ratio --- .../monitoring/task_run_statistics.test.ts | 6 +- .../server/monitoring/task_run_statistics.ts | 88 ++++++++++++------- .../test_suites/task_manager/health_route.ts | 15 +++- 3 files changed, 71 insertions(+), 38 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index b0a67216927f8..fefe0fd62b874 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -110,7 +110,7 @@ describe('Task Run Statistics', () => { windows: Record ) { for (const [type, window] of Object.entries(windows)) { - expect(taskStat.value.duration[type]).toMatchObject({ + expect(taskStat.value.execution.duration[type]).toMatchObject({ mean: Math.round(stats.mean(window)), median: stats.median(window), }); @@ -230,7 +230,9 @@ describe('Task Run Statistics', () => { * At any given time we only keep track of the last X Polling Results * In the tests this is ocnfiugured to a window size of 5 */ - expect(taskStats.map((taskStat) => taskStat.value.taskRunResultFrequency)).toEqual([ + expect( + taskStats.map((taskStat) => taskStat.value.execution.resultFrequency['alerting:test']) + ).toEqual([ // Success { Success: 100, RetryScheduled: 0, Failed: 0 }, // Success, Success, diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index 1708542ed8587..5c3c3d12972aa 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -35,12 +35,17 @@ interface FillPoolStat extends JsonObject { resultFrequency: FillPoolResult[]; } +interface ExecutionStat extends JsonObject { + duration: Record; + resultFrequency: Record; +} + export interface TaskRunStat extends JsonObject { drift: number[]; - duration: Record; - taskRunResultFrequency: TaskRunResult[]; + execution: ExecutionStat; polling: FillPoolStat | Omit; } + interface FillPoolRawStat extends JsonObject { lastSuccessfulPoll: string; resultFrequency: { @@ -52,12 +57,17 @@ interface FillPoolRawStat extends JsonObject { export interface SummarizedTaskRunStat extends JsonObject { drift: AveragedStat; - duration: Record; - taskRunResultFrequency: { - [TaskRunResult.Success]: number; - [TaskRunResult.SuccessRescheduled]: number; - [TaskRunResult.RetryScheduled]: number; - [TaskRunResult.Failed]: number; + execution: { + duration: Record; + resultFrequency: Record< + string, + { + [TaskRunResult.Success]: number; + [TaskRunResult.SuccessRescheduled]: number; + [TaskRunResult.RetryScheduled]: number; + [TaskRunResult.Failed]: number; + } + >; }; polling: FillPoolRawStat | Omit; } @@ -76,33 +86,35 @@ export function createTaskRunAggregator( ); const resultFrequencyQueue = createRunningAveragedStat(runningAverageWindowSize); - const taskPollingEvents$: Observable = taskManager.events.pipe( + const taskPollingEvents$: Observable> = taskManager.events.pipe( filter( (taskEvent: TaskLifecycleEvent) => isTaskPollingCycleEvent(taskEvent) && isOk(taskEvent.event) ), map((taskEvent: TaskLifecycleEvent) => { return { - lastSuccessfulPoll: new Date().toISOString(), - resultFrequency: resultFrequencyQueue((taskEvent.event as Ok).value), + polling: { + lastSuccessfulPoll: new Date().toISOString(), + resultFrequency: resultFrequencyQueue((taskEvent.event as Ok).value), + }, }; }) ); return combineLatest( - taskRunEvents$.pipe(startWith({ duration: {}, drift: [], taskRunResultFrequency: [] })), + taskRunEvents$.pipe(startWith({ drift: [], execution: { duration: {}, resultFrequency: {} } })), taskPollingEvents$.pipe( startWith({ - resultFrequency: [], + polling: { resultFrequency: [] }, }) ) ).pipe( - map(([taskRun, polling]) => { + map(([taskRun, polling]: [Omit, Pick]) => { return { key: 'runtime', value: { ...taskRun, - polling, + ...polling, }, } as AggregatedStat; }) @@ -116,42 +128,54 @@ function hasTiming(taskEvent: TaskLifecycleEvent) { function createTaskRunEventToStat(runningAverageWindowSize: number) { const driftQueue = createRunningAveragedStat(runningAverageWindowSize); const taskRunDurationQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); - const resultFrequencyQueue = createRunningAveragedStat(runningAverageWindowSize); + const resultFrequencyQueue = createMapOfRunningAveragedStats( + runningAverageWindowSize + ); return ( task: ConcreteTaskInstance, timing: TaskTiming, result: TaskRunResult ): Omit => ({ - duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), drift: driftQueue(timing!.start - task.runAt.getTime()), - taskRunResultFrequency: resultFrequencyQueue(result), + execution: { + duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), + resultFrequency: resultFrequencyQueue(task.taskType, result), + }, }); } +const DEFAULT_TASK_RUN_FREQUENCIES = { + [TaskRunResult.Success]: 0, + [TaskRunResult.SuccessRescheduled]: 0, + [TaskRunResult.RetryScheduled]: 0, + [TaskRunResult.Failed]: 0, +}; +const DEFAULT_POLLING_FREQUENCIES = { + [FillPoolResult.NoTasksClaimed]: 0, + [FillPoolResult.RanOutOfCapacity]: 0, + [FillPoolResult.PoolFilled]: 0, +}; + export function summarizeTaskRunStat({ - polling: { lastSuccessfulPoll, resultFrequency }, + polling: { lastSuccessfulPoll, resultFrequency: pollingResultFrequency }, drift, - duration, - taskRunResultFrequency, + execution: { duration, resultFrequency: executionResultFrequency }, }: TaskRunStat): SummarizedTaskRunStat { return { polling: { ...(lastSuccessfulPoll ? { lastSuccessfulPoll } : {}), resultFrequency: { - [FillPoolResult.NoTasksClaimed]: 0, - [FillPoolResult.RanOutOfCapacity]: 0, - [FillPoolResult.PoolFilled]: 0, - ...calculateFrequency(resultFrequency as FillPoolResult[]), + ...DEFAULT_POLLING_FREQUENCIES, + ...calculateFrequency(pollingResultFrequency as FillPoolResult[]), }, }, drift: calculateRunningAverage(drift), - duration: mapValues(duration, (typedDuration) => calculateRunningAverage(typedDuration)), - taskRunResultFrequency: { - [TaskRunResult.Success]: 0, - [TaskRunResult.SuccessRescheduled]: 0, - [TaskRunResult.RetryScheduled]: 0, - [TaskRunResult.Failed]: 0, - ...calculateFrequency(taskRunResultFrequency), + execution: { + duration: mapValues(duration, (typedDurations) => calculateRunningAverage(typedDurations)), + resultFrequency: mapValues(executionResultFrequency, (typedResultFrequencies) => ({ + ...DEFAULT_TASK_RUN_FREQUENCIES, + ...calculateFrequency(typedResultFrequencies), + })), }, }; } diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index d5fef2852eed3..eb00f81bea629 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -26,7 +26,10 @@ interface MonitoringStats { timestamp: string; value: { drift: Record; - duration: Record>; + execution: { + duration: Record>; + resultFrequency: Record>; + }; polling: { lastSuccessfulPoll: string; resultFrequency: Record; @@ -130,7 +133,7 @@ export default function ({ getService }: FtrProviderContext) { const { runtime: { - value: { drift, polling, duration }, + value: { drift, polling, execution }, }, } = (await getHealth()).stats; @@ -142,8 +145,12 @@ export default function ({ getService }: FtrProviderContext) { expect(typeof drift.mean).to.eql('number'); expect(typeof drift.median).to.eql('number'); - expect(typeof duration.sampleTask.mean).to.eql('number'); - expect(typeof duration.sampleTask.median).to.eql('number'); + expect(typeof execution.duration.sampleTask.mean).to.eql('number'); + expect(typeof execution.duration.sampleTask.median).to.eql('number'); + + expect(typeof execution.resultFrequency.sampleTask.Success).to.eql('number'); + expect(typeof execution.resultFrequency.sampleTask.RetryScheduled).to.eql('number'); + expect(typeof execution.resultFrequency.sampleTask.Failed).to.eql('number'); }); }); } From de5a7ac4e10af82ec4731d1c39a21720f3462dbd Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Fri, 2 Oct 2020 21:56:08 +0100 Subject: [PATCH 19/42] added schedule density to Task Manager health --- .../task_manager/server/lib/intervals.test.ts | 25 ++ .../task_manager/server/lib/intervals.ts | 6 + .../monitoring/monitoring_stats_stream.ts | 7 +- .../monitoring/workload_statistics.test.ts | 358 +++++++++++++++--- .../server/monitoring/workload_statistics.ts | 183 +++++++-- .../server/queries/aggregation_clauses.ts | 153 +++++++- .../task_manager/server/task_manager.ts | 4 +- .../plugins/task_manager/server/task_store.ts | 53 ++- 8 files changed, 683 insertions(+), 106 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/intervals.test.ts b/x-pack/plugins/task_manager/server/lib/intervals.test.ts index 3554f8d8294f2..5ce6c33c57973 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.test.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.test.ts @@ -12,6 +12,7 @@ import { intervalFromDate, secondsFromNow, secondsFromDate, + asInterval, } from './intervals'; let fakeTimer: sinon.SinonFakeTimers; @@ -51,6 +52,30 @@ describe('taskIntervals', () => { }); }); + describe('asInterval', () => { + test('returns a ms interval when ms duration can only divide by ms', () => { + expect(asInterval(500)).toEqual('500ms'); + expect(asInterval(1500)).toEqual('1500ms'); + expect(asInterval(1001)).toEqual('1001ms'); + expect(asInterval(2001)).toEqual('2001ms'); + expect(asInterval(61001)).toEqual('61001ms'); + expect(asInterval(90001)).toEqual('90001ms'); + }); + + test('returns a seconds interval when ms duration divides by seconds', () => { + expect(asInterval(1000)).toEqual('1s'); + expect(asInterval(2000)).toEqual('2s'); + expect(asInterval(61000)).toEqual('61s'); + expect(asInterval(99000)).toEqual('99s'); + expect(asInterval(90000)).toEqual('90s'); + }); + + test('returns a minutes interval when ms duration divides by minutes', () => { + expect(asInterval(60000)).toEqual('1m'); + expect(asInterval(120000)).toEqual('2m'); + }); + }); + describe('intervalFromNow', () => { test('it returns the current date plus n minutes', () => { const mins = _.random(1, 100); diff --git a/x-pack/plugins/task_manager/server/lib/intervals.ts b/x-pack/plugins/task_manager/server/lib/intervals.ts index 967251e6d717f..914bc35bb526f 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.ts @@ -20,6 +20,12 @@ function isCadence(cadence: IntervalCadence | string): cadence is IntervalCadenc return VALID_CADENCE.has(cadence as IntervalCadence); } +export function asInterval(ms: number): string { + const secondsRemainder = ms % 1000; + const minutesRemainder = ms % 60000; + return secondsRemainder ? `${ms}ms` : minutesRemainder ? `${ms / 1000}s` : `${ms / 60000}m`; +} + /** * Returns a date that is the specified interval from now. Currently, * only minute-intervals and second-intervals are supported. diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts index 02ed298a047e6..edb22b6d79ae5 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -63,7 +63,12 @@ export function createAggregators( ): AggregatedStatProvider { return merge( createTaskRunAggregator(taskManager, config.monitored_stats_running_average_window), - createWorkloadAggregator(taskManager, config.monitored_aggregated_stats_refresh_rate, logger) + createWorkloadAggregator( + taskManager, + config.monitored_aggregated_stats_refresh_rate, + config.poll_interval, + logger + ) ); } diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index 0bcf3abfc7607..a95b8d96117da 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -5,27 +5,48 @@ */ import { first, take, bufferCount } from 'rxjs/operators'; -import { createWorkloadAggregator } from './workload_statistics'; +import { createWorkloadAggregator, padBuckets } from './workload_statistics'; import { taskManagerMock } from '../task_manager.mock'; -import { AggregationResult } from '../queries/aggregation_clauses'; +import { AggregationSearchResult, KeyedAggregationBucket } from '../queries/aggregation_clauses'; import { mockLogger } from '../test_utils'; describe('Workload Statistics Aggregator', () => { test('queries the Task Store at a fixed interval for the current workload', async () => { const taskManager = taskManagerMock.create(); taskManager.aggregate.mockResolvedValue(({ - task: { - doc_count: 0, + sum: 0, + aggregations: { taskType: { buckets: [], }, schedule: { buckets: [], }, + idleTasks: { + doc_count: 0, + overdue: { + doc_count: 0, + }, + scheduleDensity: { + buckets: [ + { + key: '2020-10-02T15:18:37.274Z-2020-10-02T15:19:36.274Z', + from: 1.601651917274e12, + from_as_string: '2020-10-02T15:18:37.274Z', + to: 1.601651976274e12, + to_as_string: '2020-10-02T15:19:36.274Z', + doc_count: 0, + histogram: { + buckets: [], + }, + }, + ], + }, + }, }, - } as unknown) as AggregationResult); + } as unknown) as AggregationSearchResult); - const workloadAggregator = createWorkloadAggregator(taskManager, 10, mockLogger()); + const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe(() => { @@ -44,6 +65,34 @@ describe('Workload Statistics Aggregator', () => { field: 'task.schedule.interval', }, }, + idleTasks: { + filter: { + term: { 'task.status': 'idle' }, + }, + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [{ from: 'now', to: 'now+1m' }], + }, + aggs: { + histogram: { + date_histogram: { + field: 'task.runAt', + fixed_interval: '3s', + }, + }, + }, + }, + overdue: { + filter: { + range: { + 'task.runAt': { lt: 'now' }, + }, + }, + }, + }, + }, }, }); resolve(); @@ -52,8 +101,8 @@ describe('Workload Statistics Aggregator', () => { }); const mockAggregatedResult = ({ - task: { - doc_count: 4, + sum: 4, + aggregations: { schedule: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, @@ -120,46 +169,36 @@ describe('Workload Statistics Aggregator', () => { }, ], }, - }, - } as unknown) as AggregationResult; - - function setTaskTypeCount( - result: AggregationResult, - taskType: string, - status: Record - ) { - const buckets = [ - ...result.task.taskType.buckets.filter(({ key }) => key !== taskType), - { - key: taskType, - doc_count: Object.values(status).reduce((sum, count) => sum + count, 0), - status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: Object.entries(status).map(([key, count]) => ({ - key, - doc_count: count, - })), + idleTasks: { + doc_count: 13, + overdue: { + doc_count: 6, }, - }, - ]; - return ({ - task: { - doc_count: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), - taskType: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets, + scheduleDensity: { + buckets: [ + mockHistogram(Date.now(), Date.now() + 7 * 3000, Date.now() + 60000, 3000, [ + 2, + 2, + 5, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + ]), + ], }, }, - } as unknown) as AggregationResult; - } + }, + } as unknown) as AggregationSearchResult; test('returns a summary of the workload by task type', async () => { const taskManager = taskManagerMock.create(); taskManager.aggregate.mockResolvedValue(mockAggregatedResult); - const workloadAggregator = createWorkloadAggregator(taskManager, 10, mockLogger()); + const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { @@ -177,6 +216,45 @@ describe('Workload Statistics Aggregator', () => { }); }); + test('returns a count of the overdue workload', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + + const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + overdue: 6, + }); + resolve(); + }); + }); + }); + + test('returns a histogram of the upcoming workload', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + + const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + // we have intervals every 3s, so we aggregate buckets 3s apart + // in this mock, Elasticsearch found tasks scheduled in 21 (8th bucket), 24, 27 and 48s seconds from now + // 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57 + // [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0 ] + // Above you see each bucket and the number of scheduled tasks we expect to have in them + scheduleDensity: [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0], + }); + resolve(); + }); + }); + }); + test('recovers from errors fetching the workload', async () => { const taskManager = taskManagerMock.create(); taskManager.aggregate @@ -193,9 +271,9 @@ describe('Workload Statistics Aggregator', () => { }) ); const logger = mockLogger(); - const workloadAggregator = createWorkloadAggregator(taskManager, 10, logger); + const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, logger); - return new Promise((resolve) => { + return new Promise((resolve, reject) => { workloadAggregator.pipe(take(2), bufferCount(2)).subscribe((results) => { expect(results[0].key).toEqual('workload'); expect(results[0].value).toMatchObject({ @@ -216,7 +294,201 @@ describe('Workload Statistics Aggregator', () => { }, }); resolve(); - }); + }, reject); }); }); }); + +describe('padBuckets', () => { + test('returns zeroed out bucklets when there are no buckets in the histogram', async () => { + expect( + padBuckets(10, 3000, { + key: '2020-10-02T19:47:28.128Z-2020-10-02T19:48:28.128Z', + from: 1601668048128, + from_as_string: '2020-10-02T19:47:28.128Z', + to: 1601668108128, + to_as_string: '2020-10-02T19:48:28.128Z', + doc_count: 0, + histogram: { + buckets: [], + }, + }) + ).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + }); + + test('pads buckets with zeros to fill out the entire period of time after detected buckets', async () => { + expect( + padBuckets(10, 3000, { + key: '2020-10-02T19:47:28.128Z-2020-10-02T19:48:28.128Z', + from: 1601668048128, + from_as_string: '2020-10-02T19:47:28.128Z', + to: 1601668077128, + to_as_string: '2020-10-02T19:47:57.128Z', + doc_count: 3, + histogram: { + buckets: [ + { + key_as_string: '2020-10-02T19:47:27.000Z', + key: 1601668047000, + doc_count: 1, + }, + { + key_as_string: '2020-10-02T19:47:30.000Z', + key: 1601668050000, + doc_count: 1, + }, + { + key_as_string: '2020-10-02T19:47:33.000Z', + key: 1601668053000, + doc_count: 0, + }, + { + key_as_string: '2020-10-02T19:47:36.000Z', + key: 1601668056000, + doc_count: 0, + }, + { + key_as_string: '2020-10-02T19:47:39.000Z', + key: 1601668059000, + doc_count: 0, + }, + { + key_as_string: '2020-10-02T19:47:42.000Z', + key: 1601668062000, + doc_count: 1, + }, + ], + }, + }) + ).toEqual([1, 1, 0, 0, 0, 1, 0, 0, 0, 0]); + }); + + test('pads buckets with zeros to fill out the entire period of time before detected buckets', async () => { + expect( + padBuckets(10, 3000, { + key: '2020-10-02T20:39:45.793Z-2020-10-02T20:40:14.793Z', + from: 1.601671185793e12, + from_as_string: '2020-10-02T20:39:45.793Z', + to: 1.601671214793e12, + to_as_string: '2020-10-02T20:40:14.793Z', + doc_count: 2, + histogram: { + buckets: [ + { + key_as_string: '2020-10-02T20:40:09.000Z', + key: 1601671209000, + doc_count: 1, + }, + { + key_as_string: '2020-10-02T20:40:12.000Z', + key: 1601671212000, + doc_count: 1, + }, + ], + }, + }) + ).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1]); + }); + + test('pads buckets with zeros to fill out the entire period surounding the detected buckets', async () => { + expect( + padBuckets(20, 3000, { + key: '2020-10-02T20:39:45.793Z-2020-10-02T20:40:14.793Z', + from: 1.601671185793e12, + from_as_string: '2020-10-02T20:39:45.793Z', + to: 1.1601671244793, + to_as_string: '2020-10-02T20:40:44.793Z', + doc_count: 2, + histogram: { + buckets: [ + { + key_as_string: '2020-10-02T20:40:09.000Z', + key: 1601671209000, + doc_count: 1, + }, + { + key_as_string: '2020-10-02T20:40:12.000Z', + key: 1601671212000, + doc_count: 1, + }, + ], + }, + }) + ).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + }); +}); + +function setTaskTypeCount( + { aggregations: { taskType: taskTypeAgg, ...otherAggs } }: AggregationSearchResult, + taskType: string, + status: Record +) { + const buckets = [ + ...(taskTypeAgg.buckets as KeyedAggregationBucket[]).filter(({ key }) => key !== taskType), + { + key: taskType, + doc_count: Object.values(status).reduce((sum, count) => sum + count, 0), + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: Object.entries(status).map(([key, count]) => ({ + key, + doc_count: count, + })), + }, + }, + ]; + return ({ + sum: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), + aggregations: { + taskType: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets, + }, + ...otherAggs, + }, + } as unknown) as AggregationSearchResult; +} + +/** * + * This creates a mock histogram as returned by Elasticsearch + * + * @param from lower bound of query + * @param findFrom the timestamp (key) of the first bucket returned + * @param to upper bound of query + * @param interval the duration that each bucket coresponds to + * @param foundBuckets the buckets identified by ES, any buckets missing before or after which + * are still in the date range are assumed to have 0 results, ES only returns 0 for + * buckets that sit in between buckets which do have results + */ +function mockHistogram( + from: number, + findFrom: number, + to: number, + interval: number, + foundBuckets: Array +) { + const fromDate = new Date(from); + const toDate = new Date(to); + return { + from, + from_as_string: fromDate.toISOString(), + to, + to_as_string: toDate.toISOString(), + doc_count: foundBuckets.reduce((sum: number, count) => sum + (count ?? 0), 0), + histogram: { + buckets: foundBuckets.reduce((histogramBuckets, count, index) => { + if (typeof count === 'number') { + const key = new Date(findFrom + index * interval); + histogramBuckets.push({ + key_as_string: key.toISOString(), + key: key.getTime(), + doc_count: count, + }); + } + return histogramBuckets; + }, [] as KeyedAggregationBucket[]), + }, + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 669e6af16ea0e..061f1a9399bce 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -12,11 +12,18 @@ import { keyBy, mapValues } from 'lodash'; import { AggregatedStatProvider } from './runtime_statistics_aggregator'; import { TaskManager } from '../task_manager'; import { - AggregationResult, + AggregationSearchResult, AggregationBucketWithSubAgg, - AggregationBucket, + isBucketedAggregation, + isAggregationBucket, + isKeyedBuckets, + isBucketsWithNumericKey, + aggregationBucketsByKey, + KeyedAggregationBucket, + getStringKeyOfBucket, + RangeAggregationBucket, } from '../queries/aggregation_clauses'; -import { parseIntervalAsSecond } from '../lib/intervals'; +import { parseIntervalAsSecond, asInterval } from '../lib/intervals'; interface StatusStat extends JsonObject { [status: string]: number; @@ -37,8 +44,16 @@ export interface WorkloadStat extends JsonObject { export function createWorkloadAggregator( taskManager: TaskManager, refreshInterval: number, + pollInterval: number, logger: Logger ): AggregatedStatProvider { + // calculate scheduleDensity going two refreshIntervals or 1 minute into into the future + // (the longer of the two) + const scheduleDensityBuckets = Math.max( + Math.round(60000 / pollInterval), + Math.round((refreshInterval * 2) / pollInterval) + ); + return timer(0, refreshInterval).pipe( concatMap(() => taskManager.aggregate({ @@ -54,39 +69,104 @@ export function createWorkloadAggregator( schedule: { terms: { field: 'task.schedule.interval' }, }, + idleTasks: { + filter: { + term: { 'task.status': 'idle' }, + }, + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [ + { from: `now`, to: `now+${asInterval(scheduleDensityBuckets * pollInterval)}` }, + ], + }, + aggs: { + histogram: { + date_histogram: { + field: 'task.runAt', + fixed_interval: asInterval(pollInterval), + }, + }, + }, + }, + overdue: { + filter: { + range: { + 'task.runAt': { lt: 'now' }, + }, + }, + }, + }, + }, }, }) ), - map(({ task }: AggregationResult<'task' | 'taskType' | 'schedule' | 'status'>) => { - const { - doc_count: sum = 0, - taskType: { buckets: taskTypes = [] } = {}, - schedule: { buckets: schedules = [] } = {}, - } = task; - const summary: WorkloadStat = { + map( + ({ + aggregations, sum, - taskTypes: mapValues( - keyBy>( - taskTypes as Array>, - 'key' - ), - ({ doc_count: docCount, status }) => ({ - sum: docCount, - status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), - }) - ), - schedule: (schedules as AggregationBucket[]) - .sort( - ({ key: scheduleLeft }, { key: scheduleRight }) => - parseIntervalAsSecond(scheduleLeft) - parseIntervalAsSecond(scheduleRight) + }: AggregationSearchResult< + | 'taskType' + | 'schedule' + | 'status' + | 'scheduleDensity' + | 'histogram' + | 'overdue' + | 'idleTasks' + >) => { + if ( + !isBucketedAggregation(aggregations.taskType) || + !isBucketedAggregation(aggregations.schedule) || + !( + !isBucketedAggregation(aggregations.idleTasks) && + isAggregationBucket(aggregations.idleTasks.overdue) && + isBucketedAggregation(aggregations.idleTasks.scheduleDensity) && + !isKeyedBuckets(aggregations.idleTasks.scheduleDensity.buckets) ) - .map(({ key: schedule, doc_count: count }) => [schedule, count]), - }; - return { - key: 'workload', - value: summary, - }; - }), + ) { + throw new Error(`Invalid workload: ${JSON.stringify({ aggregations, sum })}`); + } + + const { + taskType: { buckets: taskTypes = [] } = {}, + schedule: { buckets: schedules = [] } = {}, + idleTasks: { + overdue: { doc_count: overdue } = { doc_count: 0 }, + scheduleDensity: { buckets: [scheduleDensity] = [] } = {}, + } = {}, + } = aggregations; + + const summary: WorkloadStat = { + sum, + taskTypes: mapValues( + keyBy>( + taskTypes as Array>, + 'key' + ), + ({ doc_count: docCount, status }) => { + return { + sum: docCount, + status: mapValues(aggregationBucketsByKey(status), 'doc_count'), + }; + } + ), + schedule: (schedules as KeyedAggregationBucket[]) + .sort( + (scheduleLeft, scheduleRight) => + parseIntervalAsSecond(getStringKeyOfBucket(scheduleLeft)) - + parseIntervalAsSecond(getStringKeyOfBucket(scheduleRight)) + ) + .map((schedule) => [getStringKeyOfBucket(schedule), schedule.doc_count]), + overdue, + scheduleDensity: padBuckets(scheduleDensityBuckets, pollInterval, scheduleDensity), + }; + return { + key: 'workload', + value: summary, + }; + } + ), catchError((ex: Error, caught) => { logger.error(`[WorkloadAggregator]: ${ex}`); // continue to pull values from the same observable @@ -94,3 +174,44 @@ export function createWorkloadAggregator( }) ); } + +export function padBuckets( + scheduleDensityBuckets: number, + pollInterval: number, + scheduleDensity: unknown +): number[] { + const { histogram, doc_count: docCount, from } = scheduleDensity as AggregationBucketWithSubAgg< + 'histogram', + RangeAggregationBucket + >; + + if ( + docCount && + histogram && + !isKeyedBuckets(histogram.buckets) && + isBucketsWithNumericKey(histogram.buckets) + ) { + const firstBucket = histogram.buckets[0].key; + const bucketsToPadBeforeFirstBucket = bucketsBetween(from, firstBucket, pollInterval); + const bucketsToPadAfterLast = + scheduleDensityBuckets - (bucketsToPadBeforeFirstBucket + histogram.buckets.length); + return [ + ...(bucketsToPadBeforeFirstBucket > 0 + ? new Array(bucketsToPadBeforeFirstBucket).fill(0) + : []), + ...histogram.buckets.map((bucket, index) => bucket.doc_count), + ...(bucketsToPadAfterLast > 0 ? new Array(bucketsToPadAfterLast).fill(0) : []), + ]; + } + return new Array(scheduleDensityBuckets).fill(0); +} + +function bucketsBetween(from: number, to: number, interval: number) { + let fromBound = from; + let count = 0; + while (fromBound <= to) { + fromBound += interval; + count++; + } + return count; +} diff --git a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts index 84cd9d6ae2b5e..04e5bd9f89eed 100644 --- a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts +++ b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts @@ -4,7 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ -import { TermFilter } from './query_clauses'; +import { keyBy } from 'lodash'; +import { TermFilter, RangeFilter } from './query_clauses'; /** * Terminology @@ -30,7 +31,25 @@ import { TermFilter } from './query_clauses'; * "terms": { "field": "task.status" } * } * } - * } + * }, + * "scheduleDensity": { + * "range": { (3) + * "field": "task.runAt", + * "keyed": true, + * "ranges": [ + * { "key": "overdue", "from": "now-1m", "to": "now" }, + * { "key": "upcoming", "from": "now+1s", "to": "now+1m" } + * ] + * }, + * "aggs": { + * "histogram": { (4) + * "date_histogram": { + * "field": "task.runAt", + * "fixed_interval": "3s" + * } + * } + * } + * } * } * } * } @@ -39,21 +58,48 @@ import { TermFilter } from './query_clauses'; * These are referred to as: * (1). AggregationQuery * (2). TermAggregation + * (3). RangeAggregation + * (4). HistogramAggregation * */ export interface AggregationQuery { - [aggregationName: string]: (TermAggregation | { aggs: AggregationQuery }) & { - filter?: TermFilter; - }; + [aggregationName: string]: TypedAggregation & { aggs?: AggregationQuery }; } +type TypedAggregation = + | TermAggregation + | FilterAggregation + | RangeAggregation + | RangeAggregation + | HistogramAggregation; + interface TermAggregation { terms: { field: string; }; } +interface FilterAggregation { + filter: TermFilter | RangeFilter; +} + +interface RangeAggregation { + range: { + field: string; + keyed?: boolean; + ranges: Array<{ key?: string; from?: string; to?: string }>; + }; +} + +interface HistogramAggregation { + date_histogram: { + field: string; + fixed_interval: string; + keyed?: boolean; + }; +} + /** * Results of an Aggregation */ @@ -66,19 +112,108 @@ export type Aggregation = { }; export interface AggregationBucket { - key: string; doc_count: number; } -export type AggregationBucketWithSubAgg = AggregationBucket & +export function isAggregationBucket(bucket: unknown): bucket is AggregationBucket { + return typeof (bucket as AggregationBucket)?.doc_count === 'number'; +} + +export function isBucketsWithNumericKey( + buckets: AggregationBuckets['buckets'] +): buckets is Array< + AggregationBucket & { + key_as_string: string; + key: number; + } +> { + return ( + !isKeyedBuckets(buckets) && typeof (buckets[0] as KeyedAggregationBucket)?.key === 'number' + ); +} + +export type KeyedAggregationBucket = AggregationBucket & + ( + | { + key: string; + } + | { + key_as_string: string; + key: number; + } + ); + +export function getStringKeyOfBucket(bucket: KeyedAggregationBucket) { + return typeof bucket.key === 'string' + ? bucket.key + : (bucket as { + key_as_string: string; + }).key_as_string; +} + +export interface RangeAggregationBucket { + from: number; + to: number; + doc_count: number; +} + +export type KeyedRangeAggregationBucket = RangeAggregationBucket & { + key: string; +}; + +export function isRangeAggregationBucket(bucket: TypedBucket): bucket is RangeAggregationBucket { + return ( + typeof (bucket as RangeAggregationBucket).to !== 'number' || + typeof (bucket as RangeAggregationBucket).from !== 'number' + ); +} + +type TypedBucket = AggregationBucket | RangeAggregationBucket; +type KeyedTypedBucket = KeyedAggregationBucket | KeyedRangeAggregationBucket; + +export type AggregationBucketWithSubAgg< + Name extends AggregationNames, + AggType extends TypedBucket = TypedBucket +> = AggType & { [innerAggregation in Name]: AggregationBuckets; }; +export type KeyedBuckets = Record< + Name, + TypedBucket | AggregationBucketWithSubAgg +>; + export interface AggregationBuckets { - buckets: AggregationBucket[] | Array>; + buckets: KeyedTypedBucket[] | Array> | KeyedBuckets; +} + +export function isKeyedBuckets( + buckets: AggregationBuckets['buckets'] +): buckets is KeyedBuckets { + return !Array.isArray(buckets); +} + +export function aggregationBucketsByKey({ + buckets, +}: AggregationBuckets): KeyedBuckets { + if (isKeyedBuckets(buckets)) { + return buckets; + } + return keyBy(buckets, 'key') as KeyedBuckets; } export type AggregationResult = { - [aggregationName in Name]: Aggregation; + [aggregationName in Name]: Aggregation | AggregationBuckets; }; + +export function isBucketedAggregation( + aggregation: Aggregation | AggregationBuckets +): aggregation is AggregationBuckets { + return aggregation && Array.isArray((aggregation as AggregationBuckets).buckets); +} + +export interface AggregationSearchResult { + sum: number; + aggregations: AggregationResult; +} diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index f9fcb2d567393..af6c02a60576f 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -69,7 +69,7 @@ import { import { identifyEsError } from './lib/identify_es_error'; import { ensureDeprecatedFieldsAreCorrected } from './lib/correct_deprecated_fields'; import { BufferedTaskStore } from './buffered_task_store'; -import { AggregationResult } from './queries/aggregation_clauses'; +import { AggregationSearchResult } from './queries/aggregation_clauses'; const VERSION_CONFLICT_STATUS = 409; @@ -401,7 +401,7 @@ export class TaskManager { */ public async aggregate( opts: AggregationOpts - ): Promise> { + ): Promise> { await this.waitUntilStarted(); return this.store.aggregate(opts); } diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index 17523ee9efb6e..804ed16569694 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -60,7 +60,11 @@ import { SortByRunAtAndRetryAt, tasksClaimedByOwner, } from './queries/mark_available_tasks_as_claimed'; -import { AggregationQuery, AggregationResult } from './queries/aggregation_clauses'; +import { + AggregationQuery, + AggregationSearchResult, + AggregationResult, +} from './queries/aggregation_clauses'; export interface StoreOpts { callCluster: ElasticJs; @@ -82,6 +86,7 @@ export interface SearchOpts { export interface AggregationOpts { aggs: AggregationQuery; + query?: object; size?: number; } @@ -467,17 +472,25 @@ export class TaskStore { public async aggregate({ aggs, size = 0, - }: AggregationOpts): Promise> { - const result = await this.callCluster('search', { + }: AggregationOpts): Promise> { + const { + aggregations, + hits: { + total: { value: sum }, + }, + } = (await this.callCluster('search', { index: this.index, ignoreUnavailable: true, - body: { - aggs: ensureAggregationOnlyReturnsTaskObjects(aggs), + body: ensureAggregationOnlyReturnsTaskObjects({ + aggs, size, - }, - }); + }), + })) as { + aggregations: AggregationResult; + hits: { total: { value: number } }; + }; - return (result as { aggregations: AggregationResult }).aggregations; + return { aggregations, sum }; } private async updateByQuery( @@ -559,20 +572,20 @@ function ensureQueryOnlyReturnsTaskObjects(opts: SearchOpts): SearchOpts { }; } -function ensureAggregationOnlyReturnsTaskObjects( - aggs: AggregationOpts['aggs'] -): AggregationOpts['aggs'] { - const filteredAgg: AggregationQuery = { - task: { - filter: { - term: { - type: 'task', - }, - }, - aggs, +function ensureAggregationOnlyReturnsTaskObjects(opts: AggregationOpts): AggregationOpts { + const originalQuery = opts.query; + const filterToOnlyTasks = { + bool: { + filter: [{ term: { type: 'task' } }], }, }; - return filteredAgg; + const query = originalQuery + ? { bool: { must: [filterToOnlyTasks, originalQuery] } } + : filterToOnlyTasks; + return { + ...opts, + query, + }; } function isSavedObjectsUpdateResponse( From 62ceba327ecbb1dcf80008ebef4330189506bfa3 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 09:37:26 +0100 Subject: [PATCH 20/42] added schedule density to test --- .../monitoring/workload_statistics.test.ts | 30 +++++++++---------- .../server/monitoring/workload_statistics.ts | 14 +++++---- .../server/queries/aggregation_clauses.ts | 2 +- .../plugins/task_manager/server/task_store.ts | 4 +-- .../test_suites/task_manager/health_route.ts | 28 ++++++++++++++--- 5 files changed, 50 insertions(+), 28 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index a95b8d96117da..afc9d200d1147 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -14,7 +14,7 @@ describe('Workload Statistics Aggregator', () => { test('queries the Task Store at a fixed interval for the current workload', async () => { const taskManager = taskManagerMock.create(); taskManager.aggregate.mockResolvedValue(({ - sum: 0, + count: 0, aggregations: { taskType: { buckets: [], @@ -101,7 +101,7 @@ describe('Workload Statistics Aggregator', () => { }); const mockAggregatedResult = ({ - sum: 4, + count: 4, aggregations: { schedule: { doc_count_error_upper_bound: 0, @@ -204,11 +204,11 @@ describe('Workload Statistics Aggregator', () => { workloadAggregator.pipe(first()).subscribe((result) => { expect(result.key).toEqual('workload'); expect(result.value).toMatchObject({ - sum: 4, + count: 4, taskTypes: { - actions_telemetry: { sum: 2, status: { idle: 2 } }, - alerting_telemetry: { sum: 1, status: { idle: 1 } }, - session_cleanup: { sum: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 1, status: { idle: 1 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, }, }); resolve(); @@ -277,20 +277,20 @@ describe('Workload Statistics Aggregator', () => { workloadAggregator.pipe(take(2), bufferCount(2)).subscribe((results) => { expect(results[0].key).toEqual('workload'); expect(results[0].value).toMatchObject({ - sum: 5, + count: 5, taskTypes: { - actions_telemetry: { sum: 2, status: { idle: 2 } }, - alerting_telemetry: { sum: 2, status: { idle: 2 } }, - session_cleanup: { sum: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 2, status: { idle: 2 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, }, }); expect(results[1].key).toEqual('workload'); expect(results[1].value).toMatchObject({ - sum: 5, + count: 5, taskTypes: { - actions_telemetry: { sum: 2, status: { idle: 2 } }, - alerting_telemetry: { sum: 2, status: { idle: 1, failed: 1 } }, - session_cleanup: { sum: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 2, status: { idle: 1, failed: 1 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, }, }); resolve(); @@ -439,7 +439,7 @@ function setTaskTypeCount( }, ]; return ({ - sum: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), + count: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), aggregations: { taskType: { doc_count_error_upper_bound: 0, diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 061f1a9399bce..f050ab94b8fec 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -30,15 +30,17 @@ interface StatusStat extends JsonObject { } interface TaskTypeStat extends JsonObject { [taskType: string]: { - sum: number; + count: number; status: StatusStat; }; } export interface WorkloadStat extends JsonObject { - sum: number; + count: number; taskTypes: TaskTypeStat; schedule: Array<[string, number]>; + overdue: number; + scheduleDensity: number[]; } export function createWorkloadAggregator( @@ -105,7 +107,7 @@ export function createWorkloadAggregator( map( ({ aggregations, - sum, + count, }: AggregationSearchResult< | 'taskType' | 'schedule' @@ -125,7 +127,7 @@ export function createWorkloadAggregator( !isKeyedBuckets(aggregations.idleTasks.scheduleDensity.buckets) ) ) { - throw new Error(`Invalid workload: ${JSON.stringify({ aggregations, sum })}`); + throw new Error(`Invalid workload: ${JSON.stringify({ aggregations, count })}`); } const { @@ -138,7 +140,7 @@ export function createWorkloadAggregator( } = aggregations; const summary: WorkloadStat = { - sum, + count, taskTypes: mapValues( keyBy>( taskTypes as Array>, @@ -146,7 +148,7 @@ export function createWorkloadAggregator( ), ({ doc_count: docCount, status }) => { return { - sum: docCount, + count: docCount, status: mapValues(aggregationBucketsByKey(status), 'doc_count'), }; } diff --git a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts index 04e5bd9f89eed..805be0b148b7f 100644 --- a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts +++ b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts @@ -214,6 +214,6 @@ export function isBucketedAggregation( } export interface AggregationSearchResult { - sum: number; + count: number; aggregations: AggregationResult; } diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index 804ed16569694..ea3aa7170c86c 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -476,7 +476,7 @@ export class TaskStore { const { aggregations, hits: { - total: { value: sum }, + total: { value: count }, }, } = (await this.callCluster('search', { index: this.index, @@ -490,7 +490,7 @@ export class TaskStore { hits: { total: { value: number } }; }; - return { aggregations, sum }; + return { aggregations, count }; } private async updateByQuery( diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index eb00f81bea629..188cce9e0cc6c 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -20,7 +20,13 @@ interface MonitoringStats { }; workload: { timestamp: string; - value: Record; + value: { + count: number; + taskTypes: Record; + schedule: Array<[string, number]>; + overdue: number; + scheduleDensity: number[]; + }; }; runtime: { timestamp: string; @@ -83,8 +89,8 @@ export default function ({ getService }: FtrProviderContext) { const { workload } = (await getHealth()).stats; const sumSampleTaskInWorkload = (workload.value.taskTypes as { - sampleTask?: { sum: number }; - }).sampleTask?.sum ?? 0; + sampleTask?: { count: number }; + }).sampleTask?.count ?? 0; const scheduledWorkload = (mapValues( keyBy(workload.value.schedule as Array<[string, number]>, ([interval, count]) => interval), ([, count]) => count @@ -107,7 +113,7 @@ export default function ({ getService }: FtrProviderContext) { const workloadAfterScheduling = (await getHealth()).stats.workload.value; expect( - (workloadAfterScheduling.taskTypes as { sampleTask: { sum: number } }).sampleTask.sum + (workloadAfterScheduling.taskTypes as { sampleTask: { count: number } }).sampleTask.count ).to.eql(sumSampleTaskInWorkload + 2); const schedulesWorkloadAfterScheduling = (mapValues( @@ -125,6 +131,20 @@ export default function ({ getService }: FtrProviderContext) { }); }); + it('should return a breakdown of idleTasks in the task manager workload', async () => { + const { + workload: { value: workload }, + } = (await getHealth()).stats; + + expect(typeof workload.overdue).to.eql('number'); + + expect(Array.isArray(workload.scheduleDensity)).to.eql(true); + + // test run with the default poll_interval of 3s and a monitored_aggregated_stats_refresh_rate of 5s, + // so we expect the scheduleDensity to span a minute (which means 20 buckets, as 60s / 3s = 20) + expect(workload.scheduleDensity.length).to.eql(20); + }); + it('should return the task manager runtime stats', async () => { await scheduleTask({ taskType: 'sampleTask', From 1cc826041ba13355404bbb3c4e3338ab9b970ed4 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 09:54:14 +0100 Subject: [PATCH 21/42] added upper bound to schedule density --- .../monitoring/workload_statistics.test.ts | 87 ++++++++++++++++++- .../server/monitoring/workload_statistics.ts | 9 +- 2 files changed, 92 insertions(+), 4 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index afc9d200d1147..a7d22c3f5c9db 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -233,7 +233,7 @@ describe('Workload Statistics Aggregator', () => { }); }); - test('returns a histogram of the upcoming workload', async () => { + test('returns a histogram of the upcoming workload for the upcoming minute when refresh rate is high', async () => { const taskManager = taskManagerMock.create(); taskManager.aggregate.mockResolvedValue(mockAggregatedResult); @@ -255,6 +255,91 @@ describe('Workload Statistics Aggregator', () => { }); }); + test('returns a histogram of the upcoming workload for twice refresh rate when rate is low', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + + const workloadAggregator = createWorkloadAggregator(taskManager, 60 * 1000, 3000, mockLogger()); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + // same schedule density as in previous test, but window of 40 buckets ((60s refresh * 2) / 3s = 40) + scheduleDensity: [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 2, + 2, + 5, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + ...new Array(20).fill(0), + ], + }); + resolve(); + }); + }); + }); + + test('returns a histogram of the upcoming workload maxed out at 50 buckets when rate is too low', async () => { + const taskManager = taskManagerMock.create(); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + + const workloadAggregator = createWorkloadAggregator( + taskManager, + 15 * 60 * 1000, + 3000, + mockLogger() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + // same schedule density as in previous test, but window of 40 buckets ((60s refresh * 2) / 3s = 40) + scheduleDensity: [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 2, + 2, + 5, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + ...new Array(30).fill(0), + ], + }); + resolve(); + }); + }); + }); + test('recovers from errors fetching the workload', async () => { const taskManager = taskManagerMock.create(); taskManager.aggregate diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index f050ab94b8fec..4840ca17f1462 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -43,6 +43,9 @@ export interface WorkloadStat extends JsonObject { scheduleDensity: number[]; } +// Set an upper bound just in case a customer sets a really high refresh rate +const MAX_SHCEDULE_DENSITY_BUCKETS = 50; + export function createWorkloadAggregator( taskManager: TaskManager, refreshInterval: number, @@ -51,9 +54,9 @@ export function createWorkloadAggregator( ): AggregatedStatProvider { // calculate scheduleDensity going two refreshIntervals or 1 minute into into the future // (the longer of the two) - const scheduleDensityBuckets = Math.max( - Math.round(60000 / pollInterval), - Math.round((refreshInterval * 2) / pollInterval) + const scheduleDensityBuckets = Math.min( + Math.max(Math.round(60000 / pollInterval), Math.round((refreshInterval * 2) / pollInterval)), + MAX_SHCEDULE_DENSITY_BUCKETS ); return timer(0, refreshInterval).pipe( From 2dca67f4ca44bd0c90507ee6e96d14c40307d06d Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 12:47:53 +0100 Subject: [PATCH 22/42] use APM agg types --- .../apm/typings/elasticsearch/aggregations.ts | 21 ++ .../monitoring/workload_statistics.test.ts | 111 ++++++--- .../server/monitoring/workload_statistics.ts | 202 +++++++++------- .../server/queries/aggregation_clauses.ts | 219 ------------------ .../task_manager/server/task_manager.ts | 8 +- .../plugins/task_manager/server/task_store.ts | 34 +-- 6 files changed, 222 insertions(+), 373 deletions(-) delete mode 100644 x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts diff --git a/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts b/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts index 534321201938d..a25782cedc3c0 100644 --- a/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts +++ b/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts @@ -145,6 +145,15 @@ export interface AggregationOptionsByType { >; keyed?: boolean; } & AggregationSourceOptions; + range: { + field: string; + ranges: Array< + | { key?: string; from: string | number } + | { key?: string; to: string | number } + | { key?: string; from: string | number; to: string | number } + >; + keyed?: boolean; + }; auto_date_histogram: { buckets: number; } & AggregationSourceOptions; @@ -319,6 +328,18 @@ interface AggregationResponsePart< ? Record : { buckets: DateRangeBucket[] }; }; + range: { + buckets: TAggregationOptionsMap extends { range: { keyed: true } } + ? Record< + string, + DateRangeBucket & + SubAggregationResponseOf + > + : Array< + DateRangeBucket & + SubAggregationResponseOf + >; + }; auto_date_histogram: { buckets: Array< DateHistogramBucket & diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index a7d22c3f5c9db..2bf4acad25d6c 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -5,16 +5,37 @@ */ import { first, take, bufferCount } from 'rxjs/operators'; -import { createWorkloadAggregator, padBuckets } from './workload_statistics'; +import { WorkloadAggregation, createWorkloadAggregator, padBuckets } from './workload_statistics'; import { taskManagerMock } from '../task_manager.mock'; -import { AggregationSearchResult, KeyedAggregationBucket } from '../queries/aggregation_clauses'; import { mockLogger } from '../test_utils'; +import { ConcreteTaskInstance } from '../task'; +import { ESSearchResponse } from '../../../apm/typings/elasticsearch'; +import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; + +type MockESResult = ESSearchResponse< + ConcreteTaskInstance, + { + body: WorkloadAggregation; + } +>; describe('Workload Statistics Aggregator', () => { test('queries the Task Store at a fixed interval for the current workload', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(({ - count: 0, + taskManager.aggregate.mockResolvedValue({ + hits: { + hits: [], + max_score: 0, + total: { value: 0, relation: 'eq' }, + }, + took: 1, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 1, + failed: 0, + }, aggregations: { taskType: { buckets: [], @@ -44,7 +65,7 @@ describe('Workload Statistics Aggregator', () => { }, }, }, - } as unknown) as AggregationSearchResult); + } as MockESResult); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -100,12 +121,22 @@ describe('Workload Statistics Aggregator', () => { }); }); - const mockAggregatedResult = ({ - count: 4, + const mockAggregatedResult: MockESResult = { + hits: { + hits: [], + max_score: 0, + total: { value: 4, relation: 'eq' }, + }, + took: 1, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 1, + failed: 0, + }, aggregations: { schedule: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, buckets: [ { key: '3600s', @@ -122,15 +153,11 @@ describe('Workload Statistics Aggregator', () => { ], }, taskType: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, buckets: [ { key: 'actions_telemetry', doc_count: 2, status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, buckets: [ { key: 'idle', @@ -143,8 +170,6 @@ describe('Workload Statistics Aggregator', () => { key: 'alerting_telemetry', doc_count: 1, status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, buckets: [ { key: 'idle', @@ -157,8 +182,6 @@ describe('Workload Statistics Aggregator', () => { key: 'session_cleanup', doc_count: 1, status: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, buckets: [ { key: 'idle', @@ -192,11 +215,11 @@ describe('Workload Statistics Aggregator', () => { }, }, }, - } as unknown) as AggregationSearchResult; + }; test('returns a summary of the workload by task type', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -218,7 +241,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a count of the overdue workload', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -235,7 +258,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a histogram of the upcoming workload for the upcoming minute when refresh rate is high', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -257,7 +280,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a histogram of the upcoming workload for twice refresh rate when rate is low', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); const workloadAggregator = createWorkloadAggregator(taskManager, 60 * 1000, 3000, mockLogger()); @@ -297,7 +320,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a histogram of the upcoming workload maxed out at 50 buckets when rate is too low', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); const workloadAggregator = createWorkloadAggregator( taskManager, @@ -504,12 +527,16 @@ describe('padBuckets', () => { }); function setTaskTypeCount( - { aggregations: { taskType: taskTypeAgg, ...otherAggs } }: AggregationSearchResult, + { aggregations }: MockESResult, taskType: string, status: Record ) { + const taskTypes = aggregations!.taskType as AggregationResultOf< + WorkloadAggregation['aggs']['taskType'], + {} + >; const buckets = [ - ...(taskTypeAgg.buckets as KeyedAggregationBucket[]).filter(({ key }) => key !== taskType), + ...taskTypes.buckets.filter(({ key }) => key !== taskType), { key: taskType, doc_count: Object.values(status).reduce((sum, count) => sum + count, 0), @@ -526,14 +553,14 @@ function setTaskTypeCount( return ({ count: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), aggregations: { + ...aggregations, taskType: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets, }, - ...otherAggs, }, - } as unknown) as AggregationSearchResult; + } as {}) as MockESResult; } /** * @@ -557,23 +584,31 @@ function mockHistogram( const fromDate = new Date(from); const toDate = new Date(to); return { + key: `${fromDate.toISOString()}-${toDate.toISOString()}`, from, from_as_string: fromDate.toISOString(), to, to_as_string: toDate.toISOString(), doc_count: foundBuckets.reduce((sum: number, count) => sum + (count ?? 0), 0), histogram: { - buckets: foundBuckets.reduce((histogramBuckets, count, index) => { - if (typeof count === 'number') { - const key = new Date(findFrom + index * interval); - histogramBuckets.push({ - key_as_string: key.toISOString(), - key: key.getTime(), - doc_count: count, - }); - } - return histogramBuckets; - }, [] as KeyedAggregationBucket[]), + buckets: foundBuckets.reduce( + (histogramBuckets, count, index) => { + if (typeof count === 'number') { + const key = new Date(findFrom + index * interval); + histogramBuckets.push({ + key_as_string: key.toISOString(), + key: key.getTime(), + doc_count: count, + }); + } + return histogramBuckets; + }, + [] as Array<{ + key_as_string: string; + key: number; + doc_count: number; + }> + ), }, }; } diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 4840ca17f1462..5fa2ef11c7a96 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -9,21 +9,12 @@ import { concatMap, map, catchError } from 'rxjs/operators'; import { Logger } from 'src/core/server'; import { JsonObject } from 'src/plugins/kibana_utils/common'; import { keyBy, mapValues } from 'lodash'; +import { ESSearchResponse } from '../../../apm/typings/elasticsearch'; import { AggregatedStatProvider } from './runtime_statistics_aggregator'; import { TaskManager } from '../task_manager'; -import { - AggregationSearchResult, - AggregationBucketWithSubAgg, - isBucketedAggregation, - isAggregationBucket, - isKeyedBuckets, - isBucketsWithNumericKey, - aggregationBucketsByKey, - KeyedAggregationBucket, - getStringKeyOfBucket, - RangeAggregationBucket, -} from '../queries/aggregation_clauses'; +import { ConcreteTaskInstance } from '../task'; import { parseIntervalAsSecond, asInterval } from '../lib/intervals'; +import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; interface StatusStat extends JsonObject { [status: string]: number; @@ -43,6 +34,56 @@ export interface WorkloadStat extends JsonObject { scheduleDensity: number[]; } +export interface WorkloadAggregation { + aggs: { + taskType: { + terms: { field: string }; + aggs: { + status: { + terms: { field: string }; + }; + }; + }; + schedule: { + terms: { field: string }; + }; + idleTasks: { + filter: { + term: { 'task.status': string }; + }; + aggs: { + scheduleDensity: { + range: { + field: string; + ranges: [{ from: string; to: string }]; + }; + aggs: { + histogram: { + date_histogram: { + field: string; + fixed_interval: string; + }; + }; + }; + }; + overdue: { + filter: { + range: { + 'task.runAt': { lt: string }; + }; + }; + }; + }; + }; + }; +} + +// The type of a bucket in the scheduleDensity range aggregation +type ScheduleDensityResult = AggregationResultOf< + WorkloadAggregation['aggs']['idleTasks']['aggs']['scheduleDensity'], + {} +>['buckets'][0]; + // Set an upper bound just in case a customer sets a really high refresh rate const MAX_SHCEDULE_DENSITY_BUCKETS = 50; @@ -61,7 +102,7 @@ export function createWorkloadAggregator( return timer(0, refreshInterval).pipe( concatMap(() => - taskManager.aggregate({ + taskManager.aggregate({ aggs: { taskType: { terms: { field: 'task.taskType' }, @@ -107,71 +148,65 @@ export function createWorkloadAggregator( }, }) ), - map( - ({ + map((result: ESSearchResponse) => { + const { aggregations, - count, - }: AggregationSearchResult< - | 'taskType' - | 'schedule' - | 'status' - | 'scheduleDensity' - | 'histogram' - | 'overdue' - | 'idleTasks' - >) => { - if ( - !isBucketedAggregation(aggregations.taskType) || - !isBucketedAggregation(aggregations.schedule) || - !( - !isBucketedAggregation(aggregations.idleTasks) && - isAggregationBucket(aggregations.idleTasks.overdue) && - isBucketedAggregation(aggregations.idleTasks.scheduleDensity) && - !isKeyedBuckets(aggregations.idleTasks.scheduleDensity.buckets) - ) - ) { - throw new Error(`Invalid workload: ${JSON.stringify({ aggregations, count })}`); - } - - const { - taskType: { buckets: taskTypes = [] } = {}, - schedule: { buckets: schedules = [] } = {}, - idleTasks: { - overdue: { doc_count: overdue } = { doc_count: 0 }, - scheduleDensity: { buckets: [scheduleDensity] = [] } = {}, - } = {}, - } = aggregations; + hits: { + total: { value: count }, + }, + } = result; - const summary: WorkloadStat = { - count, - taskTypes: mapValues( - keyBy>( - taskTypes as Array>, - 'key' - ), - ({ doc_count: docCount, status }) => { - return { - count: docCount, - status: mapValues(aggregationBucketsByKey(status), 'doc_count'), - }; - } - ), - schedule: (schedules as KeyedAggregationBucket[]) - .sort( - (scheduleLeft, scheduleRight) => - parseIntervalAsSecond(getStringKeyOfBucket(scheduleLeft)) - - parseIntervalAsSecond(getStringKeyOfBucket(scheduleRight)) - ) - .map((schedule) => [getStringKeyOfBucket(schedule), schedule.doc_count]), - overdue, - scheduleDensity: padBuckets(scheduleDensityBuckets, pollInterval, scheduleDensity), - }; - return { - key: 'workload', - value: summary, - }; + if ( + !( + aggregations?.taskType && + aggregations?.schedule && + aggregations?.idleTasks?.overdue && + aggregations?.idleTasks?.scheduleDensity + ) + ) { + throw new Error(`Invalid workload: ${JSON.stringify({ aggregations, count })}`); } - ), + + const taskTypes = (aggregations.taskType as AggregationResultOf< + WorkloadAggregation['aggs']['taskType'], + {} + >).buckets; + const schedules = (aggregations.schedule as AggregationResultOf< + WorkloadAggregation['aggs']['schedule'], + {} + >).buckets; + + const { + overdue: { doc_count: overdue }, + scheduleDensity: { buckets: [scheduleDensity] = [] } = {}, + } = aggregations.idleTasks as AggregationResultOf< + WorkloadAggregation['aggs']['idleTasks'], + {} + >; + + const summary: WorkloadStat = { + count, + taskTypes: mapValues(keyBy(taskTypes, 'key'), ({ doc_count: docCount, status }) => { + return { + count: docCount, + status: mapValues(keyBy(status, 'key'), 'doc_count'), + }; + }), + schedule: schedules + .sort( + (scheduleLeft, scheduleRight) => + parseIntervalAsSecond(scheduleLeft.key as string) - + parseIntervalAsSecond(scheduleRight.key as string) + ) + .map((schedule) => [schedule.key as string, schedule.doc_count]), + overdue, + scheduleDensity: padBuckets(scheduleDensityBuckets, pollInterval, scheduleDensity), + }; + return { + key: 'workload', + value: summary, + }; + }), catchError((ex: Error, caught) => { logger.error(`[WorkloadAggregator]: ${ex}`); // continue to pull values from the same observable @@ -183,19 +218,10 @@ export function createWorkloadAggregator( export function padBuckets( scheduleDensityBuckets: number, pollInterval: number, - scheduleDensity: unknown + scheduleDensity: ScheduleDensityResult ): number[] { - const { histogram, doc_count: docCount, from } = scheduleDensity as AggregationBucketWithSubAgg< - 'histogram', - RangeAggregationBucket - >; - - if ( - docCount && - histogram && - !isKeyedBuckets(histogram.buckets) && - isBucketsWithNumericKey(histogram.buckets) - ) { + if (scheduleDensity.from && scheduleDensity.histogram?.buckets?.length) { + const { histogram, from } = scheduleDensity; const firstBucket = histogram.buckets[0].key; const bucketsToPadBeforeFirstBucket = bucketsBetween(from, firstBucket, pollInterval); const bucketsToPadAfterLast = @@ -204,7 +230,7 @@ export function padBuckets( ...(bucketsToPadBeforeFirstBucket > 0 ? new Array(bucketsToPadBeforeFirstBucket).fill(0) : []), - ...histogram.buckets.map((bucket, index) => bucket.doc_count), + ...histogram.buckets.map((bucket) => bucket.doc_count), ...(bucketsToPadAfterLast > 0 ? new Array(bucketsToPadAfterLast).fill(0) : []), ]; } diff --git a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts b/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts deleted file mode 100644 index 805be0b148b7f..0000000000000 --- a/x-pack/plugins/task_manager/server/queries/aggregation_clauses.ts +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -import { keyBy } from 'lodash'; -import { TermFilter, RangeFilter } from './query_clauses'; - -/** - * Terminology - * =========== - * The terms for the different clauses in an Elasticsearch query aggregation can be confusing, here are some - * clarifications that might help you understand the Typescript types we use here. - * - * Given the following Aggregation: - * { - * "size": 0, - * "aggs": { (1) - * "task": { - * "filter": { - * "term": { - * "type": "task" - * } - * }, - * "aggs": { (1) - * "taskType": { (2) - * "terms": { "field": "task.taskType" }, - * "aggs": { - * "status": { (2) - * "terms": { "field": "task.status" } - * } - * } - * }, - * "scheduleDensity": { - * "range": { (3) - * "field": "task.runAt", - * "keyed": true, - * "ranges": [ - * { "key": "overdue", "from": "now-1m", "to": "now" }, - * { "key": "upcoming", "from": "now+1s", "to": "now+1m" } - * ] - * }, - * "aggs": { - * "histogram": { (4) - * "date_histogram": { - * "field": "task.runAt", - * "fixed_interval": "3s" - * } - * } - * } - * } - * } - * } - * } - * } - * - * These are referred to as: - * (1). AggregationQuery - * (2). TermAggregation - * (3). RangeAggregation - * (4). HistogramAggregation - * - */ - -export interface AggregationQuery { - [aggregationName: string]: TypedAggregation & { aggs?: AggregationQuery }; -} - -type TypedAggregation = - | TermAggregation - | FilterAggregation - | RangeAggregation - | RangeAggregation - | HistogramAggregation; - -interface TermAggregation { - terms: { - field: string; - }; -} - -interface FilterAggregation { - filter: TermFilter | RangeFilter; -} - -interface RangeAggregation { - range: { - field: string; - keyed?: boolean; - ranges: Array<{ key?: string; from?: string; to?: string }>; - }; -} - -interface HistogramAggregation { - date_histogram: { - field: string; - fixed_interval: string; - keyed?: boolean; - }; -} - -/** - * Results of an Aggregation - */ -type ReservedNames = 'doc_count'; -type AggregationNames = Exclude; -export type Aggregation = { - doc_count: number; -} & { - [innerAggregation in Name]: AggregationBuckets; -}; - -export interface AggregationBucket { - doc_count: number; -} - -export function isAggregationBucket(bucket: unknown): bucket is AggregationBucket { - return typeof (bucket as AggregationBucket)?.doc_count === 'number'; -} - -export function isBucketsWithNumericKey( - buckets: AggregationBuckets['buckets'] -): buckets is Array< - AggregationBucket & { - key_as_string: string; - key: number; - } -> { - return ( - !isKeyedBuckets(buckets) && typeof (buckets[0] as KeyedAggregationBucket)?.key === 'number' - ); -} - -export type KeyedAggregationBucket = AggregationBucket & - ( - | { - key: string; - } - | { - key_as_string: string; - key: number; - } - ); - -export function getStringKeyOfBucket(bucket: KeyedAggregationBucket) { - return typeof bucket.key === 'string' - ? bucket.key - : (bucket as { - key_as_string: string; - }).key_as_string; -} - -export interface RangeAggregationBucket { - from: number; - to: number; - doc_count: number; -} - -export type KeyedRangeAggregationBucket = RangeAggregationBucket & { - key: string; -}; - -export function isRangeAggregationBucket(bucket: TypedBucket): bucket is RangeAggregationBucket { - return ( - typeof (bucket as RangeAggregationBucket).to !== 'number' || - typeof (bucket as RangeAggregationBucket).from !== 'number' - ); -} - -type TypedBucket = AggregationBucket | RangeAggregationBucket; -type KeyedTypedBucket = KeyedAggregationBucket | KeyedRangeAggregationBucket; - -export type AggregationBucketWithSubAgg< - Name extends AggregationNames, - AggType extends TypedBucket = TypedBucket -> = AggType & - { - [innerAggregation in Name]: AggregationBuckets; - }; - -export type KeyedBuckets = Record< - Name, - TypedBucket | AggregationBucketWithSubAgg ->; - -export interface AggregationBuckets { - buckets: KeyedTypedBucket[] | Array> | KeyedBuckets; -} - -export function isKeyedBuckets( - buckets: AggregationBuckets['buckets'] -): buckets is KeyedBuckets { - return !Array.isArray(buckets); -} - -export function aggregationBucketsByKey({ - buckets, -}: AggregationBuckets): KeyedBuckets { - if (isKeyedBuckets(buckets)) { - return buckets; - } - return keyBy(buckets, 'key') as KeyedBuckets; -} - -export type AggregationResult = { - [aggregationName in Name]: Aggregation | AggregationBuckets; -}; - -export function isBucketedAggregation( - aggregation: Aggregation | AggregationBuckets -): aggregation is AggregationBuckets { - return aggregation && Array.isArray((aggregation as AggregationBuckets).buckets); -} - -export interface AggregationSearchResult { - count: number; - aggregations: AggregationResult; -} diff --git a/x-pack/plugins/task_manager/server/task_manager.ts b/x-pack/plugins/task_manager/server/task_manager.ts index af6c02a60576f..af1d7cbe22d6e 100644 --- a/x-pack/plugins/task_manager/server/task_manager.ts +++ b/x-pack/plugins/task_manager/server/task_manager.ts @@ -12,6 +12,7 @@ import { performance } from 'perf_hooks'; import { pipe } from 'fp-ts/lib/pipeable'; import { Option, some, map as mapOptional, getOrElse } from 'fp-ts/lib/Option'; +import { ESSearchResponse } from '../../apm/typings/elasticsearch'; import { SavedObjectsSerializer, ILegacyScopedClusterClient, @@ -69,7 +70,6 @@ import { import { identifyEsError } from './lib/identify_es_error'; import { ensureDeprecatedFieldsAreCorrected } from './lib/correct_deprecated_fields'; import { BufferedTaskStore } from './buffered_task_store'; -import { AggregationSearchResult } from './queries/aggregation_clauses'; const VERSION_CONFLICT_STATUS = 409; @@ -399,11 +399,11 @@ export class TaskManager { * @param opts - The query options used to filter tasks * @returns {Promise} */ - public async aggregate( + public async aggregate( opts: AggregationOpts - ): Promise> { + ): Promise> { await this.waitUntilStarted(); - return this.store.aggregate(opts); + return this.store.aggregate(opts); } /** diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index ea3aa7170c86c..c2fe44625ee8b 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -60,11 +60,8 @@ import { SortByRunAtAndRetryAt, tasksClaimedByOwner, } from './queries/mark_available_tasks_as_claimed'; -import { - AggregationQuery, - AggregationSearchResult, - AggregationResult, -} from './queries/aggregation_clauses'; + +import { ESSearchResponse, ESSearchBody, ESSearchRequest } from '../../apm/typings/elasticsearch'; export interface StoreOpts { callCluster: ElasticJs; @@ -84,11 +81,8 @@ export interface SearchOpts { search_after?: unknown[]; } -export interface AggregationOpts { - aggs: AggregationQuery; - query?: object; - size?: number; -} +export type AggregationOpts = Pick, 'aggs'> & + Pick; export interface UpdateByQuerySearchOpts extends SearchOpts { script?: object; @@ -469,28 +463,20 @@ export class TaskStore { }; } - public async aggregate({ + public async aggregate({ aggs, + query, size = 0, - }: AggregationOpts): Promise> { - const { - aggregations, - hits: { - total: { value: count }, - }, - } = (await this.callCluster('search', { + }: AggregationOpts) { + return this.callCluster('search', { index: this.index, ignoreUnavailable: true, body: ensureAggregationOnlyReturnsTaskObjects({ + query, aggs, size, }), - })) as { - aggregations: AggregationResult; - hits: { total: { value: number } }; - }; - - return { aggregations, count }; + }) as Promise>; } private async updateByQuery( From ae15dc62969b135f05fdbc552a4e5a7295532049 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 13:05:32 +0100 Subject: [PATCH 23/42] fixed tests --- .../monitoring/workload_statistics.test.ts | 4 +++- .../server/monitoring/workload_statistics.ts | 4 ++-- .../task_manager/server/routes/health.test.ts | 17 +++++++++++------ .../plugins/task_manager/server/task_store.ts | 2 +- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index 2bf4acad25d6c..0714401bdf4ec 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -551,7 +551,9 @@ function setTaskTypeCount( }, ]; return ({ - count: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0), + hits: { + total: { value: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0) }, + }, aggregations: { ...aggregations, taskType: { diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 5fa2ef11c7a96..28f412ce7cf4d 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -164,7 +164,7 @@ export function createWorkloadAggregator( aggregations?.idleTasks?.scheduleDensity ) ) { - throw new Error(`Invalid workload: ${JSON.stringify({ aggregations, count })}`); + throw new Error(`Invalid workload: ${JSON.stringify(result)}`); } const taskTypes = (aggregations.taskType as AggregationResultOf< @@ -189,7 +189,7 @@ export function createWorkloadAggregator( taskTypes: mapValues(keyBy(taskTypes, 'key'), ({ doc_count: docCount, status }) => { return { count: docCount, - status: mapValues(keyBy(status, 'key'), 'doc_count'), + status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), }; }), schedule: schedules diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 82f1717092dfc..52efa97ea4000 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -170,20 +170,25 @@ function mockHealthStats(overrides = {}) { workload: { timestamp: new Date().toISOString(), value: { - sum: 4, + count: 4, taskTypes: { - actions_telemetry: { sum: 2, status: { idle: 2 } }, - alerting_telemetry: { sum: 1, status: { idle: 1 } }, - session_cleanup: { sum: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 1, status: { idle: 1 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, }, + schedule: {}, + overdue: 0, + scheduleDensity: [], }, }, runtime: { timestamp: new Date().toISOString(), value: { drift: [1000, 1000], - duration: [], - taskRunResultFrequency: [], + execution: { + duration: [], + resultFrequency: [], + }, polling: { lastSuccessfulPoll: new Date().toISOString(), resultFrequency: ['NoTasksClaimed', 'NoTasksClaimed', 'NoTasksClaimed'], diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index c2fe44625ee8b..af93970937748 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -61,7 +61,7 @@ import { tasksClaimedByOwner, } from './queries/mark_available_tasks_as_claimed'; -import { ESSearchResponse, ESSearchBody, ESSearchRequest } from '../../apm/typings/elasticsearch'; +import { ESSearchResponse, ESSearchBody } from '../../apm/typings/elasticsearch'; export interface StoreOpts { callCluster: ElasticJs; From 734cb12c86cbbfadd036cf5670ef9315f7c4ffab Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 13:58:18 +0100 Subject: [PATCH 24/42] fixed mock import --- .../alerts/server/alerts_client_conflict_retries.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugins/alerts/server/alerts_client_conflict_retries.test.ts b/x-pack/plugins/alerts/server/alerts_client_conflict_retries.test.ts index 1c5edb45c80fe..b1ac5ac4c6783 100644 --- a/x-pack/plugins/alerts/server/alerts_client_conflict_retries.test.ts +++ b/x-pack/plugins/alerts/server/alerts_client_conflict_retries.test.ts @@ -8,7 +8,7 @@ import { cloneDeep } from 'lodash'; import { AlertsClient, ConstructorOptions } from './alerts_client'; import { savedObjectsClientMock, loggingSystemMock } from '../../../../src/core/server/mocks'; -import { taskManagerMock } from '../../task_manager/server/task_manager.mock'; +import { taskManagerMock } from '../../task_manager/server/mocks'; import { alertTypeRegistryMock } from './alert_type_registry.mock'; import { alertsAuthorizationMock } from './authorization/alerts_authorization.mock'; import { encryptedSavedObjectsMock } from '../../encrypted_saved_objects/server/mocks'; @@ -25,7 +25,7 @@ const MockAlertId = 'alert-id'; const ConflictAfterRetries = RetryForConflictsAttempts + 1; -const taskManager = taskManagerMock.start(); +const taskManager = taskManagerMock.createStart(); const alertTypeRegistry = alertTypeRegistryMock.create(); const unsecuredSavedObjectsClient = savedObjectsClientMock.create(); From db869863363f0f53332d1850eb2764a60025e61a Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 16:11:16 +0100 Subject: [PATCH 25/42] added status on health api --- x-pack/plugins/task_manager/server/plugin.ts | 6 +- .../task_manager/server/routes/health.test.ts | 67 ++++++++++++++--- .../task_manager/server/routes/health.ts | 75 ++++++++++++------- 3 files changed, 109 insertions(+), 39 deletions(-) diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index f53418aec05ad..fd922bb0da782 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -55,8 +55,10 @@ export class TaskManagerPlugin router, this.taskManager.then((tm) => createMonitoringStats(tm, config, logger)), logger, - // if health is any more stale than the pollInterval (+1s buffer) consider the system unhealthy - config.poll_interval + 1000 + // if "hot" health stats are any more stale than the pollInterval (+1s buffer) consider the system unhealthy + config.poll_interval + 1000, + // if "cold" health stats are any more stale than the configured refresh, consider the system unhealthy + config.monitored_aggregated_stats_refresh_rate ); return { diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 52efa97ea4000..921acb31451fa 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -39,7 +39,7 @@ describe('healthRoute', () => { const stats = Promise.resolve(new Subject()); - healthRoute(router, stats, logger, 1000); + healthRoute(router, stats, logger, 1000, 60000); const stats$ = await stats; @@ -59,11 +59,11 @@ describe('healthRoute', () => { expect(logger.debug).toHaveBeenCalledTimes(2); }); - it('returns an error response if the stats are no longer fresh', async () => { + it('returns a red status if the stats have not been updated within the required hot freshness', async () => { const router = httpServiceMock.createRouter(); const mockStat = mockHealthStats(); - healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000); + healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000, 60000); const [, handler] = router.get.mock.calls[0]; @@ -73,7 +73,8 @@ describe('healthRoute', () => { expect(await handler(context, req, res)).toMatchObject({ body: { - attributes: summarizeMonitoringStats( + status: 'red', + ...summarizeMonitoringStats( mockHealthStats({ lastUpdate: expect.any(String), stats: { @@ -94,12 +95,58 @@ describe('healthRoute', () => { }, }) ), - message: new Error('Task Manager monitored stats are out of date'), }, }); }); - it('returns an error response if the poller hasnt polled within the required freshness', async () => { + it('returns a red status if the workload stats have not been updated within the required cold freshness', async () => { + const router = httpServiceMock.createRouter(); + + const lastUpdateOfWorkload = new Date(Date.now() - 120000).toISOString(); + const mockStat = mockHealthStats({ + stats: { + workload: { + timestamp: lastUpdateOfWorkload, + }, + }, + }); + healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 5000, 60000); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + await sleep(2000); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + status: 'red', + ...summarizeMonitoringStats( + mockHealthStats({ + lastUpdate: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + lastSuccessfulPoll: expect.any(String), + }, + }, + }, + }, + }) + ), + }, + }); + }); + + it('returns a red status if the poller hasnt polled within the required hot freshness', async () => { const router = httpServiceMock.createRouter(); const lastSuccessfulPoll = new Date(Date.now() - 2000).toISOString(); @@ -114,7 +161,7 @@ describe('healthRoute', () => { }, }, }); - healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000); + healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000, 60000); const [, handler] = router.get.mock.calls[0]; @@ -122,7 +169,8 @@ describe('healthRoute', () => { expect(await handler(context, req, res)).toMatchObject({ body: { - attributes: summarizeMonitoringStats( + status: 'red', + ...summarizeMonitoringStats( mockHealthStats({ lastUpdate: expect.any(String), stats: { @@ -143,7 +191,6 @@ describe('healthRoute', () => { }, }) ), - message: new Error('Task Manager monitored stats are out of date'), }, }); }); @@ -184,7 +231,7 @@ function mockHealthStats(overrides = {}) { runtime: { timestamp: new Date().toISOString(), value: { - drift: [1000, 1000], + drift: [1000, 60000], execution: { duration: [], resultFrequency: [], diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index d48775803c780..0eb34d6960aa5 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -15,18 +15,25 @@ import { Logger } from 'src/core/server'; import { Observable } from 'rxjs'; import { take } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; -import { isString } from 'lodash'; +import { isString, isNumber } from 'lodash'; import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; +enum HealthStatus { + Green = 'green', + Yellow = 'yellow', + Red = 'red', +} + export function healthRoute( router: IRouter, monitoringStats: Promise>, logger: Logger, - requiredFreshness: number + requiredHotStatsFreshness: number, + requiredColdStatsFreshness: number ) { /* Log Task Manager stats as a Debug log line at a fixed interval */ monitoringStats.then((monitoringStats$) => { - monitoringStats$.pipe(throttleTime(requiredFreshness)).subscribe((stats) => { + monitoringStats$.pipe(throttleTime(requiredHotStatsFreshness)).subscribe((stats) => { logger.debug(JSON.stringify(summarizeMonitoringStats(stats))); }); }); @@ -46,40 +53,54 @@ export function healthRoute( const timestamp = new Date(now).toISOString(); /** - * If the monitored stats aren't fresh, return an `500 internalError` with - * the stats in the body of the api call. This makes it easier for monitoring - * services to mark the service as broken + * If the monitored stats aren't fresh, return a red status */ - if ( - now - - getOldestTimestamp( - stats.lastUpdate, - stats.stats.runtime?.value.polling.lastSuccessfulPoll - ) > - requiredFreshness - ) { - return res.internalError({ - body: { - message: new Error('Task Manager monitored stats are out of date'), - attributes: { timestamp, ...summarizeMonitoringStats(stats) }, - }, - }); - } + const healthStatus = + hasExpiredHotTimestamps(stats, now, requiredHotStatsFreshness) || + hasExpiredColdTimestamps(stats, now, requiredColdStatsFreshness) + ? HealthStatus.Red + : HealthStatus.Green; + return res.ok({ - body: { timestamp, ...summarizeMonitoringStats(stats) }, + body: { timestamp, status: healthStatus, ...summarizeMonitoringStats(stats) }, }); } ); } -function getOldestTimestamp(...timestamps: unknown[]): number { - return Math.min( - ...timestamps - .map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN)) - .filter((timestamp) => !isNaN(timestamp)) +/** + * If certain "hot" stats are not fresh, then the _health api will should return a Red status + * @param stats The monitored stats + * @param now The time to compare against + * @param requiredFreshness How fresh should these stats be + */ +function hasExpiredHotTimestamps( + stats: MonitoringStats, + now: number, + requiredFreshness: number +): boolean { + return ( + now - + getOldestTimestamp(stats.lastUpdate, stats.stats.runtime?.value.polling.lastSuccessfulPoll) > + requiredFreshness ); } +function hasExpiredColdTimestamps( + stats: MonitoringStats, + now: number, + requiredFreshness: number +): boolean { + return now - getOldestTimestamp(stats.stats.workload?.timestamp) > requiredFreshness; +} + +function getOldestTimestamp(...timestamps: unknown[]): number { + const validTimestamps = timestamps + .map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN)) + .filter((timestamp) => !isNaN(timestamp)); + return validTimestamps.length ? Math.min(...validTimestamps) : 0; +} + async function getLatestStats(monitoringStats$: Observable) { return new Promise((resolve) => monitoringStats$.pipe(take(1)).subscribe((stats) => resolve(stats)) From 4ca65c3ec5448cbf7606c90d7599a89b100f2639 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 16:28:05 +0100 Subject: [PATCH 26/42] test status in aceptancve tests --- .../test_suites/task_manager/health_route.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 188cce9e0cc6c..243ea3084dc4b 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -75,7 +75,9 @@ export default function ({ getService }: FtrProviderContext) { describe('health', () => { it('should return basic configuration of task manager', async () => { - expect((await getHealth()).stats.configuration.value).to.eql({ + const health = await getHealth(); + expect(health.status).to.eql('green'); + expect(health.stats.configuration.value).to.eql({ poll_interval: 3000, max_poll_inactivity_cycles: 10, monitored_aggregated_stats_refresh_rate: monitoredAggregatedStatsRefreshRate, @@ -86,7 +88,14 @@ export default function ({ getService }: FtrProviderContext) { }); it('should return the task manager workload', async () => { - const { workload } = (await getHealth()).stats; + const health = await getHealth(); + const { + status, + stats: { workload }, + } = health; + + expect(status).to.eql('green'); + const sumSampleTaskInWorkload = (workload.value.taskTypes as { sampleTask?: { count: number }; From 44cb5789285216828e4cf8478bb2619c2f4ca4c2 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 18:54:02 +0100 Subject: [PATCH 27/42] corrected types --- x-pack/plugins/task_manager/server/plugin.ts | 2 +- x-pack/plugins/task_manager/server/routes/health.test.ts | 2 +- x-pack/plugins/task_manager/server/routes/health.ts | 2 +- .../test_suites/task_manager/health_route.ts | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index fd922bb0da782..5f627fee85f8f 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -58,7 +58,7 @@ export class TaskManagerPlugin // if "hot" health stats are any more stale than the pollInterval (+1s buffer) consider the system unhealthy config.poll_interval + 1000, // if "cold" health stats are any more stale than the configured refresh, consider the system unhealthy - config.monitored_aggregated_stats_refresh_rate + config.monitored_aggregated_stats_refresh_rate + 1000 ); return { diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 921acb31451fa..289a3a3b605c7 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -20,7 +20,7 @@ describe('healthRoute', () => { it('registers the route', async () => { const router = httpServiceMock.createRouter(); - healthRoute(router, Promise.resolve(of()), mockLogger(), 1000); + healthRoute(router, Promise.resolve(of()), mockLogger(), 1000, 1000); const [config] = router.get.mock.calls[0]; diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 0eb34d6960aa5..987f1cdbb0066 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -15,7 +15,7 @@ import { Logger } from 'src/core/server'; import { Observable } from 'rxjs'; import { take } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; -import { isString, isNumber } from 'lodash'; +import { isString } from 'lodash'; import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; enum HealthStatus { diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 243ea3084dc4b..f70c4253f79f9 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -13,6 +13,7 @@ import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; interface MonitoringStats { lastUpdate: string; + status: string; stats: { configuration: { timestamp: string; From d29c866d23b9d7a9ff43eb48b98cdc9ebe86c64d Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 5 Oct 2020 19:29:41 +0100 Subject: [PATCH 28/42] change RGY to OK Error and Warn --- .../task_manager/server/routes/health.test.ts | 12 +++--- .../task_manager/server/routes/health.ts | 39 ++++++++++--------- .../test_suites/task_manager/health_route.ts | 4 +- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 289a3a3b605c7..d2e2ee707ffa1 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -59,7 +59,7 @@ describe('healthRoute', () => { expect(logger.debug).toHaveBeenCalledTimes(2); }); - it('returns a red status if the stats have not been updated within the required hot freshness', async () => { + it('returns a error status if the stats have not been updated within the required hot freshness', async () => { const router = httpServiceMock.createRouter(); const mockStat = mockHealthStats(); @@ -73,7 +73,7 @@ describe('healthRoute', () => { expect(await handler(context, req, res)).toMatchObject({ body: { - status: 'red', + status: 'error', ...summarizeMonitoringStats( mockHealthStats({ lastUpdate: expect.any(String), @@ -99,7 +99,7 @@ describe('healthRoute', () => { }); }); - it('returns a red status if the workload stats have not been updated within the required cold freshness', async () => { + it('returns a error status if the workload stats have not been updated within the required cold freshness', async () => { const router = httpServiceMock.createRouter(); const lastUpdateOfWorkload = new Date(Date.now() - 120000).toISOString(); @@ -120,7 +120,7 @@ describe('healthRoute', () => { expect(await handler(context, req, res)).toMatchObject({ body: { - status: 'red', + status: 'error', ...summarizeMonitoringStats( mockHealthStats({ lastUpdate: expect.any(String), @@ -146,7 +146,7 @@ describe('healthRoute', () => { }); }); - it('returns a red status if the poller hasnt polled within the required hot freshness', async () => { + it('returns a error status if the poller hasnt polled within the required hot freshness', async () => { const router = httpServiceMock.createRouter(); const lastSuccessfulPoll = new Date(Date.now() - 2000).toISOString(); @@ -169,7 +169,7 @@ describe('healthRoute', () => { expect(await handler(context, req, res)).toMatchObject({ body: { - status: 'red', + status: 'error', ...summarizeMonitoringStats( mockHealthStats({ lastUpdate: expect.any(String), diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 987f1cdbb0066..36e7a20e3bb30 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -19,9 +19,9 @@ import { isString } from 'lodash'; import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; enum HealthStatus { - Green = 'green', - Yellow = 'yellow', - Red = 'red', + OK = 'OK', + Warning = 'warn', + Error = 'error', } export function healthRoute( @@ -31,10 +31,26 @@ export function healthRoute( requiredHotStatsFreshness: number, requiredColdStatsFreshness: number ) { + function calculateStatus(stats: MonitoringStats) { + const now = Date.now(); + const timestamp = new Date(now).toISOString(); + + /** + * If the monitored stats aren't fresh, return a red status + */ + const healthStatus = + hasExpiredHotTimestamps(stats, now, requiredHotStatsFreshness) || + hasExpiredColdTimestamps(stats, now, requiredColdStatsFreshness) + ? HealthStatus.Error + : HealthStatus.OK; + + return { timestamp, status: healthStatus, ...summarizeMonitoringStats(stats) }; + } + /* Log Task Manager stats as a Debug log line at a fixed interval */ monitoringStats.then((monitoringStats$) => { monitoringStats$.pipe(throttleTime(requiredHotStatsFreshness)).subscribe((stats) => { - logger.debug(JSON.stringify(summarizeMonitoringStats(stats))); + logger.debug(JSON.stringify(calculateStatus(stats))); }); }); @@ -48,21 +64,8 @@ export function healthRoute( req: KibanaRequest, res: KibanaResponseFactory ): Promise { - const stats = await getLatestStats(await monitoringStats); - const now = Date.now(); - const timestamp = new Date(now).toISOString(); - - /** - * If the monitored stats aren't fresh, return a red status - */ - const healthStatus = - hasExpiredHotTimestamps(stats, now, requiredHotStatsFreshness) || - hasExpiredColdTimestamps(stats, now, requiredColdStatsFreshness) - ? HealthStatus.Red - : HealthStatus.Green; - return res.ok({ - body: { timestamp, status: healthStatus, ...summarizeMonitoringStats(stats) }, + body: calculateStatus(await getLatestStats(await monitoringStats)), }); } ); diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index f70c4253f79f9..88e5910661477 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -77,7 +77,7 @@ export default function ({ getService }: FtrProviderContext) { describe('health', () => { it('should return basic configuration of task manager', async () => { const health = await getHealth(); - expect(health.status).to.eql('green'); + expect(health.status).to.eql('OK'); expect(health.stats.configuration.value).to.eql({ poll_interval: 3000, max_poll_inactivity_cycles: 10, @@ -95,7 +95,7 @@ export default function ({ getService }: FtrProviderContext) { stats: { workload }, } = health; - expect(status).to.eql('green'); + expect(status).to.eql('OK'); const sumSampleTaskInWorkload = (workload.value.taskTypes as { From 273d58d22bdf30de0b05709cbdd174275bb8c40e Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 6 Oct 2020 10:49:00 +0100 Subject: [PATCH 29/42] added readme --- .../plugins/task_manager/server/MONITORING.md | 243 ++++++++++++++++++ x-pack/plugins/task_manager/server/README.md | 8 + 2 files changed, 251 insertions(+) create mode 100644 x-pack/plugins/task_manager/server/MONITORING.md diff --git a/x-pack/plugins/task_manager/server/MONITORING.md b/x-pack/plugins/task_manager/server/MONITORING.md new file mode 100644 index 0000000000000..2fa08aa8bc1df --- /dev/null +++ b/x-pack/plugins/task_manager/server/MONITORING.md @@ -0,0 +1,243 @@ +# Task Manager Monitoring + +Task Manager has an internal monitoring mechanism in which keeps track of a variety of metrics which are exposed via a `health` api endpoint and Kibana Server Log debug messaging. + +## Exposed Metrics +There are three different sections to the stats returned by the `health` api. +- `configuration`: Summarizes Task Manager's current configuration. +- `workload`: Summarizes the workload in the current deployment. +- `runtime`: Tracks Task Manager's performance. + +### Configuring the Stats +There are two new configurations: + +- `xpack.task_manager.monitored_aggregated_stats_refresh_rate` - Dictates how often we refresh the "Cold" metrics. These metrics require an aggregation against Elasticsearch and adds load to the system, hence we want to limit how often we execute these. This covers the entire `workload` section of the stats. By default this is set to `60s` +- `xpack.task_manager.monitored_stats_running_average_window`- Dictates the size of the window used to calculate the running average of various "Hot" stats, such as the time it takes to run a task, the _drift_ that tasks experience etc. These stats are collected throughout the lifecycle of tasks and this window will dictate how large the queue we keep in memory would be, and how many values we need to calculate the average against. We do not calculate the average on *every* new value, but rather only when the time comes to summarize the stats before logging them or returning them to the API endpoint. + +Other configurations are inferred from existing config values. +For example: +- The _required freshness_ of critical "Hot" stats in always `pollingInterval + 1s`, which means that if key stats (last polling cycle time, for example) haven't been refreshed within the time scale of a single interval + 1s the stat will report an `Error` status. +- The _required freshness_ of critical "Cold" stats is `monitored_aggregated_stats_refresh_rate + 1s` , which means that if these stats (workload, for example) has not been updated within the required refresh rate then the api will return an `Error` status. + +## Consuming Health Stats +Task Manager exposes a `/api/task_manager/_health` api which returns the _latest_ stats. +Calling this API is designed to be fast and doesn't actually perform any checks- rather it returns the result of the latest stats in the system, and is design in such a way that you could call it from an external service on a regular basis without worrying that you'll be adding substantial load to the system. + +Additionally, the metrics are logged out into Task Manager's `DEBUG` logger at a regular cadence (dictated by the Polling Interval). +If you wish to enable DEBUG logging in your Kibana instance, you will need to add the following to your `Kibana.yml`: +``` +logging: + loggers: + - context: plugins.taskManager + appenders: [console] + level: debug +``` + +Please bear in mind that these stats are logged as often as your `poll_interval` configuration, which means it could add substantial noise to your logs. +We would recommend only enabling this level of logging temporarily. + +### Understanding the Exposed Stats + +As mentioned above, the `health` api exposes three sections: `configuration`, `workload` and `runtime`. +Each section has a `timestamp` and a `status` which indicates when the last update to this setion took place and whether the health of this section was evaluated as `OK`, `Warning` or `Error`. + +The root has its own `status` which indicate the state of the system overall as infered from the `status` of the section. +An `Error` status in any section will cause the whole system to display as `Error`. +A `Warning` status in any section will cause the whole system to display as `Warning`. +An `OK` status will only be displayed when all sections are marked as `OK`. + +The root `timestamp` is the time in which the summary was exposed (either to the DEBUG logger or the http api) and the `lastUpdate` is the last time any one of the sections was updated. + +#### The Configuration Section +The `configuration` section summarizes Task Manager's current configuration, including dynamic configurations which change over time, such as `poll_interval` and `max_workers` which adjust in reaction to changing load on the system. + +These are "Hot" stats which are updated whenever a change happens in the configuration. + +#### The Workload Section +The `workload` which summarizes the work load in the current deployment, listing the tasks in the system, their types and what their current status is. + +It includes three sub sections: + - The number of tasks scheduled in the system, broken down by type and status. + - The number of idle `overdue` tasks, whose `runAt` has expired. + - Execution density in the next minute or so (configurable), which shows how many tasks are scheduled to execute in the scope of each polling interval. This can give us an idea of how much load there is on the current Kibana deployment. + +These are "Cold" stat which are updated at a regular cadence, configured by the `monitored_aggregated_stats_refresh_rate` config. + +#### The Runtime Section +The `runtime` tracks Task Manager's performance as it runs, making note of task execution time, _drift_ etc. +These include: + - The time it takes a task to run (mean and median, using a configurable running average window, `50` by default) + - The average _drift_ that tasks experience (mean and median, using the same configurable running average window as above). Drift tells us how long after a task's scheduled a task typically executes. + - The polling rate (the timestamp of the last time a polling cycle completed) and the result [`No tasks | Filled task pool | Unexpectedly ran out of workers`] frequency the past 50 polling cycles (using the same window size as the one used for running averages) + - The `Success | Retry | Failure ratio` by task type. This is different than the workload stats which tell you what's in the queue, but ca't keep track of retries and of non recurring tasks as they're wiped off the index when completed. + +These are "Hot" stats which are updated reactively as Tasks are executed and interacted with. + +### Example Stats + +For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might get these stats: +``` +{ + /* the time these stats were returned by the api */ + "timestamp": "2020-10-05T18:26:11.346Z", + /* the overall status of the system */ + "status": "OK", + /* last time any stat was updated in this output */ + "lastUpdate": "2020-10-05T17:57:55.411Z", + "stats": { + "configuration": { /* current configuration of TM */ + "timestamp": "2020-10-05T17:56:06.507Z", + "value": { + "max_workers": 10, + "poll_interval": 3000, + "request_capacity": 1000, + "max_poll_inactivity_cycles": 10, + "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_running_average_window": 50 + } + }, + "workload": { /* The workload of this deployment */ + "timestamp": "2020-10-05T17:57:06.534Z", + "value": { + "count": 6, /* count of tasks in the system */ + "taskTypes": { /* what tasks are there and what status are they in */ + "actions_telemetry": { + "count": 1, + "status": { + "idle": 1 + } + }, + "alerting_telemetry": { + "count": 1, + "status": { + "idle": 1 + } + }, + "apm-telemetry-task": { + "count": 1, + "status": { + "idle": 1 + } + }, + "endpoint:user-artifact-packager": { + "count": 1, + "status": { + "idle": 1 + } + }, + "lens_telemetry": { + "count": 1, + "status": { + "idle": 1 + } + }, + "session_cleanup": { + "count": 1, + "status": { + "idle": 1 + } + } + }, + + /* Frequency of recurring tasks schedules */ + "schedule": [ + ["60s", 1], /* 1 task, every 60s */ + ["3600s", 3], /* 3 tasks every hour */ + ["720m", 1] + ], + /* There are no overdue tasks in this system at the moment */ + "overdue": 0, + /* This is the schedule density, it shows a histogram of all the polling intervals in the next minute (or, if + pollInterval is configured unusually high it will show a min of 2 refresh intervals into the future, and a max of 50 buckets). + Here we see that on the 3rd polling interval from *now* (which is ~9 seconds from now, as pollInterval is `3s`) there is one task due to run. + We also see that there are 5 due two intervals later, which is fine as we have a max workers of `10` + */ + "scheduleDensity": [0, 0, 1, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + } + }, + "runtime": { + "timestamp": "2020-10-05T17:57:55.411Z", + "value": { + "polling": { + /* When was the last polling cycle? */ + "lastSuccessfulPoll": "2020-10-05T17:57:55.411Z", + /* What is the frequency of polling cycle result? + Here we see 94% of "NoTasksClaimed" and 6% "PoolFilled" */ + "resultFrequency": { + "NoTasksClaimed": 94, + "RanOutOfCapacity": 0, /* This is a legacy result, we might want to rename - it tells us when a polling cycle resulted in claiming more tasks than we had workers for, butt he name doesn't make much sense outside of the context of the code */ + "PoolFilled": 6 + } + }, + /* on average, the tasks in this deployment run 1.7s after their scheduled time */ + "drift": { + "mean": 1720, + "median": 2276 + }, + "execution": { + "duration": { + /* on average, the `endpoint:user-artifact-packager` tasks take 15ms to run */ + "endpoint:user-artifact-packager": { + "mean": 15, + "median": 14.5 + }, + "session_cleanup": { + "mean": 28, + "median": 28 + }, + "lens_telemetry": { + "mean": 100, + "median": 100 + }, + "actions_telemetry": { + "mean": 135, + "median": 135 + }, + "alerting_telemetry": { + "mean": 197, + "median": 197 + }, + "apm-telemetry-task": { + "mean": 1347, + "median": 1347 + } + }, + "resultFrequency": { + /* and 100% of `endpoint:user-artifact-packager` have completed in success (within the running average window, so the past 50 runs (by default, configrable by `monitored_stats_running_average_window`) */ + "endpoint:user-artifact-packager": { + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "session_cleanup": { + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "lens_telemetry": { + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "actions_telemetry": { + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "alerting_telemetry": { + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "apm-telemetry-task": { + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + } + } + } + } + } + } +} +``` diff --git a/x-pack/plugins/task_manager/server/README.md b/x-pack/plugins/task_manager/server/README.md index fd2409a7db0a5..4eb8a78cb4d97 100644 --- a/x-pack/plugins/task_manager/server/README.md +++ b/x-pack/plugins/task_manager/server/README.md @@ -48,6 +48,8 @@ The task_manager can be configured via `taskManager` config options (e.g. `taskM - `override_num_workers`: An object of `taskType: number` that overrides the `num_workers` for tasks - For example: `task_manager.override_num_workers.reporting: 2` would override the number of workers occupied by tasks of type `reporting` - This allows sysadmins to tweak the operational performance of Kibana, allowing more or fewer tasks of a specific type to run simultaneously +- `monitored_aggregated_stats_refresh_rate` - Dictates how often we refresh the "Cold" metrics. Learn More: [./MONITORING](./MONITORING.MD) +- `monitored_stats_running_average_window`- Dictates the size of the window used to calculate the running average of various "Hot" stats. Learn More: [./MONITORING](./MONITORING.MD) ## Task definitions @@ -460,3 +462,9 @@ The task manager's public API is create / delete / list. Updates aren't directly node scripts/functional_tests_server.js --config x-pack/test/plugin_api_integration/config.ts node scripts/functional_test_runner --config x-pack/test/plugin_api_integration/config.ts ``` + +## Monitoring + +Task Manager exposes runtime statistics which enable basic observability into its inner workings and makes it possible to monitor the system from external services. + +Learn More: [./MONITORING](./MONITORING.MD) \ No newline at end of file From 2efb59904060faba2a2f13683d67bdb0bfdba702 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 6 Oct 2020 10:53:50 +0100 Subject: [PATCH 30/42] updated json in readme --- x-pack/plugins/task_manager/server/MONITORING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/x-pack/plugins/task_manager/server/MONITORING.md b/x-pack/plugins/task_manager/server/MONITORING.md index 2fa08aa8bc1df..36bb85d32612d 100644 --- a/x-pack/plugins/task_manager/server/MONITORING.md +++ b/x-pack/plugins/task_manager/server/MONITORING.md @@ -87,6 +87,7 @@ For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might g "stats": { "configuration": { /* current configuration of TM */ "timestamp": "2020-10-05T17:56:06.507Z", + "status": "OK", "value": { "max_workers": 10, "poll_interval": 3000, @@ -98,6 +99,7 @@ For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might g }, "workload": { /* The workload of this deployment */ "timestamp": "2020-10-05T17:57:06.534Z", + "status": "OK", "value": { "count": 6, /* count of tasks in the system */ "taskTypes": { /* what tasks are there and what status are they in */ @@ -157,6 +159,7 @@ For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might g }, "runtime": { "timestamp": "2020-10-05T17:57:55.411Z", + "status": "OK", "value": { "polling": { /* When was the last polling cycle? */ From 911c827f0af8b544afd76b7ffd56995d5a28ac4d Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 6 Oct 2020 10:58:39 +0100 Subject: [PATCH 31/42] spaces -> tabs --- x-pack/plugins/task_manager/server/MONITORING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/x-pack/plugins/task_manager/server/MONITORING.md b/x-pack/plugins/task_manager/server/MONITORING.md index 36bb85d32612d..10bf60ad7e58b 100644 --- a/x-pack/plugins/task_manager/server/MONITORING.md +++ b/x-pack/plugins/task_manager/server/MONITORING.md @@ -87,7 +87,7 @@ For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might g "stats": { "configuration": { /* current configuration of TM */ "timestamp": "2020-10-05T17:56:06.507Z", - "status": "OK", + "status": "OK", "value": { "max_workers": 10, "poll_interval": 3000, @@ -99,7 +99,7 @@ For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might g }, "workload": { /* The workload of this deployment */ "timestamp": "2020-10-05T17:57:06.534Z", - "status": "OK", + "status": "OK", "value": { "count": 6, /* count of tasks in the system */ "taskTypes": { /* what tasks are there and what status are they in */ @@ -159,7 +159,7 @@ For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might g }, "runtime": { "timestamp": "2020-10-05T17:57:55.411Z", - "status": "OK", + "status": "OK", "value": { "polling": { /* When was the last polling cycle? */ From 2ab5e730337a25a1668b92572be52fd7fe0daf95 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 6 Oct 2020 15:59:40 +0100 Subject: [PATCH 32/42] Added health status in each section --- .../task_manager/server/monitoring/index.ts | 1 + .../monitoring/monitoring_stats_stream.ts | 66 +++++++++++++------ .../monitoring/task_run_statistics.test.ts | 8 +-- .../server/monitoring/task_run_statistics.ts | 32 +++++---- .../server/monitoring/workload_statistics.ts | 10 +++ .../task_manager/server/routes/health.test.ts | 34 +++++++--- .../task_manager/server/routes/health.ts | 45 ++++++++----- 7 files changed, 131 insertions(+), 65 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/index.ts b/x-pack/plugins/task_manager/server/monitoring/index.ts index ef447d6ef0620..ab431978b7531 100644 --- a/x-pack/plugins/task_manager/server/monitoring/index.ts +++ b/x-pack/plugins/task_manager/server/monitoring/index.ts @@ -16,6 +16,7 @@ import { export { MonitoringStats, + HealthStatus, RawMonitoringStats, summarizeMonitoringStats, createAggregators, diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts index edb22b6d79ae5..2975a26977c05 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -10,8 +10,17 @@ import { pick } from 'lodash'; import { Logger } from 'src/core/server'; import { JsonObject } from 'src/plugins/kibana_utils/common'; import { TaskManager } from '../task_manager'; -import { createWorkloadAggregator, WorkloadStat } from './workload_statistics'; -import { createTaskRunAggregator, summarizeTaskRunStat, TaskRunStat } from './task_run_statistics'; +import { + createWorkloadAggregator, + summarizeWorkloadStat, + WorkloadStat, +} from './workload_statistics'; +import { + createTaskRunAggregator, + summarizeTaskRunStat, + TaskRunStat, + SummarizedTaskRunStat, +} from './task_run_statistics'; import { TaskManagerConfig } from '../config'; import { AggregatedStatProvider } from './runtime_statistics_aggregator'; @@ -31,29 +40,33 @@ type ConfigStat = Pick; + workload?: MonitoredStat; + runtime?: MonitoredStat; }; } -interface MonitoredStat { +export enum HealthStatus { + OK = 'OK', + Warning = 'warn', + Error = 'error', +} + +interface MonitoredStat { timestamp: string; - value: JsonObject; + value: T; } +type RawMonitoredStat = MonitoredStat & { + status: HealthStatus; +}; export interface RawMonitoringStats { lastUpdate: string; - stats: Record; + stats: { + configuration: RawMonitoredStat; + workload?: RawMonitoredStat; + runtime?: RawMonitoredStat; + }; } export function createAggregators( @@ -100,17 +113,28 @@ export function createMonitoringStatsStream( export function summarizeMonitoringStats({ lastUpdate, - stats: { runtime, ...otherStats }, + stats: { runtime, workload, configuration }, }: MonitoringStats): RawMonitoringStats { return { lastUpdate, stats: { - ...((otherStats as unknown) as RawMonitoringStats['stats']), + configuration: { + ...configuration, + status: HealthStatus.OK, + }, ...(runtime ? { runtime: { - ...runtime, - value: summarizeTaskRunStat(runtime.value), + timestamp: runtime.timestamp, + ...summarizeTaskRunStat(runtime.value), + }, + } + : {}), + ...(workload + ? { + workload: { + timestamp: workload.timestamp, + ...summarizeWorkloadStat(workload.value), }, } : {}), diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index fefe0fd62b874..247f78808e62c 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -63,7 +63,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value), + value: summarizeTaskRunStat(value).value, })), take(runAtDrift.length), bufferCount(runAtDrift.length) @@ -126,7 +126,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value), + value: summarizeTaskRunStat(value).value, })), take(runDurations.length * 2), bufferCount(runDurations.length * 2) @@ -219,7 +219,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value), + value: summarizeTaskRunStat(value).value, })), take(10), bufferCount(10) @@ -292,7 +292,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value), + value: summarizeTaskRunStat(value).value, })), tap(() => { expectedTimestamp.push(new Date().toISOString()); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index 5c3c3d12972aa..0e9f50b57adc4 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -29,6 +29,7 @@ import { createRunningAveragedStat, createMapOfRunningAveragedStats, } from './task_run_calcultors'; +import { HealthStatus } from './monitoring_stats_stream'; interface FillPoolStat extends JsonObject { lastSuccessfulPoll: string; @@ -160,22 +161,25 @@ export function summarizeTaskRunStat({ polling: { lastSuccessfulPoll, resultFrequency: pollingResultFrequency }, drift, execution: { duration, resultFrequency: executionResultFrequency }, -}: TaskRunStat): SummarizedTaskRunStat { +}: TaskRunStat): { value: SummarizedTaskRunStat; status: HealthStatus } { return { - polling: { - ...(lastSuccessfulPoll ? { lastSuccessfulPoll } : {}), - resultFrequency: { - ...DEFAULT_POLLING_FREQUENCIES, - ...calculateFrequency(pollingResultFrequency as FillPoolResult[]), + value: { + polling: { + ...(lastSuccessfulPoll ? { lastSuccessfulPoll } : {}), + resultFrequency: { + ...DEFAULT_POLLING_FREQUENCIES, + ...calculateFrequency(pollingResultFrequency as FillPoolResult[]), + }, + }, + drift: calculateRunningAverage(drift), + execution: { + duration: mapValues(duration, (typedDurations) => calculateRunningAverage(typedDurations)), + resultFrequency: mapValues(executionResultFrequency, (typedResultFrequencies) => ({ + ...DEFAULT_TASK_RUN_FREQUENCIES, + ...calculateFrequency(typedResultFrequencies), + })), }, }, - drift: calculateRunningAverage(drift), - execution: { - duration: mapValues(duration, (typedDurations) => calculateRunningAverage(typedDurations)), - resultFrequency: mapValues(executionResultFrequency, (typedResultFrequencies) => ({ - ...DEFAULT_TASK_RUN_FREQUENCIES, - ...calculateFrequency(typedResultFrequencies), - })), - }, + status: HealthStatus.OK, }; } diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 28f412ce7cf4d..a46acb013a6c2 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -15,6 +15,7 @@ import { TaskManager } from '../task_manager'; import { ConcreteTaskInstance } from '../task'; import { parseIntervalAsSecond, asInterval } from '../lib/intervals'; import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; +import { HealthStatus } from './monitoring_stats_stream'; interface StatusStat extends JsonObject { [status: string]: number; @@ -246,3 +247,12 @@ function bucketsBetween(from: number, to: number, interval: number) { } return count; } + +export function summarizeWorkloadStat( + workloadStats: WorkloadStat +): { value: WorkloadStat; status: HealthStatus } { + return { + value: workloadStats, + status: HealthStatus.OK, + }; +} diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index d2e2ee707ffa1..294f87dfd1f9c 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -10,7 +10,7 @@ import { httpServiceMock } from 'src/core/server/mocks'; import { healthRoute } from './health'; import { mockHandlerArguments } from './_mock_handler_arguments'; import { sleep, mockLogger } from '../test_utils'; -import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; +import { MonitoringStats, summarizeMonitoringStats, HealthStatus } from '../monitoring'; describe('healthRoute', () => { beforeEach(() => { @@ -49,20 +49,34 @@ describe('healthRoute', () => { await sleep(600); stats$.next(nextMockStat); - expect(logger.debug).toHaveBeenCalledWith(JSON.stringify(summarizeMonitoringStats(mockStat))); - expect(logger.debug).not.toHaveBeenCalledWith( - JSON.stringify(summarizeMonitoringStats(skippedMockStat)) - ); - expect(logger.debug).toHaveBeenCalledWith( - JSON.stringify(summarizeMonitoringStats(nextMockStat)) - ); + const firstDebug = JSON.parse(logger.debug.mock.calls[0][0]); + expect(firstDebug).toMatchObject({ + timestamp: expect.any(String), + status: expect.any(String), + ...summarizeMonitoringStats(mockStat), + }); + + const secondDebug = JSON.parse(logger.debug.mock.calls[1][0]); + expect(secondDebug).not.toMatchObject({ + timestamp: expect.any(String), + status: expect.any(String), + ...summarizeMonitoringStats(skippedMockStat), + }); + expect(secondDebug).toMatchObject({ + timestamp: expect.any(String), + status: expect.any(String), + ...summarizeMonitoringStats(nextMockStat), + }); + expect(logger.debug).toHaveBeenCalledTimes(2); }); - it('returns a error status if the stats have not been updated within the required hot freshness', async () => { + it('returns a error status if the overall stats have not been updated within the required hot freshness', async () => { const router = httpServiceMock.createRouter(); - const mockStat = mockHealthStats(); + const mockStat = mockHealthStats({ + lastUpdate: new Date(Date.now() - 1500).toISOString(), + }); healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000, 60000); const [, handler] = router.get.mock.calls[0]; diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 36e7a20e3bb30..f614511faa98a 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -16,13 +16,12 @@ import { Observable } from 'rxjs'; import { take } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; import { isString } from 'lodash'; -import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; - -enum HealthStatus { - OK = 'OK', - Warning = 'warn', - Error = 'error', -} +import { + MonitoringStats, + summarizeMonitoringStats, + HealthStatus, + RawMonitoringStats, +} from '../monitoring'; export function healthRoute( router: IRouter, @@ -31,20 +30,25 @@ export function healthRoute( requiredHotStatsFreshness: number, requiredColdStatsFreshness: number ) { - function calculateStatus(stats: MonitoringStats) { + function calculateStatus(monitoredStats: MonitoringStats) { const now = Date.now(); const timestamp = new Date(now).toISOString(); + const summarizedStats = summarizeMonitoringStats(monitoredStats); + /** * If the monitored stats aren't fresh, return a red status */ const healthStatus = - hasExpiredHotTimestamps(stats, now, requiredHotStatsFreshness) || - hasExpiredColdTimestamps(stats, now, requiredColdStatsFreshness) + hasStatus(summarizedStats.stats, HealthStatus.Error) || + hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness) || + hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness) ? HealthStatus.Error + : hasStatus(summarizedStats.stats, HealthStatus.Warning) + ? HealthStatus.Warning : HealthStatus.OK; - return { timestamp, status: healthStatus, ...summarizeMonitoringStats(stats) }; + return { timestamp, status: healthStatus, ...summarizedStats }; } /* Log Task Manager stats as a Debug log line at a fixed interval */ @@ -73,28 +77,37 @@ export function healthRoute( /** * If certain "hot" stats are not fresh, then the _health api will should return a Red status - * @param stats The monitored stats + * @param monitoringStats The monitored stats * @param now The time to compare against * @param requiredFreshness How fresh should these stats be */ function hasExpiredHotTimestamps( - stats: MonitoringStats, + monitoringStats: RawMonitoringStats, now: number, requiredFreshness: number ): boolean { return ( now - - getOldestTimestamp(stats.lastUpdate, stats.stats.runtime?.value.polling.lastSuccessfulPoll) > + getOldestTimestamp( + monitoringStats.lastUpdate, + monitoringStats.stats.runtime?.value.polling.lastSuccessfulPoll + ) > requiredFreshness ); } function hasExpiredColdTimestamps( - stats: MonitoringStats, + monitoringStats: RawMonitoringStats, now: number, requiredFreshness: number ): boolean { - return now - getOldestTimestamp(stats.stats.workload?.timestamp) > requiredFreshness; + return now - getOldestTimestamp(monitoringStats.stats.workload?.timestamp) > requiredFreshness; +} + +function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean { + return Object.values(stats) + .map((stat) => stat?.status === status) + .includes(true); } function getOldestTimestamp(...timestamps: unknown[]): number { From 37a4041554f2bcda70165cd73a0112870623c51b Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Tue, 6 Oct 2020 16:07:02 +0100 Subject: [PATCH 33/42] removed unused import --- x-pack/plugins/task_manager/server/routes/health.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 294f87dfd1f9c..2fc1d818f826b 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -10,7 +10,7 @@ import { httpServiceMock } from 'src/core/server/mocks'; import { healthRoute } from './health'; import { mockHandlerArguments } from './_mock_handler_arguments'; import { sleep, mockLogger } from '../test_utils'; -import { MonitoringStats, summarizeMonitoringStats, HealthStatus } from '../monitoring'; +import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; describe('healthRoute', () => { beforeEach(() => { From 8c81a12faeb0db2d84ec2a98d44ef6d0cbc2f653 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Thu, 8 Oct 2020 10:20:33 +0100 Subject: [PATCH 34/42] replace startsWith with a timer that is scheduled to 0 --- .../managed_configuration.test.ts | 3 +++ .../server/polling/observable_monitor.ts | 15 +++------------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts index 4fc8ae899518c..c2a4a455934af 100644 --- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -52,6 +52,9 @@ describe('managed configuration', () => { }, }); taskManager.start(); + // force rxjs timers t ofire when they are scheduled for setTimeout(0) as the + // sinon fake timers cause them to stall + clock.tick(0); }); afterEach(() => clock.restore()); diff --git a/x-pack/plugins/task_manager/server/polling/observable_monitor.ts b/x-pack/plugins/task_manager/server/polling/observable_monitor.ts index e0c31f7014a6a..b07bb6661163b 100644 --- a/x-pack/plugins/task_manager/server/polling/observable_monitor.ts +++ b/x-pack/plugins/task_manager/server/polling/observable_monitor.ts @@ -4,17 +4,9 @@ * you may not use this file except in compliance with the Elastic License. */ -import { Subject, Observable, throwError, interval, timer, Subscription } from 'rxjs'; +import { Subject, Observable, throwError, timer, Subscription } from 'rxjs'; import { noop } from 'lodash'; -import { - exhaustMap, - tap, - takeUntil, - switchMap, - switchMapTo, - catchError, - startWith, -} from 'rxjs/operators'; +import { exhaustMap, tap, takeUntil, switchMap, switchMapTo, catchError } from 'rxjs/operators'; const DEFAULT_HEARTBEAT_INTERVAL = 1000; @@ -37,9 +29,8 @@ export function createObservableMonitor( }: ObservableMonitorOptions = {} ): Observable { return new Observable((subscriber) => { - const subscription: Subscription = interval(heartbeatInterval) + const subscription: Subscription = timer(0, heartbeatInterval) .pipe( - startWith(0), // switch from the heartbeat interval to the instantiated observable until it completes / errors exhaustMap(() => takeUntilDurationOfInactivity(observableFactory(), inactivityTimeout)), // if an error is thrown, catch it, notify and try to recover From 423b8ad764dce53fe6366f023701f06d90934573 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Thu, 8 Oct 2020 10:26:54 +0100 Subject: [PATCH 35/42] typo --- .../server/integration_tests/managed_configuration.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts index c2a4a455934af..443c811469002 100644 --- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -52,7 +52,7 @@ describe('managed configuration', () => { }, }); taskManager.start(); - // force rxjs timers t ofire when they are scheduled for setTimeout(0) as the + // force rxjs timers to fire when they are scheduled for setTimeout(0) as the // sinon fake timers cause them to stall clock.tick(0); }); From 486339138822d54bd2da00214fbfeb2f00ab7a2a Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Thu, 8 Oct 2020 12:51:58 +0100 Subject: [PATCH 36/42] plug health into service status --- x-pack/plugins/task_manager/server/plugin.ts | 16 ++++- .../task_manager/server/routes/health.test.ts | 46 ++++++++++++++- .../task_manager/server/routes/health.ts | 58 +++++++++++++++---- 3 files changed, 105 insertions(+), 15 deletions(-) diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 5f627fee85f8f..e93b639e2c8da 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -4,8 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ import { PluginInitializerContext, Plugin, CoreSetup, CoreStart, Logger } from 'src/core/server'; -import { Subject } from 'rxjs'; -import { first } from 'rxjs/operators'; +import { Subject, combineLatest } from 'rxjs'; +import { first, map } from 'rxjs/operators'; import { TaskDictionary, TaskDefinition } from './task'; import { TaskManager } from './task_manager'; import { TaskManagerConfig } from './config'; @@ -51,7 +51,7 @@ export class TaskManagerPlugin // Routes const router = core.http.createRouter(); - healthRoute( + const serviceStatus$ = healthRoute( router, this.taskManager.then((tm) => createMonitoringStats(tm, config, logger)), logger, @@ -61,6 +61,16 @@ export class TaskManagerPlugin config.monitored_aggregated_stats_refresh_rate + 1000 ); + core.getStartServices().then(async () => { + core.status.set( + combineLatest([core.status.derivedStatus$, serviceStatus$]).pipe( + map(([derivedStatus, serviceStatus]) => + serviceStatus.level > derivedStatus.level ? serviceStatus : derivedStatus + ) + ) + ); + }); + return { addMiddleware: (middleware: Middleware) => { this.taskManager.then((tm) => tm.addMiddleware(middleware)); diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 2fc1d818f826b..6b783b915009f 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -4,13 +4,15 @@ * you may not use this file except in compliance with the Elastic License. */ -import { of, Subject } from 'rxjs'; +import { Observable, of, Subject } from 'rxjs'; +import { take } from 'rxjs/operators'; import { merge } from 'lodash'; import { httpServiceMock } from 'src/core/server/mocks'; import { healthRoute } from './health'; import { mockHandlerArguments } from './_mock_handler_arguments'; import { sleep, mockLogger } from '../test_utils'; import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; +import { ServiceStatusLevels } from 'src/core/server'; describe('healthRoute', () => { beforeEach(() => { @@ -77,7 +79,14 @@ describe('healthRoute', () => { const mockStat = mockHealthStats({ lastUpdate: new Date(Date.now() - 1500).toISOString(), }); - healthRoute(router, Promise.resolve(of(mockStat)), mockLogger(), 1000, 60000); + + const serviceStatus$ = healthRoute( + router, + Promise.resolve(of(mockStat)), + mockLogger(), + 1000, + 60000 + ); const [, handler] = router.get.mock.calls[0]; @@ -111,6 +120,35 @@ describe('healthRoute', () => { ), }, }); + + expect(await getLatest(serviceStatus$)).toMatchObject({ + level: ServiceStatusLevels.unavailable, + summary: 'Task Manager is unavailable', + meta: { + status: 'error', + ...summarizeMonitoringStats( + mockHealthStats({ + lastUpdate: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + lastSuccessfulPoll: expect.any(String), + }, + }, + }, + }, + }) + ), + }, + }); }); it('returns a error status if the workload stats have not been updated within the required cold freshness', async () => { @@ -261,3 +299,7 @@ function mockHealthStats(overrides = {}) { overrides ) as unknown) as MonitoringStats; } + +async function getLatest(stream$: Observable) { + return new Promise((resolve) => stream$.pipe(take(1)).subscribe((stats) => resolve(stats))); +} diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index f614511faa98a..48aa5a346c436 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -11,9 +11,9 @@ import { IKibanaResponse, KibanaResponseFactory, } from 'kibana/server'; -import { Logger } from 'src/core/server'; -import { Observable } from 'rxjs'; -import { take } from 'rxjs/operators'; +import { Logger, ServiceStatus, ServiceStatusLevels } from 'src/core/server'; +import { Observable, from } from 'rxjs'; +import { take, mergeMap, map } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; import { isString } from 'lodash'; import { @@ -23,14 +23,22 @@ import { RawMonitoringStats, } from '../monitoring'; +type MonitoredHealth = RawMonitoringStats & { status: HealthStatus; timestamp: string }; + +const LEVEL_SUMMARY = { + [ServiceStatusLevels.available.toString()]: 'Task Manager is healthy', + [ServiceStatusLevels.degraded.toString()]: 'Task Manager is unhealthy', + [ServiceStatusLevels.unavailable.toString()]: 'Task Manager is unavailable', +}; + export function healthRoute( router: IRouter, monitoringStats: Promise>, logger: Logger, requiredHotStatsFreshness: number, requiredColdStatsFreshness: number -) { - function calculateStatus(monitoredStats: MonitoringStats) { +): Observable { + function calculateStatus(monitoredStats: MonitoringStats): MonitoredHealth { const now = Date.now(); const timestamp = new Date(now).toISOString(); @@ -47,15 +55,23 @@ export function healthRoute( : hasStatus(summarizedStats.stats, HealthStatus.Warning) ? HealthStatus.Warning : HealthStatus.OK; - return { timestamp, status: healthStatus, ...summarizedStats }; } + // Only calculate the summerized stats (calculates all runnign averages and evaluates state) + // when needed by throttling down to the requiredHotStatsFreshness + const throttledMonitoredStats$ = from(monitoringStats).pipe( + mergeMap((monitoringStats$) => + monitoringStats$.pipe( + throttleTime(requiredHotStatsFreshness), + map((stats) => calculateStatus(stats)) + ) + ) + ); + /* Log Task Manager stats as a Debug log line at a fixed interval */ - monitoringStats.then((monitoringStats$) => { - monitoringStats$.pipe(throttleTime(requiredHotStatsFreshness)).subscribe((stats) => { - logger.debug(JSON.stringify(calculateStatus(stats))); - }); + throttledMonitoredStats$.subscribe((stats) => { + logger.debug(JSON.stringify(stats)); }); router.get( @@ -73,6 +89,28 @@ export function healthRoute( }); } ); + + return asServiceStatus(throttledMonitoredStats$); +} + +export function asServiceStatus( + monitoredHealth$: Observable +): Observable { + return monitoredHealth$.pipe( + map((monitoredHealth) => { + const level = + monitoredHealth.status === HealthStatus.OK + ? ServiceStatusLevels.available + : monitoredHealth.status === HealthStatus.Warning + ? ServiceStatusLevels.degraded + : ServiceStatusLevels.unavailable; + return { + level, + summary: LEVEL_SUMMARY[level.toString()], + meta: monitoredHealth, + }; + }) + ); } /** From 9fc7da6dcdf1a0dfc9fa8b7084b382b967fb6e1f Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Thu, 8 Oct 2020 14:35:05 +0100 Subject: [PATCH 37/42] fixed src import --- x-pack/plugins/task_manager/server/routes/health.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 48aa5a346c436..2d255ab035509 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -11,11 +11,11 @@ import { IKibanaResponse, KibanaResponseFactory, } from 'kibana/server'; -import { Logger, ServiceStatus, ServiceStatusLevels } from 'src/core/server'; import { Observable, from } from 'rxjs'; import { take, mergeMap, map } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; import { isString } from 'lodash'; +import { Logger, ServiceStatus, ServiceStatusLevels } from '../../../../../src/core/server'; import { MonitoringStats, summarizeMonitoringStats, From 74039037adef487e5ce023b4105da029635c63cd Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Fri, 9 Oct 2020 19:57:31 +0100 Subject: [PATCH 38/42] estimate recurring tasks in schedule --- .../task_manager/server/lib/intervals.test.ts | 45 ++ .../task_manager/server/lib/intervals.ts | 12 +- .../monitoring/workload_statistics.test.ts | 452 +++++++++++------- .../server/monitoring/workload_statistics.ts | 136 +++++- 4 files changed, 461 insertions(+), 184 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/intervals.test.ts b/x-pack/plugins/task_manager/server/lib/intervals.test.ts index 5ce6c33c57973..e79694915f926 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.test.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.test.ts @@ -8,6 +8,7 @@ import _ from 'lodash'; import sinon from 'sinon'; import { parseIntervalAsSecond, + parseIntervalAsMillisecond, intervalFromNow, intervalFromDate, secondsFromNow, @@ -50,6 +51,50 @@ describe('taskIntervals', () => { /Invalid interval "hello"\. Intervals must be of the form {number}m. Example: 5m/ ); }); + + test('returns an interval as s', () => { + expect(parseIntervalAsSecond('5s')).toEqual(5); + expect(parseIntervalAsSecond('15s')).toEqual(15); + expect(parseIntervalAsSecond('20m')).toEqual(20 * 60); + expect(parseIntervalAsSecond('61m')).toEqual(61 * 60); + expect(parseIntervalAsSecond('90m')).toEqual(90 * 60); + }); + }); + + describe('parseIntervalAsMillisecond', () => { + test('it accepts intervals in the form `Nm`', () => { + expect(() => parseIntervalAsMillisecond(`${_.random(1, 1000)}m`)).not.toThrow(); + }); + + test('it accepts intervals in the form `Ns`', () => { + expect(() => parseIntervalAsMillisecond(`${_.random(1, 1000)}s`)).not.toThrow(); + }); + + test('it rejects 0 based intervals', () => { + expect(() => parseIntervalAsMillisecond('0m')).toThrow( + /Invalid interval "0m"\. Intervals must be of the form {number}m. Example: 5m/ + ); + expect(() => parseIntervalAsMillisecond('0s')).toThrow( + /Invalid interval "0s"\. Intervals must be of the form {number}m. Example: 5m/ + ); + }); + + test('it rejects intervals are not of the form `Nm` or `Ns`', () => { + expect(() => parseIntervalAsMillisecond(`5m 2s`)).toThrow( + /Invalid interval "5m 2s"\. Intervals must be of the form {number}m. Example: 5m/ + ); + expect(() => parseIntervalAsMillisecond(`hello`)).toThrow( + /Invalid interval "hello"\. Intervals must be of the form {number}m. Example: 5m/ + ); + }); + + test('returns an interval as ms', () => { + expect(parseIntervalAsMillisecond('5s')).toEqual(5 * 1000); + expect(parseIntervalAsMillisecond('15s')).toEqual(15 * 1000); + expect(parseIntervalAsMillisecond('20m')).toEqual(20 * 60 * 1000); + expect(parseIntervalAsMillisecond('61m')).toEqual(61 * 60 * 1000); + expect(parseIntervalAsMillisecond('90m')).toEqual(90 * 60 * 1000); + }); }); describe('asInterval', () => { diff --git a/x-pack/plugins/task_manager/server/lib/intervals.ts b/x-pack/plugins/task_manager/server/lib/intervals.ts index 914bc35bb526f..a28dfa62a501f 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.ts @@ -11,9 +11,9 @@ export enum IntervalCadence { Second = 's', } const VALID_CADENCE = new Set(Object.values(IntervalCadence)); -const CADENCE_IN_SECONDS: Record = { - [IntervalCadence.Second]: 1, - [IntervalCadence.Minute]: 60, +const CADENCE_IN_MS: Record = { + [IntervalCadence.Second]: 1000, + [IntervalCadence.Minute]: 60 * 1000, }; function isCadence(cadence: IntervalCadence | string): cadence is IntervalCadence { @@ -81,6 +81,10 @@ export function secondsFromDate(date: Date, secs: number): Date { * @returns {number} The interval as seconds */ export const parseIntervalAsSecond = memoize((interval: string): number => { + return Math.round(parseIntervalAsMillisecond(interval) / 1000); +}); + +export const parseIntervalAsMillisecond = memoize((interval: string): number => { const numericAsStr: string = interval.slice(0, -1); const numeric: number = parseInt(numericAsStr, 10); const cadence: IntervalCadence | string = interval.slice(-1); @@ -89,7 +93,7 @@ export const parseIntervalAsSecond = memoize((interval: string): number => { `Invalid interval "${interval}". Intervals must be of the form {number}m. Example: 5m.` ); } - return numeric * CADENCE_IN_SECONDS[cadence]; + return numeric * CADENCE_IN_MS[cadence]; }); const isNumeric = (numAsStr: string) => /^\d+$/.test(numAsStr); diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index 0714401bdf4ec..f3a0c8e7e1ae7 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -5,12 +5,18 @@ */ import { first, take, bufferCount } from 'rxjs/operators'; -import { WorkloadAggregation, createWorkloadAggregator, padBuckets } from './workload_statistics'; +import { + WorkloadAggregation, + createWorkloadAggregator, + padBuckets, + estimateRecurringTaskScheduling, +} from './workload_statistics'; import { taskManagerMock } from '../task_manager.mock'; import { mockLogger } from '../test_utils'; import { ConcreteTaskInstance } from '../task'; import { ESSearchResponse } from '../../../apm/typings/elasticsearch'; import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; +import { times } from 'lodash'; type MockESResult = ESSearchResponse< ConcreteTaskInstance, @@ -102,6 +108,13 @@ describe('Workload Statistics Aggregator', () => { field: 'task.runAt', fixed_interval: '3s', }, + aggs: { + interval: { + terms: { + field: 'task.schedule.interval', + }, + }, + }, }, }, }, @@ -121,105 +134,93 @@ describe('Workload Statistics Aggregator', () => { }); }); - const mockAggregatedResult: MockESResult = { - hits: { - hits: [], - max_score: 0, - total: { value: 4, relation: 'eq' }, - }, - took: 1, - timed_out: false, - _shards: { - total: 1, - successful: 1, - skipped: 1, - failed: 0, - }, - aggregations: { - schedule: { - buckets: [ - { - key: '3600s', - doc_count: 1, - }, - { - key: '60s', - doc_count: 1, - }, - { - key: '720m', - doc_count: 1, - }, - ], + const mockAggregatedResult: () => MockESResult = () => + ({ + hits: { + hits: [], + max_score: 0, + total: { value: 4, relation: 'eq' }, }, - taskType: { - buckets: [ - { - key: 'actions_telemetry', - doc_count: 2, - status: { - buckets: [ - { - key: 'idle', - doc_count: 2, - }, - ], + took: 1, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 1, + failed: 0, + }, + aggregations: { + schedule: { + buckets: [ + { + key: '3600s', + doc_count: 1, }, - }, - { - key: 'alerting_telemetry', - doc_count: 1, - status: { - buckets: [ - { - key: 'idle', - doc_count: 1, - }, - ], + { + key: '60s', + doc_count: 1, }, - }, - { - key: 'session_cleanup', - doc_count: 1, - status: { - buckets: [ - { - key: 'idle', - doc_count: 1, - }, - ], + { + key: '720m', + doc_count: 1, }, - }, - ], - }, - idleTasks: { - doc_count: 13, - overdue: { - doc_count: 6, + ], }, - scheduleDensity: { + taskType: { buckets: [ - mockHistogram(Date.now(), Date.now() + 7 * 3000, Date.now() + 60000, 3000, [ - 2, - 2, - 5, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - ]), + { + key: 'actions_telemetry', + doc_count: 2, + status: { + buckets: [ + { + key: 'idle', + doc_count: 2, + }, + ], + }, + }, + { + key: 'alerting_telemetry', + doc_count: 1, + status: { + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + { + key: 'session_cleanup', + doc_count: 1, + status: { + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, ], }, + idleTasks: { + doc_count: 13, + overdue: { + doc_count: 6, + }, + scheduleDensity: { + buckets: [mockHistogram(0, 7 * 3000, 60 * 1000, 3000, [2, 2, 5, 0, 0, 0, 0, 0, 0, 1])], + }, + }, }, - }, - }; + } as MockESResult); test('returns a summary of the workload by task type', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult()); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -241,7 +242,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a count of the overdue workload', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult()); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -258,7 +259,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a histogram of the upcoming workload for the upcoming minute when refresh rate is high', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult()); const workloadAggregator = createWorkloadAggregator(taskManager, 10, 3000, mockLogger()); @@ -269,9 +270,9 @@ describe('Workload Statistics Aggregator', () => { // we have intervals every 3s, so we aggregate buckets 3s apart // in this mock, Elasticsearch found tasks scheduled in 21 (8th bucket), 24, 27 and 48s seconds from now // 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57 - // [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0 ] + // [0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 ] // Above you see each bucket and the number of scheduled tasks we expect to have in them - scheduleDensity: [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0], + scheduleDensity: [0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], }); resolve(); }); @@ -280,38 +281,30 @@ describe('Workload Statistics Aggregator', () => { test('returns a histogram of the upcoming workload for twice refresh rate when rate is low', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult()); const workloadAggregator = createWorkloadAggregator(taskManager, 60 * 1000, 3000, mockLogger()); return new Promise((resolve) => { - workloadAggregator.pipe(first()).subscribe((result) => { - expect(result.key).toEqual('workload'); - expect(result.value).toMatchObject({ - // same schedule density as in previous test, but window of 40 buckets ((60s refresh * 2) / 3s = 40) - scheduleDensity: [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 2, - 2, - 5, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - ...new Array(20).fill(0), - ], + workloadAggregator.pipe(first()).subscribe(() => { + expect(taskManager.aggregate.mock.calls[0][0]).toMatchObject({ + aggs: { + idleTasks: { + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [ + { + from: 'now', + to: 'now+2m', + }, + ], + }, + }, + }, + }, + }, }); resolve(); }); @@ -320,7 +313,7 @@ describe('Workload Statistics Aggregator', () => { test('returns a histogram of the upcoming workload maxed out at 50 buckets when rate is too low', async () => { const taskManager = taskManagerMock.create(); - taskManager.aggregate.mockResolvedValue(mockAggregatedResult as MockESResult); + taskManager.aggregate.mockResolvedValue(mockAggregatedResult()); const workloadAggregator = createWorkloadAggregator( taskManager, @@ -331,32 +324,25 @@ describe('Workload Statistics Aggregator', () => { return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { - expect(result.key).toEqual('workload'); - expect(result.value).toMatchObject({ - // same schedule density as in previous test, but window of 40 buckets ((60s refresh * 2) / 3s = 40) - scheduleDensity: [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 2, - 2, - 5, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - ...new Array(30).fill(0), - ], + expect(taskManager.aggregate.mock.calls[0][0]).toMatchObject({ + aggs: { + idleTasks: { + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [ + { + from: 'now', + // 50 buckets of 3s = 50 * 3 = 150s + to: 'now+150s', + }, + ], + }, + }, + }, + }, + }, }); resolve(); }); @@ -367,13 +353,13 @@ describe('Workload Statistics Aggregator', () => { const taskManager = taskManagerMock.create(); taskManager.aggregate .mockResolvedValueOnce( - setTaskTypeCount(mockAggregatedResult, 'alerting_telemetry', { + setTaskTypeCount(mockAggregatedResult(), 'alerting_telemetry', { idle: 2, }) ) .mockRejectedValueOnce(new Error('Elasticsearch has gone poof')) .mockResolvedValueOnce( - setTaskTypeCount(mockAggregatedResult, 'alerting_telemetry', { + setTaskTypeCount(mockAggregatedResult(), 'alerting_telemetry', { idle: 1, failed: 1, }) @@ -407,6 +393,116 @@ describe('Workload Statistics Aggregator', () => { }); }); +describe('estimateRecurringTaskScheduling', () => { + test('flattens out buckets with non recurring tasks', () => { + const now = Date.now(); + const schedule = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: index, + })); + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring task interval equals the interval', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[4].recurring = [[1, '3s']]; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([1, 1, 0, 0, 1, 1, 1, 1, 1, 1]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring task interval is larger than the interval', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[4].recurring = [[1, '6s']]; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([1, 1, 0, 0, 1, 0, 1, 0, 1, 0]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring task interval doesnt divide by interval', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[4].recurring = [[1, '5s']]; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([1, 1, 0, 0, 1, 0, 1, 0, 1, 0]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring tasks overlap', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(20, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[3].recurring = [[1, '3s']]; + schedule[4].recurring = [ + [2, '6s'], + [1, '8s'], + ]; + schedule[5].recurring = [[1, '5s']]; + schedule[6].nonRecurring = 3; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([ + 1, + 1, + 0, + 1, + 4, + 2, + 6, + 3, + 3, + 2, + 4, + 2, + 3, + 3, + 3, + 2, + 4, + 2, + 3, + 3, + ]); + }); +}); + describe('padBuckets', () => { test('returns zeroed out bucklets when there are no buckets in the histogram', async () => { expect( @@ -430,8 +526,8 @@ describe('padBuckets', () => { key: '2020-10-02T19:47:28.128Z-2020-10-02T19:48:28.128Z', from: 1601668048128, from_as_string: '2020-10-02T19:47:28.128Z', - to: 1601668077128, - to_as_string: '2020-10-02T19:47:57.128Z', + to: 1601668075128, + to_as_string: '2020-10-02T19:47:55.128Z', doc_count: 3, histogram: { buckets: [ @@ -439,31 +535,55 @@ describe('padBuckets', () => { key_as_string: '2020-10-02T19:47:27.000Z', key: 1601668047000, doc_count: 1, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, }, { key_as_string: '2020-10-02T19:47:30.000Z', key: 1601668050000, doc_count: 1, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, }, { key_as_string: '2020-10-02T19:47:33.000Z', key: 1601668053000, doc_count: 0, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, }, { key_as_string: '2020-10-02T19:47:36.000Z', key: 1601668056000, doc_count: 0, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, }, { key_as_string: '2020-10-02T19:47:39.000Z', key: 1601668059000, doc_count: 0, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, }, { key_as_string: '2020-10-02T19:47:42.000Z', key: 1601668062000, doc_count: 1, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, }, ], }, @@ -486,11 +606,13 @@ describe('padBuckets', () => { key_as_string: '2020-10-02T20:40:09.000Z', key: 1601671209000, doc_count: 1, + interval: { buckets: [] }, }, { key_as_string: '2020-10-02T20:40:12.000Z', key: 1601671212000, doc_count: 1, + interval: { buckets: [] }, }, ], }, @@ -502,10 +624,10 @@ describe('padBuckets', () => { expect( padBuckets(20, 3000, { key: '2020-10-02T20:39:45.793Z-2020-10-02T20:40:14.793Z', - from: 1.601671185793e12, + from: 1601671185793, from_as_string: '2020-10-02T20:39:45.793Z', - to: 1.1601671244793, - to_as_string: '2020-10-02T20:40:44.793Z', + to: 1601671242793, + to_as_string: '2020-10-02T20:40:42.793Z', doc_count: 2, histogram: { buckets: [ @@ -513,11 +635,13 @@ describe('padBuckets', () => { key_as_string: '2020-10-02T20:40:09.000Z', key: 1601671209000, doc_count: 1, + interval: { buckets: [] }, }, { key_as_string: '2020-10-02T20:40:12.000Z', key: 1601671212000, doc_count: 1, + interval: { buckets: [] }, }, ], }, @@ -541,7 +665,6 @@ function setTaskTypeCount( key: taskType, doc_count: Object.values(status).reduce((sum, count) => sum + count, 0), status: { - doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets: Object.entries(status).map(([key, count]) => ({ key, @@ -557,7 +680,6 @@ function setTaskTypeCount( aggregations: { ...aggregations, taskType: { - doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets, }, @@ -583,24 +705,26 @@ function mockHistogram( interval: number, foundBuckets: Array ) { - const fromDate = new Date(from); - const toDate = new Date(to); + const now = Date.now(); + const fromDate = new Date(now + from); + const toDate = new Date(now + to); return { key: `${fromDate.toISOString()}-${toDate.toISOString()}`, - from, + from: now + from, from_as_string: fromDate.toISOString(), - to, + to: now + to, to_as_string: toDate.toISOString(), doc_count: foundBuckets.reduce((sum: number, count) => sum + (count ?? 0), 0), histogram: { buckets: foundBuckets.reduce( (histogramBuckets, count, index) => { if (typeof count === 'number') { - const key = new Date(findFrom + index * interval); + const key = new Date(now + findFrom + index * interval); histogramBuckets.push({ key_as_string: key.toISOString(), key: key.getTime(), doc_count: count, + interval: { buckets: [] }, }); } return histogramBuckets; @@ -609,6 +733,12 @@ function mockHistogram( key_as_string: string; key: number; doc_count: number; + interval: { + buckets: Array<{ + key: string; + doc_count: number; + }>; + }; }> ), }, diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index a46acb013a6c2..8ab16eaf5fdc2 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -13,7 +13,7 @@ import { ESSearchResponse } from '../../../apm/typings/elasticsearch'; import { AggregatedStatProvider } from './runtime_statistics_aggregator'; import { TaskManager } from '../task_manager'; import { ConcreteTaskInstance } from '../task'; -import { parseIntervalAsSecond, asInterval } from '../lib/intervals'; +import { parseIntervalAsSecond, asInterval, parseIntervalAsMillisecond } from '../lib/intervals'; import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; import { HealthStatus } from './monitoring_stats_stream'; @@ -64,6 +64,11 @@ export interface WorkloadAggregation { field: string; fixed_interval: string; }; + aggs: { + interval: { + terms: { field: string }; + }; + }; }; }; }; @@ -84,6 +89,7 @@ type ScheduleDensityResult = AggregationResultOf< WorkloadAggregation['aggs']['idleTasks']['aggs']['scheduleDensity'], {} >['buckets'][0]; +type ScheduledIntervals = ScheduleDensityResult['histogram']['buckets'][0]; // Set an upper bound just in case a customer sets a really high refresh rate const MAX_SHCEDULE_DENSITY_BUCKETS = 50; @@ -122,6 +128,7 @@ export function createWorkloadAggregator( }, aggs: { scheduleDensity: { + // create a window of upcoming tasks range: { field: 'task.runAt', ranges: [ @@ -129,11 +136,18 @@ export function createWorkloadAggregator( ], }, aggs: { + // create histogram of scheduling in the window, with each bucket being a polling interval histogram: { date_histogram: { field: 'task.runAt', fixed_interval: asInterval(pollInterval), }, + // break down each bucket in the historgram by schedule + aggs: { + interval: { + terms: { field: 'task.schedule.interval' }, + }, + }, }, }, }, @@ -216,38 +230,122 @@ export function createWorkloadAggregator( ); } +interface IntervalTaskCountTouple { + nonRecurring?: number; + recurring?: Array<[number, string]>; + key: number; +} + export function padBuckets( scheduleDensityBuckets: number, pollInterval: number, scheduleDensity: ScheduleDensityResult ): number[] { - if (scheduleDensity.from && scheduleDensity.histogram?.buckets?.length) { - const { histogram, from } = scheduleDensity; + if (scheduleDensity.from && scheduleDensity.to && scheduleDensity.histogram?.buckets?.length) { + const { histogram, from, to } = scheduleDensity; const firstBucket = histogram.buckets[0].key; - const bucketsToPadBeforeFirstBucket = bucketsBetween(from, firstBucket, pollInterval); - const bucketsToPadAfterLast = - scheduleDensityBuckets - (bucketsToPadBeforeFirstBucket + histogram.buckets.length); - return [ - ...(bucketsToPadBeforeFirstBucket > 0 - ? new Array(bucketsToPadBeforeFirstBucket).fill(0) - : []), - ...histogram.buckets.map((bucket) => bucket.doc_count), - ...(bucketsToPadAfterLast > 0 ? new Array(bucketsToPadAfterLast).fill(0) : []), - ]; + const lastBucket = histogram.buckets[histogram.buckets.length - 1].key; + const bucketsToPadBeforeFirstBucket = calculateBucketsBetween(from, firstBucket, pollInterval); + + const bucketsToPadAfterLast = calculateBucketsBetween( + lastBucket + pollInterval, + to, + pollInterval + ); + + return estimateRecurringTaskScheduling( + [ + ...bucketsToPadBeforeFirstBucket, + ...histogram.buckets.map(countByIntervalInBucket), + ...bucketsToPadAfterLast, + ], + pollInterval + ); } return new Array(scheduleDensityBuckets).fill(0); } -function bucketsBetween(from: number, to: number, interval: number) { +function countByIntervalInBucket(bucket: ScheduledIntervals): IntervalTaskCountTouple { + if (bucket.doc_count === 0) { + return { nonRecurring: 0, key: bucket.key }; + } + const recurring: Array<[number, string]> = []; + let nonRecurring = bucket.doc_count; + for (const intervalBucket of bucket.interval.buckets) { + recurring.push([intervalBucket.doc_count, intervalBucket.key as string]); + nonRecurring -= intervalBucket.doc_count; + } + + return { nonRecurring, recurring, key: bucket.key }; +} + +function calculateBucketsBetween( + from: number, + to: number, + interval: number, + bucketInterval: number = interval +): Array<{ key: number }> { + // as task interval might not divide by the pollInterval (aka the bucket interval) + // we have to adjust for the "drift" that occurs when estimating when the next + // bucket the task might actually get scheduled in + const actualInterval = Math.ceil(interval / bucketInterval) * bucketInterval; + + const buckets: Array<{ key: number }> = []; let fromBound = from; - let count = 0; - while (fromBound <= to) { - fromBound += interval; - count++; + while (fromBound < to) { + buckets.push({ key: fromBound }); + fromBound += actualInterval; } - return count; + + return buckets; +} + +export function estimateRecurringTaskScheduling( + scheduleDensity: IntervalTaskCountTouple[], + pollInterval: number +) { + const lastKey = scheduleDensity[scheduleDensity.length - 1].key; + + return scheduleDensity.map((bucket, currentBucketIndex) => { + for (const [count, interval] of bucket.recurring ?? []) { + for (const recurrance of calculateBucketsBetween( + bucket.key, + // `calculateBucketsBetween` uses the `to` as a non-inclusive upper bound + // but lastKey is a bucket we wish to include + lastKey + pollInterval, + parseIntervalAsMillisecond(interval), + pollInterval + )) { + const recurranceBucketIndex = + currentBucketIndex + Math.ceil((recurrance.key - bucket.key) / pollInterval); + + if (recurranceBucketIndex < scheduleDensity.length) { + scheduleDensity[recurranceBucketIndex].nonRecurring = + count + (scheduleDensity[recurranceBucketIndex].nonRecurring ?? 0); + } + } + } + return bucket.nonRecurring ?? 0; + }); } +// function estimateDriftInExecutionDueToPollInterval( +// scheduledExecutions: number[], +// pollInterval: number +// ) { +// const recuranceBeginsAt = scheduledExecutions[0]; +// let drift = 0; +// return scheduledExecutions.map((scheduledExecution, cycle) => { +// const estimatedExectionCycleTime = cycle * pollInterval; +// const estimatedExecution = scheduledExecution + drift; + +// drift = estimatedExectionCycleTime > estimatedExecution ? () +// // drift = (scheduledExecution - estimatedExecution) % pollInterval; + +// return estimatedExecution; +// }); +// } + export function summarizeWorkloadStat( workloadStats: WorkloadStat ): { value: WorkloadStat; status: HealthStatus } { From e47b6194b705464d99b2889560af3c6123eb38b5 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 12 Oct 2020 11:06:44 +0100 Subject: [PATCH 39/42] make hot stats refresh rate configurable --- .../task_manager/server/config.test.ts | 29 +++++ x-pack/plugins/task_manager/server/config.ts | 119 ++++++++++-------- x-pack/plugins/task_manager/server/plugin.ts | 5 +- 3 files changed, 101 insertions(+), 52 deletions(-) diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index f0c1937638991..cb78bc7945295 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -16,6 +16,7 @@ describe('config validation', () => { "max_poll_inactivity_cycles": 10, "max_workers": 10, "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_required_freshness": 4000, "monitored_stats_running_average_window": 50, "poll_interval": 3000, "request_capacity": 1000, @@ -33,4 +34,32 @@ describe('config validation', () => { `"[index]: \\".tasks\\" is an invalid Kibana Task Manager index, as it is already in use by the ElasticSearch Tasks Manager"` ); }); + + test('the required freshness of the monitored stats config must always be less-than-equal to the poll interval', () => { + const config: Record = { + monitored_stats_required_freshness: 100, + }; + expect(() => { + configSchema.validate(config); + }).toThrowErrorMatchingInlineSnapshot( + `"The specified monitored_stats_required_freshness (100) is invalid, as it is below the poll_interval (3000)"` + ); + }); + test('the default required freshness of the monitored stats is poll interval with a slight buffer', () => { + const config: Record = {}; + expect(configSchema.validate(config)).toMatchInlineSnapshot(` + Object { + "enabled": true, + "index": ".kibana_task_manager", + "max_attempts": 3, + "max_poll_inactivity_cycles": 10, + "max_workers": 10, + "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_required_freshness": 4000, + "monitored_stats_running_average_window": 50, + "poll_interval": 3000, + "request_capacity": 1000, + } + `); + }); }); diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index f2de109273714..de82a6ffa8886 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -12,60 +12,79 @@ export const DEFAULT_MAX_POLL_INACTIVITY_CYCLES = 10; // Monitoring Constants // =================== -// Refresh "pull based" monitored stats at a default rate of once a minute +// Refresh aggregated monitored stats at a default rate of once a minute export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000; export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50; -export const configSchema = schema.object({ - enabled: schema.boolean({ defaultValue: true }), - /* The maximum number of times a task will be attempted before being abandoned as failed */ - max_attempts: schema.number({ - defaultValue: 3, - min: 1, - }), - /* How often, in milliseconds, the task manager will look for more work. */ - poll_interval: schema.number({ - defaultValue: DEFAULT_POLL_INTERVAL, - min: 100, - }), - /* How many poll interval cycles can work take before it's timed out. */ - max_poll_inactivity_cycles: schema.number({ - defaultValue: DEFAULT_MAX_POLL_INACTIVITY_CYCLES, - min: 1, - }), - /* How many requests can Task Manager buffer before it rejects new requests. */ - request_capacity: schema.number({ - // a nice round contrived number, feel free to change as we learn how it behaves - defaultValue: 1000, - min: 1, - }), - /* The name of the index used to store task information. */ - index: schema.string({ - defaultValue: '.kibana_task_manager', - validate: (val) => { - if (val.toLowerCase() === '.tasks') { - return `"${val}" is an invalid Kibana Task Manager index, as it is already in use by the ElasticSearch Tasks Manager`; +export const configSchema = schema.object( + { + enabled: schema.boolean({ defaultValue: true }), + /* The maximum number of times a task will be attempted before being abandoned as failed */ + max_attempts: schema.number({ + defaultValue: 3, + min: 1, + }), + /* How often, in milliseconds, the task manager will look for more work. */ + poll_interval: schema.number({ + defaultValue: DEFAULT_POLL_INTERVAL, + min: 100, + }), + /* How many poll interval cycles can work take before it's timed out. */ + max_poll_inactivity_cycles: schema.number({ + defaultValue: DEFAULT_MAX_POLL_INACTIVITY_CYCLES, + min: 1, + }), + /* How many requests can Task Manager buffer before it rejects new requests. */ + request_capacity: schema.number({ + // a nice round contrived number, feel free to change as we learn how it behaves + defaultValue: 1000, + min: 1, + }), + /* The name of the index used to store task information. */ + index: schema.string({ + defaultValue: '.kibana_task_manager', + validate: (val) => { + if (val.toLowerCase() === '.tasks') { + return `"${val}" is an invalid Kibana Task Manager index, as it is already in use by the ElasticSearch Tasks Manager`; + } + }, + }), + /* The maximum number of tasks that this Kibana instance will run simultaneously. */ + max_workers: schema.number({ + defaultValue: DEFAULT_MAX_WORKERS, + // disable the task manager rather than trying to specify it with 0 workers + min: 1, + }), + /* The rate at emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */ + monitored_stats_required_freshness: schema.number({ + defaultValue: (config?: unknown) => + ((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000, + min: 100, + }), + /* The rate at which we refresh monitored stats that require aggregation queries against ES. */ + monitored_aggregated_stats_refresh_rate: schema.number({ + defaultValue: DEFAULT_MONITORING_REFRESH_RATE, + /* don't run monitored stat aggregations any faster than once every 5 seconds */ + min: 5000, + }), + /* The size of the running average window for monitored stats. */ + monitored_stats_running_average_window: schema.number({ + defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW, + max: 100, + min: 10, + }), + }, + { + validate: (config) => { + if ( + config.monitored_stats_required_freshness && + config.poll_interval && + config.monitored_stats_required_freshness < config.poll_interval + ) { + return `The specified monitored_stats_required_freshness (${config.monitored_stats_required_freshness}) is invalid, as it is below the poll_interval (${config.poll_interval})`; } }, - }), - /* The maximum number of tasks that this Kibana instance will run simultaneously. */ - max_workers: schema.number({ - defaultValue: DEFAULT_MAX_WORKERS, - // disable the task manager rather than trying to specify it with 0 workers - min: 1, - }), - /* The rate at which we refresh monitored stats that require aggregation queries against ES. */ - monitored_aggregated_stats_refresh_rate: schema.number({ - defaultValue: DEFAULT_MONITORING_REFRESH_RATE, - /* don't run monitored stat aggregations any faster than once every 5 seconds */ - min: 5000, - }), - /* The size of the running average window for monitored stats. */ - monitored_stats_running_average_window: schema.number({ - defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW, - max: 100, - min: 10, - }), -}); + } +); export type TaskManagerConfig = TypeOf; diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index e93b639e2c8da..359e2ed3979bc 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -55,8 +55,9 @@ export class TaskManagerPlugin router, this.taskManager.then((tm) => createMonitoringStats(tm, config, logger)), logger, - // if "hot" health stats are any more stale than the pollInterval (+1s buffer) consider the system unhealthy - config.poll_interval + 1000, + // if "hot" health stats are any more stale than monitored_stats_required_freshness (pollInterval +1s buffer by default) + // consider the system unhealthy + config.monitored_stats_required_freshness, // if "cold" health stats are any more stale than the configured refresh, consider the system unhealthy config.monitored_aggregated_stats_refresh_rate + 1000 ); From 7798875e52d54cb5fbd8a75740a99293598154bd Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 12 Oct 2020 12:06:00 +0100 Subject: [PATCH 40/42] ensure we dont aggregate workload before tm is readyt --- .../task_manager/server/monitoring/workload_statistics.ts | 6 +++++- x-pack/plugins/task_manager/server/routes/health.ts | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 8ab16eaf5fdc2..82b61583e96e6 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -5,7 +5,7 @@ */ import { timer } from 'rxjs'; -import { concatMap, map, catchError } from 'rxjs/operators'; +import { concatMap, map, filter, catchError } from 'rxjs/operators'; import { Logger } from 'src/core/server'; import { JsonObject } from 'src/plugins/kibana_utils/common'; import { keyBy, mapValues } from 'lodash'; @@ -108,6 +108,10 @@ export function createWorkloadAggregator( ); return timer(0, refreshInterval).pipe( + // Setup might occurr before Kibana is entirely setup + // To avoid erros due to ES not being ready, we'll wait until Start + // to begin polling for the workload + filter(() => taskManager.isStarted), concatMap(() => taskManager.aggregate({ aggs: { diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 2d255ab035509..9f84cc881d675 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -15,6 +15,7 @@ import { Observable, from } from 'rxjs'; import { take, mergeMap, map } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; import { isString } from 'lodash'; +import { JsonValue } from 'src/plugins/kibana_utils/common'; import { Logger, ServiceStatus, ServiceStatusLevels } from '../../../../../src/core/server'; import { MonitoringStats, @@ -148,7 +149,7 @@ function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): bo .includes(true); } -function getOldestTimestamp(...timestamps: unknown[]): number { +function getOldestTimestamp(...timestamps: Array): number { const validTimestamps = timestamps .map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN)) .filter((timestamp) => !isNaN(timestamp)); From a47c7aa688b25eccfffb86df5eaaf394dcdba3fa Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 12 Oct 2020 15:45:32 +0100 Subject: [PATCH 41/42] fixed config mocks --- .../server/monitoring/monitoring_stats_stream.test.ts | 1 + x-pack/plugins/task_manager/server/task_manager.test.ts | 1 + 2 files changed, 2 insertions(+) diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts index 063947f2ecad7..b8bcf15101d26 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts @@ -21,6 +21,7 @@ describe('createMonitoringStatsStream', () => { index: 'foo', max_attempts: 9, poll_interval: 6000000, + monitored_stats_required_freshness: 6000000, max_poll_inactivity_cycles: 10, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, diff --git a/x-pack/plugins/task_manager/server/task_manager.test.ts b/x-pack/plugins/task_manager/server/task_manager.test.ts index 52a3beaf174d1..f8e25edcc0ae3 100644 --- a/x-pack/plugins/task_manager/server/task_manager.test.ts +++ b/x-pack/plugins/task_manager/server/task_manager.test.ts @@ -41,6 +41,7 @@ describe('TaskManager', () => { index: 'foo', max_attempts: 9, poll_interval: 6000000, + monitored_stats_required_freshness: 6000000, max_poll_inactivity_cycles: 10, monitored_aggregated_stats_refresh_rate: 5000, monitored_stats_running_average_window: 50, From 689d0147c5be8e629e3501acd58c6ef7085484cf Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Mon, 12 Oct 2020 15:55:07 +0100 Subject: [PATCH 42/42] updated docs --- x-pack/plugins/task_manager/server/MONITORING.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/x-pack/plugins/task_manager/server/MONITORING.md b/x-pack/plugins/task_manager/server/MONITORING.md index 10bf60ad7e58b..13120fb41b53a 100644 --- a/x-pack/plugins/task_manager/server/MONITORING.md +++ b/x-pack/plugins/task_manager/server/MONITORING.md @@ -9,16 +9,12 @@ There are three different sections to the stats returned by the `health` api. - `runtime`: Tracks Task Manager's performance. ### Configuring the Stats -There are two new configurations: +There are three new configurations: -- `xpack.task_manager.monitored_aggregated_stats_refresh_rate` - Dictates how often we refresh the "Cold" metrics. These metrics require an aggregation against Elasticsearch and adds load to the system, hence we want to limit how often we execute these. This covers the entire `workload` section of the stats. By default this is set to `60s` +- `xpack.task_manager.monitored_stats_required_freshness` - The _required freshness_ of critical "Hot" stats, which means that if key stats (last polling cycle time, for example) haven't been refreshed within the specified duration, the `_health` endpoint and service will report an `Error` status. By default this is inferred from the configured `poll_interval` and is set to `poll_interval` plus a `1s` buffer. +- `xpack.task_manager.monitored_aggregated_stats_refresh_rate` - Dictates how often we refresh the "Cold" metrics. These metrics require an aggregation against Elasticsearch and add load to the system, hence we want to limit how often we execute these. We also inffer the _required freshness_ of these "Cold" metrics from this configuration, which means that if these stats have not been updated within the required duration then the `_health` endpoint and service will report an `Error` status. This covers the entire `workload` section of the stats. By default this is configured to `60s`, and as a result the _required freshness_ defaults to `61s` (refresh plus a `1s` buffer). - `xpack.task_manager.monitored_stats_running_average_window`- Dictates the size of the window used to calculate the running average of various "Hot" stats, such as the time it takes to run a task, the _drift_ that tasks experience etc. These stats are collected throughout the lifecycle of tasks and this window will dictate how large the queue we keep in memory would be, and how many values we need to calculate the average against. We do not calculate the average on *every* new value, but rather only when the time comes to summarize the stats before logging them or returning them to the API endpoint. -Other configurations are inferred from existing config values. -For example: -- The _required freshness_ of critical "Hot" stats in always `pollingInterval + 1s`, which means that if key stats (last polling cycle time, for example) haven't been refreshed within the time scale of a single interval + 1s the stat will report an `Error` status. -- The _required freshness_ of critical "Cold" stats is `monitored_aggregated_stats_refresh_rate + 1s` , which means that if these stats (workload, for example) has not been updated within the required refresh rate then the api will return an `Error` status. - ## Consuming Health Stats Task Manager exposes a `/api/task_manager/_health` api which returns the _latest_ stats. Calling this API is designed to be fast and doesn't actually perform any checks- rather it returns the result of the latest stats in the system, and is design in such a way that you could call it from an external service on a regular basis without worrying that you'll be adding substantial load to the system.