Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1382b1c
Log at different levels based on the state
chrisronline Jun 9, 2021
4ca7c80
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 9, 2021
6d9ae91
Fix types and add tests
chrisronline Jun 10, 2021
3f54951
Remove unnecessary code
chrisronline Jun 10, 2021
c3109d4
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 10, 2021
57219a4
Add more descriptive message
chrisronline Jun 11, 2021
b64356d
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 14, 2021
1dc0519
Partially fix failing tests
chrisronline Jun 14, 2021
0ed7d0a
Move into separate function
chrisronline Jun 14, 2021
35d89d6
Get rid of customStatus in favor of moving the logging logic to a sep…
chrisronline Jun 15, 2021
a9a2a53
Remove debug logging
chrisronline Jun 15, 2021
6715146
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 15, 2021
4ce5672
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 15, 2021
a219c6f
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 15, 2021
4567cce
Do not log as an error if the stats are empty
chrisronline Jun 15, 2021
069f7fa
PR feedback
chrisronline Jun 15, 2021
e4fc649
Add docker whitelist
chrisronline Jun 15, 2021
b85f83b
alpha order
chrisronline Jun 15, 2021
4df5433
English is hard
chrisronline Jun 15, 2021
24c8718
Removing extra newline
chrisronline Jun 15, 2021
30e13d7
PR feedback around ignoring capacity estimation
chrisronline Jun 16, 2021
28c6a75
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 16, 2021
868f6be
Merge remote-tracking branch 'elastic/master' into alerting/tm_health…
chrisronline Jun 16, 2021
fb1ded5
Move json utils
chrisronline Jun 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/settings/task-manager-settings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ Task Manager runs background tasks by polling for work on an interval. You can
| `xpack.task_manager.max_workers`
| The maximum number of tasks that this Kibana instance will run simultaneously. Defaults to 10.
Starting in 8.0, it will not be possible to set the value greater than 100.

| `xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds`
| The amount of seconds we allow a task to delay before printing a warning server log. Defaults to 60.
|===

[float]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ kibana_vars=(
xpack.task_manager.monitored_aggregated_stats_refresh_rate
xpack.task_manager.monitored_stats_required_freshness
xpack.task_manager.monitored_stats_running_average_window
xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_task_execution_thresholds
xpack.task_manager.poll_interval
xpack.task_manager.request_capacity
Expand Down
3 changes: 3 additions & 0 deletions x-pack/plugins/task_manager/server/config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ describe('config validation', () => {
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -68,6 +69,7 @@ describe('config validation', () => {
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -103,6 +105,7 @@ describe('config validation', () => {
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {
"alerting:always-fires": Object {
Expand Down
5 changes: 5 additions & 0 deletions x-pack/plugins/task_manager/server/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export const DEFAULT_VERSION_CONFLICT_THRESHOLD = 80;
// Refresh aggregated monitored stats at a default rate of once a minute
export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50;
export const DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS = 60;

export const taskExecutionFailureThresholdSchema = schema.object(
{
Expand Down Expand Up @@ -109,6 +110,10 @@ export const configSchema = schema.object(
defaultValue: {},
}),
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
monitored_stats_warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
},
{
validate: (config) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ describe('managed configuration', () => {
version_conflict_threshold: 80,
max_poll_inactivity_cycles: 10,
monitored_aggregated_stats_refresh_rate: 60000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_required_freshness: 4000,
monitored_stats_running_average_window: 50,
request_capacity: 1000,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

const createCalculateHealthStatusMock = () => {
return jest.fn();
};

export const calculateHealthStatusMock = {
create: createCalculateHealthStatusMock,
};
79 changes: 79 additions & 0 deletions x-pack/plugins/task_manager/server/lib/calculate_health_status.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { isString } from 'lodash';
import { JsonValue } from '@kbn/common-utils';
import { HealthStatus, RawMonitoringStats } from '../monitoring';
import { TaskManagerConfig } from '../config';

export function calculateHealthStatus(
summarizedStats: RawMonitoringStats,
config: TaskManagerConfig
): HealthStatus {
const now = Date.now();

// if "hot" health stats are any more stale than monitored_stats_required_freshness (pollInterval +1s buffer by default)
// consider the system unhealthy
const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness;

// if "cold" health stats are any more stale than the configured refresh (+ a buffer), consider the system unhealthy
const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5;

/**
* If the monitored stats aren't fresh, return a red status
*/
const healthStatus =
hasStatus(summarizedStats.stats, HealthStatus.Error) ||
hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness) ||
hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness)
? HealthStatus.Error
: hasStatus(summarizedStats.stats, HealthStatus.Warning)
? HealthStatus.Warning
: HealthStatus.OK;
return healthStatus;
}

function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean {
return Object.values(stats)
.map((stat) => stat?.status === status)
.includes(true);
}

/**
* If certain "hot" stats are not fresh, then the _health api will should return a Red status
* @param monitoringStats The monitored stats
* @param now The time to compare against
* @param requiredFreshness How fresh should these stats be
*/
function hasExpiredHotTimestamps(
monitoringStats: RawMonitoringStats,
now: number,
requiredFreshness: number
): boolean {
const diff =
now -
getOldestTimestamp(
monitoringStats.last_update,
monitoringStats.stats.runtime?.value.polling.last_successful_poll
);
return diff > requiredFreshness;
}

function hasExpiredColdTimestamps(
monitoringStats: RawMonitoringStats,
now: number,
requiredFreshness: number
): boolean {
return now - getOldestTimestamp(monitoringStats.stats.workload?.timestamp) > requiredFreshness;
}

function getOldestTimestamp(...timestamps: Array<JsonValue | undefined>): number {
const validTimestamps = timestamps
.map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN))
.filter((timestamp) => !isNaN(timestamp));
return validTimestamps.length ? Math.min(...validTimestamps) : 0;
}
14 changes: 14 additions & 0 deletions x-pack/plugins/task_manager/server/lib/log_health_metrics.mock.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

const createLogHealthMetricsMock = () => {
return jest.fn();
};

export const logHealthMetricsMock = {
create: createLogHealthMetricsMock,
};
Loading