Skip to content

Commit 5eefdbb

Browse files
authored
[Uptime] Use scripted metric for snapshot calculation (#58247) (#58389)
Fixes #58079 This is an improved version of #58078 Note, this is a bugfix targeting 7.6.1 . I've decided to open this PR directly against 7.6 in the interest of time. We can forward-port this to 7.x / master later. This patch improves the handling of timespans with snapshot counts. This feature originally worked, but suffered a regression when we increased the default timespan in the query context to 5m. This means that without this patch the counts you get are the maximum total number of monitors that were down over the past 5m, which is not really that useful. We now use a scripted metric to always count precisely the number of up/down monitors. On my box this could process 400k summary docs in ~600ms. This should scale as shards are added. I attempted to keep memory usage relatively slow by using simple maps of strings.
1 parent 13eacb5 commit 5eefdbb

File tree

2 files changed

+192
-107
lines changed

2 files changed

+192
-107
lines changed

x-pack/legacy/plugins/uptime/server/lib/requests/get_snapshot_counts.ts

Lines changed: 138 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import { UMElasticsearchQueryFn } from '../adapters';
88
import { Snapshot } from '../../../common/runtime_types';
9-
import { QueryContext, MonitorGroupIterator } from './search';
9+
import { QueryContext } from './search';
1010
import { CONTEXT_DEFAULTS, INDEX_NAMES } from '../../../common/constants';
1111

1212
export interface GetSnapshotCountParams {
@@ -16,49 +16,6 @@ export interface GetSnapshotCountParams {
1616
statusFilter?: string;
1717
}
1818

19-
const fastStatusCount = async (context: QueryContext): Promise<Snapshot> => {
20-
const params = {
21-
index: INDEX_NAMES.HEARTBEAT,
22-
body: {
23-
size: 0,
24-
query: { bool: { filter: await context.dateAndCustomFilters() } },
25-
aggs: {
26-
unique: {
27-
// We set the precision threshold to 40k which is the max precision supported by cardinality
28-
cardinality: { field: 'monitor.id', precision_threshold: 40000 },
29-
},
30-
down: {
31-
filter: { range: { 'summary.down': { gt: 0 } } },
32-
aggs: {
33-
unique: { cardinality: { field: 'monitor.id', precision_threshold: 40000 } },
34-
},
35-
},
36-
},
37-
},
38-
};
39-
40-
const statistics = await context.search(params);
41-
const total = statistics.aggregations.unique.value;
42-
const down = statistics.aggregations.down.unique.value;
43-
44-
return {
45-
total,
46-
down,
47-
up: total - down,
48-
};
49-
};
50-
51-
const slowStatusCount = async (context: QueryContext, status: string): Promise<number> => {
52-
const downContext = context.clone();
53-
downContext.statusFilter = status;
54-
const iterator = new MonitorGroupIterator(downContext);
55-
let count = 0;
56-
while (await iterator.next()) {
57-
count++;
58-
}
59-
return count;
60-
};
61-
6219
export const getSnapshotCount: UMElasticsearchQueryFn<GetSnapshotCountParams, Snapshot> = async ({
6320
callES,
6421
dateRangeStart,
@@ -81,22 +38,147 @@ export const getSnapshotCount: UMElasticsearchQueryFn<GetSnapshotCountParams, Sn
8138
);
8239

8340
// Calculate the total, up, and down counts.
84-
const counts = await fastStatusCount(context);
85-
86-
// Check if the last count was accurate, if not, we need to perform a slower count with the
87-
// MonitorGroupsIterator.
88-
if (!(await context.hasTimespan())) {
89-
// Figure out whether 'up' or 'down' is more common. It's faster to count the lower cardinality
90-
// one then use subtraction to figure out its opposite.
91-
const [leastCommonStatus, mostCommonStatus]: Array<'up' | 'down'> =
92-
counts.up > counts.down ? ['down', 'up'] : ['up', 'down'];
93-
counts[leastCommonStatus] = await slowStatusCount(context, leastCommonStatus);
94-
counts[mostCommonStatus] = counts.total - counts[leastCommonStatus];
95-
}
41+
const counts = await statusCount(context);
9642

9743
return {
9844
total: statusFilter ? counts[statusFilter] : counts.total,
9945
up: statusFilter === 'down' ? 0 : counts.up,
10046
down: statusFilter === 'up' ? 0 : counts.down,
10147
};
10248
};
49+
50+
const statusCount = async (context: QueryContext): Promise<Snapshot> => {
51+
const res = await context.search({
52+
index: INDEX_NAMES.HEARTBEAT,
53+
body: statusCountBody(await context.dateAndCustomFilters()),
54+
});
55+
56+
return res.aggregations.counts.value;
57+
};
58+
59+
const statusCountBody = (filters: any): any => {
60+
return {
61+
size: 0,
62+
query: {
63+
bool: {
64+
filter: [
65+
{
66+
exists: {
67+
field: 'summary',
68+
},
69+
},
70+
filters,
71+
],
72+
},
73+
},
74+
aggs: {
75+
counts: {
76+
scripted_metric: {
77+
init_script: 'state.locStatus = new HashMap(); state.totalDocs = 0;',
78+
map_script: `
79+
def loc = doc["observer.geo.name"].size() == 0 ? "" : doc["observer.geo.name"][0];
80+
81+
// One concern here is memory since we could build pretty gigantic maps. I've opted to
82+
// stick to a simple <String,String> map to reduce memory overhead. This means we do
83+
// a little string parsing to treat these strings as records that stay lexicographically
84+
// sortable (which is important later).
85+
// We encode the ID and location as $id.len:$id$loc
86+
String id = doc["monitor.id"][0];
87+
String idLenDelim = Integer.toHexString(id.length()) + ":" + id;
88+
String idLoc = loc == null ? idLenDelim : idLenDelim + loc;
89+
90+
String status = doc["summary.down"][0] > 0 ? "d" : "u";
91+
String timeAndStatus = doc["@timestamp"][0].toInstant().toEpochMilli().toString() + status;
92+
state.locStatus[idLoc] = timeAndStatus;
93+
state.totalDocs++;
94+
`,
95+
combine_script: `
96+
return state;
97+
`,
98+
reduce_script: `
99+
// Use a treemap since it's traversable in sorted order.
100+
// This is important later.
101+
TreeMap locStatus = new TreeMap();
102+
long totalDocs = 0;
103+
int uniqueIds = 0;
104+
for (state in states) {
105+
totalDocs += state.totalDocs;
106+
for (entry in state.locStatus.entrySet()) {
107+
// Update the value for the given key if we have a more recent check from this location.
108+
locStatus.merge(entry.getKey(), entry.getValue(), (a,b) -> a.compareTo(b) > 0 ? a : b)
109+
}
110+
}
111+
112+
HashMap locTotals = new HashMap();
113+
int total = 0;
114+
int down = 0;
115+
String curId = "";
116+
boolean curIdDown = false;
117+
// We now iterate through our tree map in order, which means records for a given ID
118+
// always are encountered one after the other. This saves us having to make an intermediate
119+
// map.
120+
for (entry in locStatus.entrySet()) {
121+
String idLoc = entry.getKey();
122+
String timeStatus = entry.getValue();
123+
124+
// Parse the length delimited id/location strings described in the map section
125+
int colonIndex = idLoc.indexOf(":");
126+
int idEnd = Integer.parseInt(idLoc.substring(0, colonIndex), 16) + colonIndex + 1;
127+
String id = idLoc.substring(colonIndex + 1, idEnd);
128+
String loc = idLoc.substring(idEnd, idLoc.length());
129+
String status = timeStatus.substring(timeStatus.length() - 1);
130+
131+
// Here we increment counters for the up/down key per location
132+
// We also create a new hashmap in locTotals if we've never seen this location
133+
// before.
134+
locTotals.compute(loc, (k,v) -> {
135+
HashMap res = v;
136+
if (v == null) {
137+
res = new HashMap();
138+
res.put('up', 0);
139+
res.put('down', 0);
140+
}
141+
142+
if (status == 'u') {
143+
res.up++;
144+
} else {
145+
res.down++;
146+
}
147+
148+
return res;
149+
});
150+
151+
152+
// We've encountered a new ID
153+
if (curId != id) {
154+
total++;
155+
curId = id;
156+
if (status == "d") {
157+
curIdDown = true;
158+
down++;
159+
} else {
160+
curIdDown = false;
161+
}
162+
} else if (!curIdDown) {
163+
if (status == "d") {
164+
curIdDown = true;
165+
down++;
166+
} else {
167+
curIdDown = false;
168+
}
169+
}
170+
}
171+
172+
Map result = new HashMap();
173+
result.total = total;
174+
result.location_totals = locTotals;
175+
result.up = total - down;
176+
result.down = down;
177+
result.totalDocs = totalDocs;
178+
return result;
179+
`,
180+
},
181+
},
182+
},
183+
};
184+
};

x-pack/test/api_integration/apis/uptime/rest/snapshot.ts

Lines changed: 54 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -34,66 +34,69 @@ export default function({ getService }: FtrProviderContext) {
3434
let dateRange: { start: string; end: string };
3535

3636
[true, false].forEach(async (includeTimespan: boolean) => {
37-
describe(`with timespans ${includeTimespan ? 'included' : 'missing'}`, async () => {
38-
before(async () => {
39-
const promises: Array<Promise<any>> = [];
40-
41-
// When includeTimespan is false we have to remove the values there.
42-
let mogrify = (d: any) => d;
43-
if ((includeTimespan = false)) {
44-
mogrify = (d: any): any => {
45-
d.monitor.delete('timespan');
37+
[true, false].forEach(async (includeObserver: boolean) => {
38+
describe(`with timespans=${includeTimespan} and observer=${includeObserver}`, async () => {
39+
before(async () => {
40+
const promises: Array<Promise<any>> = [];
41+
42+
const mogrify = (d: any) => {
43+
if (!includeTimespan) {
44+
delete d.monitor.timespan;
45+
}
46+
if (!includeObserver) {
47+
delete d.observer;
48+
}
4649
return d;
4750
};
48-
}
49-
50-
const makeMonitorChecks = async (monitorId: string, status: 'up' | 'down') => {
51-
return makeChecksWithStatus(
52-
getService('legacyEs'),
53-
monitorId,
54-
checksPerMonitor,
55-
numIps,
56-
scheduleEvery,
57-
{},
58-
status,
59-
mogrify
60-
);
61-
};
6251

63-
for (let i = 0; i < numUpMonitors; i++) {
64-
promises.push(makeMonitorChecks(`up-${i}`, 'up'));
65-
}
66-
for (let i = 0; i < numDownMonitors; i++) {
67-
promises.push(makeMonitorChecks(`down-${i}`, 'down'));
68-
}
52+
const makeMonitorChecks = async (monitorId: string, status: 'up' | 'down') => {
53+
return makeChecksWithStatus(
54+
getService('legacyEs'),
55+
monitorId,
56+
checksPerMonitor,
57+
numIps,
58+
scheduleEvery,
59+
{},
60+
status,
61+
mogrify
62+
);
63+
};
6964

70-
const allResults = await Promise.all(promises);
71-
dateRange = getChecksDateRange(allResults);
72-
});
65+
for (let i = 0; i < numUpMonitors; i++) {
66+
promises.push(makeMonitorChecks(`up-${i}`, 'up'));
67+
}
68+
for (let i = 0; i < numDownMonitors; i++) {
69+
promises.push(makeMonitorChecks(`down-${i}`, 'down'));
70+
}
7371

74-
it('will count all statuses correctly', async () => {
75-
const apiResponse = await supertest.get(
76-
`/api/uptime/snapshot/count?dateRangeStart=${dateRange.start}&dateRangeEnd=${dateRange.end}`
77-
);
72+
const allResults = await Promise.all(promises);
73+
dateRange = getChecksDateRange(allResults);
74+
});
7875

79-
expectFixtureEql(apiResponse.body, 'snapshot');
80-
});
76+
it('will count all statuses correctly', async () => {
77+
const apiResponse = await supertest.get(
78+
`/api/uptime/snapshot/count?dateRangeStart=${dateRange.start}&dateRangeEnd=${dateRange.end}`
79+
);
8180

82-
it('will fetch a monitor snapshot filtered by down status', async () => {
83-
const statusFilter = 'down';
84-
const apiResponse = await supertest.get(
85-
`/api/uptime/snapshot/count?dateRangeStart=${dateRange.start}&dateRangeEnd=${dateRange.end}&statusFilter=${statusFilter}`
86-
);
81+
expectFixtureEql(apiResponse.body, 'snapshot');
82+
});
8783

88-
expectFixtureEql(apiResponse.body, 'snapshot_filtered_by_down');
89-
});
84+
it('will fetch a monitor snapshot filtered by down status', async () => {
85+
const statusFilter = 'down';
86+
const apiResponse = await supertest.get(
87+
`/api/uptime/snapshot/count?dateRangeStart=${dateRange.start}&dateRangeEnd=${dateRange.end}&statusFilter=${statusFilter}`
88+
);
9089

91-
it('will fetch a monitor snapshot filtered by up status', async () => {
92-
const statusFilter = 'up';
93-
const apiResponse = await supertest.get(
94-
`/api/uptime/snapshot/count?dateRangeStart=${dateRange.start}&dateRangeEnd=${dateRange.end}&statusFilter=${statusFilter}`
95-
);
96-
expectFixtureEql(apiResponse.body, 'snapshot_filtered_by_up');
90+
expectFixtureEql(apiResponse.body, 'snapshot_filtered_by_down');
91+
});
92+
93+
it('will fetch a monitor snapshot filtered by up status', async () => {
94+
const statusFilter = 'up';
95+
const apiResponse = await supertest.get(
96+
`/api/uptime/snapshot/count?dateRangeStart=${dateRange.start}&dateRangeEnd=${dateRange.end}&statusFilter=${statusFilter}`
97+
);
98+
expectFixtureEql(apiResponse.body, 'snapshot_filtered_by_up');
99+
});
97100
});
98101
});
99102
});

0 commit comments

Comments
 (0)