-
Notifications
You must be signed in to change notification settings - Fork 8.5k
[Uptime] Use scripted metric for snapshot calculation #58247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c5e4a50
ae072a6
388c5e6
f7f7a95
8c558fb
afeaf31
e305bec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,7 +7,6 @@ | |
| import { UMMonitorStatesAdapter } from './adapter_types'; | ||
| import { INDEX_NAMES, CONTEXT_DEFAULTS } from '../../../../common/constants'; | ||
| import { fetchPage } from './search'; | ||
| import { MonitorGroupIterator } from './search/monitor_group_iterator'; | ||
| import { Snapshot } from '../../../../common/runtime_types'; | ||
| import { QueryContext } from './search/query_context'; | ||
|
|
||
|
|
@@ -66,18 +65,7 @@ export const elasticsearchMonitorStatesAdapter: UMMonitorStatesAdapter = { | |
| ); | ||
|
|
||
| // Calculate the total, up, and down counts. | ||
| const counts = await fastStatusCount(context); | ||
|
|
||
| // Check if the last count was accurate, if not, we need to perform a slower count with the | ||
| // MonitorGroupsIterator. | ||
| if (!(await context.hasTimespan())) { | ||
| // Figure out whether 'up' or 'down' is more common. It's faster to count the lower cardinality | ||
| // one then use subtraction to figure out its opposite. | ||
| const [leastCommonStatus, mostCommonStatus]: Array<'up' | 'down'> = | ||
| counts.up > counts.down ? ['down', 'up'] : ['up', 'down']; | ||
| counts[leastCommonStatus] = await slowStatusCount(context, leastCommonStatus); | ||
| counts[mostCommonStatus] = counts.total - counts[leastCommonStatus]; | ||
| } | ||
| const counts = await statusCount(context); | ||
|
|
||
| return { | ||
| total: statusFilter ? counts[statusFilter] : counts.total, | ||
|
|
@@ -110,45 +98,138 @@ const jsonifyPagination = (p: any): string | null => { | |
| return JSON.stringify(p); | ||
| }; | ||
|
|
||
| const fastStatusCount = async (context: QueryContext): Promise<Snapshot> => { | ||
| const params = { | ||
| const statusCount = async (context: QueryContext): Promise<Snapshot> => { | ||
| const res = await context.search({ | ||
| index: INDEX_NAMES.HEARTBEAT, | ||
| body: { | ||
| size: 0, | ||
| query: { bool: { filter: await context.dateAndCustomFilters() } }, | ||
| aggs: { | ||
| unique: { | ||
| // We set the precision threshold to 40k which is the max precision supported by cardinality | ||
| cardinality: { field: 'monitor.id', precision_threshold: 40000 }, | ||
| }, | ||
| down: { | ||
| filter: { range: { 'summary.down': { gt: 0 } } }, | ||
| aggs: { | ||
| unique: { cardinality: { field: 'monitor.id', precision_threshold: 40000 } }, | ||
| body: statusCountBody(await context.dateAndCustomFilters()), | ||
| }); | ||
|
|
||
| return res.aggregations.counts.value; | ||
| }; | ||
|
|
||
| const statusCountBody = (filters: any): any => { | ||
| return { | ||
| size: 0, | ||
| query: { | ||
| bool: { | ||
| filter: [ | ||
| { | ||
| exists: { | ||
| field: 'summary', | ||
| }, | ||
| }, | ||
| filters, | ||
| ], | ||
| }, | ||
| }, | ||
| aggs: { | ||
| counts: { | ||
| scripted_metric: { | ||
| init_script: 'state.locStatus = new HashMap(); state.totalDocs = 0;', | ||
| map_script: ` | ||
| def loc = doc["observer.geo.name"].size() == 0 ? "" : doc["observer.geo.name"][0]; | ||
|
|
||
| // One concern here is memory since we could build pretty gigantic maps. I've opted to | ||
| // stick to a simple <String,String> map to reduce memory overhead. This means we do | ||
| // a little string parsing to treat these strings as records that stay lexicographically | ||
| // sortable (which is important later). | ||
| // We encode the ID and location as $id.len:$id$loc | ||
| String id = doc["monitor.id"][0]; | ||
| String idLenDelim = Integer.toHexString(id.length()) + ":" + id; | ||
| String idLoc = loc == null ? idLenDelim : idLenDelim + loc; | ||
|
|
||
| String status = doc["summary.down"][0] > 0 ? "d" : "u"; | ||
| String timeAndStatus = doc["@timestamp"][0].toInstant().toEpochMilli().toString() + status; | ||
| state.locStatus[idLoc] = timeAndStatus; | ||
| state.totalDocs++; | ||
| `, | ||
| combine_script: ` | ||
| return state; | ||
| `, | ||
| reduce_script: ` | ||
| // Use a treemap since it's traversable in sorted order. | ||
| // This is important later. | ||
| TreeMap locStatus = new TreeMap(); | ||
| long totalDocs = 0; | ||
| int uniqueIds = 0; | ||
| for (state in states) { | ||
| totalDocs += state.totalDocs; | ||
| for (entry in state.locStatus.entrySet()) { | ||
| // Update the value for the given key if we have a more recent check from this location. | ||
| locStatus.merge(entry.getKey(), entry.getValue(), (a,b) -> a.compareTo(b) > 0 ? a : b) | ||
| } | ||
| } | ||
|
|
||
| HashMap locTotals = new HashMap(); | ||
| int total = 0; | ||
| int down = 0; | ||
| String curId = ""; | ||
| boolean curIdDown = false; | ||
| // We now iterate through our tree map in order, which means records for a given ID | ||
| // always are encountered one after the other. This saves us having to make an intermediate | ||
| // map. | ||
| for (entry in locStatus.entrySet()) { | ||
| String idLoc = entry.getKey(); | ||
| String timeStatus = entry.getValue(); | ||
|
|
||
| // Parse the length delimited id/location strings described in the map section | ||
| int colonIndex = idLoc.indexOf(":"); | ||
| int idEnd = Integer.parseInt(idLoc.substring(0, colonIndex), 16) + colonIndex + 1; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is 16 the radix?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly, since we hex encode the numbers for density |
||
| String id = idLoc.substring(colonIndex + 1, idEnd); | ||
| String loc = idLoc.substring(idEnd, idLoc.length()); | ||
| String status = timeStatus.substring(timeStatus.length() - 1); | ||
|
|
||
| // Here we increment counters for the up/down key per location | ||
| // We also create a new hashmap in locTotals if we've never seen this location | ||
| // before. | ||
| locTotals.compute(loc, (k,v) -> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A comment heading this block would My understanding is we are updating the value for key
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's correct. I'll add a comment |
||
| HashMap res = v; | ||
| if (v == null) { | ||
| res = new HashMap(); | ||
| res.put('up', 0); | ||
| res.put('down', 0); | ||
| } | ||
|
|
||
| if (status == 'u') { | ||
| res.up++; | ||
| } else { | ||
| res.down++; | ||
| } | ||
|
|
||
| return res; | ||
| }); | ||
|
|
||
|
|
||
| // We've encountered a new ID | ||
| if (curId != id) { | ||
| total++; | ||
| curId = id; | ||
| if (status == "d") { | ||
| curIdDown = true; | ||
| down++; | ||
| } else { | ||
| curIdDown = false; | ||
| } | ||
| } else if (!curIdDown) { | ||
| if (status == "d") { | ||
| curIdDown = true; | ||
| down++; | ||
| } else { | ||
| curIdDown = false; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Map result = new HashMap(); | ||
| result.total = total; | ||
| result.location_totals = locTotals; | ||
| result.up = total - down; | ||
| result.down = down; | ||
| result.totalDocs = totalDocs; | ||
| return result; | ||
| `, | ||
| }, | ||
| }, | ||
| }, | ||
| }; | ||
|
|
||
| const statistics = await context.search(params); | ||
| const total = statistics.aggregations.unique.value; | ||
| const down = statistics.aggregations.down.unique.value; | ||
|
|
||
| return { | ||
| total, | ||
| down, | ||
| up: total - down, | ||
| }; | ||
| }; | ||
|
|
||
| const slowStatusCount = async (context: QueryContext, status: string): Promise<number> => { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So now rather than having a fast/slow count, we're able to just have one counter (slower, but still fast, and always accurate), right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly |
||
| const downContext = context.clone(); | ||
| downContext.statusFilter = status; | ||
| const iterator = new MonitorGroupIterator(downContext); | ||
| let count = 0; | ||
| while (await iterator.next()) { | ||
| count++; | ||
| } | ||
| return count; | ||
| }; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it'd be better to name this function
getStatusCount?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if
gethas any particular meaning at least in my head, unless there's something to juxtapose it against.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fair