Skip to content

Commit dbb22cb

Browse files
author
David Roberts
committed
[ML] Adjust assertion in Grok pattern creation code (#65421)
It turns out that there _is_ a situation where the regex for a category definition will not match all the examples: where one or more examples have been truncated. Previously we had an assertion that this would never happen. This only affected development, as in production assertions are disabled. However, it makes sense to adjust the assertion and comment to reflect reality.
1 parent 74923ec commit dbb22cb

File tree

2 files changed

+45
-4
lines changed

2 files changed

+45
-4
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,12 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C
118118
groupsMatchesFromExamples.get(groupNum - 1).add(matcher.group(groupNum));
119119
}
120120
} else {
121-
// We should never get here. If we do it implies a bug in the original categorization,
122-
// as it's produced a regex that doesn't match the examples.
123-
assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
124-
logger.error("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
121+
// If we get here it implies the original categorization has produced a
122+
// regex that doesn't match one of the examples. This can happen when
123+
// the message was very long, and the example was truncated. In this
124+
// case we will have appended an ellipsis to indicate truncation.
125+
assert example.endsWith("...") : exampleProcessor.pattern() + " did not match non-truncated example " + example;
126+
logger.warn("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
125127
}
126128
}
127129

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import java.util.ArrayList;
1212
import java.util.Arrays;
1313
import java.util.Collection;
14+
import java.util.Collections;
1415
import java.util.HashMap;
1516
import java.util.Map;
1617

@@ -344,4 +345,42 @@ public void testFindBestGrokMatchFromExamplesGivenMatchAllRegex() {
344345
"[tweets_by_location] Killing job");
345346
assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
346347
}
348+
349+
public void testFindBestGrokMatchFromExamplesGivenTruncated() {
350+
String regex = ".*?BST.+?dave.+?bank3.+?CONTEXT.+?SQL.+?statement.+?SELECT.+?time_series_ids_tmp\\.evidence_id" +
351+
".+?time_series_ids_tmp\\.time_series_id.+?is_delta.+?GREATEST.+?usual_interval.+?FROM.+?time_series_ids_tmp.+?" +
352+
"WHERE.+?found_peak_value.+?FALSE.+?ORDER.+?BY.+?time_series_ids_tmp\\.magnitude.+?DESC.+?" +
353+
"time_series_ids_tmp\\.scaling_factor.+?DESC.+?time_series_ids_tmp\\.significance.+?DESC.+?" +
354+
"time_series_ids_tmp\\.evidence_id.+?DESC.+?LIMIT.+?PL.+?pgSQL.+?function.+?probable_cause_list_common.+?" +
355+
"integer.+?integer.+?integer.+?line.+?at.+?SQL.+?statement.+?SQL.+?statement.+?SELECT.+?" +
356+
"probable_cause_list_common.+?evidenceIdIn.+?linkGroupId.+?timeSpanSeconds.+?PL.+?pgSQL.+?function.+?" +
357+
"probable_cause_list.+?integer.+?integer.+?line.+?at.+?PERFORM.*";
358+
Collection<String> examples = Collections.singletonList("2013-05-16 12:13:45 BST:192.168.61.59(51438):dave:@bank3:[19084]: " +
359+
"CONTEXT: SQL statement \"SELECT\n" +
360+
" time_series_ids_tmp.evidence_id,\n" +
361+
" time_series_ids_tmp.time_series_id,\n" +
362+
" is_delta,\n" +
363+
" GREATEST(usual_interval, 1)\n" +
364+
" FROM\n" +
365+
" time_series_ids_tmp\n" +
366+
" WHERE\n" +
367+
" found_peak_value = FALSE\n" +
368+
" ORDER BY\n" +
369+
" \n" +
370+
" \n" +
371+
" \n" +
372+
" time_series_ids_tmp.magnitude DESC,\n" +
373+
" time_series_ids_tmp.scaling_factor DESC,\n" +
374+
" time_series_ids_tmp.significance DESC,\n" +
375+
" time_series_ids_tmp.evidence_id DESC\n" +
376+
" LIMIT\n" +
377+
" 1\"\n" +
378+
" PL/pgSQL function probable_cause_list_common(integer,integer,integer) line 255 at SQL statement\n" +
379+
" SQL statement \"SELECT probable_cause_list_common(evidenceIdIn, linkGroupId, timeSpanSeconds)\"\n" +
380+
" PL/pgSQL function probable_cause_list...");
381+
// Our algorithm for converting examples to Grok patterns that pick out useful fields doesn't work in
382+
// this case because the regex doesn't match the example (because the example has been truncated and
383+
// the regex contains pieces that would match parts of the original message beyond the truncation point)
384+
assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
385+
}
347386
}

0 commit comments

Comments
 (0)