diff --git a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java index d26a9b083df..ac78238ca60 100644 --- a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java +++ b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java @@ -172,8 +172,11 @@ public BrainLogParser( * @return list of tokens by splitting preprocessed log message */ public List preprocess(String logMessage, String logId) { - if (logMessage == null || logId == null) { - throw new IllegalArgumentException("log message or logId must not be null"); + if (logId == null) { + throw new IllegalArgumentException("logId must not be null"); + } + if (logMessage == null) { + return Arrays.asList("", logId); } List tokens = preprocess(logMessage, this.filterPatternVariableMap, this.delimiters); @@ -224,7 +227,7 @@ public void processTokenHistogram(List tokens) { * @return list of token lists */ public List> preprocessAllLogs(List logMessages) { - List> preprocessedLogs = new ArrayList<>(); + List> preprocessedLogs = new ArrayList<>(logMessages.size()); for (int i = 0; i < logMessages.size(); i++) { String logId = String.valueOf(i); @@ -291,7 +294,8 @@ public static List parseLogPattern( String groupCandidateStr = logIdGroupCandidateMap.get(logId); String[] groupCandidate = groupCandidateStr.split(","); Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group - return IntStream.range(0, tokens.size() - 1) + int tokenCapacity = Math.max(0, tokens.size() - 1); + return IntStream.range(0, tokenCapacity) .mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))) .map( entry -> { @@ -334,7 +338,7 @@ public static List parseLogPattern( } return token; }) - .collect(Collectors.toList()); + .collect(Collectors.toCollection(() -> new ArrayList<>(tokenCapacity))); } /** @@ -349,7 +353,10 @@ public Map> parseAllLogPatterns( Map> logPatternMap = new HashMap<>(); for (int i = 0; i < processedMessages.size(); i++) { - List logPattern = this.parseLogPattern(processedMessages.get(i)); + List logPattern = + this.parseLogPattern(processedMessages.get(i)).stream() + .map(BrainLogParser::collapseContinuousWildcards) + .collect(Collectors.toList()); String patternKey = String.join(" ", logPattern); String sampleLog = logMessages.get(i); logPatternMap.compute( @@ -379,6 +386,29 @@ public Map> parseAllLogPatterns( return logPatternMap; } + static String collapseContinuousWildcards(String part) { + // The minimum of continuous wildcards are 6 characters: <*><*> + if (part == null || part.length() < 6) { + return part; + } + int tokenLen = VARIABLE_DENOTER.length(); + StringBuilder sb = new StringBuilder(part.length()); + int i = 0; + while (i < part.length()) { + int j = part.indexOf(VARIABLE_DENOTER, i); + if (j < 0) { + sb.append(part, i, part.length()); + break; + } + sb.append(part, i, j).append(VARIABLE_DENOTER); + do { + j += tokenLen; + } while (j <= part.length() - tokenLen && part.startsWith(VARIABLE_DENOTER, j)); + i = j; + } + return sb.toString(); + } + private Map getWordOccurrences(List tokens) { Map occurrences = new HashMap<>(); for (int i = 0; i < tokens.size() - 1; i++) { diff --git a/common/src/main/java/org/opensearch/sql/common/patterns/PatternUtils.java b/common/src/main/java/org/opensearch/sql/common/patterns/PatternUtils.java index 13efbf8a8fe..afc2afb6e70 100644 --- a/common/src/main/java/org/opensearch/sql/common/patterns/PatternUtils.java +++ b/common/src/main/java/org/opensearch/sql/common/patterns/PatternUtils.java @@ -5,6 +5,7 @@ package org.opensearch.sql.common.patterns; +import com.google.common.collect.ImmutableList; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -20,6 +21,8 @@ public final class PatternUtils { public static final Pattern WILDCARD_PATTERN = Pattern.compile("<\\*[^>]*>"); public static final String TOKEN_PREFIX = ""); + public static final List VALID_BRAIN_PARAMETERS = + ImmutableList.of("variable_count_threshold", "frequency_threshold_percentage"); public static Map> mergePatternGroups( Map> left, @@ -55,6 +58,7 @@ public static Map> mergePatternGroups( public static void extractVariables( ParseResult parseResult, String original, Map> result, String prefix) { List parts = parseResult.parts; + List isToken = parseResult.isToken; List tokenOrder = parseResult.tokenOrder; if (parts.isEmpty()) { @@ -67,7 +71,7 @@ public static void extractVariables( while (i < parts.size()) { String currentPart = parts.get(i); - if (currentPart.startsWith(prefix)) { // Process already labeled part + if (isToken.get(i)) { // Process already labeled part String tokenKey = tokenOrder.get(tokenIndex++); if (i == parts.size() - 1) { // The last part String value = original.substring(pos); @@ -97,19 +101,22 @@ public static void extractVariables( } public static class ParseResult { - List parts; - List tokenOrder; + final List parts; + final List isToken; + final List tokenOrder; - public ParseResult(List parts, List tokenOrder) { + public ParseResult(List parts, List isToken, List tokenOrder) { this.parts = parts; + this.isToken = isToken; this.tokenOrder = tokenOrder; } public String toTokenOrderString(String prefix) { StringBuilder result = new StringBuilder(); int tokenIndex = 0; - for (String currentPart : parts) { - if (currentPart.startsWith(prefix)) { + for (int i = 0; i < parts.size(); i++) { + String currentPart = parts.get(i); + if (isToken.get(i)) { result.append(tokenOrder.get(tokenIndex++)); } else { result.append(currentPart); @@ -126,6 +133,7 @@ public String toTokenOrderString(String prefix) { */ public static ParseResult parsePattern(String pattern, Pattern compiledPattern) { List parts = new ArrayList<>(); + List isToken = new ArrayList<>(); List tokenOrder = new ArrayList<>(); Matcher matcher = compiledPattern.matcher(pattern); int lastEnd = 0; @@ -137,10 +145,12 @@ public static ParseResult parsePattern(String pattern, Pattern compiledPattern) // Add static part before the found match if there is if (start > lastEnd) { parts.add(pattern.substring(lastEnd, start)); + isToken.add(false); } // Add matched wildcard part and generate token order key String wildcard = matcher.group(); parts.add(wildcard); + isToken.add(true); tokenOrder.add(""); lastEnd = end; } @@ -148,9 +158,10 @@ public static ParseResult parsePattern(String pattern, Pattern compiledPattern) // Add static part at the end if (lastEnd < pattern.length()) { parts.add(pattern.substring(lastEnd)); + isToken.add(false); } - return new ParseResult(parts, tokenOrder); + return new ParseResult(parts, isToken, tokenOrder); } private static void addToResult(Map> result, String key, String value) { diff --git a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java index 61fd70e5b17..48fa5ff1632 100644 --- a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java +++ b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java @@ -31,6 +31,7 @@ public enum Key { PATTERN_MODE("plugins.ppl.pattern.mode"), PATTERN_MAX_SAMPLE_COUNT("plugins.ppl.pattern.max.sample.count"), PATTERN_BUFFER_LIMIT("plugins.ppl.pattern.buffer.limit"), + PATTERN_SHOW_NUMBERED_TOKEN("plugins.ppl.pattern.show.numbered.token"), PPL_REX_MAX_MATCH_LIMIT("plugins.ppl.rex.max_match.limit"), PPL_VALUES_MAX_LIMIT("plugins.ppl.values.max.limit"), PPL_SYNTAX_LEGACY_PREFERRED("plugins.ppl.syntax.legacy.preferred"), diff --git a/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java b/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java index 055e42f5298..f00d604586b 100644 --- a/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java +++ b/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java @@ -10,6 +10,7 @@ import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import java.util.AbstractMap; import java.util.Arrays; @@ -104,6 +105,15 @@ public void testPreprocess() { assertEquals(expectedResult, result); } + @Test + public void testPreprocessNullString() { + String logMessage = null; + String logId = "log1"; + List expectedResult = Arrays.asList("", "log1"); + List result = parser.preprocess(logMessage, logId); + assertEquals(expectedResult, result); + } + @Test public void testPreprocessWithUUID() { String logMessage = "127.0.0.1 - 1234 something, user_id:c78ac970-f0c3-4954-8cf8-352a8458d01c"; @@ -124,15 +134,11 @@ public void testPreprocessWithUUID() { public void testPreprocessWithIllegalInput() { String logMessage = "127.0.0.1 - 1234 something"; String logId = "log1"; - String exceptionMessage = "log message or logId must not be null"; + String exceptionMessage = "logId must not be null"; + assertEquals(ImmutableList.of("", logId), parser.preprocess(null, logId)); Throwable throwable = - assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, logId)); - assertEquals(exceptionMessage, throwable.getMessage()); - throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(logMessage, null)); assertEquals(exceptionMessage, throwable.getMessage()); - throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, null)); - assertEquals(exceptionMessage, throwable.getMessage()); } @Test @@ -209,6 +215,29 @@ public void testParseLogPattern() { assertEquals(expectedLogPattern, logPattern); } + @Test + public void testParseAllLogPatternsWithNullInput() { + List messages = + Arrays.asList( + null, + "PacketResponder failed for blk_6996194389878584395", + "PacketResponder failed for blk_-1547954353065580372"); + Map> logPatternMap = parser.parseAllLogPatterns(messages, 1); + Map> expectedResult = + ImmutableMap.of( + "", + ImmutableMap.of("pattern_count", 1L, "pattern", "", "sample_logs", ImmutableList.of()), + "PacketResponder failed for blk_<*>", + ImmutableMap.of( + "pattern_count", + 2L, + "pattern", + "PacketResponder failed for blk_<*>", + "sample_logs", + ImmutableList.of("PacketResponder failed for blk_6996194389878584395"))); + assertEquals(expectedResult, logPatternMap); + } + @Test public void testParseAllLogPatterns() { Map> logPatternMap = parser.parseAllLogPatterns(TEST_HDFS_LOGS, 2); @@ -286,6 +315,19 @@ public void testParseLogPatternWhenHigherFrequencyTokenIsVariable() { assertTrue(parser.getGroupTokenSetMap().get("4-3,3-0").size() > 1); } + @Test + public void testCollapseContinuousWildcards() { + String correctTokenPattern = + "BLOCK* NameSystem.allocateBlock: /user/root/_temporary/_task_<*>_r_<*>"; + String continuousTokenPattern = + "BLOCK* NameSystem.allocateBlock: /user/root/_temporary/_task_<*><*>_r_<*><*><*>"; + + assertEquals( + correctTokenPattern, BrainLogParser.collapseContinuousWildcards(continuousTokenPattern)); + assertEquals( + correctTokenPattern, BrainLogParser.collapseContinuousWildcards(correctTokenPattern)); + } + private Map collectPatternByCountMap( Map> logPatternMap) { return logPatternMap.entrySet().stream() diff --git a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java index 338b7d04126..b2a24eb5333 100644 --- a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java +++ b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java @@ -546,6 +546,7 @@ public static Patterns patterns( PatternMode patternMode, UnresolvedExpression patternMaxSampleCount, UnresolvedExpression patternBufferLimit, + UnresolvedExpression showNumberedToken, java.util.Map arguments) { return new Patterns( sourceField, @@ -555,6 +556,7 @@ public static Patterns patterns( patternMode, patternMaxSampleCount, patternBufferLimit, + showNumberedToken, arguments, input); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/Patterns.java b/core/src/main/java/org/opensearch/sql/ast/tree/Patterns.java index 7d4ee0f8369..35a64b162b5 100644 --- a/core/src/main/java/org/opensearch/sql/ast/tree/Patterns.java +++ b/core/src/main/java/org/opensearch/sql/ast/tree/Patterns.java @@ -39,6 +39,7 @@ public class Patterns extends UnresolvedPlan { private final PatternMode patternMode; private final UnresolvedExpression patternMaxSampleCount; private final UnresolvedExpression patternBufferLimit; + private final UnresolvedExpression showNumberedToken; private final Map arguments; private UnresolvedPlan child; diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 7395d5d8310..efa24647128 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -655,6 +655,9 @@ public RelNode visitSpath(SPath node, CalcitePlanContext context) { @Override public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { visitChildren(node, context); + RexNode showNumberedTokenExpr = rexVisitor.analyze(node.getShowNumberedToken(), context); + Boolean showNumberedToken = + Boolean.TRUE.equals(((RexLiteral) showNumberedTokenExpr).getValueAs(Boolean.class)); if (PatternMethod.SIMPLE_PATTERN.equals(node.getPatternMethod())) { Parse parseNode = new Parse( @@ -686,42 +689,46 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { .collect(Collectors.toList())); context.relBuilder.aggregate(context.relBuilder.groupKey(groupByList), aggCalls); - RexNode parsedNode = - PPLFuncImpTable.INSTANCE.resolve( - context.rexBuilder, - BuiltinFunctionName.INTERNAL_PATTERN_PARSER, - context.relBuilder.field(node.getAlias()), - context.relBuilder.field(PatternUtils.SAMPLE_LOGS)); - flattenParsedPattern(node.getAlias(), parsedNode, context, false); - // Reorder fields for consistency with Brain's output - projectPlusOverriding( - List.of( - context.relBuilder.field(node.getAlias()), - context.relBuilder.field(PatternUtils.PATTERN_COUNT), - context.relBuilder.field(PatternUtils.TOKENS), - context.relBuilder.field(PatternUtils.SAMPLE_LOGS)), - List.of( - node.getAlias(), - PatternUtils.PATTERN_COUNT, - PatternUtils.TOKENS, - PatternUtils.SAMPLE_LOGS), - context); - } else { + if (showNumberedToken) { + RexNode parsedNode = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_PATTERN_PARSER, + context.relBuilder.field(node.getAlias()), + context.relBuilder.field(PatternUtils.SAMPLE_LOGS)); + flattenParsedPattern(node.getAlias(), parsedNode, context, false, true); + // Reorder fields for consistency with Brain's output + projectPlusOverriding( + List.of( + context.relBuilder.field(node.getAlias()), + context.relBuilder.field(PatternUtils.PATTERN_COUNT), + context.relBuilder.field(PatternUtils.TOKENS), + context.relBuilder.field(PatternUtils.SAMPLE_LOGS)), + List.of( + node.getAlias(), + PatternUtils.PATTERN_COUNT, + PatternUtils.TOKENS, + PatternUtils.SAMPLE_LOGS), + context); + } + } else if (showNumberedToken) { RexNode parsedNode = PPLFuncImpTable.INSTANCE.resolve( context.rexBuilder, BuiltinFunctionName.INTERNAL_PATTERN_PARSER, context.relBuilder.field(node.getAlias()), rexVisitor.analyze(node.getSourceField(), context)); - flattenParsedPattern(node.getAlias(), parsedNode, context, false); + flattenParsedPattern(node.getAlias(), parsedNode, context, false, true); } } else { List funcParamList = new ArrayList<>(); funcParamList.add(node.getSourceField()); funcParamList.add(node.getPatternMaxSampleCount()); funcParamList.add(node.getPatternBufferLimit()); + funcParamList.add(node.getShowNumberedToken()); funcParamList.addAll( node.getArguments().entrySet().stream() + .filter(entry -> PatternUtils.VALID_BRAIN_PARAMETERS.contains(entry.getKey())) .map(entry -> new Argument(entry.getKey(), entry.getValue())) .sorted(Comparator.comparing(Argument::getArgName)) .collect(Collectors.toList())); @@ -742,11 +749,16 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { context.rexBuilder, BuiltinFunctionName.INTERNAL_PATTERN_PARSER, rexVisitor.analyze(node.getSourceField(), context), - windowNode), + windowNode, + showNumberedTokenExpr), node.getAlias()); context.relBuilder.projectPlus(nestedNode); flattenParsedPattern( - node.getAlias(), context.relBuilder.field(node.getAlias()), context, false); + node.getAlias(), + context.relBuilder.field(node.getAlias()), + context, + false, + showNumberedToken); } else { // Aggregation mode, resolve plan as aggregation AggCall aggCall = aggVisitor @@ -764,7 +776,11 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { buildExpandRelNode( context.relBuilder.field(node.getAlias()), node.getAlias(), node.getAlias(), context); flattenParsedPattern( - node.getAlias(), context.relBuilder.field(node.getAlias()), context, true); + node.getAlias(), + context.relBuilder.field(node.getAlias()), + context, + true, + showNumberedToken); } } return context.relBuilder.peek(); @@ -2332,9 +2348,21 @@ private void buildParseRelNode(Parse node, CalcitePlanContext context) { pattern, context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), true) }; if (ParseMethod.PATTERNS.equals(parseMethod)) { - rexNodeList = ArrayUtils.add(rexNodeList, context.relBuilder.literal("<*>")); + rexNodeList = + ArrayUtils.add( + rexNodeList, + context.rexBuilder.makeLiteral( + "<*>", + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true)); } else { - rexNodeList = ArrayUtils.add(rexNodeList, context.relBuilder.literal(parseMethod.getName())); + rexNodeList = + ArrayUtils.add( + rexNodeList, + context.rexBuilder.makeLiteral( + parseMethod.getName(), + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true)); } List newFields = new ArrayList<>(); for (String groupCandidate : groupCandidates) { @@ -2347,9 +2375,29 @@ private void buildParseRelNode(Parse node, CalcitePlanContext context) { context.rexBuilder, BuiltinFunctionName.INTERNAL_ITEM, innerRex, - context.relBuilder.literal(groupCandidate))); + context.rexBuilder.makeLiteral( + groupCandidate, + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true))); } else { - newFields.add(innerRex); + RexNode emptyString = + context.rexBuilder.makeLiteral( + "", context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), true); + RexNode isEmptyCondition = + context.rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, sourceField, emptyString); + RexNode isNullCondition = + context.rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, sourceField); + // Calcite regexp_replace(string, string, string) doesn't accept empty string. + // So use case when condition here to handle corner cases + newFields.add( + context.rexBuilder.makeCall( + SqlStdOperatorTable.CASE, // case + isNullCondition, + emptyString, // when field is NULL then '' + isEmptyCondition, + emptyString, // when field = '' then '' + innerRex // else regexp_replace(field, regex, replace_string) + )); } } projectPlusOverriding(newFields, groupCandidates, context); @@ -2359,7 +2407,8 @@ private void flattenParsedPattern( String originalPatternResultAlias, RexNode parsedNode, CalcitePlanContext context, - boolean flattenPatternAggResult) { + boolean flattenPatternAggResult, + Boolean showNumberedToken) { List fattenedNodes = new ArrayList<>(); List projectNames = new ArrayList<>(); // Flatten map struct fields @@ -2389,18 +2438,20 @@ private void flattenParsedPattern( fattenedNodes.add(context.relBuilder.alias(patternCountExpr, PatternUtils.PATTERN_COUNT)); projectNames.add(PatternUtils.PATTERN_COUNT); } - RexNode tokensExpr = - context.rexBuilder.makeCast( - UserDefinedFunctionUtils.tokensMap, - PPLFuncImpTable.INSTANCE.resolve( - context.rexBuilder, - BuiltinFunctionName.INTERNAL_ITEM, - parsedNode, - context.rexBuilder.makeLiteral(PatternUtils.TOKENS)), - true, - true); - fattenedNodes.add(context.relBuilder.alias(tokensExpr, PatternUtils.TOKENS)); - projectNames.add(PatternUtils.TOKENS); + if (showNumberedToken) { + RexNode tokensExpr = + context.rexBuilder.makeCast( + UserDefinedFunctionUtils.tokensMap, + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_ITEM, + parsedNode, + context.rexBuilder.makeLiteral(PatternUtils.TOKENS)), + true, + true); + fattenedNodes.add(context.relBuilder.alias(tokensExpr, PatternUtils.TOKENS)); + projectNames.add(PatternUtils.TOKENS); + } if (flattenPatternAggResult) { RexNode sampleLogsExpr = context.rexBuilder.makeCast( diff --git a/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/LogPatternAggFunction.java b/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/LogPatternAggFunction.java index 4003708e3ce..129a1af9e19 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/LogPatternAggFunction.java +++ b/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/LogPatternAggFunction.java @@ -11,6 +11,7 @@ import com.google.common.collect.ImmutableMap; import java.math.BigDecimal; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; @@ -27,6 +28,7 @@ public class LogPatternAggFunction implements UserDefinedAggFunction { private int bufferLimit = 100000; private int maxSampleCount = 10; + private boolean showNumberedToken = false; private int variableCountThreshold = BrainLogParser.DEFAULT_VARIABLE_COUNT_THRESHOLD; private double thresholdPercentage = BrainLogParser.DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE; @@ -41,7 +43,8 @@ public Object result(LogParserAccumulator acc) { return null; } - return acc.value(maxSampleCount, variableCountThreshold, thresholdPercentage); + return acc.value( + maxSampleCount, variableCountThreshold, thresholdPercentage, showNumberedToken); } @Override @@ -49,8 +52,9 @@ public LogParserAccumulator add(LogParserAccumulator acc, Object... values) { throw new SyntaxCheckException( "Unsupported function signature for pattern aggregate. Valid parameters include (field:" + " required string), (max_sample_count: required integer)," - + " (buffer_limit: required integer), [variable_count_threshold: optional" - + " integer], [frequency_threshold_percentage: optional double]"); + + " (buffer_limit: required integer), (show_numbered_token: required boolean)," + + " [variable_count_threshold: optional integer]," + + " [frequency_threshold_percentage: optional double]"); } public LogParserAccumulator add( @@ -58,6 +62,7 @@ public LogParserAccumulator add( String field, int maxSampleCount, int bufferLimit, + boolean showNumberedToken, BigDecimal thresholdPercentage, int variableCountThreshold) { return add( @@ -65,6 +70,7 @@ public LogParserAccumulator add( field, maxSampleCount, bufferLimit, + showNumberedToken, thresholdPercentage.doubleValue(), variableCountThreshold); } @@ -74,6 +80,7 @@ public LogParserAccumulator add( String field, int maxSampleCount, int bufferLimit, + boolean showNumberedToken, double thresholdPercentage, int variableCountThreshold) { if (Objects.isNull(field)) { @@ -81,11 +88,13 @@ public LogParserAccumulator add( } this.bufferLimit = bufferLimit; this.maxSampleCount = maxSampleCount; + this.showNumberedToken = showNumberedToken; this.variableCountThreshold = variableCountThreshold; this.thresholdPercentage = thresholdPercentage; acc.evaluate(field); if (bufferLimit > 0 && acc.size() == bufferLimit) { - acc.partialMerge(maxSampleCount, variableCountThreshold, thresholdPercentage); + acc.partialMerge( + maxSampleCount, variableCountThreshold, thresholdPercentage, showNumberedToken); acc.clearBuffer(); } return acc; @@ -96,9 +105,16 @@ public LogParserAccumulator add( String field, int maxSampleCount, int bufferLimit, + boolean showNumberedToken, int variableCountThreshold) { return add( - acc, field, maxSampleCount, bufferLimit, this.thresholdPercentage, variableCountThreshold); + acc, + field, + maxSampleCount, + bufferLimit, + showNumberedToken, + this.thresholdPercentage, + variableCountThreshold); } public LogParserAccumulator add( @@ -106,23 +122,30 @@ public LogParserAccumulator add( String field, int maxSampleCount, int bufferLimit, + boolean showNumberedToken, BigDecimal thresholdPercentage) { return add( acc, field, maxSampleCount, bufferLimit, + showNumberedToken, thresholdPercentage.doubleValue(), this.variableCountThreshold); } public LogParserAccumulator add( - LogParserAccumulator acc, String field, int maxSampleCount, int bufferLimit) { + LogParserAccumulator acc, + String field, + int maxSampleCount, + int bufferLimit, + boolean showNumberedToken) { return add( acc, field, maxSampleCount, bufferLimit, + showNumberedToken, this.thresholdPercentage, this.variableCountThreshold); } @@ -151,7 +174,7 @@ public void partialMerge(Object... argList) { if (logMessages.isEmpty()) { return; } - assert argList.length == 3 : "partialMerge of LogParserAccumulator requires 3 parameters"; + assert argList.length == 4 : "partialMerge of LogParserAccumulator requires 4 parameters"; int maxSampleCount = (int) argList[0]; BrainLogParser logParser = new BrainLogParser((int) argList[1], ((Double) argList[2]).floatValue()); @@ -166,6 +189,7 @@ public Object value(Object... argList) { partialMerge(argList); clearBuffer(); + Boolean showToken = (Boolean) argList[3]; return patternGroupMap.values().stream() .sorted( Comparator.comparing( @@ -177,18 +201,25 @@ public Object value(Object... argList) { Long count = (Long) m.get(PatternUtils.PATTERN_COUNT); List sampleLogs = (List) m.get(PatternUtils.SAMPLE_LOGS); Map> tokensMap = new HashMap<>(); - ParseResult parseResult = - PatternUtils.parsePattern(pattern, PatternUtils.WILDCARD_PATTERN); - for (String sampleLog : sampleLogs) { - PatternUtils.extractVariables( - parseResult, sampleLog, tokensMap, PatternUtils.WILDCARD_PREFIX); + ParseResult parseResult = null; + if (showToken) { + parseResult = PatternUtils.parsePattern(pattern, PatternUtils.WILDCARD_PATTERN); + for (String sampleLog : sampleLogs) { + PatternUtils.extractVariables( + parseResult, sampleLog, tokensMap, PatternUtils.WILDCARD_PREFIX); + } } return ImmutableMap.of( PatternUtils.PATTERN, - parseResult.toTokenOrderString(PatternUtils.WILDCARD_PREFIX), - PatternUtils.PATTERN_COUNT, count, - PatternUtils.TOKENS, tokensMap, - PatternUtils.SAMPLE_LOGS, sampleLogs); + showToken + ? parseResult.toTokenOrderString(PatternUtils.WILDCARD_PREFIX) + : pattern, + PatternUtils.PATTERN_COUNT, + count, + PatternUtils.TOKENS, + showToken ? tokensMap : Collections.EMPTY_MAP, + PatternUtils.SAMPLE_LOGS, + sampleLogs); }) .collect(Collectors.toList()); } diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PatternParserFunctionImpl.java b/core/src/main/java/org/opensearch/sql/expression/function/PatternParserFunctionImpl.java index 401516346d2..660420a7e04 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PatternParserFunctionImpl.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PatternParserFunctionImpl.java @@ -55,7 +55,7 @@ public SqlReturnTypeInference getReturnTypeInference() { public UDFOperandMetadata getOperandMetadata() { return UDFOperandMetadata.wrap( (CompositeOperandTypeChecker) - OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.ANY) + OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.ARRAY, SqlTypeFamily.BOOLEAN) .or(OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER)) .or(OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.ARRAY))); } @@ -64,8 +64,11 @@ public static class PatternParserImplementor implements NotNullImplementor { @Override public Expression implement( RexToLixTranslator translator, RexCall call, List translatedOperands) { - assert call.getOperands().size() == 2 : "PATTERN_PARSER should have 2 arguments"; - assert translatedOperands.size() == 2 : "PATTERN_PARSER should have 2 arguments"; + int operandCount = call.getOperands().size(); + int translatedOperandCount = translatedOperands.size(); + assert operandCount == 3 || operandCount == 2 : "PATTERN_PARSER should have 2 or 3 arguments"; + assert translatedOperandCount == 3 || translatedOperandCount == 2 + : "PATTERN_PARSER should have 2 or 3 arguments"; RelDataType inputType = call.getOperands().get(1).getType(); Method method = resolveEvaluationMethod(inputType); @@ -81,7 +84,12 @@ private Method resolveEvaluationMethod(RelDataType inputType) { RelDataType componentType = inputType.getComponentType(); return (componentType.getSqlTypeName() == SqlTypeName.MAP) - ? getMethod(Object.class, "evalAgg") + ? Types.lookupMethod( + PatternParserFunctionImpl.class, + "evalAgg", + String.class, + Objects.class, + Boolean.class) : getMethod(List.class, "evalSamples"); } @@ -96,7 +104,9 @@ private Method getMethod(Class paramType, String methodName) { * Drain algorithm(see https://ieeexplore.ieee.org/document/8029742). */ public static Object evalAgg( - @Parameter(name = "field") String field, @Parameter(name = "aggObject") Object aggObject) { + @Parameter(name = "field") String field, + @Parameter(name = "aggObject") Object aggObject, + @Parameter(name = "showNumberedToken") Boolean showNumberedToken) { if (Strings.isBlank(field)) { return EMPTY_RESULT; } @@ -118,9 +128,11 @@ public static Object evalAgg( if (bestCandidate != null) { String bestCandidatePattern = String.join(" ", bestCandidate); Map> tokensMap = new HashMap<>(); - ParseResult parseResult = - PatternUtils.parsePattern(bestCandidatePattern, PatternUtils.TOKEN_PATTERN); - PatternUtils.extractVariables(parseResult, field, tokensMap, PatternUtils.TOKEN_PREFIX); + if (showNumberedToken) { + ParseResult parseResult = + PatternUtils.parsePattern(bestCandidatePattern, PatternUtils.TOKEN_PATTERN); + PatternUtils.extractVariables(parseResult, field, tokensMap, PatternUtils.TOKEN_PREFIX); + } return ImmutableMap.of( PatternUtils.PATTERN, bestCandidatePattern, PatternUtils.TOKENS, tokensMap); @@ -137,7 +149,6 @@ public static Object evalField( Map> tokensMap = new HashMap<>(); ParseResult parseResult = PatternUtils.parsePattern(pattern, PatternUtils.WILDCARD_PATTERN); - PatternUtils.extractVariables(parseResult, field, tokensMap, PatternUtils.WILDCARD_PREFIX); return ImmutableMap.of( PatternUtils.PATTERN, @@ -149,12 +160,11 @@ public static Object evalField( public static Object evalSamples( @Parameter(name = "pattern") String pattern, @Parameter(name = "sample_logs") List sampleLogs) { - if (sampleLogs.isEmpty()) { + if (Strings.isBlank(pattern)) { return EMPTY_RESULT; } Map> tokensMap = new HashMap<>(); ParseResult parseResult = PatternUtils.parsePattern(pattern, PatternUtils.WILDCARD_PATTERN); - for (String sampleLog : sampleLogs) { PatternUtils.extractVariables( parseResult, sampleLog, tokensMap, PatternUtils.WILDCARD_PREFIX); diff --git a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java index f60bbde6f2f..5930df74ee5 100644 --- a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java +++ b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java @@ -1876,6 +1876,7 @@ public void brain_patterns_command_with_no_additional_args() { PatternMode.LABEL, AstDSL.intLiteral(10), AstDSL.intLiteral(100000), + AstDSL.booleanLiteral(false), ImmutableMap.of()), AstDSL.field("string_value")); LogicalPlan expectedPlan = @@ -1903,6 +1904,7 @@ public void brain_patterns_command() { PatternMode.LABEL, AstDSL.intLiteral(10), AstDSL.intLiteral(100000), + AstDSL.booleanLiteral(false), ImmutableMap.of( "variable_count_threshold", AstDSL.intLiteral(10), // with integer argument diff --git a/docs/category.json b/docs/category.json index 4e40495e364..014ba172d74 100644 --- a/docs/category.json +++ b/docs/category.json @@ -14,10 +14,7 @@ "user/ppl/cmd/information_schema.rst", "user/ppl/cmd/eval.rst", "user/ppl/cmd/fillnull.rst", - "user/ppl/cmd/grok.rst", "user/ppl/cmd/head.rst", - "user/ppl/cmd/parse.rst", - "user/ppl/cmd/patterns.rst", "user/ppl/cmd/rare.rst", "user/ppl/cmd/search.rst", "user/ppl/cmd/sort.rst", @@ -59,6 +56,9 @@ "user/ppl/functions/condition.rst", "user/ppl/cmd/eventstats.rst", "user/ppl/cmd/fields.rst", + "user/ppl/cmd/grok.rst", + "user/ppl/cmd/parse.rst", + "user/ppl/cmd/patterns.rst", "user/ppl/cmd/regex.rst", "user/ppl/cmd/rename.rst", "user/ppl/cmd/rex.rst", diff --git a/docs/user/ppl/cmd/patterns.rst b/docs/user/ppl/cmd/patterns.rst index de5f1c8b527..c3a785ce274 100644 --- a/docs/user/ppl/cmd/patterns.rst +++ b/docs/user/ppl/cmd/patterns.rst @@ -18,11 +18,12 @@ Description * ``patterns`` command supports two modes, aka ``label`` and ``aggregation``. ``label`` mode is similar to previous 3.0.0 output. ``aggregation`` mode returns aggregated results on target field. * V2 Engine engine still have the same output in ``label`` mode as before. In ``aggregation`` mode, it returns aggregated pattern count on labeled pattern as well as sample logs (sample count is configurable) per pattern. -* Calcite engine's ``label`` mode not only labels pattern of text but also labels variable tokens in map. In ``aggregation`` mode, it will also output labeled pattern as well as variable tokens per pattern. +* Calcite engine by default labels the variables with '<*>' placeholder. +* If ``show_numbered_token`` option is turned on, Calcite engine's ``label`` mode not only labels pattern of text but also labels variable tokens in map. In ``aggregation`` mode, it will also output labeled pattern as well as variable tokens per pattern. The variable placeholder is in the format of '' instead of '<*>'. Syntax ============ -patterns [by byClause...] [method=simple_pattern | brain] [mode=label | aggregation] [max_sample_count=integer] [buffer_limit=integer] [new_field=] (algorithm parameters...) +patterns [by byClause...] [method=simple_pattern | brain] [mode=label | aggregation] [max_sample_count=integer] [buffer_limit=integer] [show_numbered_token=boolean] [new_field=] (algorithm parameters...) * field: mandatory. The text(string) field to analyze for patterns. * byClause: optional. Fields or scalar functions used to group logs for labeling/aggregation. @@ -30,10 +31,15 @@ patterns [by byClause...] [method=simple_pattern | brain] [mode=label | * mode: optional. Output mode: ``label`` (default) or ``aggregation``. The mode is configured by the setting ``plugins.ppl.pattern.mode``. * max_sample_count: optional. Max sample logs returned per pattern in aggregation mode (default: 10). The max_sample_count is configured by the setting ``plugins.ppl.pattern.max.sample.count``. * buffer_limit: optional. Safeguard parameter for ``brain`` algorithm to limit internal temporary buffer size (default: 100,000, min: 50,000). The buffer_limit is configured by the setting ``plugins.ppl.pattern.buffer.limit``. +* show_numbered_token: optional. The flag to turn on numbered token output format (default: false). The show_numbered_token is configured by the setting ``plugins.ppl.pattern.show.numbered.token``. * new_field: Alias of the output pattern field. (default: "patterns_field"). * algorithm parameters: optional. Algorithm-specific tuning: - - ``simple_pattern`` : Define regex via "pattern". - - ``brain`` : Adjust sensitivity with variable_count_threshold (int > 0) and frequency_threshold_percentage (double 0.0 - 1.0). + + - ``simple_pattern`` : Define regex via "pattern". + - ``brain`` : Adjust sensitivity with variable_count_threshold (int > 0) and frequency_threshold_percentage (double 0.0 - 1.0). + + - ``variable_count_threshold``: Optional integer(Default value is 5). Words(or we say tokens) are split by space. Algorithm will count how many distinct words are at specific position in initial log groups. Same log group's constant word ideally should be distinct at its position but it's not guaranteed because some words could be enums. Adjusting this threshold can primarily determine the sensitivity of constant words. + - ``frequency_threshold_percentage``: Optional double(Default value is 0.3). Brain's log pattern is selected based on longest word combination. A word combination is words with same frequency per message. To select longest word combination frequency, it needs a lower bound of frequency to ignore too low frequency words. The representative frequency of longest word combination should be >= highest token frequency of log * threshold percentage. Adjusting this threshold could prune some low frequency words. Change default pattern method ============ @@ -47,14 +53,15 @@ To override default pattern parameters, users can run following command "plugins.ppl.pattern.method": "brain", "plugins.ppl.pattern.mode": "aggregation", "plugins.ppl.pattern.max.sample.count": 5, - "plugins.ppl.pattern.buffer.limit": 50000 + "plugins.ppl.pattern.buffer.limit": 50000, + "plugins.ppl.pattern.show.numbered.token": true } } Simple Pattern Example 1: Create the new field =============================== -The example shows how to use extract punctuations in ``email`` for each document. Parsing a null field will return an empty string. +The example shows how to extract patterns in ``email`` for each document. Parsing a null field will return an empty string. PPL query:: @@ -63,100 +70,84 @@ PPL query:: +-----------------------+----------------+ | email | patterns_field | |-----------------------+----------------| - | amberduke@pyrami.com | @. | - | hattiebond@netagy.com | @. | + | amberduke@pyrami.com | <*>@<*>.<*> | + | hattiebond@netagy.com | <*>@<*>.<*> | | null | | - | daleadams@boink.com | @. | + | daleadams@boink.com | <*>@<*>.<*> | +-----------------------+----------------+ Simple Pattern Example 2: Extract log patterns =============================== -The example shows how to extract punctuations from a raw log field using the default patterns. +The example shows how to extract patterns from a raw log field using the default patterns. PPL query:: os> source=apache | patterns message method=simple_pattern | fields message, patterns_field ; fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------+ - | message | patterns_field | - |-----------------------------------------------------------------------------------------------------------------------------+---------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | ... - [//::: -] " /-/ /." | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | ... - [//::: -] " //// /." | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | ... - - [//::: -] " //--- /." | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | ... - - [//::: -] " / /." | - +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------+ + +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ + | message | patterns_field | + |-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------| + | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | + | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | + | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | + | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | + +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ Simple Pattern Example 3: Extract log patterns with custom regex pattern ========================================================= -The example shows how to extract punctuations from a raw log field using user defined patterns. +The example shows how to extract patterns from a raw log field using user defined patterns. PPL query:: os> source=apache | patterns message method=simple_pattern new_field='no_numbers' pattern='[0-9]' | fields message, no_numbers ; fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+ - | message | no_numbers | - |-----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | ... - upton [/Sep/::: -] "HEAD /e-business/mindshare HTTP/." | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | ... - pouros [/Sep/::: -] "GET /architectures/convergence/niches/mindshare HTTP/." | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | ... - - [/Sep/::: -] "PATCH /strategize/out-of-the-box HTTP/." | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | ... - - [/Sep/::: -] "POST /users HTTP/." | - +-----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+ + +-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | message | no_numbers | + |-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*><*><*>.<*><*>.<*>.<*><*> - upton<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "HEAD /e-business/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | + | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*><*><*>.<*><*>.<*><*><*>.<*> - pouros<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | + | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*><*><*>.<*><*><*>.<*><*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "PATCH /strategize/out-of-the-box HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | + | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*><*><*>.<*><*><*>.<*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "POST /users HTTP/<*>.<*>" <*><*><*> <*><*><*><*> | + +-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Simple Pattern Example 4: Return log patterns aggregation result ========================================================= -Version -------- -3.1.0 - Starting 3.1.0, patterns command support aggregation mode. The example shows how to get aggregated results from a raw log field. PPL query:: os> source=apache | patterns message method=simple_pattern mode=aggregation | fields patterns_field, pattern_count, sample_logs ; fetched rows / total rows = 4/4 - +---------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ - | patterns_field | pattern_count | sample_logs | - |---------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------| - | ... - [//::: -] " /-/ /." | 1 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927] | - | ... - [//::: -] " //// /." | 1 | [127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722] | - | ... - - [//::: -] " / /." | 1 | [210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | - | ... - - [//::: -] " //--- /." | 1 | [118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439] | - +---------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ + +---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ + | patterns_field | pattern_count | sample_logs | + |---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------| + | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | 1 | [210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | + | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | 1 | [118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439] | + | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927] | + | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722] | + +---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ Simple Pattern Example 5: Return log patterns aggregation result with detected variable tokens ========================================================= -Version -------- -3.1.0 +Starting 3.1.0, patterns command support aggregation mode. Configuration ------------- -New output format requires Calcite enabled. - -Enable Calcite: - - >> curl -H 'Content-Type: application/json' -X PUT localhost:9200/_plugins/_query/settings -d '{ - "persistent" : { - "plugins.calcite.enabled" : true - } - }' - -Starting 3.1.0, patterns command support aggregation mode. With Calcite engine enabled, the output can detect variable tokens from the pattern field. +With Calcite specific option ``show_numbered_token`` enabled, the output can detect numbered variable tokens from the pattern field. PPL query:: - PPL> source=apache | patterns message method=simple_pattern mode=aggregation | fields patterns_field, pattern_count, sample_logs | head 1 ; + os> source=apache | patterns message method=simple_pattern mode=aggregation show_numbered_token=true | fields patterns_field, pattern_count, tokens | head 1 ; fetched rows / total rows = 1/1 - |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | patterns_field | pattern_count | tokens | - |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | ... - [//::: -] " /-/ /." | 1 | {"":["e"],"":["HEAD"],"":["mindshare"],"":["business"],"":["1"],"":["HTTP"],"":["0"],"":["upton5450"],"":["74"],"":["Sep"],"":["28"],"":["10"],"":["2022"],"":["19927"],"":["15"],"":["177"],"":["404"],"":["0700"],"":["8"],"":["57"],"":["95"]} | - |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | patterns_field | pattern_count | tokens | + |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | ... - - [//::: -] " / /." | 1 | {'': ['HTTP'], '': ['users'], '': ['1'], '': ['1'], '': ['9481'], '': ['301'], '': ['28'], '': ['104'], '': ['2022'], '': ['Sep'], '': ['15'], '': ['10'], '': ['57'], '': ['210'], '': ['POST'], '': ['15'], '': ['0700'], '': ['204']} | + +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Brain Example 1: Extract log patterns =============================== @@ -167,14 +158,14 @@ PPL query:: os> source=apache | patterns message method=brain | fields message, patterns_field ; fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+ - | message | patterns_field | - |-----------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "HEAD /e-business/mindshare HTTP/<*><*>" 404 <*> | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "GET /architectures/convergence/niches/mindshare HTTP/<*><*>" 100 <*> | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "PATCH /strategize/out-of-the-box HTTP/<*><*>" 401 <*> | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*><*>" 301 <*> | - +-----------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+ + +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ + | message | patterns_field | + |-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------| + | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "HEAD /e-business/mindshare HTTP/<*>" 404 <*> | + | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>" 100 <*> | + | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "PATCH /strategize/out-of-the-box HTTP/<*>" 401 <*> | + | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*>" 301 <*> | + +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ Brain Example 2: Extract log patterns with custom parameters =============================== @@ -185,64 +176,48 @@ PPL query:: os> source=apache | patterns message method=brain variable_count_threshold=2 | fields message, patterns_field ; fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+ - | message | patterns_field | - |-----------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | - +-----------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+ + +-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------+ + | message | patterns_field | + |-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------| + | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | + | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | + | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | + | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | + +-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------+ Brain Example 3: Return log patterns aggregation result =============================== -Version -------- -3.1.0 - -Starting 3.1.0, patterns command support aggregation mode. The example shows how to get aggregated results from a raw log field for brain algorithm. +Starting 3.1.0, patterns command support aggregation mode. PPL query:: os> source=apache | patterns message method=brain mode=aggregation variable_count_threshold=2 | fields patterns_field, pattern_count, sample_logs ; fetched rows / total rows = 1/1 - +-------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | patterns_field | pattern_count | sample_logs | - |-------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | 4 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927,127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722,118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439,210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | - +-------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | patterns_field | pattern_count | sample_logs | + |----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | 4 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927,127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722,118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439,210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | + +----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Brain Example 4: Return log patterns aggregation result with detected variable tokens ========================================================= -Version -------- -3.1.0 +Starting 3.1.0, patterns command support aggregation mode. Configuration ------------- -New output format requires Calcite enabled. - -Enable Calcite: - - >> curl -H 'Content-Type: application/json' -X PUT localhost:9200/_plugins/_query/settings -d '{ - "persistent" : { - "plugins.calcite.enabled" : true - } - }' - -Starting 3.1.0, patterns command support aggregation mode. With Calcite engine enabled, the output can detect variable tokens from the pattern field. +With Calcite specific option ``show_numbered_token`` enabled, the output can detect numbered variable tokens from the pattern field. PPL query:: - PPL> source=apache | patterns message method=brain mode=aggregation variable_count_threshold=2 | fields patterns_field, pattern_count, tokens ; + os> source=apache | patterns message method=brain mode=aggregation show_numbered_token=true variable_count_threshold=2 | fields patterns_field, pattern_count, tokens ; fetched rows / total rows = 1/1 - |--------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | patterns_field | pattern_count | tokens | - |--------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | - [/Sep/::: ] HTTP/\" | 4 | {"":["10","10","10","10"],"":["2022","2022","2022","2022"],"":["57","57","57","57"],"":["15","15","15","15"],"":["\"HEAD","\"GET","\"PATCH","\"POST"],"":["-0700","-0700","-0700","-0700"],"":["/e-business/mindshare","/architectures/convergence/niches/mindshare","/strategize/out-of-the-box","/users"],"":["177.95.8.74","127.45.152.6","118.223.210.105","210.204.15.104"],"":["28","28","28","28"],"":["upton5450","pouros8756","-","-"]} | - |--------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + +----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | patterns_field | pattern_count | tokens | + |----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | - [/Sep/::: ] HTTP/" | 4 | {'': ['19927', '28722', '27439', '9481'], '': ['10', '10', '10', '10'], '': ['2022', '2022', '2022', '2022'], '': ['57', '57', '57', '57'], '': ['15', '15', '15', '15'], '': ['"HEAD', '"GET', '"PATCH', '"POST'], '': ['-0700', '-0700', '-0700', '-0700'], '': ['/e-business/mindshare', '/architectures/convergence/niches/mindshare', '/strategize/out-of-the-box', '/users'], '': ['177.95.8.74', '127.45.152.6', '118.223.210.10... | + +----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Limitations ========== diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLPatternsIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLPatternsIT.java index 7193c867fcd..761fe9b965b 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLPatternsIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLPatternsIT.java @@ -12,6 +12,7 @@ import static org.opensearch.sql.util.MatcherUtils.schema; import static org.opensearch.sql.util.MatcherUtils.verifyDataRows; import static org.opensearch.sql.util.MatcherUtils.verifySchema; +import static org.opensearch.sql.util.MatcherUtils.verifySchemaInOrder; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -32,12 +33,23 @@ public void init() throws Exception { } @Test - public void testSimplePatternLabelMode() throws IOException { + public void testSimplePatternLabelMode_NotShowNumberedToken() throws IOException { JSONObject result = executeQuery( String.format( - "source = %s | patterns email mode=label | head 1 | fields email, patterns_field," - + " tokens", + "source = %s | patterns email mode=label | head 1 | fields email, patterns_field", + TEST_INDEX_BANK)); + verifySchema(result, schema("email", "string"), schema("patterns_field", "string")); + verifyDataRows(result, rows("amberduke@pyrami.com", "<*>@<*>.<*>")); + } + + @Test + public void testSimplePatternLabelMode_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source = %s | patterns email mode=label show_numbered_token=true | head 1 | fields" + + " email, patterns_field, tokens", TEST_INDEX_BANK)); verifySchema( result, @@ -71,12 +83,24 @@ public void testSimplePatternLabelMode_NullFieldReturnEmpty() throws IOException } @Test - public void testSimplePatternLabelModeWithCustomPattern() throws IOException { + public void testSimplePatternLabelMode_EmptyStringReturnEmpty() throws IOException { JSONObject result = executeQuery( String.format( - "source = %s | patterns email mode=label pattern='@.*' | head 1 | fields email," - + " patterns_field, tokens", + "source = %s | eval message2 = '' | head 1 | patterns message2 | fields message2," + + " patterns_field", + TEST_INDEX_WEBLOGS)); + verifySchema(result, schema("message2", "string"), schema("patterns_field", "string")); + verifyDataRows(result, rows("", "")); + } + + @Test + public void testSimplePatternLabelModeWithCustomPattern_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source = %s | patterns email mode=label show_numbered_token=true pattern='@.*' |" + + " head 1 | fields email, patterns_field, tokens", TEST_INDEX_BANK)); verifySchema( result, @@ -92,11 +116,33 @@ public void testSimplePatternLabelModeWithCustomPattern() throws IOException { } @Test - public void testSimplePatternAggregationMode() throws IOException { + public void testSimplePatternAggregationMode_NotShowNumberedToken() throws IOException { JSONObject result = executeQuery( String.format( "source=%s | patterns email mode=aggregation max_sample_count=3", TEST_INDEX_BANK)); + verifySchema( + result, + schema("pattern_count", "bigint"), + schema("patterns_field", "string"), + schema("sample_logs", "array")); + verifyDataRows( + result, + rows( + "<*>@<*>.<*>", + 7, + ImmutableList.of( + "amberduke@pyrami.com", "hattiebond@netagy.com", "nanettebates@quility.com"))); + } + + @Test + public void testSimplePatternAggregationMode_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | patterns email mode=aggregation max_sample_count=3" + + " show_numbered_token=true", + TEST_INDEX_BANK)); verifySchema( result, schema("pattern_count", "bigint"), @@ -120,13 +166,83 @@ public void testSimplePatternAggregationMode() throws IOException { } @Test - public void testBrainLabelMode() throws IOException { + public void testSimplePatternAggregationMode_WithGroupBy_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | patterns email by male mode=aggregation max_sample_count=1" + + " show_numbered_token=true", + TEST_INDEX_BANK)); + verifySchemaInOrder( + result, + schema("male", "boolean"), + schema("patterns_field", "string"), + schema("pattern_count", "bigint"), + schema("tokens", "struct"), + schema("sample_logs", "array")); + verifyDataRows( + result, + rows( + false, + "@.", + 3, + ImmutableMap.of( + "", + ImmutableList.of("nanettebates"), + "", + ImmutableList.of("quility"), + "", + ImmutableList.of("com")), + ImmutableList.of("nanettebates@quility.com")), + rows( + true, + "@.", + 4, + ImmutableMap.of( + "", + ImmutableList.of("amberduke"), + "", + ImmutableList.of("pyrami"), + "", + ImmutableList.of("com")), + ImmutableList.of("amberduke@pyrami.com"))); + } + + @Test + public void testBrainLabelMode_NotShowNumberedToken() throws IOException { JSONObject result = executeQuery( String.format( "source=%s | patterns content method=BRAIN mode=label" + " max_sample_count=5 variable_count_threshold=5" + " frequency_threshold_percentage=0.2 | head 2 | fields content," + + " patterns_field", + TEST_INDEX_HDFS_LOGS)); + verifySchema(result, schema("content", "string"), schema("patterns_field", "string")); + verifyDataRows( + result, + rows( + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.31.85:50010 is added to" + + " blk_-7017553867379051457 size 67108864", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: <*IP*> is added to blk_<*>" + + " size <*>"), + rows( + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000296_0/part-00296." + + " blk_-6620182933895093708", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_<*>_<*>_r_<*>_<*>/part<*>" + + " blk_<*>")); + } + + @Test + public void testBrainLabelMode_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | patterns content method=BRAIN mode=label" + + " max_sample_count=5 show_numbered_token=true variable_count_threshold=5" + + " frequency_threshold_percentage=0.2 | head 2 | fields content," + " patterns_field, tokens", TEST_INDEX_HDFS_LOGS)); verifySchema( @@ -152,34 +268,82 @@ public void testBrainLabelMode() throws IOException { "BLOCK* NameSystem.allocateBlock:" + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000296_0/part-00296." + " blk_-6620182933895093708", - " NameSystem.allocateBlock:" - + " /user/root/sortrand/_temporary/_task___r__/part" - + " blk_", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task___r__/part" + + " blk_", ImmutableMap.of( "", - ImmutableList.of("BLOCK*"), - "", ImmutableList.of("200811092030"), - "", + "", ImmutableList.of("0002"), - "", + "", ImmutableList.of("000296"), - "", + "", ImmutableList.of("0"), - "", + "", ImmutableList.of("-00296."), - "", + "", ImmutableList.of("-6620182933895093708")))); } @Test - public void testBrainAggregationMode() throws IOException { + public void testBrainAggregationMode_NotShowNumberedToken() throws IOException { JSONObject result = executeQuery( String.format( "source=%s | patterns content method=brain mode=aggregation" + " variable_count_threshold=5", TEST_INDEX_HDFS_LOGS)); + verifySchema( + result, + schema("patterns_field", "string"), + schema("pattern_count", "bigint"), + schema("sample_logs", "array")); + verifyDataRows( + result, + rows( + "Verification succeeded <*> blk_<*>", + 2, + ImmutableList.of( + "Verification succeeded for blk_-1547954353065580372", + "Verification succeeded for blk_6996194389878584395")), + rows( + "BLOCK* NameSystem.addStoredBlock: blockMap updated: <*IP*> is added to blk_<*>" + + " size <*>", + 2, + ImmutableList.of( + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.31.85:50010 is added to" + + " blk_-7017553867379051457 size 67108864", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.107.19:50010 is added" + + " to blk_-3249711809227781266 size 67108864")), + rows( + "<*> NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_<*>_<*>_r_<*>_<*>/part<*>" + + " blk_<*>", + 2, + ImmutableList.of( + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000296_0/part-00296." + + " blk_-6620182933895093708", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318." + + " blk_2096692261399680562")), + rows( + "PacketResponder failed <*> blk_<*>", + 2, + ImmutableList.of( + "PacketResponder failed for blk_6996194389878584395", + "PacketResponder failed for blk_-1547954353065580372"))); + } + + @Test + public void testBrainAggregationMode_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | patterns content method=brain mode=aggregation" + + " show_numbered_token=true variable_count_threshold=5", + TEST_INDEX_HDFS_LOGS)); verifySchema( result, schema("patterns_field", "string"), @@ -256,12 +420,12 @@ public void testBrainAggregationMode() throws IOException { } @Test - public void testBrainAggregationModeWithGroupByClause() throws IOException { + public void testBrainAggregationModeWithGroupByClause_ShowNumberedToken() throws IOException { JSONObject result = executeQuery( String.format( "source=%s | patterns content by level method=BRAIN" - + " mode=aggregation max_sample_count=5" + + " mode=aggregation show_numbered_token=true max_sample_count=5" + " variable_count_threshold=2 frequency_threshold_percentage=0.2", TEST_INDEX_HDFS_LOGS)); System.out.println(result); @@ -338,13 +502,26 @@ public void testBrainAggregationModeWithGroupByClause() throws IOException { } @Test - public void testBrainParseWithUUID() throws IOException { + public void testBrainParseWithUUID_NotShowNumberedToken() throws IOException { JSONObject result = executeQuery( String.format( "source=%s | eval body = '[PlaceOrder] user_id=d664d7be-77d8-11f0-8880-0242f00b101d" + " user_currency=USD' | head 1 | patterns body method=BRAIN mode=label |" - + " fields patterns_field, tokens", + + " fields patterns_field", + Index.WEBLOG.getName())); + verifySchema(result, schema("patterns_field", "string")); + verifyDataRows(result, rows("[PlaceOrder] user_id=<*UUID*> user_currency=USD")); + } + + @Test + public void testBrainParseWithUUID_ShowNumberedToken() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | eval body = '[PlaceOrder] user_id=d664d7be-77d8-11f0-8880-0242f00b101d" + + " user_currency=USD' | head 1 | patterns body method=BRAIN mode=label" + + " show_numbered_token=true | fields patterns_field, tokens", Index.WEBLOG.getName())); verifySchema(result, schema("patterns_field", "string"), schema("tokens", "struct")); verifyDataRows( diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/ExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/ExplainIT.java index 9ab7f4b12d3..6dcab13d1cf 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/ExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/ExplainIT.java @@ -456,8 +456,8 @@ public void testExplainModeUnsupportedInV2() throws IOException { @Test public void testPatternsSimplePatternMethodWithoutAggExplain() throws IOException { // TODO: Correct calcite expected result once pushdown is supported - String expected = loadExpectedPlan("explain_patterns_simple_pattern.json"); - assertJsonEqualsIgnoreId( + String expected = loadExpectedPlan("explain_patterns_simple_pattern.yaml"); + assertYamlEqualsJsonIgnoreId( expected, explainQueryToString("source=opensearch-sql_test_index_account | patterns email")); } @@ -468,18 +468,19 @@ public void testPatternsSimplePatternMethodWithAggPushDownExplain() throws IOExc assertYamlEqualsJsonIgnoreId( expected, explainQueryToString( - "source=opensearch-sql_test_index_account | patterns email mode=aggregation")); + "source=opensearch-sql_test_index_account | patterns email mode=aggregation" + + " show_numbered_token=true")); } @Test public void testPatternsBrainMethodWithAggPushDownExplain() throws IOException { // TODO: Correct calcite expected result once pushdown is supported - String expected = loadExpectedPlan("explain_patterns_brain_agg_push.json"); - assertJsonEqualsIgnoreId( + String expected = loadExpectedPlan("explain_patterns_brain_agg_push.yaml"); + assertYamlEqualsJsonIgnoreId( expected, explainQueryToString( "source=opensearch-sql_test_index_account" - + "| patterns email method=brain mode=aggregation")); + + "| patterns email method=brain mode=aggregation show_numbered_token=true")); } @Test diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_agg_on_window.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_agg_on_window.json index e5c93f8aa41..20aaa1a6f9b 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_agg_on_window.json +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_agg_on_window.json @@ -1,6 +1,6 @@ { "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(count()=[$1], patterns_field=[$0])\n LogicalAggregate(group=[{0}], count()=[COUNT()])\n LogicalProject(patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($2, pattern($2, 10, 100000) OVER ()), 'pattern'))])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", - "physical": "EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], patterns_field=[$t0])\n EnumerableAggregate(group=[{0}], count()=[COUNT()])\n EnumerableCalc(expr#0..1=[{inputs}], expr#2=[PATTERN_PARSER($t0, $t1)], expr#3=['pattern'], expr#4=[ITEM($t2, $t3)], expr#5=[SAFE_CAST($t4)], patterns_field=[$t5])\n EnumerableWindow(window#0=[window(aggs [pattern($0, $1, $2)])])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[address]], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"address\"],\"excludes\":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)])\n" + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(count()=[$1], patterns_field=[$0])\n LogicalAggregate(group=[{0}], count()=[COUNT()])\n LogicalProject(patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($2, pattern($2, 10, 100000, false) OVER (), false), 'pattern'))])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", + "physical": "EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], patterns_field=[$t0])\n EnumerableAggregate(group=[{0}], count()=[COUNT()])\n EnumerableCalc(expr#0..1=[{inputs}], expr#2=[false], expr#3=[PATTERN_PARSER($t0, $t1, $t2)], expr#4=['pattern'], expr#5=[ITEM($t3, $t4)], expr#6=[SAFE_CAST($t5)], patterns_field=[$t6])\n EnumerableWindow(window#0=[window(aggs [pattern($0, $1, $2, $3)])])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[address]], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"address\"],\"excludes\":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)])\n" } } diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_brain_agg_push.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_brain_agg_push.json deleted file mode 100644 index c3fc80d1eb0..00000000000 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_brain_agg_push.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(patterns_field=[SAFE_CAST(ITEM($1, 'pattern'))], pattern_count=[SAFE_CAST(ITEM($1, 'pattern_count'))], tokens=[SAFE_CAST(ITEM($1, 'tokens'))], sample_logs=[SAFE_CAST(ITEM($1, 'sample_logs'))])\n LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}])\n LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2)])\n LogicalProject(email=[$9], $f17=[10], $f18=[100000])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n Uncollect\n LogicalProject(patterns_field=[$cor0.patterns_field])\n LogicalValues(tuples=[[{ 0 }]])\n", - "physical": "EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..1=[{inputs}], expr#2=['pattern'], expr#3=[ITEM($t1, $t2)], expr#4=[SAFE_CAST($t3)], expr#5=['pattern_count'], expr#6=[ITEM($t1, $t5)], expr#7=[SAFE_CAST($t6)], expr#8=['tokens'], expr#9=[ITEM($t1, $t8)], expr#10=[SAFE_CAST($t9)], expr#11=['sample_logs'], expr#12=[ITEM($t1, $t11)], expr#13=[SAFE_CAST($t12)], patterns_field=[$t4], pattern_count=[$t7], tokens=[$t10], sample_logs=[$t13])\n EnumerableCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}])\n EnumerableAggregate(group=[{}], patterns_field=[pattern($0, $1, $2)])\n EnumerableCalc(expr#0=[{inputs}], expr#1=[10], expr#2=[100000], proj#0..2=[{exprs}])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[email]], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"email\"],\"excludes\":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)])\n EnumerableUncollect\n EnumerableCalc(expr#0=[{inputs}], expr#1=[$cor0], expr#2=[$t1.patterns_field], patterns_field=[$t2])\n EnumerableValues(tuples=[[{ 0 }]])\n" - } -} diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_brain_agg_push.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_brain_agg_push.yaml new file mode 100644 index 00000000000..0b2d4584804 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_brain_agg_push.yaml @@ -0,0 +1,21 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(patterns_field=[SAFE_CAST(ITEM($1, 'pattern'))], pattern_count=[SAFE_CAST(ITEM($1, 'pattern_count'))], tokens=[SAFE_CAST(ITEM($1, 'tokens'))], sample_logs=[SAFE_CAST(ITEM($1, 'sample_logs'))]) + LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}]) + LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2, $3)]) + LogicalProject(email=[$9], $f17=[10], $f18=[100000], $f19=[true]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + Uncollect + LogicalProject(patterns_field=[$cor0.patterns_field]) + LogicalValues(tuples=[[{ 0 }]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..1=[{inputs}], expr#2=['pattern'], expr#3=[ITEM($t1, $t2)], expr#4=[SAFE_CAST($t3)], expr#5=['pattern_count'], expr#6=[ITEM($t1, $t5)], expr#7=[SAFE_CAST($t6)], expr#8=['tokens'], expr#9=[ITEM($t1, $t8)], expr#10=[SAFE_CAST($t9)], expr#11=['sample_logs'], expr#12=[ITEM($t1, $t11)], expr#13=[SAFE_CAST($t12)], patterns_field=[$t4], pattern_count=[$t7], tokens=[$t10], sample_logs=[$t13]) + EnumerableCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}]) + EnumerableAggregate(group=[{}], patterns_field=[pattern($0, $1, $2, $3)]) + EnumerableCalc(expr#0=[{inputs}], expr#1=[10], expr#2=[100000], expr#3=[true], proj#0..3=[{exprs}]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[email]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["email"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableUncollect + EnumerableCalc(expr#0=[{inputs}], expr#1=[$cor0], expr#2=[$t1.patterns_field], patterns_field=[$t2]) + EnumerableValues(tuples=[[{ 0 }]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern.json deleted file mode 100644 index 5dadfd4e13b..00000000000 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>'), $9), 'pattern'))], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>'), $9), 'tokens'))])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", - "physical": "EnumerableCalc(expr#0..10=[{inputs}], expr#11=['[a-zA-Z0-9]+':VARCHAR], expr#12=['<*>'], expr#13=[REGEXP_REPLACE($t9, $t11, $t12)], expr#14=[PATTERN_PARSER($t13, $t9)], expr#15=['pattern'], expr#16=[ITEM($t14, $t15)], expr#17=[SAFE_CAST($t16)], expr#18=['tokens'], expr#19=[ITEM($t14, $t18)], expr#20=[SAFE_CAST($t19)], proj#0..10=[{exprs}], $f11=[$t17], $f12=[$t20])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"],\"excludes\":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])\n" - } -} diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern.yaml new file mode 100644 index 00000000000..5b6fdebfd14 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern.yaml @@ -0,0 +1,8 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], patterns_field=[CASE(SEARCH($9, Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR))]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0..10=[{inputs}], expr#11=[Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR], expr#12=[SEARCH($t9, $t11)], expr#13=['':VARCHAR], expr#14=['[a-zA-Z0-9]+':VARCHAR], expr#15=['<*>':VARCHAR], expr#16=[REGEXP_REPLACE($t9, $t14, $t15)], expr#17=[CASE($t12, $t13, $t16)], proj#0..10=[{exprs}], $f11=[$t17]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern_agg_push.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern_agg_push.yaml index 59f80796fa7..fe1413c12e8 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern_agg_push.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_patterns_simple_pattern_agg_push.yaml @@ -3,8 +3,8 @@ calcite: LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($0, $2), 'pattern'))], pattern_count=[$1], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($0, $2), 'tokens'))], sample_logs=[$2]) LogicalAggregate(group=[{1}], pattern_count=[COUNT($1)], sample_logs=[TAKE($0, $2)]) - LogicalProject(email=[$9], patterns_field=[REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>')], $f18=[10]) + LogicalProject(email=[$9], patterns_field=[CASE(SEARCH($9, Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR))], $f18=[10]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableCalc(expr#0..2=[{inputs}], expr#3=[PATTERN_PARSER($t0, $t2)], expr#4=['pattern'], expr#5=[ITEM($t3, $t4)], expr#6=[SAFE_CAST($t5)], expr#7=['tokens'], expr#8=[ITEM($t3, $t7)], expr#9=[SAFE_CAST($t8)], patterns_field=[$t6], pattern_count=[$t1], tokens=[$t9], sample_logs=[$t2]) - CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[AGGREGATION->rel#:LogicalAggregate.NONE.[](input=RelSubset#,group={1},pattern_count=COUNT($1),sample_logs=TAKE($0, $2)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"patterns_field":{"terms":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXNyABFqYXZhLnV0aWwuQ29sbFNlcleOq7Y6G6gRAwABSQADdGFneHAAAAADdwQAAAAGdAAHcm93VHlwZXQGy3sKICAiZmllbGRzIjogWwogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhY2NvdW50X251bWJlciIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImZpcnN0bmFtZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImFkZHJlc3MiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJiYWxhbmNlIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiZ2VuZGVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiY2l0eSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImVtcGxveWVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAic3RhdGUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhZ2UiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgInByZWNpc2lvbiI6IC0xLAogICAgICAibmFtZSI6ICJlbWFpbCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImxhc3RuYW1lIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2lkIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2luZGV4IgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiUkVBTCIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJuYW1lIjogIl9zY29yZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlJFQUwiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfbWF4c2NvcmUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfc29ydCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogIl9yb3V0aW5nIgogICAgfQogIF0sCiAgIm51bGxhYmxlIjogdHJ1ZQp9dAAEZXhwcnQBx3sKICAib3AiOiB7CiAgICAibmFtZSI6ICJSRUdFWFBfUkVQTEFDRSIsCiAgICAia2luZCI6ICJPVEhFUl9GVU5DVElPTiIsCiAgICAic3ludGF4IjogIkZVTkNUSU9OIgogIH0sCiAgIm9wZXJhbmRzIjogWwogICAgewogICAgICAiaW5wdXQiOiA5LAogICAgICAibmFtZSI6ICIkOSIKICAgIH0sCiAgICB7CiAgICAgICJsaXRlcmFsIjogIlthLXpBLVowLTldKyIsCiAgICAgICJ0eXBlIjogewogICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICJudWxsYWJsZSI6IGZhbHNlLAogICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICB9CiAgICB9LAogICAgewogICAgICAibGl0ZXJhbCI6ICI8Kj4iLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJDSEFSIiwKICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAicHJlY2lzaW9uIjogMwogICAgICB9CiAgICB9CiAgXQp9dAAKZmllbGRUeXBlc3NyABdqYXZhLnV0aWwuTGlua2VkSGFzaE1hcDTATlwQbMD7AgABWgALYWNjZXNzT3JkZXJ4cgARamF2YS51dGlsLkhhc2hNYXAFB9rBwxZg0QMAAkYACmxvYWRGYWN0b3JJAAl0aHJlc2hvbGR4cD9AAAAAAAAMdwgAAAAQAAAAC3QADmFjY291bnRfbnVtYmVyfnIAKW9yZy5vcGVuc2VhcmNoLnNxbC5kYXRhLnR5cGUuRXhwckNvcmVUeXBlAAAAAAAAAAASAAB4cgAOamF2YS5sYW5nLkVudW0AAAAAAAAAABIAAHhwdAAETE9OR3QACWZpcnN0bmFtZXNyADpvcmcub3BlbnNlYXJjaC5zcWwub3BlbnNlYXJjaC5kYXRhLnR5cGUuT3BlblNlYXJjaFRleHRUeXBlrYOjkwTjMUQCAAFMAAZmaWVsZHN0AA9MamF2YS91dGlsL01hcDt4cgA6b3JnLm9wZW5zZWFyY2guc3FsLm9wZW5zZWFyY2guZGF0YS50eXBlLk9wZW5TZWFyY2hEYXRhVHlwZcJjvMoC+gU1AgADTAAMZXhwckNvcmVUeXBldAArTG9yZy9vcGVuc2VhcmNoL3NxbC9kYXRhL3R5cGUvRXhwckNvcmVUeXBlO0wAC21hcHBpbmdUeXBldABITG9yZy9vcGVuc2VhcmNoL3NxbC9vcGVuc2VhcmNoL2RhdGEvdHlwZS9PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGU7TAAKcHJvcGVydGllc3EAfgAReHB+cQB+AAt0AAdVTktOT1dOfnIARm9yZy5vcGVuc2VhcmNoLnNxbC5vcGVuc2VhcmNoLmRhdGEudHlwZS5PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGUAAAAAAAAAABIAAHhxAH4ADHQABFRleHRzcgA8c2hhZGVkLmNvbS5nb29nbGUuY29tbW9uLmNvbGxlY3QuSW1tdXRhYmxlTWFwJFNlcmlhbGl6ZWRGb3JtAAAAAAAAAAACAAJMAARrZXlzdAASTGphdmEvbGFuZy9PYmplY3Q7TAAGdmFsdWVzcQB+ABx4cHVyABNbTGphdmEubGFuZy5PYmplY3Q7kM5YnxBzKWwCAAB4cAAAAAB1cQB+AB4AAAAAc3EAfgAAAAAAA3cEAAAAAnQAB2tleXdvcmRzcQB+ABJ+cQB+AAt0AAZTVFJJTkd+cQB+ABh0AAdLZXl3b3JkcQB+AB14dAAHYWRkcmVzc3NxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAB4dAAHYmFsYW5jZXEAfgANdAAGZ2VuZGVyc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAEY2l0eXNxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAJxAH4AInEAfgAjeHQACGVtcGxveWVyc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAFc3RhdGVzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AANhZ2VxAH4ADXQABWVtYWlsc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAIbGFzdG5hbWVzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h4AHg=\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0}},"missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"pattern_count":{"value_count":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXNyABFqYXZhLnV0aWwuQ29sbFNlcleOq7Y6G6gRAwABSQADdGFneHAAAAADdwQAAAAGdAAHcm93VHlwZXQGy3sKICAiZmllbGRzIjogWwogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhY2NvdW50X251bWJlciIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImZpcnN0bmFtZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImFkZHJlc3MiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJiYWxhbmNlIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiZ2VuZGVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiY2l0eSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImVtcGxveWVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAic3RhdGUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhZ2UiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgInByZWNpc2lvbiI6IC0xLAogICAgICAibmFtZSI6ICJlbWFpbCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImxhc3RuYW1lIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2lkIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2luZGV4IgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiUkVBTCIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJuYW1lIjogIl9zY29yZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlJFQUwiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfbWF4c2NvcmUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfc29ydCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogIl9yb3V0aW5nIgogICAgfQogIF0sCiAgIm51bGxhYmxlIjogdHJ1ZQp9dAAEZXhwcnQBx3sKICAib3AiOiB7CiAgICAibmFtZSI6ICJSRUdFWFBfUkVQTEFDRSIsCiAgICAia2luZCI6ICJPVEhFUl9GVU5DVElPTiIsCiAgICAic3ludGF4IjogIkZVTkNUSU9OIgogIH0sCiAgIm9wZXJhbmRzIjogWwogICAgewogICAgICAiaW5wdXQiOiA5LAogICAgICAibmFtZSI6ICIkOSIKICAgIH0sCiAgICB7CiAgICAgICJsaXRlcmFsIjogIlthLXpBLVowLTldKyIsCiAgICAgICJ0eXBlIjogewogICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICJudWxsYWJsZSI6IGZhbHNlLAogICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICB9CiAgICB9LAogICAgewogICAgICAibGl0ZXJhbCI6ICI8Kj4iLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJDSEFSIiwKICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAicHJlY2lzaW9uIjogMwogICAgICB9CiAgICB9CiAgXQp9dAAKZmllbGRUeXBlc3NyABdqYXZhLnV0aWwuTGlua2VkSGFzaE1hcDTATlwQbMD7AgABWgALYWNjZXNzT3JkZXJ4cgARamF2YS51dGlsLkhhc2hNYXAFB9rBwxZg0QMAAkYACmxvYWRGYWN0b3JJAAl0aHJlc2hvbGR4cD9AAAAAAAAMdwgAAAAQAAAAC3QADmFjY291bnRfbnVtYmVyfnIAKW9yZy5vcGVuc2VhcmNoLnNxbC5kYXRhLnR5cGUuRXhwckNvcmVUeXBlAAAAAAAAAAASAAB4cgAOamF2YS5sYW5nLkVudW0AAAAAAAAAABIAAHhwdAAETE9OR3QACWZpcnN0bmFtZXNyADpvcmcub3BlbnNlYXJjaC5zcWwub3BlbnNlYXJjaC5kYXRhLnR5cGUuT3BlblNlYXJjaFRleHRUeXBlrYOjkwTjMUQCAAFMAAZmaWVsZHN0AA9MamF2YS91dGlsL01hcDt4cgA6b3JnLm9wZW5zZWFyY2guc3FsLm9wZW5zZWFyY2guZGF0YS50eXBlLk9wZW5TZWFyY2hEYXRhVHlwZcJjvMoC+gU1AgADTAAMZXhwckNvcmVUeXBldAArTG9yZy9vcGVuc2VhcmNoL3NxbC9kYXRhL3R5cGUvRXhwckNvcmVUeXBlO0wAC21hcHBpbmdUeXBldABITG9yZy9vcGVuc2VhcmNoL3NxbC9vcGVuc2VhcmNoL2RhdGEvdHlwZS9PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGU7TAAKcHJvcGVydGllc3EAfgAReHB+cQB+AAt0AAdVTktOT1dOfnIARm9yZy5vcGVuc2VhcmNoLnNxbC5vcGVuc2VhcmNoLmRhdGEudHlwZS5PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGUAAAAAAAAAABIAAHhxAH4ADHQABFRleHRzcgA8c2hhZGVkLmNvbS5nb29nbGUuY29tbW9uLmNvbGxlY3QuSW1tdXRhYmxlTWFwJFNlcmlhbGl6ZWRGb3JtAAAAAAAAAAACAAJMAARrZXlzdAASTGphdmEvbGFuZy9PYmplY3Q7TAAGdmFsdWVzcQB+ABx4cHVyABNbTGphdmEubGFuZy5PYmplY3Q7kM5YnxBzKWwCAAB4cAAAAAB1cQB+AB4AAAAAc3EAfgAAAAAAA3cEAAAAAnQAB2tleXdvcmRzcQB+ABJ+cQB+AAt0AAZTVFJJTkd+cQB+ABh0AAdLZXl3b3JkcQB+AB14dAAHYWRkcmVzc3NxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAB4dAAHYmFsYW5jZXEAfgANdAAGZ2VuZGVyc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAEY2l0eXNxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAJxAH4AInEAfgAjeHQACGVtcGxveWVyc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAFc3RhdGVzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AANhZ2VxAH4ADXQABWVtYWlsc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAIbGFzdG5hbWVzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h4AHg=\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0}}}},"sample_logs":{"top_hits":{"from":0,"size":10,"version":false,"seq_no_primary_term":false,"explain":false,"_source":{"includes":["email"],"excludes":[]}}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[AGGREGATION->rel#:LogicalAggregate.NONE.[](input=RelSubset#,group={1},pattern_count=COUNT($1),sample_logs=TAKE($0, $2)), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"patterns_field":{"terms":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXNyABFqYXZhLnV0aWwuQ29sbFNlcleOq7Y6G6gRAwABSQADdGFneHAAAAADdwQAAAAGdAAHcm93VHlwZXQGy3sKICAiZmllbGRzIjogWwogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhY2NvdW50X251bWJlciIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImZpcnN0bmFtZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImFkZHJlc3MiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJiYWxhbmNlIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiZ2VuZGVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiY2l0eSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImVtcGxveWVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAic3RhdGUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhZ2UiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgInByZWNpc2lvbiI6IC0xLAogICAgICAibmFtZSI6ICJlbWFpbCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImxhc3RuYW1lIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2lkIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2luZGV4IgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiUkVBTCIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJuYW1lIjogIl9zY29yZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlJFQUwiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfbWF4c2NvcmUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfc29ydCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogIl9yb3V0aW5nIgogICAgfQogIF0sCiAgIm51bGxhYmxlIjogdHJ1ZQp9dAAEZXhwcnQGGXsKICAib3AiOiB7CiAgICAibmFtZSI6ICJDQVNFIiwKICAgICJraW5kIjogIkNBU0UiLAogICAgInN5bnRheCI6ICJTUEVDSUFMIgogIH0sCiAgIm9wZXJhbmRzIjogWwogICAgewogICAgICAib3AiOiB7CiAgICAgICAgIm5hbWUiOiAiU0VBUkNIIiwKICAgICAgICAia2luZCI6ICJTRUFSQ0giLAogICAgICAgICJzeW50YXgiOiAiSU5URVJOQUwiCiAgICAgIH0sCiAgICAgICJvcGVyYW5kcyI6IFsKICAgICAgICB7CiAgICAgICAgICAiaW5wdXQiOiA5LAogICAgICAgICAgIm5hbWUiOiAiJDkiCiAgICAgICAgfSwKICAgICAgICB7CiAgICAgICAgICAibGl0ZXJhbCI6IHsKICAgICAgICAgICAgInJhbmdlU2V0IjogWwogICAgICAgICAgICAgIFsKICAgICAgICAgICAgICAgICJzaW5nbGV0b24iLAogICAgICAgICAgICAgICAgIntcInZhbHVlXCI6XCJcIixcImNoYXJzZXROYW1lXCI6XCJJU08tODg1OS0xXCIsXCJjb2xsYXRpb25cIjp7XCJjb2xsYXRpb25OYW1lXCI6XCJJU08tODg1OS0xJGVuX1VTJHByaW1hcnlcIixcImNvZXJjaWJpbGl0eVwiOlwiSU1QTElDSVRcIixcImxvY2FsZVwiOlwiZW5fVVNcIn0sXCJjaGFyc2V0XCI6XCJJU08tODg1OS0xXCIsXCJ2YWx1ZUJ5dGVzXCI6bnVsbH0iCiAgICAgICAgICAgICAgXQogICAgICAgICAgICBdLAogICAgICAgICAgICAibnVsbEFzIjogIlRSVUUiCiAgICAgICAgICB9LAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAgICAgInByZWNpc2lvbiI6IC0xCiAgICAgICAgICB9CiAgICAgICAgfQogICAgICBdCiAgICB9LAogICAgewogICAgICAibGl0ZXJhbCI6ICIiLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgfQogICAgfSwKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIlJFR0VYUF9SRVBMQUNFIiwKICAgICAgICAia2luZCI6ICJPVEhFUl9GVU5DVElPTiIsCiAgICAgICAgInN5bnRheCI6ICJGVU5DVElPTiIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJpbnB1dCI6IDksCiAgICAgICAgICAibmFtZSI6ICIkOSIKICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICJsaXRlcmFsIjogIlthLXpBLVowLTldKyIsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IGZhbHNlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICJsaXRlcmFsIjogIjwqPiIsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IGZhbHNlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0KICBdCn10AApmaWVsZFR5cGVzc3IAF2phdmEudXRpbC5MaW5rZWRIYXNoTWFwNMBOXBBswPsCAAFaAAthY2Nlc3NPcmRlcnhyABFqYXZhLnV0aWwuSGFzaE1hcAUH2sHDFmDRAwACRgAKbG9hZEZhY3RvckkACXRocmVzaG9sZHhwP0AAAAAAAAx3CAAAABAAAAALdAAOYWNjb3VudF9udW1iZXJ+cgApb3JnLm9wZW5zZWFyY2guc3FsLmRhdGEudHlwZS5FeHByQ29yZVR5cGUAAAAAAAAAABIAAHhyAA5qYXZhLmxhbmcuRW51bQAAAAAAAAAAEgAAeHB0AARMT05HdAAJZmlyc3RuYW1lc3IAOm9yZy5vcGVuc2VhcmNoLnNxbC5vcGVuc2VhcmNoLmRhdGEudHlwZS5PcGVuU2VhcmNoVGV4dFR5cGWtg6OTBOMxRAIAAUwABmZpZWxkc3QAD0xqYXZhL3V0aWwvTWFwO3hyADpvcmcub3BlbnNlYXJjaC5zcWwub3BlbnNlYXJjaC5kYXRhLnR5cGUuT3BlblNlYXJjaERhdGFUeXBlwmO8ygL6BTUCAANMAAxleHByQ29yZVR5cGV0ACtMb3JnL29wZW5zZWFyY2gvc3FsL2RhdGEvdHlwZS9FeHByQ29yZVR5cGU7TAALbWFwcGluZ1R5cGV0AEhMb3JnL29wZW5zZWFyY2gvc3FsL29wZW5zZWFyY2gvZGF0YS90eXBlL09wZW5TZWFyY2hEYXRhVHlwZSRNYXBwaW5nVHlwZTtMAApwcm9wZXJ0aWVzcQB+ABF4cH5xAH4AC3QAB1VOS05PV05+cgBGb3JnLm9wZW5zZWFyY2guc3FsLm9wZW5zZWFyY2guZGF0YS50eXBlLk9wZW5TZWFyY2hEYXRhVHlwZSRNYXBwaW5nVHlwZQAAAAAAAAAAEgAAeHEAfgAMdAAEVGV4dHNyADxzaGFkZWQuY29tLmdvb2dsZS5jb21tb24uY29sbGVjdC5JbW11dGFibGVNYXAkU2VyaWFsaXplZEZvcm0AAAAAAAAAAAIAAkwABGtleXN0ABJMamF2YS9sYW5nL09iamVjdDtMAAZ2YWx1ZXNxAH4AHHhwdXIAE1tMamF2YS5sYW5nLk9iamVjdDuQzlifEHMpbAIAAHhwAAAAAHVxAH4AHgAAAABzcQB+AAAAAAADdwQAAAACdAAHa2V5d29yZHNxAH4AEn5xAH4AC3QABlNUUklOR35xAH4AGHQAB0tleXdvcmRxAH4AHXh0AAdhZGRyZXNzc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAHh0AAdiYWxhbmNlcQB+AA10AAZnZW5kZXJzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AARjaXR5c3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAIZW1wbG95ZXJzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AAVzdGF0ZXNxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAJxAH4AInEAfgAjeHQAA2FnZXEAfgANdAAFZW1haWxzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AAhsYXN0bmFtZXNxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAJxAH4AInEAfgAjeHgAeA==\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0}},"missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"pattern_count":{"value_count":{"script":{"source":"{\"langType\":\"calcite\",\"script\":\"rO0ABXNyABFqYXZhLnV0aWwuQ29sbFNlcleOq7Y6G6gRAwABSQADdGFneHAAAAADdwQAAAAGdAAHcm93VHlwZXQGy3sKICAiZmllbGRzIjogWwogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhY2NvdW50X251bWJlciIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImZpcnN0bmFtZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImFkZHJlc3MiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJiYWxhbmNlIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiZ2VuZGVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiY2l0eSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImVtcGxveWVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAic3RhdGUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhZ2UiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgInByZWNpc2lvbiI6IC0xLAogICAgICAibmFtZSI6ICJlbWFpbCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImxhc3RuYW1lIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2lkIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2luZGV4IgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiUkVBTCIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJuYW1lIjogIl9zY29yZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlJFQUwiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfbWF4c2NvcmUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfc29ydCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogIl9yb3V0aW5nIgogICAgfQogIF0sCiAgIm51bGxhYmxlIjogdHJ1ZQp9dAAEZXhwcnQGGXsKICAib3AiOiB7CiAgICAibmFtZSI6ICJDQVNFIiwKICAgICJraW5kIjogIkNBU0UiLAogICAgInN5bnRheCI6ICJTUEVDSUFMIgogIH0sCiAgIm9wZXJhbmRzIjogWwogICAgewogICAgICAib3AiOiB7CiAgICAgICAgIm5hbWUiOiAiU0VBUkNIIiwKICAgICAgICAia2luZCI6ICJTRUFSQ0giLAogICAgICAgICJzeW50YXgiOiAiSU5URVJOQUwiCiAgICAgIH0sCiAgICAgICJvcGVyYW5kcyI6IFsKICAgICAgICB7CiAgICAgICAgICAiaW5wdXQiOiA5LAogICAgICAgICAgIm5hbWUiOiAiJDkiCiAgICAgICAgfSwKICAgICAgICB7CiAgICAgICAgICAibGl0ZXJhbCI6IHsKICAgICAgICAgICAgInJhbmdlU2V0IjogWwogICAgICAgICAgICAgIFsKICAgICAgICAgICAgICAgICJzaW5nbGV0b24iLAogICAgICAgICAgICAgICAgIntcInZhbHVlXCI6XCJcIixcImNoYXJzZXROYW1lXCI6XCJJU08tODg1OS0xXCIsXCJjb2xsYXRpb25cIjp7XCJjb2xsYXRpb25OYW1lXCI6XCJJU08tODg1OS0xJGVuX1VTJHByaW1hcnlcIixcImNvZXJjaWJpbGl0eVwiOlwiSU1QTElDSVRcIixcImxvY2FsZVwiOlwiZW5fVVNcIn0sXCJjaGFyc2V0XCI6XCJJU08tODg1OS0xXCIsXCJ2YWx1ZUJ5dGVzXCI6bnVsbH0iCiAgICAgICAgICAgICAgXQogICAgICAgICAgICBdLAogICAgICAgICAgICAibnVsbEFzIjogIlRSVUUiCiAgICAgICAgICB9LAogICAgICAgICAgInR5cGUiOiB7CiAgICAgICAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAgICAgInByZWNpc2lvbiI6IC0xCiAgICAgICAgICB9CiAgICAgICAgfQogICAgICBdCiAgICB9LAogICAgewogICAgICAibGl0ZXJhbCI6ICIiLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgfQogICAgfSwKICAgIHsKICAgICAgIm9wIjogewogICAgICAgICJuYW1lIjogIlJFR0VYUF9SRVBMQUNFIiwKICAgICAgICAia2luZCI6ICJPVEhFUl9GVU5DVElPTiIsCiAgICAgICAgInN5bnRheCI6ICJGVU5DVElPTiIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJpbnB1dCI6IDksCiAgICAgICAgICAibmFtZSI6ICIkOSIKICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICJsaXRlcmFsIjogIlthLXpBLVowLTldKyIsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IGZhbHNlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICJsaXRlcmFsIjogIjwqPiIsCiAgICAgICAgICAidHlwZSI6IHsKICAgICAgICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICAgICAgICJudWxsYWJsZSI6IGZhbHNlLAogICAgICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIF0KICAgIH0KICBdCn10AApmaWVsZFR5cGVzc3IAF2phdmEudXRpbC5MaW5rZWRIYXNoTWFwNMBOXBBswPsCAAFaAAthY2Nlc3NPcmRlcnhyABFqYXZhLnV0aWwuSGFzaE1hcAUH2sHDFmDRAwACRgAKbG9hZEZhY3RvckkACXRocmVzaG9sZHhwP0AAAAAAAAx3CAAAABAAAAALdAAOYWNjb3VudF9udW1iZXJ+cgApb3JnLm9wZW5zZWFyY2guc3FsLmRhdGEudHlwZS5FeHByQ29yZVR5cGUAAAAAAAAAABIAAHhyAA5qYXZhLmxhbmcuRW51bQAAAAAAAAAAEgAAeHB0AARMT05HdAAJZmlyc3RuYW1lc3IAOm9yZy5vcGVuc2VhcmNoLnNxbC5vcGVuc2VhcmNoLmRhdGEudHlwZS5PcGVuU2VhcmNoVGV4dFR5cGWtg6OTBOMxRAIAAUwABmZpZWxkc3QAD0xqYXZhL3V0aWwvTWFwO3hyADpvcmcub3BlbnNlYXJjaC5zcWwub3BlbnNlYXJjaC5kYXRhLnR5cGUuT3BlblNlYXJjaERhdGFUeXBlwmO8ygL6BTUCAANMAAxleHByQ29yZVR5cGV0ACtMb3JnL29wZW5zZWFyY2gvc3FsL2RhdGEvdHlwZS9FeHByQ29yZVR5cGU7TAALbWFwcGluZ1R5cGV0AEhMb3JnL29wZW5zZWFyY2gvc3FsL29wZW5zZWFyY2gvZGF0YS90eXBlL09wZW5TZWFyY2hEYXRhVHlwZSRNYXBwaW5nVHlwZTtMAApwcm9wZXJ0aWVzcQB+ABF4cH5xAH4AC3QAB1VOS05PV05+cgBGb3JnLm9wZW5zZWFyY2guc3FsLm9wZW5zZWFyY2guZGF0YS50eXBlLk9wZW5TZWFyY2hEYXRhVHlwZSRNYXBwaW5nVHlwZQAAAAAAAAAAEgAAeHEAfgAMdAAEVGV4dHNyADxzaGFkZWQuY29tLmdvb2dsZS5jb21tb24uY29sbGVjdC5JbW11dGFibGVNYXAkU2VyaWFsaXplZEZvcm0AAAAAAAAAAAIAAkwABGtleXN0ABJMamF2YS9sYW5nL09iamVjdDtMAAZ2YWx1ZXNxAH4AHHhwdXIAE1tMamF2YS5sYW5nLk9iamVjdDuQzlifEHMpbAIAAHhwAAAAAHVxAH4AHgAAAABzcQB+AAAAAAADdwQAAAACdAAHa2V5d29yZHNxAH4AEn5xAH4AC3QABlNUUklOR35xAH4AGHQAB0tleXdvcmRxAH4AHXh0AAdhZGRyZXNzc3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAHh0AAdiYWxhbmNlcQB+AA10AAZnZW5kZXJzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AARjaXR5c3EAfgAQcQB+ABZxAH4AGXEAfgAdc3EAfgAAAAAAA3cEAAAAAnEAfgAicQB+ACN4dAAIZW1wbG95ZXJzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AAVzdGF0ZXNxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAJxAH4AInEAfgAjeHQAA2FnZXEAfgANdAAFZW1haWxzcQB+ABBxAH4AFnEAfgAZcQB+AB1zcQB+AAAAAAADdwQAAAACcQB+ACJxAH4AI3h0AAhsYXN0bmFtZXNxAH4AEHEAfgAWcQB+ABlxAH4AHXNxAH4AAAAAAAN3BAAAAAJxAH4AInEAfgAjeHgAeA==\"}","lang":"opensearch_compounded_script","params":{"utcTimestamp": 0}}}},"sample_logs":{"top_hits":{"from":0,"size":10,"version":false,"seq_no_primary_term":false,"explain":false,"_source":{"includes":["email"],"excludes":[]}}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_brain_agg_push.json b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_brain_agg_push.json deleted file mode 100644 index bd4d36474e2..00000000000 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_brain_agg_push.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(patterns_field=[SAFE_CAST(ITEM($1, 'pattern'))], pattern_count=[SAFE_CAST(ITEM($1, 'pattern_count'))], tokens=[SAFE_CAST(ITEM($1, 'tokens'))], sample_logs=[SAFE_CAST(ITEM($1, 'sample_logs'))])\n LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}])\n LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2)])\n LogicalProject(email=[$9], $f17=[10], $f18=[100000])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n Uncollect\n LogicalProject(patterns_field=[$cor0.patterns_field])\n LogicalValues(tuples=[[{ 0 }]])\n", - "physical": "EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..1=[{inputs}], expr#2=['pattern'], expr#3=[ITEM($t1, $t2)], expr#4=[SAFE_CAST($t3)], expr#5=['pattern_count'], expr#6=[ITEM($t1, $t5)], expr#7=[SAFE_CAST($t6)], expr#8=['tokens'], expr#9=[ITEM($t1, $t8)], expr#10=[SAFE_CAST($t9)], expr#11=['sample_logs'], expr#12=[ITEM($t1, $t11)], expr#13=[SAFE_CAST($t12)], patterns_field=[$t4], pattern_count=[$t7], tokens=[$t10], sample_logs=[$t13])\n EnumerableCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}])\n EnumerableAggregate(group=[{}], patterns_field=[pattern($0, $1, $2)])\n EnumerableCalc(expr#0..16=[{inputs}], expr#17=[10], expr#18=[100000], email=[$t9], $f17=[$t17], $f18=[$t18])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n EnumerableUncollect\n EnumerableCalc(expr#0=[{inputs}], expr#1=[$cor0], expr#2=[$t1.patterns_field], patterns_field=[$t2])\n EnumerableValues(tuples=[[{ 0 }]])\n" - } -} diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_brain_agg_push.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_brain_agg_push.yaml new file mode 100644 index 00000000000..bc9bc027e34 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_brain_agg_push.yaml @@ -0,0 +1,21 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(patterns_field=[SAFE_CAST(ITEM($1, 'pattern'))], pattern_count=[SAFE_CAST(ITEM($1, 'pattern_count'))], tokens=[SAFE_CAST(ITEM($1, 'tokens'))], sample_logs=[SAFE_CAST(ITEM($1, 'sample_logs'))]) + LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}]) + LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2, $3)]) + LogicalProject(email=[$9], $f17=[10], $f18=[100000], $f19=[true]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + Uncollect + LogicalProject(patterns_field=[$cor0.patterns_field]) + LogicalValues(tuples=[[{ 0 }]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..1=[{inputs}], expr#2=['pattern'], expr#3=[ITEM($t1, $t2)], expr#4=[SAFE_CAST($t3)], expr#5=['pattern_count'], expr#6=[ITEM($t1, $t5)], expr#7=[SAFE_CAST($t6)], expr#8=['tokens'], expr#9=[ITEM($t1, $t8)], expr#10=[SAFE_CAST($t9)], expr#11=['sample_logs'], expr#12=[ITEM($t1, $t11)], expr#13=[SAFE_CAST($t12)], patterns_field=[$t4], pattern_count=[$t7], tokens=[$t10], sample_logs=[$t13]) + EnumerableCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}]) + EnumerableAggregate(group=[{}], patterns_field=[pattern($0, $1, $2, $3)]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[10], expr#18=[100000], expr#19=[true], email=[$t9], $f17=[$t17], $f18=[$t18], $f19=[$t19]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableUncollect + EnumerableCalc(expr#0=[{inputs}], expr#1=[$cor0], expr#2=[$t1.patterns_field], patterns_field=[$t2]) + EnumerableValues(tuples=[[{ 0 }]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern.json b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern.json deleted file mode 100644 index 80057b52d75..00000000000 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>'), $9), 'pattern'))], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>'), $9), 'tokens'))])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", - "physical": "EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..16=[{inputs}], expr#17=['[a-zA-Z0-9]+':VARCHAR], expr#18=['<*>'], expr#19=[REGEXP_REPLACE($t9, $t17, $t18)], expr#20=[PATTERN_PARSER($t19, $t9)], expr#21=['pattern'], expr#22=[ITEM($t20, $t21)], expr#23=[SAFE_CAST($t22)], expr#24=['tokens'], expr#25=[ITEM($t20, $t24)], expr#26=[SAFE_CAST($t25)], proj#0..10=[{exprs}], patterns_field=[$t23], tokens=[$t26])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n" - } -} diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern.yaml new file mode 100644 index 00000000000..056b8a463a7 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], patterns_field=[CASE(SEARCH($9, Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR))]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR], expr#18=[SEARCH($t9, $t17)], expr#19=['':VARCHAR], expr#20=['[a-zA-Z0-9]+':VARCHAR], expr#21=['<*>':VARCHAR], expr#22=[REGEXP_REPLACE($t9, $t20, $t21)], expr#23=[CASE($t18, $t19, $t22)], proj#0..10=[{exprs}], patterns_field=[$t23]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern_agg_push.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern_agg_push.yaml index f9d28c5503b..bd40ed4bb7a 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern_agg_push.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_patterns_simple_pattern_agg_push.yaml @@ -3,11 +3,11 @@ calcite: LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) LogicalProject(patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($0, $2), 'pattern'))], pattern_count=[$1], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($0, $2), 'tokens'))], sample_logs=[$2]) LogicalAggregate(group=[{1}], pattern_count=[COUNT($1)], sample_logs=[TAKE($0, $2)]) - LogicalProject(email=[$9], patterns_field=[REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>')], $f18=[10]) + LogicalProject(email=[$9], patterns_field=[CASE(SEARCH($9, Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($9, '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR))], $f18=[10]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableLimit(fetch=[10000]) EnumerableCalc(expr#0..2=[{inputs}], expr#3=[PATTERN_PARSER($t0, $t2)], expr#4=['pattern'], expr#5=[ITEM($t3, $t4)], expr#6=[SAFE_CAST($t5)], expr#7=['tokens'], expr#8=[ITEM($t3, $t7)], expr#9=[SAFE_CAST($t8)], patterns_field=[$t6], pattern_count=[$t1], tokens=[$t9], sample_logs=[$t2]) EnumerableAggregate(group=[{1}], pattern_count=[COUNT($1)], sample_logs=[TAKE($0, $2)]) - EnumerableCalc(expr#0..16=[{inputs}], expr#17=['[a-zA-Z0-9]+':VARCHAR], expr#18=['<*>'], expr#19=[REGEXP_REPLACE($t9, $t17, $t18)], expr#20=[10], email=[$t9], patterns_field=[$t19], $f18=[$t20]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR], expr#18=[SEARCH($t9, $t17)], expr#19=['':VARCHAR], expr#20=['[a-zA-Z0-9]+':VARCHAR], expr#21=['<*>':VARCHAR], expr#22=[REGEXP_REPLACE($t9, $t20, $t21)], expr#23=[CASE($t18, $t19, $t22)], expr#24=[10], email=[$t9], patterns_field=[$t23], $f18=[$t24]) CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_brain_agg_push.json b/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_brain_agg_push.json deleted file mode 100644 index 946e447b9fc..00000000000 --- a/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_brain_agg_push.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "root":{ - "name":"ProjectOperator", - "description":{ - "fields":"[pattern_count, sample_logs, patterns_field]" - }, - "children":[ - { - "name":"AggregationOperator", - "description":{ - "aggregators":"[pattern_count, sample_logs]", - "groupBy":"[patterns_field]" - }, - "children":[ - { - "name":"WindowOperator", - "description":{ - "function":"patterns_field", - "definition":{ - "partitionBy":"[]", - "sortList":{} - } - }, - "children":[ - { - "name":"OpenSearchIndexScan", - "description":{ - "request":"OpenSearchQueryRequest(indexName=opensearch-sql_test_index_account, sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\"}, needClean=true, searchDone=false, pitId=null, cursorKeepAlive=1m, searchAfter=null, searchResponse=null)" - }, - "children":[] - } - ] - } - ] - } - ] - } -} \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_brain_agg_push.yaml b/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_brain_agg_push.yaml new file mode 100644 index 00000000000..a73e2590d4d --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_brain_agg_push.yaml @@ -0,0 +1,24 @@ +root: + name: ProjectOperator + description: + fields: "[pattern_count, sample_logs, patterns_field]" + children: + - name: AggregationOperator + description: + aggregators: "[pattern_count, sample_logs]" + groupBy: "[patterns_field]" + children: + - name: WindowOperator + description: + function: patterns_field + definition: + partitionBy: "[]" + sortList: {} + children: + - name: OpenSearchIndexScan + description: + request: "OpenSearchQueryRequest(indexName=opensearch-sql_test_index_account,\ + \ sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\"},\ + \ needClean=true, searchDone=false, pitId=*,\ + \ cursorKeepAlive=1m, searchAfter=null, searchResponse=null)" + children: [] diff --git a/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_simple_pattern.json b/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_simple_pattern.json deleted file mode 100644 index f9e1ceaf901..00000000000 --- a/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_simple_pattern.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "root": { - "name": "ProjectOperator", - "description": { - "fields": "[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname, patterns_field]" - }, - "children": [ - { - "name": "OpenSearchIndexScan", - "description": { - "request": "OpenSearchQueryRequest(indexName=opensearch-sql_test_index_account, sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\",\"patterns_field\"],\"excludes\":[]}}, needClean=true, searchDone=false, pitId=*, cursorKeepAlive=1m, searchAfter=null, searchResponse=null)" - }, - "children": [] - } - ] - } -} diff --git a/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_simple_pattern.yaml b/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_simple_pattern.yaml new file mode 100644 index 00000000000..ec791b34e14 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/ppl/explain_patterns_simple_pattern.yaml @@ -0,0 +1,16 @@ +root: + name: ProjectOperator + description: + fields: "[account_number, firstname, address, balance, gender, city, employer,\ + \ state, age, email, lastname, patterns_field]" + children: + - name: OpenSearchIndexScan + description: + request: "OpenSearchQueryRequest(indexName=opensearch-sql_test_index_account,\ + \ sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"_source\"\ + :{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\"\ + ,\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"\ + ,\"patterns_field\"],\"excludes\":[]}}, needClean=true, searchDone=false,\ + \ pitId=*,\ + \ cursorKeepAlive=1m, searchAfter=null, searchResponse=null)" + children: [] \ No newline at end of file diff --git a/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/3570.yml b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/3570.yml index 10e1ff7e1ec..30175754646 100644 --- a/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/3570.yml +++ b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/3570.yml @@ -49,7 +49,7 @@ teardown: Content-Type: 'application/json' ppl: body: - query: 'source=hdfs_logs | patterns content | head 1 | fields content, patterns_field, tokens' + query: 'source=hdfs_logs | patterns content show_numbered_token=true | head 1 | fields content, patterns_field, tokens' - match: {"total": 1} - match: {"schema": [{"name": "content", "type": "string"}, {"name": "patterns_field", "type": "string"}, {"name": "tokens", "type": "struct"}]} - match: {"datarows": [ @@ -74,7 +74,7 @@ teardown: Content-Type: 'application/json' ppl: body: - query: 'source=hdfs_logs | patterns content mode=aggregation | sort patterns_field | fields patterns_field, pattern_count' + query: 'source=hdfs_logs | patterns content mode=aggregation show_numbered_token=true | sort patterns_field | fields patterns_field, pattern_count' - match: {"total": 5} - match: {"schema": [{"name": "patterns_field", "type": "string"}, {"name": "pattern_count", "type": "bigint"}]} - match: {"datarows": [ @@ -114,7 +114,7 @@ teardown: Content-Type: 'application/json' ppl: body: - query: 'source=hdfs_logs | patterns content method=brain mode=label | head 1 | fields content, patterns_field, tokens' + query: 'source=hdfs_logs | patterns content method=brain mode=label show_numbered_token=true | head 1 | fields content, patterns_field, tokens' - match: {"total": 1} - match: {"schema": [{"name": "content", "type": "string"}, {"name": "patterns_field", "type": "string"}, {"name": "tokens", "type": "struct"}]} - match: {"datarows": [ @@ -148,7 +148,7 @@ teardown: Content-Type: 'application/json' ppl: body: - query: 'source=hdfs_logs | patterns content method=brain mode=aggregation | fields patterns_field, pattern_count' + query: 'source=hdfs_logs | patterns content method=brain mode=aggregation show_numbered_token=true | fields patterns_field, pattern_count' - match: {"total": 4} - match: {"schema": [{"name": "patterns_field", "type": "string"}, {"name": "pattern_count", "type": "bigint"}]} - match: {"datarows": [ diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java index 86c623bfd44..b8a23378e2b 100644 --- a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java @@ -122,6 +122,13 @@ public class OpenSearchSettings extends Settings { Setting.Property.NodeScope, Setting.Property.Dynamic); + public static final Setting DEFAULT_PATTERN_SHOW_NUMBERED_TOKEN_SETTING = + Setting.boolSetting( + Key.PATTERN_SHOW_NUMBERED_TOKEN.getKeyValue(), + false, + Setting.Property.NodeScope, + Setting.Property.Dynamic); + public static final Setting PPL_REX_MAX_MATCH_LIMIT_SETTING = Setting.intSetting( Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), @@ -394,6 +401,12 @@ public OpenSearchSettings(ClusterSettings clusterSettings) { Key.PATTERN_BUFFER_LIMIT, DEFAULT_PATTERN_BUFFER_LIMIT_SETTING, new Updater(Key.PATTERN_BUFFER_LIMIT)); + register( + settingBuilder, + clusterSettings, + Key.PATTERN_SHOW_NUMBERED_TOKEN, + DEFAULT_PATTERN_SHOW_NUMBERED_TOKEN_SETTING, + new Updater(Key.PATTERN_SHOW_NUMBERED_TOKEN)); register( settingBuilder, clusterSettings, @@ -620,6 +633,7 @@ public static List> pluginSettings() { .add(DEFAULT_PATTERN_MODE_SETTING) .add(DEFAULT_PATTERN_MAX_SAMPLE_COUNT_SETTING) .add(DEFAULT_PATTERN_BUFFER_LIMIT_SETTING) + .add(DEFAULT_PATTERN_SHOW_NUMBERED_TOKEN_SETTING) .add(PPL_REX_MAX_MATCH_LIMIT_SETTING) .add(PPL_VALUES_MAX_LIMIT_SETTING) .add(QUERY_MEMORY_LIMIT_SETTING) diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index b8ae9a90619..f7eb7f3e3c1 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -58,6 +58,7 @@ MAX_MATCH: 'MAX_MATCH'; OFFSET_FIELD: 'OFFSET_FIELD'; BUFFER_LIMIT: 'BUFFER_LIMIT'; LABEL: 'LABEL'; +SHOW_NUMBERED_TOKEN: 'SHOW_NUMBERED_TOKEN'; AGGREGATION: 'AGGREGATION'; //Native JOIN KEYWORDS diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index e25222aae35..05afef5b587 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -371,11 +371,20 @@ patternsMethod ; patternsCommand - : PATTERNS (source_field = expression) (statsByClause)? (METHOD EQUAL method = patternMethod)? (MODE EQUAL pattern_mode = patternMode)? (MAX_SAMPLE_COUNT EQUAL max_sample_count = integerLiteral)? (BUFFER_LIMIT EQUAL buffer_limit = integerLiteral)? (NEW_FIELD EQUAL new_field = stringLiteral)? (patternsParameter)* + : PATTERNS (source_field = expression) (statsByClause)? (patternsCommandOption)* (patternsParameter)* + ; + +patternsCommandOption + : (METHOD EQUAL method = patternMethod) + | (MODE EQUAL pattern_mode = patternMode) + | (MAX_SAMPLE_COUNT EQUAL max_sample_count = integerLiteral) + | (BUFFER_LIMIT EQUAL buffer_limit = integerLiteral) + | (SHOW_NUMBERED_TOKEN EQUAL show_numbered_token = booleanLiteral) ; patternsParameter : (PATTERN EQUAL pattern = stringLiteral) + | (NEW_FIELD EQUAL new_field = stringLiteral) | (VARIABLE_COUNT_THRESHOLD EQUAL variable_count_threshold = integerLiteral) | (FREQUENCY_THRESHOLD_PERCENTAGE EQUAL frequency_threshold_percentage = decimalLiteral) ; @@ -1467,6 +1476,7 @@ searchableKeyWord | FREQUENCY_THRESHOLD_PERCENTAGE | MAX_SAMPLE_COUNT | BUFFER_LIMIT + | SHOW_NUMBERED_TOKEN | WITH | REGEX | PUNCT diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 95273552be8..b5a9fef1a3e 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -773,11 +773,6 @@ public UnresolvedPlan visitSpathCommand(OpenSearchPPLParser.SpathCommandContext public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandContext ctx) { UnresolvedExpression sourceField = internalVisitExpression(ctx.source_field); ImmutableMap.Builder builder = ImmutableMap.builder(); - Literal newField = null; - if (ctx.new_field != null) { - newField = (Literal) internalVisitExpression(ctx.new_field); - builder.put("new_field", newField); - } ctx.patternsParameter() .forEach( x -> { @@ -786,32 +781,48 @@ public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandCo builder.put(argName, value); }); java.util.Map arguments = builder.build(); + + ImmutableMap.Builder cmdOptionsBuilder = ImmutableMap.builder(); + ctx.patternsCommandOption() + .forEach( + option -> { + String argName = option.children.get(0).toString(); + Literal value = (Literal) internalVisitExpression(option.children.get(2)); + cmdOptionsBuilder.put(argName, value); + }); + java.util.Map cmdOptions = cmdOptionsBuilder.build(); String patternMethod = - ctx.method != null - ? StringUtils.unquoteIdentifier(ctx.method.getText()).toLowerCase(Locale.ROOT) - : settings.getSettingValue(Key.PATTERN_METHOD).toString().toLowerCase(Locale.ROOT); + cmdOptions + .getOrDefault( + "method", AstDSL.stringLiteral(settings.getSettingValue(Key.PATTERN_METHOD))) + .toString(); String patternMode = - ctx.pattern_mode != null - ? StringUtils.unquoteIdentifier(ctx.pattern_mode.getText()).toLowerCase(Locale.ROOT) - : settings.getSettingValue(Key.PATTERN_MODE).toString().toLowerCase(Locale.ROOT); + cmdOptions + .getOrDefault("mode", AstDSL.stringLiteral(settings.getSettingValue(Key.PATTERN_MODE))) + .toString(); Literal patternMaxSampleCount = - ctx.max_sample_count != null - ? (Literal) internalVisitExpression(ctx.max_sample_count) - : AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT)); + cmdOptions.getOrDefault( + "max_sample_count", + AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT))); Literal patternBufferLimit = - ctx.buffer_limit != null - ? (Literal) internalVisitExpression(ctx.buffer_limit) - : AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT)); + cmdOptions.getOrDefault( + "max_sample_count", + AstDSL.intLiteral(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT))); + Literal showNumberedToken = + cmdOptions.getOrDefault( + "show_numbered_token", + AstDSL.booleanLiteral(settings.getSettingValue(Key.PATTERN_SHOW_NUMBERED_TOKEN))); List partitionByList = getPartitionExprList(ctx.statsByClause()); return new Patterns( sourceField, partitionByList, - newField != null ? newField.getValue().toString() : "patterns_field", + arguments.getOrDefault("new_field", AstDSL.stringLiteral("patterns_field")).toString(), PatternMethod.valueOf(patternMethod.toUpperCase(Locale.ROOT)), PatternMode.valueOf(patternMode.toUpperCase(Locale.ROOT)), patternMaxSampleCount, patternBufferLimit, + showNumberedToken, arguments); } diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index 1d177f09f16..dbd79eaead0 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -57,6 +57,8 @@ import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.LogicalOrContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.LogicalXorContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.MultiFieldRelevanceFunctionContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.PatternMethodContext; +import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.PatternModeContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.RenameFieldExpressionContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SingleFieldRelevanceFunctionContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.SortFieldContext; @@ -235,6 +237,16 @@ public UnresolvedExpression visitSortField(SortFieldContext ctx) { return new Field(fieldExpression, ArgumentFactory.getArgumentList(ctx)); } + @Override + public UnresolvedExpression visitPatternMethod(PatternMethodContext ctx) { + return new Literal(StringUtils.unquoteText(ctx.getText()), DataType.STRING); + } + + @Override + public UnresolvedExpression visitPatternMode(PatternModeContext ctx) { + return new Literal(StringUtils.unquoteText(ctx.getText()), DataType.STRING); + } + /** Aggregation function. */ @Override public UnresolvedExpression visitStatsFunctionCall(StatsFunctionCallContext ctx) { diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLGrokTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLGrokTest.java index cbe45b70b67..376a47d0c9a 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLGrokTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLGrokTest.java @@ -19,8 +19,8 @@ public void testGrok() { String ppl = "source=EMP | grok ENAME '.+@%{HOSTNAME:host}' | fields ENAME, host"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(ENAME=[$1], host=[ITEM(GROK($1, '.+@%{HOSTNAME:host}':VARCHAR, 'grok')," - + " 'host')])\n" + "LogicalProject(ENAME=[$1], host=[ITEM(GROK($1, '.+@%{HOSTNAME:host}':VARCHAR," + + " 'grok':VARCHAR), 'host':VARCHAR)])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); @@ -35,8 +35,8 @@ public void testGrokOverriding() { String ppl = "source=EMP | grok ENAME '%{NUMBER} %{GREEDYDATA:ENAME}' | fields ENAME"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(ENAME=[ITEM(GROK($1, '%{NUMBER} %{GREEDYDATA:ENAME}':VARCHAR, 'grok')," - + " 'ENAME')])\n" + "LogicalProject(ENAME=[ITEM(GROK($1, '%{NUMBER} %{GREEDYDATA:ENAME}':VARCHAR," + + " 'grok':VARCHAR), 'ENAME':VARCHAR)])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLParseTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLParseTest.java index 776f473c38b..6b613f346bb 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLParseTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLParseTest.java @@ -22,7 +22,7 @@ public void testParse() { RelNode root = getRelNode(ppl); String expectedLogical = "LogicalProject(JOB=[$2], year=[ITEM(PARSE(DATE_FORMAT($4, '%Y-%m-%d':VARCHAR)," - + " '(?\\d{4})-\\d{2}-\\d{2}':VARCHAR, 'regex'), 'year')])\n" + + " '(?\\d{4})-\\d{2}-\\d{2}':VARCHAR, 'regex':VARCHAR), 'year':VARCHAR)])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); @@ -41,7 +41,7 @@ public void testParseOverriding() { RelNode root = getRelNode(ppl); String expectedLogical = "LogicalProject(JOB=[$2], MGR=[ITEM(PARSE(DATE_FORMAT($4, '%Y-%m-%d':VARCHAR)," - + " '(?\\d{4})-\\d{2}-\\d{2}':VARCHAR, 'regex'), 'MGR')])\n" + + " '(?\\d{4})-\\d{2}-\\d{2}':VARCHAR, 'regex':VARCHAR), 'MGR':VARCHAR)])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLPatternsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLPatternsTest.java index 467dd36f548..9a88f25f270 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLPatternsTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLPatternsTest.java @@ -24,157 +24,279 @@ public void setUp() { doReturn("label").when(settings).getSettingValue(Key.PATTERN_MODE); doReturn(10).when(settings).getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT); doReturn(100000).when(settings).getSettingValue(Key.PATTERN_BUFFER_LIMIT); + doReturn(false).when(settings).getSettingValue(Key.PATTERN_SHOW_NUMBERED_TOKEN); } @Test - public void testPatternsLabelModeForSimplePatternMethod() { - String ppl = "source=EMP | patterns ENAME | fields ENAME, patterns_field, tokens"; + public void testPatternsLabelMode_NotShowNumberedToken_ForSimplePatternMethod() { + String ppl = "source=EMP | patterns ENAME | fields ENAME, patterns_field"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(ENAME=[$1]," - + " patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($1," - + " '[a-zA-Z0-9]+':VARCHAR, '<*>'), $1), 'pattern'))]," - + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," - + " '<*>'), $1), 'tokens'))])\n" + "LogicalProject(ENAME=[$1], patterns_field=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS" + + " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," + + " '<*>':VARCHAR))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')," - + " `ENAME`)['pattern'] AS STRING) `patterns_field`," - + " SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')," - + " `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + "SELECT `ENAME`, CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END `patterns_field`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsLabelModeWithCustomPatternForSimplePatternMethod() { + public void testPatternsLabelMode_ShowNumberedToken_ForSimplePatternMethod() { String ppl = - "source=EMP | patterns ENAME pattern='[A-H]' | fields ENAME, patterns_field, tokens"; + "source=EMP | patterns ENAME show_numbered_token=true | fields ENAME, patterns_field," + + " tokens"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(ENAME=[$1]," - + " patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($1, '[A-H]':VARCHAR," - + " '<*>'), $1), 'pattern'))], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($1," - + " '[A-H]':VARCHAR, '<*>'), $1), 'tokens'))])\n" + "LogicalProject(ENAME=[$1], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1," + + " Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1," + + " '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR)), $1), 'pattern'))]," + + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS" + + " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," + + " '<*>':VARCHAR)), $1), 'tokens'))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>')," - + " `ENAME`)['pattern'] AS STRING) `patterns_field`," - + " SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>')," + "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN" + + " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `ENAME`)['pattern'] AS" + + " STRING) `patterns_field`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR" + + " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END," + " `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsLabelModeWithPartitionBySimplePatternMethod() { + public void testPatternsLabelModeWithCustomPattern_ShowNumberedToken_ForSimplePatternMethod() { + String ppl = + "source=EMP | patterns ENAME show_numbered_token=true pattern='[A-H]' | fields ENAME," + + " patterns_field, tokens"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(ENAME=[$1], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1," + + " Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1," + + " '[A-H]':VARCHAR, '<*>':VARCHAR)), $1), 'pattern'))]," + + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS" + + " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[A-H]':VARCHAR, '<*>':VARCHAR))," + + " $1), 'tokens'))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN" + + " '' ELSE REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>') END, `ENAME`)['pattern'] AS STRING)" + + " `patterns_field`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` =" + + " '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>') END, `ENAME`)['tokens'] AS" + + " MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsLabelModeWithCustomField_NotShowNumberedToken_ForSimplePatternMethod() { + String ppl = + "source=EMP | patterns ENAME new_field='upper' pattern='[A-H]' | fields ENAME," + " upper"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(ENAME=[$1], upper=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR)," + + " '':VARCHAR, REGEXP_REPLACE($1, '[A-H]':VARCHAR, '<*>':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[A-H]', '<*>') END `upper`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsLabelModeWithPartitionBy_ShowNumberedToken_SimplePatternMethod() { String ppl = - "source=EMP | patterns ENAME by DEPTNO | fields ENAME, DEPTNO, patterns_field, tokens"; + "source=EMP | patterns ENAME by DEPTNO show_numbered_token=true | fields ENAME, DEPTNO," + + " patterns_field, tokens"; RelNode root = getRelNode(ppl); String expectedLogical = "LogicalProject(ENAME=[$1], DEPTNO=[$7]," - + " patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($1," - + " '[a-zA-Z0-9]+':VARCHAR, '<*>'), $1), 'pattern'))]," - + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," - + " '<*>'), $1), 'tokens'))])\n" + + " patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL" + + " AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," + + " '<*>':VARCHAR)), $1), 'pattern'))]," + + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER(CASE(SEARCH($1, Sarg['':VARCHAR; NULL AS" + + " TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," + + " '<*>':VARCHAR)), $1), 'tokens'))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, `DEPTNO`, SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`," - + " '[a-zA-Z0-9]+', '<*>'), `ENAME`)['pattern'] AS STRING) `patterns_field`," - + " SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')," - + " `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + "SELECT `ENAME`, `DEPTNO`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME`" + + " = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END," + + " `ENAME`)['pattern'] AS STRING) `patterns_field`, SAFE_CAST(`PATTERN_PARSER`(CASE" + + " WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`," + + " '[a-zA-Z0-9]+', '<*>') END, `ENAME`)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >)" + + " `tokens`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsLabelModeForBrainMethod() { - String ppl = "source=EMP | patterns ENAME method=BRAIN | fields ENAME, patterns_field, tokens"; + public void testPatternsLabelMode_NotShowNumberedToken_ForBrainMethod() { + String ppl = "source=EMP | patterns ENAME method=BRAIN | fields ENAME, patterns_field"; RelNode root = getRelNode(ppl); String expectedLogical = "LogicalProject(ENAME=[$1], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($1, pattern($1," - + " 10, 100000) OVER ()), 'pattern'))], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($1," - + " pattern($1, 10, 100000) OVER ()), 'tokens'))])\n" + + " 10, 100000, false) OVER (), false), 'pattern'))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10, 100000) OVER" - + " (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))['pattern'] AS STRING)" - + " `patterns_field`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10," - + " 100000) OVER (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))['tokens']" - + " AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10, 100000, FALSE)" + + " OVER (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), FALSE)['pattern']" + + " AS STRING) `patterns_field`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsLabelModeWithPartitionByForBrainMethod() { + public void testPatternsLabelMode_ShowNumberedToken_ForBrainMethod() { String ppl = - "source=EMP | patterns ENAME by DEPTNO method=BRAIN | fields ENAME, DEPTNO," + "source=EMP | patterns ENAME method=BRAIN show_numbered_token=true | fields ENAME," + " patterns_field, tokens"; RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($1, pattern($1," + + " 10, 100000, true) OVER (), true), 'pattern'))]," + + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($1, pattern($1, 10, 100000, true) OVER ()," + + " true), 'tokens'))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10, 100000, TRUE)" + + " OVER (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), TRUE)['pattern']" + + " AS STRING) `patterns_field`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`," + + " 10, 100000, TRUE) OVER (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)," + + " TRUE)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsLabelModeWithPartitionBy_NotShowNumberedToken_ForBrainMethod() { + String ppl = + "source=EMP | patterns ENAME by DEPTNO method=BRAIN | fields ENAME, DEPTNO," + + " patterns_field"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(ENAME=[$1], DEPTNO=[$7], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($1," + + " pattern($1, 10, 100000, false) OVER (PARTITION BY $7), false), 'pattern'))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `DEPTNO`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10," + + " 100000, FALSE) OVER (PARTITION BY `DEPTNO` RANGE BETWEEN UNBOUNDED PRECEDING AND" + + " UNBOUNDED FOLLOWING), FALSE)['pattern'] AS STRING) `patterns_field`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsLabelModeWithPartitionBy_ShowNumberedToken_ForBrainMethod() { + String ppl = + "source=EMP | patterns ENAME by DEPTNO method=BRAIN show_numbered_token=true | fields" + + " ENAME, DEPTNO, patterns_field, tokens"; + RelNode root = getRelNode(ppl); + String expectedLogical = "LogicalProject(ENAME=[$1], DEPTNO=[$7], patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($1," - + " pattern($1, 10, 100000) OVER (PARTITION BY $7)), 'pattern'))]," - + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($1, pattern($1, 10, 100000) OVER (PARTITION" - + " BY $7)), 'tokens'))])\n" + + " pattern($1, 10, 100000, true) OVER (PARTITION BY $7), true), 'pattern'))]," + + " tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($1, pattern($1, 10, 100000, true) OVER" + + " (PARTITION BY $7), true), 'tokens'))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = "SELECT `ENAME`, `DEPTNO`, SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10," - + " 100000) OVER (PARTITION BY `DEPTNO` RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED" - + " FOLLOWING))['pattern'] AS STRING) `patterns_field`," - + " SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10, 100000) OVER (PARTITION" - + " BY `DEPTNO` RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))['tokens']" - + " AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + + " 100000, TRUE) OVER (PARTITION BY `DEPTNO` RANGE BETWEEN UNBOUNDED PRECEDING AND" + + " UNBOUNDED FOLLOWING), TRUE)['pattern'] AS STRING) `patterns_field`," + + " SAFE_CAST(`PATTERN_PARSER`(`ENAME`, `pattern`(`ENAME`, 10, 100000, TRUE) OVER" + + " (PARTITION BY `DEPTNO` RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)," + + " TRUE)['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsAggregationModeForSimplePatternMethod() { + public void testPatternsAggregationMode_NotShowNumberedToken_ForSimplePatternMethod() { String ppl = "source=EMP | patterns ENAME mode=aggregation"; RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalAggregate(group=[{1}], pattern_count=[COUNT($1)], sample_logs=[TAKE($0, $2)])\n" + + " LogicalProject(ENAME=[$1], patterns_field=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL" + + " AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," + + " '<*>':VARCHAR))], $f9=[10])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`," + + " '[a-zA-Z0-9]+', '<*>') END `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR" + + " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END)" + + " `pattern_count`, `TAKE`(`ENAME`, 10) `sample_logs`\n" + + "FROM `scott`.`EMP`\n" + + "GROUP BY CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsAggregationMode_ShowNumberedToken_ForSimplePatternMethod() { + String ppl = "source=EMP | patterns ENAME mode=aggregation show_numbered_token=true"; + RelNode root = getRelNode(ppl); + String expectedLogical = "LogicalProject(patterns_field=[SAFE_CAST(ITEM(PATTERN_PARSER($0, $2), 'pattern'))]," + " pattern_count=[$1], tokens=[SAFE_CAST(ITEM(PATTERN_PARSER($0, $2), 'tokens'))]," + " sample_logs=[$2])\n" + " LogicalAggregate(group=[{1}], pattern_count=[COUNT($1)], sample_logs=[TAKE($0," + " $2)])\n" - + " LogicalProject(ENAME=[$1], patterns_field=[REGEXP_REPLACE($1," - + " '[a-zA-Z0-9]+':VARCHAR, '<*>')], $f9=[10])\n" + + " LogicalProject(ENAME=[$1], patterns_field=[CASE(SEARCH($1, Sarg['':VARCHAR; NULL" + + " AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1, '[a-zA-Z0-9]+':VARCHAR," + + " '<*>':VARCHAR))], $f9=[10])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')," - + " `TAKE`(`ENAME`, 10))['pattern'] AS STRING) `patterns_field`," - + " COUNT(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')) `pattern_count`," - + " SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')," - + " `TAKE`(`ENAME`, 10))['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`," - + " `TAKE`(`ENAME`, 10) `sample_logs`\n" + "SELECT SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`, 10))['pattern']" + + " AS STRING) `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN" + + " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END) `pattern_count`," + + " SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`, 10))['tokens']" + + " AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`, `TAKE`(`ENAME`, 10) `sample_logs`\n" + "FROM `scott`.`EMP`\n" - + "GROUP BY REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')"; + + "GROUP BY CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsAggregationModeWithGroupByForSimplePatternMethod() { - String ppl = "source=EMP | patterns ENAME by DEPTNO mode=aggregation"; + public void testPatternsAggregationModeWithGroupBy_ShowNumberedToken_ForSimplePatternMethod() { + String ppl = "source=EMP | patterns ENAME by DEPTNO mode=aggregation show_numbered_token=true"; RelNode root = getRelNode(ppl); String expectedLogical = @@ -183,35 +305,73 @@ public void testPatternsAggregationModeWithGroupByForSimplePatternMethod() { + " 'tokens'))], sample_logs=[$3])\n" + " LogicalAggregate(group=[{1, 2}], pattern_count=[COUNT($2)], sample_logs=[TAKE($0," + " $3)])\n" - + " LogicalProject(ENAME=[$1], DEPTNO=[$7], patterns_field=[REGEXP_REPLACE($1," - + " '[a-zA-Z0-9]+':VARCHAR, '<*>')], $f9=[10])\n" + + " LogicalProject(ENAME=[$1], DEPTNO=[$7], patterns_field=[CASE(SEARCH($1," + + " Sarg['':VARCHAR; NULL AS TRUE]:VARCHAR), '':VARCHAR, REGEXP_REPLACE($1," + + " '[a-zA-Z0-9]+':VARCHAR, '<*>':VARCHAR))], $f9=[10])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `DEPTNO`, SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+'," - + " '<*>'), `TAKE`(`ENAME`, 10))['pattern'] AS STRING) `patterns_field`," - + " COUNT(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')) `pattern_count`," - + " SAFE_CAST(`PATTERN_PARSER`(REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')," - + " `TAKE`(`ENAME`, 10))['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`," - + " `TAKE`(`ENAME`, 10) `sample_logs`\n" + "SELECT `DEPTNO`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN" + + " '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`," + + " 10))['pattern'] AS STRING) `patterns_field`, COUNT(CASE WHEN `ENAME` IS NULL OR" + + " `ENAME` = '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END)" + + " `pattern_count`, SAFE_CAST(`PATTERN_PARSER`(CASE WHEN `ENAME` IS NULL OR `ENAME` =" + + " '' THEN '' ELSE REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END, `TAKE`(`ENAME`," + + " 10))['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >) `tokens`, `TAKE`(`ENAME`, 10)" + + " `sample_logs`\n" + "FROM `scott`.`EMP`\n" - + "GROUP BY `DEPTNO`, REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>')"; + + "GROUP BY `DEPTNO`, CASE WHEN `ENAME` IS NULL OR `ENAME` = '' THEN '' ELSE" + + " REGEXP_REPLACE(`ENAME`, '[a-zA-Z0-9]+', '<*>') END"; verifyPPLToSparkSQL(root, expectedSparkSql); } @Test - public void testPatternsAggregationModeForBrainMethod() { + public void testPatternsAggregationMode_NotShowNumberedToken_ForBrainMethod() { String ppl = "source=EMP | patterns ENAME method=BRAIN mode=aggregation"; RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(patterns_field=[SAFE_CAST(ITEM($1, 'pattern'))]," + + " pattern_count=[SAFE_CAST(ITEM($1, 'pattern_count'))]," + + " sample_logs=[SAFE_CAST(ITEM($1, 'sample_logs'))])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}])\n" + + " LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2, $3)])\n" + + " LogicalProject(ENAME=[$1], $f8=[10], $f9=[100000], $f10=[false])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " Uncollect\n" + + " LogicalProject(patterns_field=[$cor0.patterns_field])\n" + + " LogicalValues(tuples=[[{ 0 }]])\n"; + verifyLogical(root, expectedLogical); + + /* + * TODO: Fix Spark SQL conformance + * Spark doesn't have SAFE_CAST and UNNEST + */ + String expectedSparkSql = + "SELECT SAFE_CAST(`t20`.`patterns_field`['pattern'] AS STRING) `patterns_field`," + + " SAFE_CAST(`t20`.`patterns_field`['pattern_count'] AS BIGINT) `pattern_count`," + + " SAFE_CAST(`t20`.`patterns_field`['sample_logs'] AS VARCHAR ARRAY) `sample_logs`\n" + + "FROM (SELECT `pattern`(`ENAME`, 10, 100000, FALSE) `patterns_field`\n" + + "FROM `scott`.`EMP`) `$cor0`,\n" + + "LATERAL UNNEST (SELECT `$cor0`.`patterns_field`\n" + + "FROM (VALUES (0)) `t` (`ZERO`)) `t2` (`patterns_field`) `t20`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsAggregationMode_ShowNumberedToken_ForBrainMethod() { + String ppl = + "source=EMP | patterns ENAME method=BRAIN mode=aggregation show_numbered_token=true"; + RelNode root = getRelNode(ppl); + String expectedLogical = "LogicalProject(patterns_field=[SAFE_CAST(ITEM($1, 'pattern'))]," + " pattern_count=[SAFE_CAST(ITEM($1, 'pattern_count'))], tokens=[SAFE_CAST(ITEM($1," + " 'tokens'))], sample_logs=[SAFE_CAST(ITEM($1, 'sample_logs'))])\n" + " LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{0}])\n" - + " LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2)])\n" - + " LogicalProject(ENAME=[$1], $f8=[10], $f9=[100000])\n" + + " LogicalAggregate(group=[{}], patterns_field=[pattern($0, $1, $2, $3)])\n" + + " LogicalProject(ENAME=[$1], $f8=[10], $f9=[100000], $f10=[true])\n" + " LogicalTableScan(table=[[scott, EMP]])\n" + " Uncollect\n" + " LogicalProject(patterns_field=[$cor0.patterns_field])\n" @@ -228,7 +388,7 @@ public void testPatternsAggregationModeForBrainMethod() { + " SAFE_CAST(`t20`.`patterns_field`['tokens'] AS MAP< VARCHAR, VARCHAR ARRAY >)" + " `tokens`, SAFE_CAST(`t20`.`patterns_field`['sample_logs'] AS VARCHAR ARRAY)" + " `sample_logs`\n" - + "FROM (SELECT `pattern`(`ENAME`, 10, 100000) `patterns_field`\n" + + "FROM (SELECT `pattern`(`ENAME`, 10, 100000, TRUE) `patterns_field`\n" + "FROM `scott`.`EMP`) `$cor0`,\n" + "LATERAL UNNEST (SELECT `$cor0`.`patterns_field`\n" + "FROM (VALUES (0)) `t` (`ZERO`)) `t2` (`patterns_field`) `t20`"; @@ -236,17 +396,55 @@ public void testPatternsAggregationModeForBrainMethod() { } @Test - public void testPatternsAggregationModeWithGroupByForBrainMethod() { + public void testPatternsAggregationModeWithGroupBy_NotShowNumberedToken_ForBrainMethod() { String ppl = "source=EMP | patterns ENAME by DEPTNO method=BRAIN mode=aggregation"; RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(DEPTNO=[$0], patterns_field=[SAFE_CAST(ITEM($2, 'pattern'))]," + + " pattern_count=[SAFE_CAST(ITEM($2, 'pattern_count'))]," + + " sample_logs=[SAFE_CAST(ITEM($2, 'sample_logs'))])\n" + + " LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{1}])\n" + + " LogicalAggregate(group=[{1}], patterns_field=[pattern($0, $2, $3, $4)])\n" + + " LogicalProject(ENAME=[$1], DEPTNO=[$7], $f8=[10], $f9=[100000]," + + " $f10=[false])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n" + + " Uncollect\n" + + " LogicalProject(patterns_field=[$cor0.patterns_field])\n" + + " LogicalValues(tuples=[[{ 0 }]])\n"; + verifyLogical(root, expectedLogical); + + /* + * TODO: Fix Spark SQL conformance + * Spark doesn't have SAFE_CAST and UNNEST + */ + String expectedSparkSql = + "SELECT `$cor0`.`DEPTNO`, SAFE_CAST(`t20`.`patterns_field`['pattern'] AS STRING)" + + " `patterns_field`, SAFE_CAST(`t20`.`patterns_field`['pattern_count'] AS BIGINT)" + + " `pattern_count`, SAFE_CAST(`t20`.`patterns_field`['sample_logs'] AS VARCHAR ARRAY)" + + " `sample_logs`\n" + + "FROM (SELECT `DEPTNO`, `pattern`(`ENAME`, 10, 100000, FALSE) `patterns_field`\n" + + "FROM `scott`.`EMP`\n" + + "GROUP BY `DEPTNO`) `$cor0`,\n" + + "LATERAL UNNEST (SELECT `$cor0`.`patterns_field`\n" + + "FROM (VALUES (0)) `t` (`ZERO`)) `t2` (`patterns_field`) `t20`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testPatternsAggregationModeWithGroupBy_ShowNumberedToken_ForBrainMethod() { + String ppl = + "source=EMP | patterns ENAME by DEPTNO method=BRAIN mode=aggregation" + + " show_numbered_token=true"; + RelNode root = getRelNode(ppl); + String expectedLogical = "LogicalProject(DEPTNO=[$0], patterns_field=[SAFE_CAST(ITEM($2, 'pattern'))]," + " pattern_count=[SAFE_CAST(ITEM($2, 'pattern_count'))], tokens=[SAFE_CAST(ITEM($2," + " 'tokens'))], sample_logs=[SAFE_CAST(ITEM($2, 'sample_logs'))])\n" + " LogicalCorrelate(correlation=[$cor0], joinType=[inner], requiredColumns=[{1}])\n" - + " LogicalAggregate(group=[{1}], patterns_field=[pattern($0, $2, $3)])\n" - + " LogicalProject(ENAME=[$1], DEPTNO=[$7], $f8=[10], $f9=[100000])\n" + + " LogicalAggregate(group=[{1}], patterns_field=[pattern($0, $2, $3, $4)])\n" + + " LogicalProject(ENAME=[$1], DEPTNO=[$7], $f8=[10], $f9=[100000], $f10=[true])\n" + " LogicalTableScan(table=[[scott, EMP]])\n" + " Uncollect\n" + " LogicalProject(patterns_field=[$cor0.patterns_field])\n" @@ -263,7 +461,7 @@ public void testPatternsAggregationModeWithGroupByForBrainMethod() { + " `pattern_count`, SAFE_CAST(`t20`.`patterns_field`['tokens'] AS MAP< VARCHAR," + " VARCHAR ARRAY >) `tokens`, SAFE_CAST(`t20`.`patterns_field`['sample_logs'] AS" + " VARCHAR ARRAY) `sample_logs`\n" - + "FROM (SELECT `DEPTNO`, `pattern`(`ENAME`, 10, 100000) `patterns_field`\n" + + "FROM (SELECT `DEPTNO`, `pattern`(`ENAME`, 10, 100000, TRUE) `patterns_field`\n" + "FROM `scott`.`EMP`\n" + "GROUP BY `DEPTNO`) `$cor0`,\n" + "LATERAL UNNEST (SELECT `$cor0`.`patterns_field`\n" diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index ea534e0d8d5..a988599ab5c 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -1011,6 +1011,7 @@ public void testPatternsCommand() { when(settings.getSettingValue(Key.PATTERN_MODE)).thenReturn("LABEL"); when(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT)).thenReturn(10); when(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT)).thenReturn(100000); + when(settings.getSettingValue(Key.PATTERN_SHOW_NUMBERED_TOKEN)).thenReturn(false); assertEqual( "source=t | patterns raw new_field=\"custom_field\" " + "pattern=\"custom_pattern\"", patterns( @@ -1022,6 +1023,7 @@ public void testPatternsCommand() { PatternMode.LABEL, AstDSL.intLiteral(10), AstDSL.intLiteral(100000), + AstDSL.booleanLiteral(false), ImmutableMap.of( "new_field", AstDSL.stringLiteral("custom_field"), "pattern", AstDSL.stringLiteral("custom_pattern")))); @@ -1033,6 +1035,7 @@ public void testPatternsCommandWithBrainMethod() { when(settings.getSettingValue(Key.PATTERN_MODE)).thenReturn("LABEL"); when(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT)).thenReturn(10); when(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT)).thenReturn(100000); + when(settings.getSettingValue(Key.PATTERN_SHOW_NUMBERED_TOKEN)).thenReturn(false); assertEqual( "source=t | patterns raw method=BRAIN variable_count_threshold=2" + " frequency_threshold_percentage=0.1", @@ -1045,6 +1048,7 @@ public void testPatternsCommandWithBrainMethod() { PatternMode.LABEL, AstDSL.intLiteral(10), AstDSL.intLiteral(100000), + AstDSL.booleanLiteral(false), ImmutableMap.of( "frequency_threshold_percentage", new Literal(0.1, DataType.DECIMAL), "variable_count_threshold", new Literal(2, DataType.INTEGER)))); @@ -1056,6 +1060,7 @@ public void testPatternsWithoutArguments() { when(settings.getSettingValue(Key.PATTERN_MODE)).thenReturn("LABEL"); when(settings.getSettingValue(Key.PATTERN_MAX_SAMPLE_COUNT)).thenReturn(10); when(settings.getSettingValue(Key.PATTERN_BUFFER_LIMIT)).thenReturn(100000); + when(settings.getSettingValue(Key.PATTERN_SHOW_NUMBERED_TOKEN)).thenReturn(false); assertEqual( "source=t | patterns raw", patterns( @@ -1067,6 +1072,7 @@ public void testPatternsWithoutArguments() { PatternMode.LABEL, AstDSL.intLiteral(10), AstDSL.intLiteral(100000), + AstDSL.booleanLiteral(false), ImmutableMap.of())); }