Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,11 @@ public BrainLogParser(
* @return list of tokens by splitting preprocessed log message
*/
public List<String> preprocess(String logMessage, String logId) {
if (logMessage == null || logId == null) {
throw new IllegalArgumentException("log message or logId must not be null");
if (logId == null) {
throw new IllegalArgumentException("logId must not be null");
}
if (logMessage == null) {
return Arrays.asList("", logId);
}

List<String> tokens = preprocess(logMessage, this.filterPatternVariableMap, this.delimiters);
Expand Down Expand Up @@ -224,7 +227,7 @@ public void processTokenHistogram(List<String> tokens) {
* @return list of token lists
*/
public List<List<String>> preprocessAllLogs(List<String> logMessages) {
List<List<String>> preprocessedLogs = new ArrayList<>();
List<List<String>> preprocessedLogs = new ArrayList<>(logMessages.size());

for (int i = 0; i < logMessages.size(); i++) {
String logId = String.valueOf(i);
Expand Down Expand Up @@ -291,7 +294,8 @@ public static List<String> parseLogPattern(
String groupCandidateStr = logIdGroupCandidateMap.get(logId);
String[] groupCandidate = groupCandidateStr.split(",");
Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group
return IntStream.range(0, tokens.size() - 1)
int tokenCapacity = Math.max(0, tokens.size() - 1);
return IntStream.range(0, tokenCapacity)
.mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i)))
.map(
entry -> {
Expand Down Expand Up @@ -334,7 +338,7 @@ public static List<String> parseLogPattern(
}
return token;
})
.collect(Collectors.toList());
.collect(Collectors.toCollection(() -> new ArrayList<>(tokenCapacity)));
}

/**
Expand All @@ -349,7 +353,10 @@ public Map<String, Map<String, Object>> parseAllLogPatterns(

Map<String, Map<String, Object>> logPatternMap = new HashMap<>();
for (int i = 0; i < processedMessages.size(); i++) {
List<String> logPattern = this.parseLogPattern(processedMessages.get(i));
List<String> logPattern =
this.parseLogPattern(processedMessages.get(i)).stream()
.map(BrainLogParser::collapseContinuousWildcards)
.collect(Collectors.toList());
String patternKey = String.join(" ", logPattern);
String sampleLog = logMessages.get(i);
logPatternMap.compute(
Expand Down Expand Up @@ -379,6 +386,29 @@ public Map<String, Map<String, Object>> parseAllLogPatterns(
return logPatternMap;
}

static String collapseContinuousWildcards(String part) {
// The minimum of continuous wildcards are 6 characters: <*><*>
if (part == null || part.length() < 6) {
return part;
}
int tokenLen = VARIABLE_DENOTER.length();
StringBuilder sb = new StringBuilder(part.length());
int i = 0;
while (i < part.length()) {
int j = part.indexOf(VARIABLE_DENOTER, i);
if (j < 0) {
sb.append(part, i, part.length());
break;
}
sb.append(part, i, j).append(VARIABLE_DENOTER);
do {
j += tokenLen;
} while (j <= part.length() - tokenLen && part.startsWith(VARIABLE_DENOTER, j));
i = j;
}
return sb.toString();
}

private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
Map<Long, Integer> occurrences = new HashMap<>();
for (int i = 0; i < tokens.size() - 1; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

package org.opensearch.sql.common.patterns;

import com.google.common.collect.ImmutableList;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand All @@ -20,6 +21,8 @@ public final class PatternUtils {
public static final Pattern WILDCARD_PATTERN = Pattern.compile("<\\*[^>]*>");
public static final String TOKEN_PREFIX = "<token";
public static final Pattern TOKEN_PATTERN = Pattern.compile("<token\\d+>");
public static final List<String> VALID_BRAIN_PARAMETERS =
ImmutableList.of("variable_count_threshold", "frequency_threshold_percentage");

public static Map<String, Map<String, Object>> mergePatternGroups(
Map<String, Map<String, Object>> left,
Expand Down Expand Up @@ -55,6 +58,7 @@ public static Map<String, Map<String, Object>> mergePatternGroups(
public static void extractVariables(
ParseResult parseResult, String original, Map<String, List<String>> result, String prefix) {
List<String> parts = parseResult.parts;
List<Boolean> isToken = parseResult.isToken;
List<String> tokenOrder = parseResult.tokenOrder;

if (parts.isEmpty()) {
Expand All @@ -67,7 +71,7 @@ public static void extractVariables(

while (i < parts.size()) {
String currentPart = parts.get(i);
if (currentPart.startsWith(prefix)) { // Process already labeled part
if (isToken.get(i)) { // Process already labeled part
String tokenKey = tokenOrder.get(tokenIndex++);
if (i == parts.size() - 1) { // The last part
String value = original.substring(pos);
Expand Down Expand Up @@ -97,19 +101,22 @@ public static void extractVariables(
}

public static class ParseResult {
List<String> parts;
List<String> tokenOrder;
final List<String> parts;
final List<Boolean> isToken;
final List<String> tokenOrder;

public ParseResult(List<String> parts, List<String> tokenOrder) {
public ParseResult(List<String> parts, List<Boolean> isToken, List<String> tokenOrder) {
this.parts = parts;
this.isToken = isToken;
this.tokenOrder = tokenOrder;
}

public String toTokenOrderString(String prefix) {
StringBuilder result = new StringBuilder();
int tokenIndex = 0;
for (String currentPart : parts) {
if (currentPart.startsWith(prefix)) {
for (int i = 0; i < parts.size(); i++) {
String currentPart = parts.get(i);
if (isToken.get(i)) {
result.append(tokenOrder.get(tokenIndex++));
} else {
result.append(currentPart);
Expand All @@ -126,6 +133,7 @@ public String toTokenOrderString(String prefix) {
*/
public static ParseResult parsePattern(String pattern, Pattern compiledPattern) {
List<String> parts = new ArrayList<>();
List<Boolean> isToken = new ArrayList<>();
List<String> tokenOrder = new ArrayList<>();
Matcher matcher = compiledPattern.matcher(pattern);
int lastEnd = 0;
Expand All @@ -137,20 +145,23 @@ public static ParseResult parsePattern(String pattern, Pattern compiledPattern)
// Add static part before the found match if there is
if (start > lastEnd) {
parts.add(pattern.substring(lastEnd, start));
isToken.add(false);
}
// Add matched wildcard part and generate token order key
String wildcard = matcher.group();
parts.add(wildcard);
isToken.add(true);
tokenOrder.add("<token" + tokenCount++ + ">");
lastEnd = end;
}

// Add static part at the end
if (lastEnd < pattern.length()) {
parts.add(pattern.substring(lastEnd));
isToken.add(false);
}

return new ParseResult(parts, tokenOrder);
return new ParseResult(parts, isToken, tokenOrder);
}

private static void addToResult(Map<String, List<String>> result, String key, String value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public enum Key {
PATTERN_MODE("plugins.ppl.pattern.mode"),
PATTERN_MAX_SAMPLE_COUNT("plugins.ppl.pattern.max.sample.count"),
PATTERN_BUFFER_LIMIT("plugins.ppl.pattern.buffer.limit"),
PATTERN_SHOW_NUMBERED_TOKEN("plugins.ppl.pattern.show.numbered.token"),
PPL_REX_MAX_MATCH_LIMIT("plugins.ppl.rex.max_match.limit"),
PPL_VALUES_MAX_LIMIT("plugins.ppl.values.max.limit"),
PPL_SYNTAX_LEGACY_PREFERRED("plugins.ppl.syntax.legacy.preferred"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import java.util.AbstractMap;
import java.util.Arrays;
Expand Down Expand Up @@ -104,6 +105,15 @@ public void testPreprocess() {
assertEquals(expectedResult, result);
}

@Test
public void testPreprocessNullString() {
String logMessage = null;
String logId = "log1";
List<String> expectedResult = Arrays.asList("", "log1");
List<String> result = parser.preprocess(logMessage, logId);
assertEquals(expectedResult, result);
}

@Test
public void testPreprocessWithUUID() {
String logMessage = "127.0.0.1 - 1234 something, user_id:c78ac970-f0c3-4954-8cf8-352a8458d01c";
Expand All @@ -124,15 +134,11 @@ public void testPreprocessWithUUID() {
public void testPreprocessWithIllegalInput() {
String logMessage = "127.0.0.1 - 1234 something";
String logId = "log1";
String exceptionMessage = "log message or logId must not be null";
String exceptionMessage = "logId must not be null";
assertEquals(ImmutableList.of("", logId), parser.preprocess(null, logId));
Throwable throwable =
assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, logId));
assertEquals(exceptionMessage, throwable.getMessage());
throwable =
assertThrows(IllegalArgumentException.class, () -> parser.preprocess(logMessage, null));
assertEquals(exceptionMessage, throwable.getMessage());
throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, null));
assertEquals(exceptionMessage, throwable.getMessage());
}

@Test
Expand Down Expand Up @@ -209,6 +215,29 @@ public void testParseLogPattern() {
assertEquals(expectedLogPattern, logPattern);
}

@Test
public void testParseAllLogPatternsWithNullInput() {
List<String> messages =
Arrays.asList(
null,
"PacketResponder failed for blk_6996194389878584395",
"PacketResponder failed for blk_-1547954353065580372");
Map<String, Map<String, Object>> logPatternMap = parser.parseAllLogPatterns(messages, 1);
Map<String, Map<String, Object>> expectedResult =
ImmutableMap.of(
"",
ImmutableMap.of("pattern_count", 1L, "pattern", "", "sample_logs", ImmutableList.of()),
"PacketResponder failed for blk_<*>",
ImmutableMap.of(
"pattern_count",
2L,
"pattern",
"PacketResponder failed for blk_<*>",
"sample_logs",
ImmutableList.of("PacketResponder failed for blk_6996194389878584395")));
assertEquals(expectedResult, logPatternMap);
}

@Test
public void testParseAllLogPatterns() {
Map<String, Map<String, Object>> logPatternMap = parser.parseAllLogPatterns(TEST_HDFS_LOGS, 2);
Expand Down Expand Up @@ -286,6 +315,19 @@ public void testParseLogPatternWhenHigherFrequencyTokenIsVariable() {
assertTrue(parser.getGroupTokenSetMap().get("4-3,3-0").size() > 1);
}

@Test
public void testCollapseContinuousWildcards() {
String correctTokenPattern =
"BLOCK* NameSystem.allocateBlock: /user/root/_temporary/_task_<*>_r_<*>";
String continuousTokenPattern =
"BLOCK* NameSystem.allocateBlock: /user/root/_temporary/_task_<*><*>_r_<*><*><*>";

assertEquals(
correctTokenPattern, BrainLogParser.collapseContinuousWildcards(continuousTokenPattern));
assertEquals(
correctTokenPattern, BrainLogParser.collapseContinuousWildcards(correctTokenPattern));
}

private Map<String, Long> collectPatternByCountMap(
Map<String, Map<String, Object>> logPatternMap) {
return logPatternMap.entrySet().stream()
Expand Down
2 changes: 2 additions & 0 deletions core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ public static Patterns patterns(
PatternMode patternMode,
UnresolvedExpression patternMaxSampleCount,
UnresolvedExpression patternBufferLimit,
UnresolvedExpression showNumberedToken,
java.util.Map<String, Literal> arguments) {
return new Patterns(
sourceField,
Expand All @@ -555,6 +556,7 @@ public static Patterns patterns(
patternMode,
patternMaxSampleCount,
patternBufferLimit,
showNumberedToken,
arguments,
input);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public class Patterns extends UnresolvedPlan {
private final PatternMode patternMode;
private final UnresolvedExpression patternMaxSampleCount;
private final UnresolvedExpression patternBufferLimit;
private final UnresolvedExpression showNumberedToken;
private final Map<String, Literal> arguments;
private UnresolvedPlan child;

Expand Down
Loading
Loading