diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index 805c17b6af8..719818e4c78 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -79,6 +79,7 @@ import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; @@ -743,6 +744,12 @@ public LogicalPlan visitReverse(Reverse node, AnalysisContext context) { "REVERSE is supported only when " + CALCITE_ENGINE_ENABLED.getKeyValue() + "=true"); } + @Override + public LogicalPlan visitRegex(Regex node, AnalysisContext context) { + throw new UnsupportedOperationException( + "REGEX is supported only when " + CALCITE_ENGINE_ENABLED.getKeyValue() + "=true"); + } + @Override public LogicalPlan visitPaginate(Paginate paginate, AnalysisContext context) { LogicalPlan child = paginate.getChild().get(0).accept(this, context); diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index 9b9f036a2df..cc3a26abe4f 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -67,6 +67,7 @@ import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; @@ -259,6 +260,10 @@ public T visitReverse(Reverse node, C context) { return visitChildren(node, context); } + public T visitRegex(Regex node, C context) { + return visitChildren(node, context); + } + public T visitLambdaFunction(LambdaFunction node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/Regex.java b/core/src/main/java/org/opensearch/sql/ast/tree/Regex.java new file mode 100644 index 00000000000..50913b4edba --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/Regex.java @@ -0,0 +1,55 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.Literal; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +@Getter +@ToString +@EqualsAndHashCode(callSuper = false) +public class Regex extends UnresolvedPlan { + public static final String EQUALS_OPERATOR = "="; + + public static final String NOT_EQUALS_OPERATOR = "!="; + + private final UnresolvedExpression field; + + private final boolean negated; + + private final Literal pattern; + + @Setter private UnresolvedPlan child; + + public Regex(UnresolvedExpression field, boolean negated, Literal pattern) { + this.field = field; + this.negated = negated; + this.pattern = pattern; + } + + @Override + public Regex attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitRegex(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 9b5478cc0be..13e62e5df07 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -48,6 +48,7 @@ import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexWindowBounds; import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.apache.calcite.sql.type.SqlTypeFamily; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.RelBuilder; import org.apache.calcite.tools.RelBuilder.AggCall; @@ -99,6 +100,7 @@ import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.SPath; @@ -174,6 +176,32 @@ public RelNode visitFilter(Filter node, CalcitePlanContext context) { return context.relBuilder.peek(); } + @Override + public RelNode visitRegex(Regex node, CalcitePlanContext context) { + visitChildren(node, context); + + RexNode fieldRex = rexVisitor.analyze(node.getField(), context); + RexNode patternRex = rexVisitor.analyze(node.getPattern(), context); + + if (!SqlTypeFamily.CHARACTER.contains(fieldRex.getType())) { + throw new IllegalArgumentException( + String.format( + "Regex command requires field of string type, but got %s for field '%s'", + fieldRex.getType().getSqlTypeName(), node.getField().toString())); + } + + RexNode regexCondition = + context.rexBuilder.makeCall( + org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS, fieldRex, patternRex); + + if (node.isNegated()) { + regexCondition = context.rexBuilder.makeCall(SqlStdOperatorTable.NOT, regexCondition); + } + + context.relBuilder.filter(regexCondition); + return context.relBuilder.peek(); + } + private boolean containsSubqueryExpression(Node expr) { if (expr == null) { return false; diff --git a/core/src/main/java/org/opensearch/sql/expression/parse/RegexCommonUtils.java b/core/src/main/java/org/opensearch/sql/expression/parse/RegexCommonUtils.java new file mode 100644 index 00000000000..f2c9092c346 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/parse/RegexCommonUtils.java @@ -0,0 +1,108 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.parse; + +import com.google.common.collect.ImmutableList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * Common utilities for regex operations. Provides pattern caching and consistent matching behavior. + */ +public class RegexCommonUtils { + + private static final Pattern NAMED_GROUP_PATTERN = + Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>"); + + private static final int MAX_CACHE_SIZE = 1000; + + private static final Map patternCache = + Collections.synchronizedMap( + new LinkedHashMap<>(MAX_CACHE_SIZE + 1, 0.75f, true) { + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > MAX_CACHE_SIZE; + } + }); + + /** + * Get compiled pattern from cache or compile and cache it. + * + * @param regex The regex pattern string + * @return Compiled Pattern object + * @throws PatternSyntaxException if the regex is invalid + */ + public static Pattern getCompiledPattern(String regex) { + Pattern pattern = patternCache.get(regex); + if (pattern == null) { + pattern = Pattern.compile(regex); + patternCache.put(regex, pattern); + } + return pattern; + } + + /** + * Extract list of named group candidates from a regex pattern. + * + * @param pattern The regex pattern string + * @return List of named group names found in the pattern + */ + public static List getNamedGroupCandidates(String pattern) { + ImmutableList.Builder namedGroups = ImmutableList.builder(); + Matcher m = NAMED_GROUP_PATTERN.matcher(pattern); + while (m.find()) { + namedGroups.add(m.group(1)); + } + return namedGroups.build(); + } + + /** + * Match using find() for partial match semantics with string pattern. + * + * @param text The text to match against + * @param patternStr The pattern string + * @return true if pattern is found anywhere in the text + * @throws PatternSyntaxException if the regex is invalid + */ + public static boolean matchesPartial(String text, String patternStr) { + if (text == null || patternStr == null) { + return false; + } + Pattern pattern = getCompiledPattern(patternStr); + return pattern.matcher(text).find(); + } + + /** + * Extract a specific named group from text using the pattern. Used by parse command regex method. + * + * @param text The text to extract from + * @param pattern The compiled pattern with named groups + * @param groupName The name of the group to extract + * @return The extracted value or null if not found + */ + public static String extractNamedGroup(String text, Pattern pattern, String groupName) { + if (text == null || pattern == null || groupName == null) { + return null; + } + + Matcher matcher = pattern.matcher(text); + + if (matcher.matches()) { + try { + return matcher.group(groupName); + } catch (IllegalArgumentException e) { + return null; + } + } + + return null; + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/parse/RegexExpression.java b/core/src/main/java/org/opensearch/sql/expression/parse/RegexExpression.java index 7514c9df69a..a65dc04f683 100644 --- a/core/src/main/java/org/opensearch/sql/expression/parse/RegexExpression.java +++ b/core/src/main/java/org/opensearch/sql/expression/parse/RegexExpression.java @@ -5,9 +5,6 @@ package org.opensearch.sql.expression.parse; -import com.google.common.collect.ImmutableList; -import java.util.List; -import java.util.regex.Matcher; import java.util.regex.Pattern; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -24,7 +21,6 @@ @ToString public class RegexExpression extends ParseExpression { private static final Logger log = LogManager.getLogger(RegexExpression.class); - private static final Pattern GROUP_PATTERN = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>"); @Getter @EqualsAndHashCode.Exclude private final Pattern regexPattern; /** @@ -36,32 +32,19 @@ public class RegexExpression extends ParseExpression { */ public RegexExpression(Expression sourceField, Expression pattern, Expression identifier) { super("regex", sourceField, pattern, identifier); - this.regexPattern = Pattern.compile(pattern.valueOf().stringValue()); + this.regexPattern = RegexCommonUtils.getCompiledPattern(pattern.valueOf().stringValue()); } @Override ExprValue parseValue(ExprValue value) throws ExpressionEvaluationException { String rawString = value.stringValue(); - Matcher matcher = regexPattern.matcher(rawString); - if (matcher.matches()) { - return new ExprStringValue(matcher.group(identifierStr)); + + String extracted = RegexCommonUtils.extractNamedGroup(rawString, regexPattern, identifierStr); + + if (extracted != null) { + return new ExprStringValue(extracted); } log.debug("failed to extract pattern {} from input ***", regexPattern.pattern()); return new ExprStringValue(""); } - - /** - * Get list of derived fields based on parse pattern. - * - * @param pattern pattern used for parsing - * @return list of names of the derived fields - */ - public static List getNamedGroupCandidates(String pattern) { - ImmutableList.Builder namedGroups = ImmutableList.builder(); - Matcher m = GROUP_PATTERN.matcher(pattern); - while (m.find()) { - namedGroups.add(m.group(1)); - } - return namedGroups.build(); - } } diff --git a/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java b/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java index 581aecfebe3..3e84a8b6d24 100644 --- a/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java +++ b/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java @@ -16,6 +16,7 @@ import org.opensearch.sql.expression.parse.GrokExpression; import org.opensearch.sql.expression.parse.ParseExpression; import org.opensearch.sql.expression.parse.PatternsExpression; +import org.opensearch.sql.expression.parse.RegexCommonUtils; import org.opensearch.sql.expression.parse.RegexExpression; /** Utils for {@link ParseExpression}. */ @@ -57,7 +58,7 @@ public static List getNamedGroupCandidates( ParseMethod parseMethod, String pattern, Map arguments) { switch (parseMethod) { case REGEX: - return RegexExpression.getNamedGroupCandidates(pattern); + return RegexCommonUtils.getNamedGroupCandidates(pattern); case GROK: return GrokExpression.getNamedGroupCandidates(pattern); default: diff --git a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java index 271c1b7b046..47ea2e5c5d3 100644 --- a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java +++ b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java @@ -1929,4 +1929,18 @@ public void brain_patterns_command() { assertAnalyzeEqual(expectedPlan, patterns); } + + @Test + public void regex_command_throws_unsupported_exception_with_legacy_engine() { + UnsupportedOperationException exception = + assertThrows( + UnsupportedOperationException.class, + () -> + analyze( + new org.opensearch.sql.ast.tree.Regex( + field("lastname"), false, stringLiteral("^[A-Z][a-z]+$")) + .attach(relation("schema")))); + assertEquals( + "REGEX is supported only when plugins.calcite.enabled=true", exception.getMessage()); + } } diff --git a/core/src/test/java/org/opensearch/sql/expression/parse/RegexCommonUtilsTest.java b/core/src/test/java/org/opensearch/sql/expression/parse/RegexCommonUtilsTest.java new file mode 100644 index 00000000000..5e0f58e3b95 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/parse/RegexCommonUtilsTest.java @@ -0,0 +1,214 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.parse; + +import static org.junit.jupiter.api.Assertions.*; + +import java.util.List; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.junit.jupiter.api.Test; + +public class RegexCommonUtilsTest { + + @Test + public void testGetCompiledPattern() { + String regex = "test.*pattern"; + Pattern pattern1 = RegexCommonUtils.getCompiledPattern(regex); + Pattern pattern2 = RegexCommonUtils.getCompiledPattern(regex); + + assertNotNull(pattern1); + assertSame(pattern1, pattern2, "Should return cached pattern"); + assertEquals(regex, pattern1.pattern()); + } + + @Test + public void testGetCompiledPatternWithInvalidRegex() { + String invalidRegex = "[invalid"; + + assertThrows( + PatternSyntaxException.class, + () -> { + RegexCommonUtils.getCompiledPattern(invalidRegex); + }); + } + + @Test + public void testGetNamedGroupCandidatesSingle() { + String pattern = "(?[a-z]+)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(1, groups.size()); + assertEquals("name", groups.get(0)); + } + + @Test + public void testGetNamedGroupCandidatesMultiple() { + String pattern = "(?[a-z]+)\\s+(?[0-9]+)\\s+(?.*)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(3, groups.size()); + assertEquals("first", groups.get(0)); + assertEquals("second", groups.get(1)); + assertEquals("third", groups.get(2)); + } + + @Test + public void testGetNamedGroupCandidatesWithMixedGroups() { + String pattern = "([a-z]+)\\s+(?[0-9]+)\\s+(\\d+)\\s+(?.*)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(2, groups.size()); + assertEquals("named1", groups.get(0)); + assertEquals("named2", groups.get(1)); + } + + @Test + public void testGetNamedGroupCandidatesNoGroups() { + String pattern = "[a-z]+\\s+[0-9]+"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(0, groups.size()); + } + + @Test + public void testGetNamedGroupCandidatesEmailPattern() { + String pattern = ".+@(?.+)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(1, groups.size()); + assertEquals("host", groups.get(0)); + } + + @Test + public void testMatchesPartialWithMatch() { + assertTrue(RegexCommonUtils.matchesPartial("test string", "test")); + assertTrue(RegexCommonUtils.matchesPartial("test string", "string")); + assertTrue(RegexCommonUtils.matchesPartial("test string", "st.*ng")); + assertTrue(RegexCommonUtils.matchesPartial("user@domain.com", ".*@domain\\.com")); + } + + @Test + public void testMatchesPartialWithoutMatch() { + assertFalse(RegexCommonUtils.matchesPartial("test string", "notfound")); + assertFalse(RegexCommonUtils.matchesPartial("test string", "^string")); + assertFalse(RegexCommonUtils.matchesPartial("user@domain.com", ".*@other\\.com")); + } + + @Test + public void testMatchesPartialWithNullInputs() { + assertFalse(RegexCommonUtils.matchesPartial(null, "pattern")); + assertFalse(RegexCommonUtils.matchesPartial("text", null)); + assertFalse(RegexCommonUtils.matchesPartial(null, null)); + } + + @Test + public void testMatchesPartialWithEmptyString() { + assertTrue(RegexCommonUtils.matchesPartial("", "")); + assertTrue(RegexCommonUtils.matchesPartial("text", "")); + assertFalse(RegexCommonUtils.matchesPartial("", "pattern")); + } + + @Test + public void testMatchesPartialWithInvalidRegex() { + assertThrows( + PatternSyntaxException.class, + () -> { + RegexCommonUtils.matchesPartial("text", "[invalid"); + }); + } + + @Test + public void testExtractNamedGroupSuccess() { + Pattern pattern = Pattern.compile("(?[^@]+)@(?.+)"); + String text = "john@example.com"; + + assertEquals("john", RegexCommonUtils.extractNamedGroup(text, pattern, "user")); + assertEquals("example.com", RegexCommonUtils.extractNamedGroup(text, pattern, "domain")); + } + + @Test + public void testExtractNamedGroupNoMatch() { + Pattern pattern = Pattern.compile("(?[^@]+)@(?.+)"); + String text = "not_an_email"; + + assertNull(RegexCommonUtils.extractNamedGroup(text, pattern, "user")); + assertNull(RegexCommonUtils.extractNamedGroup(text, pattern, "domain")); + } + + @Test + public void testExtractNamedGroupNonExistentGroup() { + Pattern pattern = Pattern.compile("(?[^@]+)@(?.+)"); + String text = "john@example.com"; + + assertNull(RegexCommonUtils.extractNamedGroup(text, pattern, "nonexistent")); + } + + @Test + public void testExtractNamedGroupWithNullInputs() { + Pattern pattern = Pattern.compile("(?[^@]+)@(?.+)"); + String text = "john@example.com"; + + assertNull(RegexCommonUtils.extractNamedGroup(null, pattern, "user")); + assertNull(RegexCommonUtils.extractNamedGroup(text, null, "user")); + assertNull(RegexCommonUtils.extractNamedGroup(text, pattern, null)); + assertNull(RegexCommonUtils.extractNamedGroup(null, null, null)); + } + + @Test + public void testExtractNamedGroupComplexPattern() { + Pattern pattern = Pattern.compile("(?\\d{4})-(?\\d{2})-(?\\d{2})"); + String text = "2024-03-15"; + + assertEquals("2024", RegexCommonUtils.extractNamedGroup(text, pattern, "year")); + assertEquals("03", RegexCommonUtils.extractNamedGroup(text, pattern, "month")); + assertEquals("15", RegexCommonUtils.extractNamedGroup(text, pattern, "day")); + } + + @Test + public void testPatternCachingBehavior() { + String regex1 = "test1.*"; + String regex2 = "test2.*"; + + Pattern p1a = RegexCommonUtils.getCompiledPattern(regex1); + Pattern p2a = RegexCommonUtils.getCompiledPattern(regex2); + Pattern p1b = RegexCommonUtils.getCompiledPattern(regex1); + Pattern p2b = RegexCommonUtils.getCompiledPattern(regex2); + + assertSame(p1a, p1b, "Same regex should return cached pattern"); + assertSame(p2a, p2b, "Same regex should return cached pattern"); + assertNotSame(p1a, p2a, "Different regex should return different patterns"); + } + + @Test + public void testGetNamedGroupCandidatesWithNumericNames() { + String pattern = "(?[a-z]+)\\s+(?[0-9]+)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(2, groups.size()); + assertEquals("group1", groups.get(0)); + assertEquals("group2", groups.get(1)); + } + + @Test + public void testGetNamedGroupCandidatesSpecialCharacters() { + // Test that groups with special characters are not captured (only alphanumeric starting with + // letter) + String pattern = "(?[a-z]+)\\s+(?<123invalid>[0-9]+)\\s+(?.*)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + assertEquals(0, groups.size()); + } + + @Test + public void testGetNamedGroupCandidatesValidAlphanumeric() { + String pattern = "(?[a-z]+)\\s+(?[0-9]+)"; + List groups = RegexCommonUtils.getNamedGroupCandidates(pattern); + + assertEquals(2, groups.size()); + assertEquals("groupA", groups.get(0)); + assertEquals("group2B", groups.get(1)); + } +} diff --git a/core/src/test/java/org/opensearch/sql/expression/parse/RegexExpressionTest.java b/core/src/test/java/org/opensearch/sql/expression/parse/RegexExpressionTest.java index 846aff69117..f6cd0c774c2 100644 --- a/core/src/test/java/org/opensearch/sql/expression/parse/RegexExpressionTest.java +++ b/core/src/test/java/org/opensearch/sql/expression/parse/RegexExpressionTest.java @@ -66,7 +66,7 @@ public void resolve_regex_groups_and_parsed_values() { "userAgent", "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1"); List identifiers = new ArrayList<>(expected.keySet()); - assertEquals(identifiers, RegexExpression.getNamedGroupCandidates(rawPattern)); + assertEquals(identifiers, RegexCommonUtils.getNamedGroupCandidates(rawPattern)); identifiers.forEach( identifier -> assertEquals( diff --git a/docs/category.json b/docs/category.json index 47098ffc032..d475b416a32 100644 --- a/docs/category.json +++ b/docs/category.json @@ -57,6 +57,7 @@ "user/dql/metadata.rst" ], "ppl_cli_calcite": [ + "user/ppl/cmd/regex.rst", "user/ppl/cmd/stats.rst" ] } diff --git a/docs/user/ppl/cmd/regex.rst b/docs/user/ppl/cmd/regex.rst new file mode 100644 index 00000000000..307aa0129d1 --- /dev/null +++ b/docs/user/ppl/cmd/regex.rst @@ -0,0 +1,144 @@ +============= +regex +============= + +.. rubric:: Table of contents + +.. contents:: + :local: + :depth: 2 + + +Description +============ +| The ``regex`` command filters search results by matching field values against a regular expression pattern. Only documents where the specified field matches the pattern are included in the results. + +Version +======= +3.3.0 + +Syntax +============ +regex = +regex != + +* field: mandatory. The field name to match against. +* pattern: mandatory string. The regular expression pattern to match. Supports Java regex syntax including named groups, lookahead/lookbehind, and character classes. +* = : operator for positive matching (include matches) +* != : operator for negative matching (exclude matches) + +Regular Expression Engine +========================== + +The regex command uses Java's built-in regular expression engine, which supports: + +* **Standard regex features**: Character classes, quantifiers, anchors +* **Named capture groups**: ``(?pattern)`` syntax +* **Lookahead/lookbehind**: ``(?=...)`` and ``(?<=...)`` assertions +* **Inline flags**: Case-insensitive ``(?i)``, multiline ``(?m)``, dotall ``(?s)``, and other modes + +For complete documentation of Java regex patterns and available modes, see the `Java Pattern documentation `_. + +Example 1: Basic pattern matching +================================= + +The example shows how to filter documents where the ``lastname`` field matches names starting with uppercase letters. + +PPL query:: + + os> source=accounts | regex lastname="^[A-Z][a-z]+$" | fields account_number, firstname, lastname; + fetched rows / total rows = 4/4 + +----------------+-----------+----------+ + | account_number | firstname | lastname | + |----------------+-----------+----------| + | 1 | Amber | Duke | + | 6 | Hattie | Bond | + | 13 | Nanette | Bates | + | 18 | Dale | Adams | + +----------------+-----------+----------+ + + +Example 2: Negative matching +============================ + +The example shows how to exclude documents where the ``lastname`` field ends with "son". + +PPL query:: + + os> source=accounts | regex lastname!=".*son$" | fields account_number, lastname; + fetched rows / total rows = 4/4 + +----------------+----------+ + | account_number | lastname | + |----------------+----------| + | 1 | Duke | + | 6 | Bond | + | 13 | Bates | + | 18 | Adams | + +----------------+----------+ + + +Example 3: Email domain matching +================================ + +The example shows how to filter documents by email domain patterns. + +PPL query:: + + os> source=accounts | regex email="@pyrami\.com$" | fields account_number, email; + fetched rows / total rows = 1/1 + +----------------+----------------------+ + | account_number | email | + |----------------+----------------------| + | 1 | amberduke@pyrami.com | + +----------------+----------------------+ + + +Example 4: Complex patterns with character classes +================================================== + +The example shows how to use complex regex patterns with character classes and quantifiers. + +PPL query:: + + os> source=accounts | regex address="\d{3,4}\s+[A-Z][a-z]+\s+(Street|Lane|Court)" | fields account_number, address; + fetched rows / total rows = 4/4 + +----------------+----------------------+ + | account_number | address | + |----------------+----------------------| + | 1 | 880 Holmes Lane | + | 6 | 671 Bristol Street | + | 13 | 789 Madison Street | + | 18 | 467 Hutchinson Court | + +----------------+----------------------+ + + +Example 5: Case-sensitive matching +================================== + +The example demonstrates that regex matching is case-sensitive by default. + +PPL query:: + + os> source=accounts | regex state="va" | fields account_number, state; + fetched rows / total rows = 0/0 + +----------------+-------+ + | account_number | state | + |----------------+-------| + +----------------+-------+ + +PPL query:: + + os> source=accounts | regex state="VA" | fields account_number, state; + fetched rows / total rows = 1/1 + +----------------+-------+ + | account_number | state | + |----------------+-------| + | 13 | VA | + +----------------+-------+ + + +Limitations +=========== + +* **Field specification required**: A field name must be specified in the regex command. Pattern-only syntax (e.g., ``regex "pattern"``) is not currently supported +* **String fields only**: The regex command currently only supports string fields. Using it on numeric or boolean fields will result in an error diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst index c74f5c82bf5..386001bc5ef 100644 --- a/docs/user/ppl/index.rst +++ b/docs/user/ppl/index.rst @@ -94,6 +94,8 @@ The query start with search command and then flowing a set of command delimited - `rename command `_ + - `regex command `_ + - `search command `_ - `show datasources command `_ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index 45bc34dcba7..d4d9f61f42e 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -83,6 +83,7 @@ CalcitePrometheusDataSourceCommandsIT.class, CalciteQueryAnalysisIT.class, CalciteRareCommandIT.class, + CalciteRegexCommandIT.class, CalciteRenameCommandIT.class, CalciteResourceMonitorIT.class, CalciteSearchCommandIT.class, diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index 264a76d1b12..317997f7661 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -365,6 +365,23 @@ public void testExplainOnEarliestLatestWithCustomTimeField() throws IOException TEST_INDEX_LOGS))); } + @Test + public void testRegexExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | regex lastname='^[A-Z][a-z]+$' | head 5"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_regex.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testRegexNegatedExplain() throws IOException { + String query = "source=opensearch-sql_test_index_account | regex lastname!='.*son$' | head 5"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_regex_negated.json"); + assertJsonEqualsIgnoreId(expected, result); + } + /** * Executes the PPL query and returns the result as a string with windows-style line breaks * replaced with Unix-style ones. diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRegexCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRegexCommandIT.java new file mode 100644 index 00000000000..3367901e1f8 --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRegexCommandIT.java @@ -0,0 +1,92 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.legacy.TestsConstants.*; + +import java.io.IOException; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +public class CalciteRegexCommandIT extends PPLIntegTestCase { + + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.ACCOUNT); + } + + @Test + public void testRegexBasicStringMatch() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | regex firstname='Amber' | fields account_number, firstname", + TEST_INDEX_ACCOUNT)); + + assertEquals(1, result.getJSONArray("datarows").length()); + assertEquals("Amber", result.getJSONArray("datarows").getJSONArray(0).get(1)); + } + + @Test + public void testRegexPartialStringMatch() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | regex firstname='nan' | fields account_number, firstname", + TEST_INDEX_ACCOUNT)); + + // Should match names containing "nan": Fernandez, Buchanan + assertEquals(2, result.getJSONArray("datarows").length()); + // Verify one of the results contains "nan" + String firstName = result.getJSONArray("datarows").getJSONArray(0).get(1).toString(); + assertTrue(firstName.contains("nan")); + } + + @Test + public void testRegexPatternMatch() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | regex firstname='A.*' | fields account_number, firstname", + TEST_INDEX_ACCOUNT)); + + // Should match names starting with A - there are 66 such names in accounts.json + assertEquals(66, result.getJSONArray("datarows").length()); + // Verify first result is a name starting with A + assertTrue(result.getJSONArray("datarows").getJSONArray(0).get(1).toString().startsWith("A")); + } + + @Test + public void testRegexNegatedMatch() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | regex firstname!='Amber' | fields account_number, firstname | head 3", + TEST_INDEX_ACCOUNT)); + + assertEquals(3, result.getJSONArray("datarows").length()); + // Verify Amber is not in results + for (int i = 0; i < result.getJSONArray("datarows").length(); i++) { + assertNotEquals("Amber", result.getJSONArray("datarows").getJSONArray(i).get(1)); + } + } + + @Test + public void testRegexWithStateField() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | regex state='CA' | fields account_number, firstname, state", + TEST_INDEX_ACCOUNT)); + + // There are 17 CA records in accounts.json + assertEquals(17, result.getJSONArray("datarows").length()); + assertEquals("CA", result.getJSONArray("datarows").getJSONArray(0).get(2)); + } +} diff --git a/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java b/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java index 344919990f1..8fcd756edac 100644 --- a/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java @@ -239,4 +239,31 @@ public void testTimeBinCrossCluster() throws IOException { rows("2025-07-28 01:00:00", 7623), rows("2025-07-28 02:00:00", 9187)); } + + @Test + public void testCrossClusterRegexBasic() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | regex firstname='.*att.*' | fields firstname", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows(result, rows("Hattie")); + } + + @Test + public void testCrossClusterRegexWithNegation() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | regex firstname!='.*att.*' | fields firstname", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows( + result, + rows("Virginia"), + rows("Elinor"), + rows("Dillard"), + rows("Dale"), + rows("Amber JOHnny"), + rows("Nanette")); + } } diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_regex.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_regex.json new file mode 100644 index 00000000000..a1a971f290e --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_regex.json @@ -0,0 +1 @@ +{"calcite":{"logical":"LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10])\n LogicalSort(fetch=[5])\n LogicalFilter(condition=[REGEXP_CONTAINS($10, '^[A-Z][a-z]+$':VARCHAR)])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n","physical":"CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[SCRIPT->REGEXP_CONTAINS($10, '^[A-Z][a-z]+$':VARCHAR), LIMIT->5, PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":5,\"timeout\":\"1m\",\"query\":{\"script\":{\"script\":{\"source\":\"{\\\"langType\\\":\\\"calcite\\\",\\\"script\\\":\\\"rO0ABXNyABFqYXZhLnV0aWwuQ29sbFNlcleOq7Y6G6gRAwABSQADdGFneHAAAAADdwQAAAAGdAAHcm93VHlwZXQGy3sKICAiZmllbGRzIjogWwogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhY2NvdW50X251bWJlciIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImZpcnN0bmFtZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImFkZHJlc3MiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJiYWxhbmNlIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiZ2VuZGVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiY2l0eSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImVtcGxveWVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAic3RhdGUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhZ2UiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgInByZWNpc2lvbiI6IC0xLAogICAgICAibmFtZSI6ICJlbWFpbCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImxhc3RuYW1lIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2lkIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2luZGV4IgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiUkVBTCIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJuYW1lIjogIl9zY29yZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlJFQUwiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfbWF4c2NvcmUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfc29ydCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogIl9yb3V0aW5nIgogICAgfQogIF0sCiAgIm51bGxhYmxlIjogdHJ1ZQp9dAAEZXhwcnQBRHsKICAib3AiOiB7CiAgICAibmFtZSI6ICJSRUdFWFBfQ09OVEFJTlMiLAogICAgImtpbmQiOiAiT1RIRVJfRlVOQ1RJT04iLAogICAgInN5bnRheCI6ICJGVU5DVElPTiIKICB9LAogICJvcGVyYW5kcyI6IFsKICAgIHsKICAgICAgImlucHV0IjogMTAsCiAgICAgICJuYW1lIjogIiQxMCIKICAgIH0sCiAgICB7CiAgICAgICJsaXRlcmFsIjogIl5bQS1aXVthLXpdKyQiLAogICAgICAidHlwZSI6IHsKICAgICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgICAibnVsbGFibGUiOiBmYWxzZSwKICAgICAgICAicHJlY2lzaW9uIjogLTEKICAgICAgfQogICAgfQogIF0KfXQACmZpZWxkVHlwZXNzcgARamF2YS51dGlsLkhhc2hNYXAFB9rBwxZg0QMAAkYACmxvYWRGYWN0b3JJAAl0aHJlc2hvbGR4cD9AAAAAAAAMdwgAAAAQAAAAC3QADmFjY291bnRfbnVtYmVyfnIAKW9yZy5vcGVuc2VhcmNoLnNxbC5kYXRhLnR5cGUuRXhwckNvcmVUeXBlAAAAAAAAAAASAAB4cgAOamF2YS5sYW5nLkVudW0AAAAAAAAAABIAAHhwdAAETE9OR3QACWZpcnN0bmFtZXNyADpvcmcub3BlbnNlYXJjaC5zcWwub3BlbnNlYXJjaC5kYXRhLnR5cGUuT3BlblNlYXJjaFRleHRUeXBlrYOjkwTjMUQCAAFMAAZmaWVsZHN0AA9MamF2YS91dGlsL01hcDt4cgA6b3JnLm9wZW5zZWFyY2guc3FsLm9wZW5zZWFyY2guZGF0YS50eXBlLk9wZW5TZWFyY2hEYXRhVHlwZcJjvMoC+gU1AgADTAAMZXhwckNvcmVUeXBldAArTG9yZy9vcGVuc2VhcmNoL3NxbC9kYXRhL3R5cGUvRXhwckNvcmVUeXBlO0wAC21hcHBpbmdUeXBldABITG9yZy9vcGVuc2VhcmNoL3NxbC9vcGVuc2VhcmNoL2RhdGEvdHlwZS9PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGU7TAAKcHJvcGVydGllc3EAfgAQeHB+cQB+AAp0AAdVTktOT1dOfnIARm9yZy5vcGVuc2VhcmNoLnNxbC5vcGVuc2VhcmNoLmRhdGEudHlwZS5PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGUAAAAAAAAAABIAAHhxAH4AC3QABFRleHRzcgA8c2hhZGVkLmNvbS5nb29nbGUuY29tbW9uLmNvbGxlY3QuSW1tdXRhYmxlTWFwJFNlcmlhbGl6ZWRGb3JtAAAAAAAAAAACAAJMAARrZXlzdAASTGphdmEvbGFuZy9PYmplY3Q7TAAGdmFsdWVzcQB+ABt4cHVyABNbTGphdmEubGFuZy5PYmplY3Q7kM5YnxBzKWwCAAB4cAAAAAB1cQB+AB0AAAAAc3EAfgAAAAAAA3cEAAAAAnQAB2tleXdvcmRzcQB+ABF+cQB+AAp0AAZTVFJJTkd+cQB+ABd0AAdLZXl3b3JkcQB+ABx4dAAHYWRkcmVzc3NxAH4AD3EAfgAVcQB+ABhxAH4AHHNxAH4AAAAAAAN3BAAAAAB4dAAHYmFsYW5jZXEAfgAMdAAGZ2VuZGVyc3EAfgAPcQB+ABVxAH4AGHEAfgAcc3EAfgAAAAAAA3cEAAAAAnEAfgAhcQB+ACJ4dAAEY2l0eXNxAH4AD3EAfgAVcQB+ABhxAH4AHHNxAH4AAAAAAAN3BAAAAAJxAH4AIXEAfgAieHQACGVtcGxveWVyc3EAfgAPcQB+ABVxAH4AGHEAfgAcc3EAfgAAAAAAA3cEAAAAAnEAfgAhcQB+ACJ4dAAFc3RhdGVzcQB+AA9xAH4AFXEAfgAYcQB+ABxzcQB+AAAAAAADdwQAAAACcQB+ACFxAH4AInh0AANhZ2VxAH4ADHQABWVtYWlsc3EAfgAPcQB+ABVxAH4AGHEAfgAcc3EAfgAAAAAAA3cEAAAAAnEAfgAhcQB+ACJ4dAAIbGFzdG5hbWVzcQB+AA9xAH4AFXEAfgAYcQB+ABxzcQB+AAAAAAADdwQAAAACcQB+ACFxAH4AInh4eA==\\\"}\",\"lang\":\"opensearch_compounded_script\",\"params\":{\"utcTimestamp\":*}},\"boost\":1.0}},\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"],\"excludes\":[]},\"sort\":[{\"_doc\":{\"order\":\"asc\"}}]}, requestedTotalSize=5, pageSize=null, startFrom=0)])\n"}} \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_regex_negated.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_regex_negated.json new file mode 100644 index 00000000000..254093b542a --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_regex_negated.json @@ -0,0 +1 @@ +{"calcite":{"logical":"LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10])\n LogicalSort(fetch=[5])\n LogicalFilter(condition=[NOT(REGEXP_CONTAINS($10, '.*son$':VARCHAR))])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n","physical":"CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[SCRIPT->NOT(REGEXP_CONTAINS($10, '.*son$':VARCHAR)), LIMIT->5, PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":5,\"timeout\":\"1m\",\"query\":{\"script\":{\"script\":{\"source\":\"{\\\"langType\\\":\\\"calcite\\\",\\\"script\\\":\\\"rO0ABXNyABFqYXZhLnV0aWwuQ29sbFNlcleOq7Y6G6gRAwABSQADdGFneHAAAAADdwQAAAAGdAAHcm93VHlwZXQGy3sKICAiZmllbGRzIjogWwogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhY2NvdW50X251bWJlciIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImZpcnN0bmFtZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImFkZHJlc3MiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJiYWxhbmNlIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiZ2VuZGVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiY2l0eSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImVtcGxveWVyIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAic3RhdGUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJhZ2UiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgIm51bGxhYmxlIjogdHJ1ZSwKICAgICAgInByZWNpc2lvbiI6IC0xLAogICAgICAibmFtZSI6ICJlbWFpbCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogImxhc3RuYW1lIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2lkIgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiVkFSQ0hBUiIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJwcmVjaXNpb24iOiAtMSwKICAgICAgIm5hbWUiOiAiX2luZGV4IgogICAgfSwKICAgIHsKICAgICAgInR5cGUiOiAiUkVBTCIsCiAgICAgICJudWxsYWJsZSI6IHRydWUsCiAgICAgICJuYW1lIjogIl9zY29yZSIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlJFQUwiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfbWF4c2NvcmUiCiAgICB9LAogICAgewogICAgICAidHlwZSI6ICJCSUdJTlQiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAibmFtZSI6ICJfc29ydCIKICAgIH0sCiAgICB7CiAgICAgICJ0eXBlIjogIlZBUkNIQVIiLAogICAgICAibnVsbGFibGUiOiB0cnVlLAogICAgICAicHJlY2lzaW9uIjogLTEsCiAgICAgICJuYW1lIjogIl9yb3V0aW5nIgogICAgfQogIF0sCiAgIm51bGxhYmxlIjogdHJ1ZQp9dAAEZXhwcnQB9XsKICAib3AiOiB7CiAgICAibmFtZSI6ICJOT1QiLAogICAgImtpbmQiOiAiTk9UIiwKICAgICJzeW50YXgiOiAiUFJFRklYIgogIH0sCiAgIm9wZXJhbmRzIjogWwogICAgewogICAgICAib3AiOiB7CiAgICAgICAgIm5hbWUiOiAiUkVHRVhQX0NPTlRBSU5TIiwKICAgICAgICAia2luZCI6ICJPVEhFUl9GVU5DVElPTiIsCiAgICAgICAgInN5bnRheCI6ICJGVU5DVElPTiIKICAgICAgfSwKICAgICAgIm9wZXJhbmRzIjogWwogICAgICAgIHsKICAgICAgICAgICJpbnB1dCI6IDEwLAogICAgICAgICAgIm5hbWUiOiAiJDEwIgogICAgICAgIH0sCiAgICAgICAgewogICAgICAgICAgImxpdGVyYWwiOiAiLipzb24kIiwKICAgICAgICAgICJ0eXBlIjogewogICAgICAgICAgICAidHlwZSI6ICJWQVJDSEFSIiwKICAgICAgICAgICAgIm51bGxhYmxlIjogZmFsc2UsCiAgICAgICAgICAgICJwcmVjaXNpb24iOiAtMQogICAgICAgICAgfQogICAgICAgIH0KICAgICAgXQogICAgfQogIF0KfXQACmZpZWxkVHlwZXNzcgARamF2YS51dGlsLkhhc2hNYXAFB9rBwxZg0QMAAkYACmxvYWRGYWN0b3JJAAl0aHJlc2hvbGR4cD9AAAAAAAAMdwgAAAAQAAAAC3QADmFjY291bnRfbnVtYmVyfnIAKW9yZy5vcGVuc2VhcmNoLnNxbC5kYXRhLnR5cGUuRXhwckNvcmVUeXBlAAAAAAAAAAASAAB4cgAOamF2YS5sYW5nLkVudW0AAAAAAAAAABIAAHhwdAAETE9OR3QACWZpcnN0bmFtZXNyADpvcmcub3BlbnNlYXJjaC5zcWwub3BlbnNlYXJjaC5kYXRhLnR5cGUuT3BlblNlYXJjaFRleHRUeXBlrYOjkwTjMUQCAAFMAAZmaWVsZHN0AA9MamF2YS91dGlsL01hcDt4cgA6b3JnLm9wZW5zZWFyY2guc3FsLm9wZW5zZWFyY2guZGF0YS50eXBlLk9wZW5TZWFyY2hEYXRhVHlwZcJjvMoC+gU1AgADTAAMZXhwckNvcmVUeXBldAArTG9yZy9vcGVuc2VhcmNoL3NxbC9kYXRhL3R5cGUvRXhwckNvcmVUeXBlO0wAC21hcHBpbmdUeXBldABITG9yZy9vcGVuc2VhcmNoL3NxbC9vcGVuc2VhcmNoL2RhdGEvdHlwZS9PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGU7TAAKcHJvcGVydGllc3EAfgAQeHB+cQB+AAp0AAdVTktOT1dOfnIARm9yZy5vcGVuc2VhcmNoLnNxbC5vcGVuc2VhcmNoLmRhdGEudHlwZS5PcGVuU2VhcmNoRGF0YVR5cGUkTWFwcGluZ1R5cGUAAAAAAAAAABIAAHhxAH4AC3QABFRleHRzcgA8c2hhZGVkLmNvbS5nb29nbGUuY29tbW9uLmNvbGxlY3QuSW1tdXRhYmxlTWFwJFNlcmlhbGl6ZWRGb3JtAAAAAAAAAAACAAJMAARrZXlzdAASTGphdmEvbGFuZy9PYmplY3Q7TAAGdmFsdWVzcQB+ABt4cHVyABNbTGphdmEubGFuZy5PYmplY3Q7kM5YnxBzKWwCAAB4cAAAAAB1cQB+AB0AAAAAc3EAfgAAAAAAA3cEAAAAAnQAB2tleXdvcmRzcQB+ABF+cQB+AAp0AAZTVFJJTkd+cQB+ABd0AAdLZXl3b3JkcQB+ABx4dAAHYWRkcmVzc3NxAH4AD3EAfgAVcQB+ABhxAH4AHHNxAH4AAAAAAAN3BAAAAAB4dAAHYmFsYW5jZXEAfgAMdAAGZ2VuZGVyc3EAfgAPcQB+ABVxAH4AGHEAfgAcc3EAfgAAAAAAA3cEAAAAAnEAfgAhcQB+ACJ4dAAEY2l0eXNxAH4AD3EAfgAVcQB+ABhxAH4AHHNxAH4AAAAAAAN3BAAAAAJxAH4AIXEAfgAieHQACGVtcGxveWVyc3EAfgAPcQB+ABVxAH4AGHEAfgAcc3EAfgAAAAAAA3cEAAAAAnEAfgAhcQB+ACJ4dAAFc3RhdGVzcQB+AA9xAH4AFXEAfgAYcQB+ABxzcQB+AAAAAAADdwQAAAACcQB+ACFxAH4AInh0AANhZ2VxAH4ADHQABWVtYWlsc3EAfgAPcQB+ABVxAH4AGHEAfgAcc3EAfgAAAAAAA3cEAAAAAnEAfgAhcQB+ACJ4dAAIbGFzdG5hbWVzcQB+AA9xAH4AFXEAfgAYcQB+ABxzcQB+AAAAAAADdwQAAAACcQB+ACFxAH4AInh4eA==\\\"}\",\"lang\":\"opensearch_compounded_script\",\"params\":{\"utcTimestamp\":*}},\"boost\":1.0}},\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"],\"excludes\":[]},\"sort\":[{\"_doc\":{\"order\":\"asc\"}}]}, requestedTotalSize=5, pageSize=null, startFrom=0)])\n"}} \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_regex.json b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_regex.json new file mode 100644 index 00000000000..1b72a2e91ff --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_regex.json @@ -0,0 +1 @@ +{"calcite":{"logical":"LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10])\n LogicalSort(fetch=[5])\n LogicalFilter(condition=[REGEXP_CONTAINS($10, '^[A-Z][a-z]+$':VARCHAR)])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n","physical":"EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}])\n EnumerableLimit(fetch=[5])\n EnumerableCalc(expr#0..16=[{inputs}], expr#17=['^[A-Z][a-z]+$':VARCHAR], expr#18=[REGEXP_CONTAINS($t10, $t17)], proj#0..16=[{exprs}], $condition=[$t18])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n"}} \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_regex_negated.json b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_regex_negated.json new file mode 100644 index 00000000000..85a3071c029 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_regex_negated.json @@ -0,0 +1 @@ +{"calcite":{"logical":"LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10])\n LogicalSort(fetch=[5])\n LogicalFilter(condition=[NOT(REGEXP_CONTAINS($10, '.*son$':VARCHAR))])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n","physical":"EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..16=[{inputs}], proj#0..10=[{exprs}])\n EnumerableLimit(fetch=[5])\n EnumerableCalc(expr#0..16=[{inputs}], expr#17=['.*son$':VARCHAR], expr#18=[REGEXP_CONTAINS($t10, $t17)], expr#19=[NOT($t18)], proj#0..16=[{exprs}], $condition=[$t19])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n"}} \ No newline at end of file diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 5b047f3b730..6fbbebadc78 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -73,6 +73,7 @@ commands | expandCommand | flattenCommand | reverseCommand + | regexCommand ; commandName @@ -106,6 +107,7 @@ commandName | TRENDLINE | EXPLAIN | REVERSE + | REGEX ; searchCommand @@ -241,6 +243,13 @@ pathElement pathArrayAccess : LT_CURLY (INTEGER_LITERAL)? RT_CURLY ; +regexCommand + : REGEX regexExpr + ; + +regexExpr + : field=qualifiedName operator=(EQUAL | NOT_EQUAL) pattern=stringLiteral + ; patternsMethod : PUNCT diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index b092d10054b..faab2628d63 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -84,6 +84,7 @@ import org.opensearch.sql.ast.tree.RangeBin; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.RareTopN.CommandType; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; @@ -906,6 +907,15 @@ public UnresolvedPlan visitAppendcolCommand(OpenSearchPPLParser.AppendcolCommand return new AppendCol(override, subsearch.get()); } + @Override + public UnresolvedPlan visitRegexCommand(OpenSearchPPLParser.RegexCommandContext ctx) { + UnresolvedExpression field = internalVisitExpression(ctx.regexExpr().field); + boolean negated = ctx.regexExpr().operator.getType() == OpenSearchPPLParser.NOT_EQUAL; + Literal pattern = (Literal) internalVisitExpression(ctx.regexExpr().pattern); + + return new Regex(field, negated, pattern); + } + /** Get original text in query. */ private String getTextInQuery(ParserRuleContext ctx) { Token start = ctx.getStart(); diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index 2a6a767668b..e969bd33f8f 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -73,6 +73,7 @@ import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RangeBin; import org.opensearch.sql.ast.tree.RareTopN; +import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; @@ -442,6 +443,16 @@ public String visitParse(Parse node, String context) { : StringUtils.format("%s | %s %s '%s'", child, commandName, source, regex); } + @Override + public String visitRegex(Regex node, String context) { + String child = node.getChild().get(0).accept(this, context); + String operator = node.isNegated() ? Regex.NOT_EQUALS_OPERATOR : Regex.EQUALS_OPERATOR; + String pattern = MASK_LITERAL; + + String field = visitExpression(node.getField()); + return StringUtils.format("%s | regex %s%s%s", child, field, operator, pattern); + } + @Override public String visitFlatten(Flatten node, String context) { String child = node.getChild().getFirst().accept(this, context); @@ -734,5 +745,11 @@ public String visitCast(Cast node, String context) { String expr = analyze(node.getExpression(), context); return StringUtils.format("cast(%s as %s)", expr, node.getConvertedType().toString()); } + + @Override + public String visitQualifiedName( + org.opensearch.sql.ast.expression.QualifiedName node, String context) { + return String.join(".", node.getParts()); + } } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRegexTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRegexTest.java new file mode 100644 index 00000000000..cfc0722bcfb --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRegexTest.java @@ -0,0 +1,172 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.test.CalciteAssert; +import org.junit.Test; + +public class CalcitePPLRegexTest extends CalcitePPLAbstractTest { + public CalcitePPLRegexTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Test + public void testRegexBasic() { + String ppl = "source=EMP | regex ENAME='A.*' | fields ENAME, JOB"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], JOB=[$2])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, 'A.*':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `JOB`\n" + + "FROM `scott`.`EMP`\n" + + "WHERE REGEXP_CONTAINS(`ENAME`, 'A.*')"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexChainedFilters() { + String ppl = "source=EMP | regex ENAME='A.*' | regex JOB='.*CLERK' | fields ENAME, JOB"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], JOB=[$2])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($2, '.*CLERK':VARCHAR)])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, 'A.*':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `JOB`\n" + + "FROM (SELECT *\n" + + "FROM `scott`.`EMP`\n" + + "WHERE REGEXP_CONTAINS(`ENAME`, 'A.*')) `t`\n" + + "WHERE REGEXP_CONTAINS(`JOB`, '.*CLERK')"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexWithNotEqual() { + String ppl = "source=EMP | regex ENAME!='A.*' | fields ENAME, JOB"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], JOB=[$2])\n" + + " LogicalFilter(condition=[NOT(REGEXP_CONTAINS($1, 'A.*':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `JOB`\n" + + "FROM `scott`.`EMP`\n" + + "WHERE NOT REGEXP_CONTAINS(`ENAME`, 'A.*')"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexComplexPattern() { + String ppl = "source=EMP | regex ENAME='[A-Z]{2,}' | fields ENAME"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '[A-Z]{2,}':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`\n" + "FROM `scott`.`EMP`\n" + "WHERE REGEXP_CONTAINS(`ENAME`, '[A-Z]{2,}')"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexWithEscapedCharacters() { + String ppl = "source=EMP | regex JOB='SALES\\sMAN' | fields JOB"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(JOB=[$2])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($2, 'SALES\\sMAN':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `JOB`\n" + "FROM `scott`.`EMP`\n" + "WHERE REGEXP_CONTAINS(`JOB`, 'SALES\\sMAN')"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexChainedCommands() { + String ppl = "source=EMP | regex ENAME='A.*' | fields ENAME | sort ENAME | head 5"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalSort(sort0=[$0], dir0=[ASC-nulls-first], fetch=[5])\n" + + " LogicalProject(ENAME=[$1])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, 'A.*':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`\n" + + "FROM `scott`.`EMP`\n" + + "WHERE REGEXP_CONTAINS(`ENAME`, 'A.*')\n" + + "ORDER BY `ENAME`\n" + + "LIMIT 5"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexWithAggregation() { + String ppl = "source=EMP | regex JOB='.*CLERK' | stats count() by JOB"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(count()=[$1], JOB=[$0])\n" + + " LogicalAggregate(group=[{0}], count()=[COUNT()])\n" + + " LogicalProject(JOB=[$2])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($2, '.*CLERK':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT COUNT(*) `count()`, `JOB`\n" + + "FROM `scott`.`EMP`\n" + + "WHERE REGEXP_CONTAINS(`JOB`, '.*CLERK')\n" + + "GROUP BY `JOB`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexCaseInsensitive() { + String ppl = "source=EMP | regex ENAME='(?i)smith' | fields ENAME"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1])\n" + + " LogicalFilter(condition=[REGEXP_CONTAINS($1, '(?i)smith':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`\n" + "FROM `scott`.`EMP`\n" + "WHERE REGEXP_CONTAINS(`ENAME`, '(?i)smith')"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRegexWithNonStringFieldThrowsException() { + String ppl = "source=EMP | regex EMPNO='123.*'"; + try { + getRelNode(ppl); + fail("Expected IllegalArgumentException for non-string field type"); + } catch (IllegalArgumentException e) { + assertTrue( + "Expected error message about field type", + e.getMessage().contains("Regex command requires field of string type")); + assertTrue("Expected error message to mention field name", e.getMessage().contains("EMPNO")); + } + } +} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index e453de711f4..85ece656ddf 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -511,6 +511,15 @@ public void testPatterns() { + " variable_count_threshold=5")); } + @Test + public void testRegex() { + assertEquals("source=t | regex field=***", anonymize("source=t | regex field='pattern'")); + assertEquals("source=t | regex field!=***", anonymize("source=t | regex field!='pattern'")); + assertEquals( + "source=t | regex email=*** | fields + email", + anonymize("source=t | regex email='.*@domain.com' | fields email")); + } + private String anonymize(String query) { AstBuilder astBuilder = new AstBuilder(query, settings); return anonymize(astBuilder.visit(parser.parse(query)));