diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java b/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java index c84ff79a606..c3b1d975ac1 100644 --- a/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java +++ b/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java @@ -23,38 +23,52 @@ public class Rex extends UnresolvedPlan { public enum RexMode { - EXTRACT + EXTRACT, + SED } /** Field to extract from. */ private final UnresolvedExpression field; - /** Pattern with named capture groups. */ + /** Pattern with named capture groups or sed expression. */ private final Literal pattern; - /** Rex mode (only EXTRACT supported). */ + /** Rex mode (extract or sed). */ private final RexMode mode; /** Maximum number of matches (optional). */ private final Optional maxMatch; + /** Offset field name for position tracking (optional). */ + private final Optional offsetField; + /** Child Plan. */ @Setter private UnresolvedPlan child; public Rex(UnresolvedExpression field, Literal pattern) { - this(field, pattern, RexMode.EXTRACT, Optional.empty()); + this(field, pattern, RexMode.EXTRACT, Optional.empty(), Optional.empty()); } public Rex(UnresolvedExpression field, Literal pattern, Optional maxMatch) { - this(field, pattern, RexMode.EXTRACT, maxMatch); + this(field, pattern, RexMode.EXTRACT, maxMatch, Optional.empty()); } public Rex( UnresolvedExpression field, Literal pattern, RexMode mode, Optional maxMatch) { + this(field, pattern, mode, maxMatch, Optional.empty()); + } + + public Rex( + UnresolvedExpression field, + Literal pattern, + RexMode mode, + Optional maxMatch, + Optional offsetField) { this.field = field; this.pattern = pattern; this.mode = mode; this.maxMatch = maxMatch; + this.offsetField = offsetField; } @Override diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 5261d438863..67aace5f203 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -217,6 +217,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) { RexNode fieldRex = rexVisitor.analyze(node.getField(), context); String patternStr = (String) node.getPattern().getValue(); + if (node.getMode() == Rex.RexMode.SED) { + RexNode sedCall = createOptimizedSedCall(fieldRex, patternStr, context); + String fieldName = node.getField().toString(); + projectPlusOverriding(List.of(sedCall), List.of(fieldName), context); + return context.relBuilder.peek(); + } + List namedGroups = RegexCommonUtils.getNamedGroupCandidates(patternStr); if (namedGroups.isEmpty()) { @@ -251,6 +258,17 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) { newFieldNames.add(namedGroups.get(i)); } + if (node.getOffsetField().isPresent()) { + RexNode offsetCall = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.REX_OFFSET, + fieldRex, + context.rexBuilder.makeLiteral(patternStr)); + newFields.add(offsetCall); + newFieldNames.add(node.getOffsetField().get()); + } + projectPlusOverriding(newFields, newFieldNames, context); return context.relBuilder.peek(); } @@ -2136,4 +2154,115 @@ private void buildExpandRelNode( context.relBuilder.rename(names); } } + + /** Creates an optimized sed call using native Calcite functions */ + private RexNode createOptimizedSedCall( + RexNode fieldRex, String sedExpression, CalcitePlanContext context) { + if (sedExpression.startsWith("s/")) { + return createOptimizedSubstitution(fieldRex, sedExpression, context); + } else if (sedExpression.startsWith("y/")) { + return createOptimizedTransliteration(fieldRex, sedExpression, context); + } else { + throw new RuntimeException("Unsupported sed pattern: " + sedExpression); + } + } + + /** Creates optimized substitution calls for s/pattern/replacement/flags syntax. */ + private RexNode createOptimizedSubstitution( + RexNode fieldRex, String sedExpression, CalcitePlanContext context) { + try { + // Parse sed substitution: s/pattern/replacement/flags + if (!sedExpression.matches("s/.+/.*/.*")) { + throw new IllegalArgumentException("Invalid sed substitution format"); + } + + // Find the delimiters - sed format is s/pattern/replacement/flags + int firstDelimiter = sedExpression.indexOf('/', 2); // First '/' after 's/' + int secondDelimiter = sedExpression.indexOf('/', firstDelimiter + 1); // Second '/' + int thirdDelimiter = sedExpression.indexOf('/', secondDelimiter + 1); // Third '/' (optional) + + if (firstDelimiter == -1 || secondDelimiter == -1) { + throw new IllegalArgumentException("Invalid sed substitution format"); + } + + String pattern = sedExpression.substring(2, firstDelimiter); + String replacement = sedExpression.substring(firstDelimiter + 1, secondDelimiter); + String flags = + secondDelimiter + 1 < sedExpression.length() + ? sedExpression.substring(secondDelimiter + 1) + : ""; + + // Convert sed backreferences (\1, \2) to Java style ($1, $2) + String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1"); + + if (flags.isEmpty()) { + // 3-parameter REGEXP_REPLACE + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_3, + fieldRex, + context.rexBuilder.makeLiteral(pattern), + context.rexBuilder.makeLiteral(javaReplacement)); + } else if (flags.matches("[gi]+")) { + // 4-parameter REGEXP_REPLACE with flags + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4, + fieldRex, + context.rexBuilder.makeLiteral(pattern), + context.rexBuilder.makeLiteral(javaReplacement), + context.rexBuilder.makeLiteral(flags)); + } else if (flags.matches("\\d+")) { + // 5-parameter REGEXP_REPLACE with occurrence + int occurrence = Integer.parseInt(flags); + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5, + fieldRex, + context.rexBuilder.makeLiteral(pattern), + context.rexBuilder.makeLiteral(javaReplacement), + context.relBuilder.literal(1), // start position + context.relBuilder.literal(occurrence)); + } else { + throw new RuntimeException( + "Unsupported sed flags: " + flags + " in expression: " + sedExpression); + } + } catch (Exception e) { + throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e); + } + } + + /** Creates optimized transliteration calls for y/from/to/ syntax. */ + private RexNode createOptimizedTransliteration( + RexNode fieldRex, String sedExpression, CalcitePlanContext context) { + try { + // Parse sed transliteration: y/from/to/ + if (!sedExpression.matches("y/.+/.*/.*")) { + throw new IllegalArgumentException("Invalid sed transliteration format"); + } + + int firstSlash = sedExpression.indexOf('/', 1); + int secondSlash = sedExpression.indexOf('/', firstSlash + 1); + int thirdSlash = sedExpression.indexOf('/', secondSlash + 1); + + if (firstSlash == -1 || secondSlash == -1) { + throw new IllegalArgumentException("Invalid sed transliteration format"); + } + + String from = sedExpression.substring(firstSlash + 1, secondSlash); + String to = + sedExpression.substring( + secondSlash + 1, thirdSlash != -1 ? thirdSlash : sedExpression.length()); + + // Use Calcite's native TRANSLATE3 function + return PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.INTERNAL_TRANSLATE3, + fieldRex, + context.rexBuilder.makeLiteral(from), + context.rexBuilder.makeLiteral(to)); + } catch (Exception e) { + throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e); + } + } } diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index ded746e8657..a7ce86b9731 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -222,6 +222,7 @@ public enum BuiltinFunctionName { REGEX_MATCH(FunctionName.of("regex_match")), REX_EXTRACT(FunctionName.of("REX_EXTRACT")), REX_EXTRACT_MULTI(FunctionName.of("REX_EXTRACT_MULTI")), + REX_OFFSET(FunctionName.of("REX_OFFSET")), REPLACE(FunctionName.of("replace")), REVERSE(FunctionName.of("reverse")), RIGHT(FunctionName.of("right")), diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index 076dfbac46f..0eeb4f59812 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -58,6 +58,7 @@ import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction; import org.opensearch.sql.expression.function.udf.RexExtractFunction; import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction; +import org.opensearch.sql.expression.function.udf.RexOffsetFunction; import org.opensearch.sql.expression.function.udf.SpanFunction; import org.opensearch.sql.expression.function.udf.condition.EarliestFunction; import org.opensearch.sql.expression.function.udf.condition.EnhancedCoalesceFunction; @@ -406,6 +407,7 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT"); public static final SqlOperator REX_EXTRACT_MULTI = new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI"); + public static final SqlOperator REX_OFFSET = new RexOffsetFunction().toUDF("REX_OFFSET"); // Aggregation functions public static final SqlAggFunction AVG_NULLABLE = new NullableSqlAvgAggFunction(SqlKind.AVG); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index 74873e7a544..411d9cf237f 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -165,6 +165,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.REVERSE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT_MULTI; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_OFFSET; import static org.opensearch.sql.expression.function.BuiltinFunctionName.RIGHT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.RINT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.ROUND; @@ -715,6 +716,7 @@ void populate() { registerOperator(MULTI_MATCH, PPLBuiltinOperators.MULTI_MATCH); registerOperator(REX_EXTRACT, PPLBuiltinOperators.REX_EXTRACT); registerOperator(REX_EXTRACT_MULTI, PPLBuiltinOperators.REX_EXTRACT_MULTI); + registerOperator(REX_OFFSET, PPLBuiltinOperators.REX_OFFSET); // Register PPL Datetime UDF operator registerOperator(TIMESTAMP, PPLBuiltinOperators.TIMESTAMP); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/RexOffsetFunction.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/RexOffsetFunction.java new file mode 100644 index 00000000000..c52b27537e5 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/function/udf/RexOffsetFunction.java @@ -0,0 +1,94 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.apache.calcite.adapter.enumerable.NotNullImplementor; +import org.apache.calcite.adapter.enumerable.NullPolicy; +import org.apache.calcite.adapter.enumerable.RexToLixTranslator; +import org.apache.calcite.linq4j.tree.Expression; +import org.apache.calcite.linq4j.tree.Expressions; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.sql.type.ReturnTypes; +import org.apache.calcite.sql.type.SqlReturnTypeInference; +import org.opensearch.sql.calcite.utils.PPLOperandTypes; +import org.opensearch.sql.expression.function.ImplementorUDF; +import org.opensearch.sql.expression.function.UDFOperandMetadata; + +/** Custom REX_OFFSET function for calculating regex match positions. */ +public final class RexOffsetFunction extends ImplementorUDF { + + public RexOffsetFunction() { + super(new RexOffsetImplementor(), NullPolicy.ARG0); + } + + @Override + public SqlReturnTypeInference getReturnTypeInference() { + return ReturnTypes.VARCHAR_2000_NULLABLE; + } + + @Override + public UDFOperandMetadata getOperandMetadata() { + return PPLOperandTypes.STRING_STRING; + } + + private static class RexOffsetImplementor implements NotNullImplementor { + + @Override + public Expression implement( + RexToLixTranslator translator, RexCall call, List translatedOperands) { + Expression field = translatedOperands.get(0); + Expression pattern = translatedOperands.get(1); + + return Expressions.call(RexOffsetFunction.class, "calculateOffsets", field, pattern); + } + } + + public static String calculateOffsets(String text, String patternStr) { + if (text == null || patternStr == null) { + return null; + } + + try { + Pattern pattern = Pattern.compile(patternStr); + Matcher matcher = pattern.matcher(text); + + if (!matcher.find()) { + return null; + } + + List offsetPairs = new java.util.ArrayList<>(); + + Pattern namedGroupPattern = Pattern.compile("\\(\\?<([^>]+)>"); + Matcher namedGroupMatcher = namedGroupPattern.matcher(patternStr); + + int groupIndex = 1; + + while (namedGroupMatcher.find()) { + String groupName = namedGroupMatcher.group(1); + + if (groupIndex <= matcher.groupCount()) { + int start = matcher.start(groupIndex); + int end = matcher.end(groupIndex); + + if (start >= 0 && end >= 0) { + offsetPairs.add(groupName + "=" + start + "-" + (end - 1)); + } + } + groupIndex++; + } + + java.util.Collections.sort(offsetPairs); + return offsetPairs.isEmpty() ? null : String.join("&", offsetPairs); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException( + "Invalid regex pattern in rex command: " + e.getMessage(), e); + } + } +} diff --git a/core/src/test/java/org/opensearch/sql/expression/function/udf/RexOffsetFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/function/udf/RexOffsetFunctionTest.java new file mode 100644 index 00000000000..7c2777ba9a3 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/function/udf/RexOffsetFunctionTest.java @@ -0,0 +1,84 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.regex.PatternSyntaxException; +import org.junit.jupiter.api.Test; + +public class RexOffsetFunctionTest { + + @Test + public void testCalculateOffsetsWithSingleNamedGroup() { + String text = "SMITH"; + String pattern = "(?[A-Z])"; + String result = RexOffsetFunction.calculateOffsets(text, pattern); + assertEquals("first=0-0", result); + } + + @Test + public void testCalculateOffsetsWithMultipleNamedGroups() { + String text = "SMITH"; + String pattern = "(?[A-Z])(?[A-Z]+)"; + String result = RexOffsetFunction.calculateOffsets(text, pattern); + assertEquals("first=0-0&rest=1-4", result); + } + + @Test + public void testCalculateOffsetsWithNoMatch() { + String text = "smith"; + String pattern = "(?[0-9])"; + String result = RexOffsetFunction.calculateOffsets(text, pattern); + assertNull(result); + } + + @Test + public void testCalculateOffsetsWithNullInput() { + String result1 = RexOffsetFunction.calculateOffsets(null, "(?.*)"); + assertNull(result1); + + String result2 = RexOffsetFunction.calculateOffsets("test", null); + assertNull(result2); + + String result3 = RexOffsetFunction.calculateOffsets(null, null); + assertNull(result3); + } + + @Test + public void testCalculateOffsetsWithInvalidPattern() { + String text = "test"; + String pattern = "(? RexOffsetFunction.calculateOffsets(text, pattern)); + + assertTrue(exception.getMessage().contains("Invalid regex pattern in rex command")); + assertTrue(exception.getCause() instanceof PatternSyntaxException); + } + + @Test + public void testCalculateOffsetsWithComplexPattern() { + String text = "ABC123DEF"; + String pattern = "(?[A-Z]+)(?[0-9]+)(?[A-Z]+)"; + String result = RexOffsetFunction.calculateOffsets(text, pattern); + assertEquals("letters=0-2&moreletters=6-8&numbers=3-5", result); + } + + @Test + public void testCalculateOffsetsWithEmptyMatch() { + String text = "test"; + String pattern = "(?)test"; + String result = RexOffsetFunction.calculateOffsets(text, pattern); + // Empty groups should have start == end but end-1 might be negative, so handle appropriately + assertEquals("empty=0--1", result); + } +} diff --git a/docs/user/ppl/cmd/rex.rst b/docs/user/ppl/cmd/rex.rst index c816ed0fbf9..05d0e3757f8 100644 --- a/docs/user/ppl/cmd/rex.rst +++ b/docs/user/ppl/cmd/rex.rst @@ -19,11 +19,23 @@ Version Syntax ============ -rex field= [max_match=] +rex [mode=] field= [max_match=] [offset_field=] * field: mandatory. The field must be a string field to extract data from. * pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using ``(?pattern)`` syntax. +* mode: optional. Either ``extract`` (default) or ``sed``. + + - **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior. + - **sed mode**: Performs text substitution on the field using sed-style patterns: + + - ``s/pattern/replacement/`` - Replace first occurrence + - ``s/pattern/replacement/g`` - Replace all occurrences (global) + - ``s/pattern/replacement/n`` - Replace only the nth occurrence (where n is a number) + - ``y/from_chars/to_chars/`` - Character-by-character transliteration + - Backreferences: ``\1``, ``\2``, etc. reference captured groups in replacement + * max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable via ``plugins.ppl.rex.max_match.limit``). +* offset_field: optional string. Field name to store the character offset positions of matches. Only available in extract mode. Example 1: Basic Field Extraction ================================== @@ -77,7 +89,41 @@ PPL query:: +--------------------+------------------+ -Example 4: Complex Email Pattern +Example 4: Text Replacement with mode=sed +========================================== + +Replace email domains using sed mode for text substitution. The extracted field is returned as string type. + +PPL query:: + + os> source=accounts | rex field=email mode=sed "s/@.*/@company.com/" | fields email | head 2 ; + fetched rows / total rows = 2/2 + +------------------------+ + | email | + |------------------------| + | amberduke@company.com | + | hattiebond@company.com | + +------------------------+ + + +Example 5: Using offset_field +============================== + +Track the character positions where matches occur. Extracted fields are string type, and the offset_field is also string type. + +PPL query:: + + os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos | fields email, username, domain, matchpos | head 2 ; + fetched rows / total rows = 2/2 + +-----------------------+------------+--------+---------------------------+ + | email | username | domain | matchpos | + |-----------------------+------------+--------+---------------------------| + | amberduke@pyrami.com | amberduke | pyrami | domain=10-15&username=0-8 | + | hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | + +-----------------------+------------+--------+---------------------------+ + + +Example 6: Complex Email Pattern ================================= Extract comprehensive email components including top-level domain. All extracted fields are returned as string type. @@ -94,7 +140,7 @@ PPL query:: +-----------------------+------------+--------+-----+ -Example 5: Chaining Multiple rex Commands +Example 7: Chaining Multiple rex Commands ========================================== Extract initial letters from both first and last names. All extracted fields are returned as string type. @@ -112,7 +158,7 @@ PPL query:: +-----------+----------+--------------+-------------+ -Example 6: Named Capture Group Limitations +Example 8: Named Capture Group Limitations ============================================ Demonstrates naming restrictions for capture groups. Group names cannot contain underscores due to Java regex limitations. @@ -135,7 +181,7 @@ Correct PPL query without underscores:: +-----------------------+------------+-------------+ -Example 7: Max Match Limit Protection +Example 9: Max Match Limit Protection ====================================== Demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion. @@ -167,6 +213,8 @@ Pattern Type Java Regex Java Regex Named Groups Required Yes Yes Filtering by Match No Yes Multiple Matches Yes No +Text Substitution Yes No +Offset Tracking Yes No Underscores in Group Names No No ============================= ============ ============ diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index b8f12d3c491..ff7944c5aa0 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -33,6 +33,7 @@ PARSE: 'PARSE'; SPATH: 'SPATH'; REGEX: 'REGEX'; REX: 'REX'; +SED: 'SED'; PUNCT: 'PUNCT'; GROK: 'GROK'; PATTERN: 'PATTERN'; @@ -54,6 +55,7 @@ FREQUENCY_THRESHOLD_PERCENTAGE: 'FREQUENCY_THRESHOLD_PERCENTAGE'; METHOD: 'METHOD'; MAX_SAMPLE_COUNT: 'MAX_SAMPLE_COUNT'; MAX_MATCH: 'MAX_MATCH'; +OFFSET_FIELD: 'OFFSET_FIELD'; BUFFER_LIMIT: 'BUFFER_LIMIT'; LABEL: 'LABEL'; AGGREGATION: 'AGGREGATION'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index c757ef497d2..ee261d79a53 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -286,7 +286,8 @@ rexExpr rexOption : MAX_MATCH EQUAL maxMatch=integerLiteral - | MODE EQUAL EXTRACT + | MODE EQUAL (EXTRACT | SED) + | OFFSET_FIELD EQUAL offsetField=qualifiedName ; patternsMethod : PUNCT diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 99ed93a5731..68e4575c88b 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -997,6 +997,7 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx) Literal pattern = (Literal) internalVisitExpression(ctx.rexExpr().pattern); Rex.RexMode mode = Rex.RexMode.EXTRACT; Optional maxMatch = Optional.empty(); + Optional offsetField = Optional.empty(); for (OpenSearchPPLParser.RexOptionContext optionCtx : ctx.rexExpr().rexOption()) { if (optionCtx.maxMatch != null) { @@ -1005,6 +1006,18 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx) if (optionCtx.EXTRACT() != null) { mode = Rex.RexMode.EXTRACT; } + if (optionCtx.SED() != null) { + mode = Rex.RexMode.SED; + } + if (optionCtx.offsetField != null) { + offsetField = Optional.of(optionCtx.offsetField.getText()); + } + } + + if (mode == Rex.RexMode.SED && offsetField.isPresent()) { + throw new IllegalArgumentException( + "Rex command: offset_field cannot be used with mode=sed. " + + "The offset_field option is only supported in extract mode."); } int maxMatchLimit = @@ -1029,7 +1042,7 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx) effectiveMaxMatch = userMaxMatch; } - return new Rex(field, pattern, mode, Optional.of(effectiveMaxMatch)); + return new Rex(field, pattern, mode, Optional.of(effectiveMaxMatch), offsetField); } /** Get original text in query. */ diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index 3c5001f39df..e17fc0272da 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -471,6 +471,10 @@ public String visitRex(Rex node, String context) { command.append(" max_match=").append(node.getMaxMatch().get()); } + if (node.getOffsetField().isPresent()) { + command.append(" offset_field=").append(node.getOffsetField().get()); + } + return command.toString(); } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java index 10072319ae2..c73a57eadfd 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java @@ -226,4 +226,77 @@ public void testRexWithMaxMatchAtLimit() { + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + @Test + public void testRexSedMode() { + String ppl = "source=EMP | rex field=ENAME mode=sed 's/([A-Z])([a-z]*)/\\1/' | fields ENAME"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[REGEXP_REPLACE($1, '([A-Z])([a-z]*)', '$1')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT REGEXP_REPLACE(`ENAME`, '([A-Z])([a-z]*)', '$1') `ENAME`\n" + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithOffsetField() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z]).*' offset_field=offsets | fields ENAME," + + " first, offsets"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?[A-Z]).*', 1)]," + + " offsets=[REX_OFFSET($1, '(?[A-Z]).*')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?[A-Z]).*', 1) `first`," + + " `REX_OFFSET`(`ENAME`, '(?[A-Z]).*') `offsets`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithMultipleNamedGroupsAndOffsetField() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])(?.*)' offset_field=positions | fields" + + " ENAME, first, rest, positions"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?[A-Z])(?.*)', 1)]," + + " rest=[REX_EXTRACT($1, '(?[A-Z])(?.*)', 2)], positions=[REX_OFFSET($1," + + " '(?[A-Z])(?.*)')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?[A-Z])(?.*)', 1) `first`," + + " `REX_EXTRACT`(`ENAME`, '(?[A-Z])(?.*)', 2) `rest`," + + " `REX_OFFSET`(`ENAME`, '(?[A-Z])(?.*)') `positions`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithMaxMatchAndOffsetField() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])' max_match=3 offset_field=positions |" + + " fields ENAME, letter, positions"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?[A-Z])', 1, 3)]," + + " positions=[REX_OFFSET($1, '(?[A-Z])')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?[A-Z])', 1, 3) `letter`," + + " `REX_OFFSET`(`ENAME`, '(?[A-Z])') `positions`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index b55d36dd367..f059becc4e2 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -1090,4 +1090,10 @@ public void testBinCommandDuplicateParameter() { // Test that duplicate parameters throw an exception plan("search source=test | bin index_field span=10 span=20"); } + + @Test(expected = IllegalArgumentException.class) + public void testRexSedModeWithOffsetFieldThrowsException() { + // Test that SED mode and offset_field cannot be used together (align with Splunk behavior) + plan("source=test | rex field=email mode=sed offset_field=matchpos \"s/@.*/@company.com/\""); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 76643c20a8c..2946ba297eb 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -575,6 +575,28 @@ public void testRexCommand() { anonymize("source=t | rex field=name \"(?[A-Z])\" max_match=3")); } + @Test + public void testRexSedMode() { + when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10); + + assertEquals( + "source=t | rex field=lastname mode=sed \"s/^[A-Z]/X/\" max_match=1", + anonymize("source=t | rex field=lastname mode=sed \"s/^[A-Z]/X/\"")); + assertEquals( + "source=t | rex field=data mode=sed \"s/sensitive/clean/g\" max_match=1 | fields + data", + anonymize("source=t | rex field=data mode=sed \"s/sensitive/clean/g\" | fields data")); + } + + @Test + public void testRexWithOffsetField() { + when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10); + + assertEquals( + "source=t | rex field=message mode=extract \"(?[a-z]+)\" max_match=1" + + " offset_field=pos", + anonymize("source=t | rex field=message \"(?[a-z]+)\" offset_field=pos")); + } + private String anonymize(String query) { AstBuilder astBuilder = new AstBuilder(query, settings); return anonymize(astBuilder.visit(parser.parse(query)));