diff --git a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java index a56ac1d30c3..91666e40c01 100644 --- a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java +++ b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java @@ -29,6 +29,7 @@ public enum Key { PATTERN_MODE("plugins.ppl.pattern.mode"), PATTERN_MAX_SAMPLE_COUNT("plugins.ppl.pattern.max.sample.count"), PATTERN_BUFFER_LIMIT("plugins.ppl.pattern.buffer.limit"), + PPL_REX_MAX_MATCH_LIMIT("plugins.ppl.rex.max_match.limit"), /** Enable Calcite as execution engine */ CALCITE_ENGINE_ENABLED("plugins.calcite.enabled"), diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index 73385a23c52..29042a2b36f 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -85,6 +85,7 @@ import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.Rex; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; import org.opensearch.sql.ast.tree.SubqueryAlias; @@ -751,6 +752,11 @@ public LogicalPlan visitRegex(Regex node, AnalysisContext context) { throw getOnlyForCalciteException("Regex"); } + @Override + public LogicalPlan visitRex(Rex node, AnalysisContext context) { + throw getOnlyForCalciteException("Rex"); + } + @Override public LogicalPlan visitPaginate(Paginate paginate, AnalysisContext context) { LogicalPlan child = paginate.getChild().get(0).accept(this, context); diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index 4b7b2cc6354..a2d54d3ec05 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -73,6 +73,7 @@ import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.Rex; import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SubqueryAlias; @@ -270,6 +271,10 @@ public T visitRegex(Regex node, C context) { return visitChildren(node, context); } + public T visitRex(Rex node, C context) { + return visitChildren(node, context); + } + public T visitLambdaFunction(LambdaFunction node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java b/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java new file mode 100644 index 00000000000..c84ff79a606 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/Rex.java @@ -0,0 +1,75 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import java.util.Optional; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.Literal; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +/** AST node represent Rex field extraction operation. */ +@Getter +@ToString +@EqualsAndHashCode(callSuper = false) +public class Rex extends UnresolvedPlan { + + public enum RexMode { + EXTRACT + } + + /** Field to extract from. */ + private final UnresolvedExpression field; + + /** Pattern with named capture groups. */ + private final Literal pattern; + + /** Rex mode (only EXTRACT supported). */ + private final RexMode mode; + + /** Maximum number of matches (optional). */ + private final Optional maxMatch; + + /** Child Plan. */ + @Setter private UnresolvedPlan child; + + public Rex(UnresolvedExpression field, Literal pattern) { + this(field, pattern, RexMode.EXTRACT, Optional.empty()); + } + + public Rex(UnresolvedExpression field, Literal pattern, Optional maxMatch) { + this(field, pattern, RexMode.EXTRACT, maxMatch); + } + + public Rex( + UnresolvedExpression field, Literal pattern, RexMode mode, Optional maxMatch) { + this.field = field; + this.pattern = pattern; + this.mode = mode; + this.maxMatch = maxMatch; + } + + @Override + public Rex attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return ImmutableList.of(child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitRex(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index d99f068fde0..1cc513c6b23 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -108,6 +108,7 @@ import org.opensearch.sql.ast.tree.Regex; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; +import org.opensearch.sql.ast.tree.Rex; import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; @@ -130,6 +131,7 @@ import org.opensearch.sql.exception.SemanticCheckException; import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.expression.function.PPLFuncImpTable; +import org.opensearch.sql.expression.parse.RegexCommonUtils; import org.opensearch.sql.utils.ParseUtils; public class CalciteRelNodeVisitor extends AbstractNodeVisitor { @@ -208,6 +210,50 @@ public RelNode visitRegex(Regex node, CalcitePlanContext context) { return context.relBuilder.peek(); } + public RelNode visitRex(Rex node, CalcitePlanContext context) { + visitChildren(node, context); + + RexNode fieldRex = rexVisitor.analyze(node.getField(), context); + String patternStr = (String) node.getPattern().getValue(); + + List namedGroups = RegexCommonUtils.getNamedGroupCandidates(patternStr); + + if (namedGroups.isEmpty()) { + throw new IllegalArgumentException( + "Rex pattern must contain at least one named capture group"); + } + + List newFields = new ArrayList<>(); + List newFieldNames = new ArrayList<>(); + + for (int i = 0; i < namedGroups.size(); i++) { + RexNode extractCall; + if (node.getMaxMatch().isPresent() && node.getMaxMatch().get() > 1) { + extractCall = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.REX_EXTRACT_MULTI, + fieldRex, + context.rexBuilder.makeLiteral(patternStr), + context.relBuilder.literal(i + 1), + context.relBuilder.literal(node.getMaxMatch().get())); + } else { + extractCall = + PPLFuncImpTable.INSTANCE.resolve( + context.rexBuilder, + BuiltinFunctionName.REX_EXTRACT, + fieldRex, + context.rexBuilder.makeLiteral(patternStr), + context.relBuilder.literal(i + 1)); + } + newFields.add(extractCall); + newFieldNames.add(namedGroups.get(i)); + } + + projectPlusOverriding(newFields, newFieldNames, context); + return context.relBuilder.peek(); + } + private boolean containsSubqueryExpression(Node expr) { if (expr == null) { return false; diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/PPLOperandTypes.java b/core/src/main/java/org/opensearch/sql/calcite/utils/PPLOperandTypes.java index faa51825edd..e6797ee9960 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/PPLOperandTypes.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/PPLOperandTypes.java @@ -51,6 +51,17 @@ private PPLOperandTypes() {} UDFOperandMetadata.wrap((FamilyOperandTypeChecker) OperandTypes.NUMERIC_NUMERIC); public static final UDFOperandMetadata STRING_INTEGER = UDFOperandMetadata.wrap(OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.INTEGER)); + public static final UDFOperandMetadata STRING_STRING_INTEGER = + UDFOperandMetadata.wrap( + OperandTypes.family( + SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER, SqlTypeFamily.INTEGER)); + public static final UDFOperandMetadata STRING_STRING_INTEGER_INTEGER = + UDFOperandMetadata.wrap( + OperandTypes.family( + SqlTypeFamily.CHARACTER, + SqlTypeFamily.CHARACTER, + SqlTypeFamily.INTEGER, + SqlTypeFamily.INTEGER)); public static final UDFOperandMetadata NUMERIC_NUMERIC_OPTIONAL_NUMERIC = UDFOperandMetadata.wrap( diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index 645ee5a2f1e..e4c023fa45c 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -219,6 +219,8 @@ public enum BuiltinFunctionName { POSITION(FunctionName.of("position")), REGEXP(FunctionName.of("regexp")), REGEX_MATCH(FunctionName.of("regex_match")), + REX_EXTRACT(FunctionName.of("REX_EXTRACT")), + REX_EXTRACT_MULTI(FunctionName.of("REX_EXTRACT_MULTI")), REPLACE(FunctionName.of("replace")), REVERSE(FunctionName.of("reverse")), RIGHT(FunctionName.of("right")), @@ -315,7 +317,10 @@ public enum BuiltinFunctionName { INTERNAL_UNCOLLECT_PATTERNS(FunctionName.of("uncollect_patterns")), INTERNAL_REGEXP_EXTRACT(FunctionName.of("regexp_extract"), true), INTERNAL_GROK(FunctionName.of("grok"), true), - INTERNAL_REGEXP_REPLACE_3(FunctionName.of("regexp_replace_3"), true); + INTERNAL_REGEXP_REPLACE_3(FunctionName.of("regexp_replace_3"), true), + INTERNAL_REGEXP_REPLACE_PG_4(FunctionName.of("regexp_replace_pg_4"), true), + INTERNAL_REGEXP_REPLACE_5(FunctionName.of("regexp_replace_5"), true), + INTERNAL_TRANSLATE3(FunctionName.of("translate3"), true); private final FunctionName name; private boolean isInternal; diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index 3ff889252f8..076dfbac46f 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -56,6 +56,8 @@ import org.opensearch.sql.expression.function.udf.CryptographicFunction; import org.opensearch.sql.expression.function.udf.GrokFunction; import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction; +import org.opensearch.sql.expression.function.udf.RexExtractFunction; +import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction; import org.opensearch.sql.expression.function.udf.SpanFunction; import org.opensearch.sql.expression.function.udf.condition.EarliestFunction; import org.opensearch.sql.expression.function.udf.condition.EnhancedCoalesceFunction; @@ -401,6 +403,9 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { public static final SqlOperator RANGE_BUCKET = new org.opensearch.sql.expression.function.udf.binning.RangeBucketFunction() .toUDF("RANGE_BUCKET"); + public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT"); + public static final SqlOperator REX_EXTRACT_MULTI = + new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI"); // Aggregation functions public static final SqlAggFunction AVG_NULLABLE = new NullableSqlAvgAggFunction(SqlKind.AVG); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index b422f61b11e..07fddd67470 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -83,6 +83,9 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_PATTERN_PARSER; import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_EXTRACT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_3; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_TRANSLATE3; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_BLANK; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_EMPTY; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NOT_NULL; @@ -159,6 +162,8 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.REGEX_MATCH; import static org.opensearch.sql.expression.function.BuiltinFunctionName.REPLACE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.REVERSE; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT_MULTI; import static org.opensearch.sql.expression.function.BuiltinFunctionName.RIGHT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.RINT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.ROUND; @@ -678,6 +683,9 @@ void populate() { registerOperator(SHA1, SqlLibraryOperators.SHA1); registerOperator(INTERNAL_REGEXP_EXTRACT, SqlLibraryOperators.REGEXP_EXTRACT); registerOperator(INTERNAL_REGEXP_REPLACE_3, SqlLibraryOperators.REGEXP_REPLACE_3); + registerOperator(INTERNAL_REGEXP_REPLACE_PG_4, SqlLibraryOperators.REGEXP_REPLACE_PG_4); + registerOperator(INTERNAL_REGEXP_REPLACE_5, SqlLibraryOperators.REGEXP_REPLACE_5); + registerOperator(INTERNAL_TRANSLATE3, SqlLibraryOperators.TRANSLATE3); // Register PPL UDF operator registerOperator(COSH, PPLBuiltinOperators.COSH); @@ -703,6 +711,8 @@ void populate() { registerOperator(SIMPLE_QUERY_STRING, PPLBuiltinOperators.SIMPLE_QUERY_STRING); registerOperator(QUERY_STRING, PPLBuiltinOperators.QUERY_STRING); registerOperator(MULTI_MATCH, PPLBuiltinOperators.MULTI_MATCH); + registerOperator(REX_EXTRACT, PPLBuiltinOperators.REX_EXTRACT); + registerOperator(REX_EXTRACT_MULTI, PPLBuiltinOperators.REX_EXTRACT_MULTI); // Register PPL Datetime UDF operator registerOperator(TIMESTAMP, PPLBuiltinOperators.TIMESTAMP); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractFunction.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractFunction.java new file mode 100644 index 00000000000..fc1a1d0bef6 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractFunction.java @@ -0,0 +1,71 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.apache.calcite.adapter.enumerable.NotNullImplementor; +import org.apache.calcite.adapter.enumerable.NullPolicy; +import org.apache.calcite.adapter.enumerable.RexToLixTranslator; +import org.apache.calcite.linq4j.tree.Expression; +import org.apache.calcite.linq4j.tree.Expressions; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.sql.type.ReturnTypes; +import org.apache.calcite.sql.type.SqlReturnTypeInference; +import org.opensearch.sql.calcite.utils.PPLOperandTypes; +import org.opensearch.sql.expression.function.ImplementorUDF; +import org.opensearch.sql.expression.function.UDFOperandMetadata; + +/** Custom REX_EXTRACT function for extracting regex named capture groups. */ +public final class RexExtractFunction extends ImplementorUDF { + + public RexExtractFunction() { + super(new RexExtractImplementor(), NullPolicy.ARG0); + } + + @Override + public SqlReturnTypeInference getReturnTypeInference() { + return ReturnTypes.VARCHAR_2000_NULLABLE; + } + + @Override + public UDFOperandMetadata getOperandMetadata() { + return PPLOperandTypes.STRING_STRING_INTEGER; + } + + private static class RexExtractImplementor implements NotNullImplementor { + + @Override + public Expression implement( + RexToLixTranslator translator, RexCall call, List translatedOperands) { + Expression field = translatedOperands.get(0); + Expression pattern = translatedOperands.get(1); + Expression groupIndex = translatedOperands.get(2); + + return Expressions.call(RexExtractFunction.class, "extractGroup", field, pattern, groupIndex); + } + } + + public static String extractGroup(String text, String pattern, int groupIndex) { + try { + Pattern compiledPattern = Pattern.compile(pattern); + Matcher matcher = compiledPattern.matcher(text); + + if (matcher.find() && groupIndex > 0 && groupIndex <= matcher.groupCount()) { + return matcher.group(groupIndex); + } + return null; + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException( + "Error in 'rex' command: Encountered the following error while compiling the regex '" + + pattern + + "': " + + e.getMessage()); + } + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunction.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunction.java new file mode 100644 index 00000000000..599a518f9ce --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunction.java @@ -0,0 +1,94 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.apache.calcite.adapter.enumerable.NotNullImplementor; +import org.apache.calcite.adapter.enumerable.NullPolicy; +import org.apache.calcite.adapter.enumerable.RexToLixTranslator; +import org.apache.calcite.linq4j.tree.Expression; +import org.apache.calcite.linq4j.tree.Expressions; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.sql.type.SqlReturnTypeInference; +import org.apache.calcite.sql.type.SqlTypeName; +import org.opensearch.sql.calcite.utils.PPLOperandTypes; +import org.opensearch.sql.expression.function.ImplementorUDF; +import org.opensearch.sql.expression.function.UDFOperandMetadata; + +/** Custom REX_EXTRACT_MULTI function for extracting multiple regex matches. */ +public final class RexExtractMultiFunction extends ImplementorUDF { + + public RexExtractMultiFunction() { + super(new RexExtractMultiImplementor(), NullPolicy.ARG0); + } + + @Override + public SqlReturnTypeInference getReturnTypeInference() { + return call -> { + var elementType = call.getTypeFactory().createSqlType(SqlTypeName.VARCHAR, 2000); + return call.getTypeFactory() + .createArrayType(call.getTypeFactory().createTypeWithNullability(elementType, true), -1); + }; + } + + @Override + public UDFOperandMetadata getOperandMetadata() { + return PPLOperandTypes.STRING_STRING_INTEGER_INTEGER; + } + + private static class RexExtractMultiImplementor implements NotNullImplementor { + + @Override + public Expression implement( + RexToLixTranslator translator, RexCall call, List translatedOperands) { + Expression field = translatedOperands.get(0); + Expression pattern = translatedOperands.get(1); + Expression groupIndex = translatedOperands.get(2); + Expression maxMatch = translatedOperands.get(3); + + return Expressions.call( + RexExtractMultiFunction.class, + "extractMultipleGroups", + field, + pattern, + groupIndex, + maxMatch); + } + } + + public static List extractMultipleGroups( + String text, String pattern, int groupIndex, int maxMatch) { + // Query planner already validates null inputs via NullPolicy.ARG0 + try { + Pattern compiledPattern = Pattern.compile(pattern); + Matcher matcher = compiledPattern.matcher(text); + List matches = new ArrayList<>(); + + int matchCount = 0; + while (matcher.find() && (maxMatch == 0 || matchCount < maxMatch)) { + if (groupIndex > 0 && groupIndex <= matcher.groupCount()) { + String match = matcher.group(groupIndex); + if (match != null) { + matches.add(match); + matchCount++; + } + } + } + + return matches.isEmpty() ? null : matches; + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException( + "Error in 'rex' command: Encountered the following error while compiling the regex '" + + pattern + + "': " + + e.getMessage()); + } + } +} diff --git a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java index 0d4973e36a4..db97379d565 100644 --- a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java +++ b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java @@ -1943,4 +1943,17 @@ public void regex_command_throws_unsupported_exception_with_legacy_engine() { assertEquals( "Regex is supported only when plugins.calcite.enabled=true", exception.getMessage()); } + + @Test + public void rex_command_throws_unsupported_operation_exception_in_legacy_engine() { + UnsupportedOperationException exception = + assertThrows( + UnsupportedOperationException.class, + () -> + analyze( + new org.opensearch.sql.ast.tree.Rex( + field("email"), stringLiteral("(?[^@]+)@(?.+)")) + .attach(relation("schema")))); + assertEquals("Rex is supported only when plugins.calcite.enabled=true", exception.getMessage()); + } } diff --git a/core/src/test/java/org/opensearch/sql/expression/function/udf/RexExtractFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/function/udf/RexExtractFunctionTest.java new file mode 100644 index 00000000000..1899c810df3 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/function/udf/RexExtractFunctionTest.java @@ -0,0 +1,216 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.Test; + +public class RexExtractFunctionTest { + + private final RexExtractFunction function = new RexExtractFunction(); + + @Test + public void testExtractGroup_BasicPattern() { + String text = "user@domain.com"; + String pattern = "([^@]+)@([^.]+)\\.(.+)"; + + // Extract username (group 1) + String result = RexExtractFunction.extractGroup(text, pattern, 1); + assertEquals("user", result); + + // Extract domain (group 2) + result = RexExtractFunction.extractGroup(text, pattern, 2); + assertEquals("domain", result); + + // Extract TLD (group 3) + result = RexExtractFunction.extractGroup(text, pattern, 3); + assertEquals("com", result); + } + + @Test + public void testExtractGroup_NamedGroups() { + String text = "John Smith age:30"; + String pattern = "(?\\w+ \\w+) age:(?\\d+)"; + + // Extract name (group 1) + String name = RexExtractFunction.extractGroup(text, pattern, 1); + assertEquals("John Smith", name); + + // Extract age (group 2) + String age = RexExtractFunction.extractGroup(text, pattern, 2); + assertEquals("30", age); + } + + @Test + public void testExtractGroup_NoMatch() { + String text = "This text has no numbers"; + String pattern = "(\\d+)"; + + String result = RexExtractFunction.extractGroup(text, pattern, 1); + assertNull(result); + } + + @Test + public void testExtractGroup_InvalidGroupIndex() { + String text = "abc123"; + String pattern = "(\\w+)(\\d+)"; + + // Group index 3 doesn't exist (only groups 1 and 2) + String result = RexExtractFunction.extractGroup(text, pattern, 3); + assertNull(result); + + // Group index 0 is not valid + result = RexExtractFunction.extractGroup(text, pattern, 0); + assertNull(result); + + // Negative group index + result = RexExtractFunction.extractGroup(text, pattern, -1); + assertNull(result); + } + + @Test + public void testExtractGroup_InvalidPattern() { + String text = "test string"; + String invalidPattern = "(?["; // Unclosed bracket + + IllegalArgumentException exception = + assertThrows( + IllegalArgumentException.class, + () -> { + RexExtractFunction.extractGroup(text, invalidPattern, 1); + }); + + String expectedMessage = + "Error in 'rex' command: Encountered the following error while compiling the regex" + + " '(?[':"; + assertEquals(true, exception.getMessage().startsWith(expectedMessage)); + } + + @Test + public void testExtractGroup_EmptyString() { + String text = ""; + String pattern = "(\\w+)"; + + String result = RexExtractFunction.extractGroup(text, pattern, 1); + assertNull(result); + } + + @Test + public void testExtractGroup_SingleCharacter() { + String text = "a"; + String pattern = "([a-z])"; + + String result = RexExtractFunction.extractGroup(text, pattern, 1); + assertEquals("a", result); + } + + @Test + public void testExtractGroup_ComplexPattern() { + String text = "Error: File not found at line 42 in script.py"; + String pattern = "Error: (.+) at line (\\d+) in ([^\\s]+)"; + + String errorMsg = RexExtractFunction.extractGroup(text, pattern, 1); + assertEquals("File not found", errorMsg); + + String lineNum = RexExtractFunction.extractGroup(text, pattern, 2); + assertEquals("42", lineNum); + + String filename = RexExtractFunction.extractGroup(text, pattern, 3); + assertEquals("script.py", filename); + } + + @Test + public void testExtractGroup_EmailPattern() { + String text = "Contact: john.doe@example.org for support"; + String pattern = "([\\w.]+)@([\\w.]+)\\.([a-z]+)"; + + String username = RexExtractFunction.extractGroup(text, pattern, 1); + assertEquals("john.doe", username); + + String domain = RexExtractFunction.extractGroup(text, pattern, 2); + assertEquals("example", domain); + + String tld = RexExtractFunction.extractGroup(text, pattern, 3); + assertEquals("org", tld); + } + + @Test + public void testExtractGroup_IPAddressPattern() { + String text = "Server IP: 192.168.1.100:8080"; + String pattern = "(\\d+)\\.(\\d+)\\.(\\d+)\\.(\\d+):(\\d+)"; + + String firstOctet = RexExtractFunction.extractGroup(text, pattern, 1); + assertEquals("192", firstOctet); + + String port = RexExtractFunction.extractGroup(text, pattern, 5); + assertEquals("8080", port); + } + + @Test + public void testExtractGroup_MultiplePatternSyntaxErrors() { + String text = "test"; + + // Test various invalid patterns + String[] invalidPatterns = { + "[unclosed", + "(?", + "*+invalid", + "(?P\\w+)", // Python-style named groups not supported in Java + "\\k" + }; + + for (String invalidPattern : invalidPatterns) { + IllegalArgumentException exception = + assertThrows( + IllegalArgumentException.class, + () -> { + RexExtractFunction.extractGroup(text, invalidPattern, 1); + }); + + String expectedPrefix = + "Error in 'rex' command: Encountered the following error while compiling the regex"; + assertEquals( + true, + exception.getMessage().startsWith(expectedPrefix), + "Error message should start with SPL-style prefix for pattern: " + invalidPattern); + } + } + + @Test + public void testExtractGroup_CaseSensitivity() { + String text = "Hello World"; + String pattern = "(hello)"; + + // Should not match due to case sensitivity + String result = RexExtractFunction.extractGroup(text, pattern, 1); + assertNull(result); + + // Case-insensitive flag in pattern + String caseInsensitivePattern = "(?i)(hello)"; + result = RexExtractFunction.extractGroup(text, caseInsensitivePattern, 1); + assertEquals("Hello", result); + } + + @Test + public void testReturnTypeInference() { + assertNotNull(function.getReturnTypeInference(), "Return type inference should not be null"); + } + + @Test + public void testOperandMetadata() { + assertNotNull(function.getOperandMetadata(), "Operand metadata should not be null"); + } + + @Test + public void testFunctionConstructor() { + RexExtractFunction testFunction = new RexExtractFunction(); + assertNotNull(testFunction, "Function should be properly initialized"); + } +} diff --git a/core/src/test/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunctionTest.java new file mode 100644 index 00000000000..07f9da9302e --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunctionTest.java @@ -0,0 +1,198 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.List; +import org.junit.jupiter.api.Test; + +public class RexExtractMultiFunctionTest { + + private final RexExtractMultiFunction function = new RexExtractMultiFunction(); + + @Test + public void testExtractMultipleGroups_BasicPattern() { + String text = "user1@domain1.com, user2@domain2.com, user3@domain3.com"; + String pattern = "(\\w+)@(\\w+)"; + + // Extract first group (usernames) with max 2 matches + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 2); + assertEquals(Arrays.asList("user1", "user2"), result); + + // Extract second group (domains) with max 3 matches + result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 2, 3); + assertEquals(Arrays.asList("domain1", "domain2", "domain3"), result); + } + + @Test + public void testExtractMultipleGroups_NamedGroups() { + String text = "John Smith age:30, Jane Doe age:25, Bob Johnson age:35"; + String pattern = "(?\\w+ \\w+) age:(?\\d+)"; + + // Extract first group (names) with unlimited matches (maxMatch = 0) + List names = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 0); + assertEquals(Arrays.asList("John Smith", "Jane Doe", "Bob Johnson"), names); + + // Extract second group (ages) with max 2 matches + List ages = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 2, 2); + assertEquals(Arrays.asList("30", "25"), ages); + } + + @Test + public void testExtractMultipleGroups_SingleMatch() { + String text = "Error: File not found at line 42"; + String pattern = "Error: (.+) at line (\\d+)"; + + List errorMsg = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 1); + assertEquals(Arrays.asList("File not found"), errorMsg); + + List lineNum = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 2, 1); + assertEquals(Arrays.asList("42"), lineNum); + } + + @Test + public void testExtractMultipleGroups_NoMatches() { + String text = "This text has no matches"; + String pattern = "(\\d+)"; + + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 5); + assertNull(result); + } + + @Test + public void testExtractMultipleGroups_InvalidGroupIndex() { + String text = "abc123def456"; + String pattern = "(\\w+)(\\d+)"; + + // Group index 3 doesn't exist (only groups 1 and 2) + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 3, 5); + assertNull(result); + + // Group index 0 is not valid + result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 0, 5); + assertNull(result); + } + + @Test + public void testExtractMultipleGroups_InvalidPattern() { + String text = "test string"; + String invalidPattern = "[unclosed"; // Invalid regex pattern + + IllegalArgumentException exception = + assertThrows( + IllegalArgumentException.class, + () -> { + RexExtractMultiFunction.extractMultipleGroups(text, invalidPattern, 1, 5); + }); + + String expectedMessage = + "Error in 'rex' command: Encountered the following error while compiling the regex" + + " '[unclosed':"; + assertTrue(exception.getMessage().startsWith(expectedMessage)); + } + + @Test + public void testExtractMultipleGroups_EmptyString() { + String text = ""; + String pattern = "(\\w+)"; + + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 5); + assertNull(result); + } + + @Test + public void testExtractMultipleGroups_MaxMatchZero() { + String text = "a1 b2 c3 d4 e5"; + String pattern = "([a-z])(\\d)"; + + // maxMatch = 0 should extract all matches + List letters = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 0); + assertEquals(Arrays.asList("a", "b", "c", "d", "e"), letters); + + List numbers = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 2, 0); + assertEquals(Arrays.asList("1", "2", "3", "4", "5"), numbers); + } + + @Test + public void testExtractMultipleGroups_MaxMatchOne() { + String text = "test1 test2 test3"; + String pattern = "(test\\d)"; + + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 1); + assertEquals(Arrays.asList("test1"), result); + } + + @Test + public void testExtractMultipleGroups_ComplexPattern() { + String text = "IP: 192.168.1.1:8080, IP: 10.0.0.1:9090, IP: 172.16.0.1:3000"; + String pattern = "IP: (\\d+\\.\\d+\\.\\d+\\.\\d+):(\\d+)"; + + // Extract IP addresses + List ips = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 0); + assertEquals(Arrays.asList("192.168.1.1", "10.0.0.1", "172.16.0.1"), ips); + + // Extract ports with max 2 matches + List ports = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 2, 2); + assertEquals(Arrays.asList("8080", "9090"), ports); + } + + @Test + public void testExtractMultipleGroups_NullMatchGroups() { + String text = "word1 word2 word3"; + String pattern = "(\\w+)"; + + // Extract words with max 3 matches + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 3); + assertEquals(Arrays.asList("word1", "word2", "word3"), result); + } + + @Test + public void testReturnTypeInference() { + assertNotNull(function.getReturnTypeInference(), "Return type inference should not be null"); + } + + @Test + public void testNullReturnTypeCompatibility() { + List result1 = + RexExtractMultiFunction.extractMultipleGroups("no match", "(\\d+)", 1, 5); + assertNull(result1, "Should return null when no matches found"); + + List result2 = RexExtractMultiFunction.extractMultipleGroups("abc", "(\\w+)", 2, 5); + assertNull(result2, "Should return null for invalid group index"); + + List result3 = RexExtractMultiFunction.extractMultipleGroups("", "(\\w+)", 1, 5); + assertNull(result3, "Should return null for empty string with no matches"); + } + + @Test + public void testOperandMetadata() { + assertNotNull(function.getOperandMetadata(), "Operand metadata should not be null"); + } + + @Test + public void testFunctionConstructor() { + RexExtractMultiFunction testFunction = new RexExtractMultiFunction(); + assertNotNull(testFunction, "Function should be properly initialized"); + } + + @Test + public void testExtractMultipleGroups_EdgeCaseEmptyMatches() { + String text = "abc def ghi"; + String pattern = "(\\d*)"; + + List result = RexExtractMultiFunction.extractMultipleGroups(text, pattern, 1, 3); + + assertTrue( + result == null || result.isEmpty() || result.size() <= 3, + "Should handle empty matches gracefully"); + } +} diff --git a/docs/category.json b/docs/category.json index b09c67e5e0f..ef3594796b7 100644 --- a/docs/category.json +++ b/docs/category.json @@ -59,6 +59,7 @@ "user/ppl/cmd/append.rst", "user/ppl/cmd/fields.rst", "user/ppl/cmd/regex.rst", + "user/ppl/cmd/rex.rst", "user/ppl/cmd/stats.rst", "user/ppl/cmd/timechart.rst" ] diff --git a/docs/user/ppl/cmd/rex.rst b/docs/user/ppl/cmd/rex.rst new file mode 100644 index 00000000000..c816ed0fbf9 --- /dev/null +++ b/docs/user/ppl/cmd/rex.rst @@ -0,0 +1,195 @@ +============= +rex +============= + +.. rubric:: Table of contents + +.. contents:: + :local: + :depth: 2 + + +Description +============ +| The ``rex`` command extracts fields from a raw text field using regular expression named capture groups. + +Version +======= +3.3.0 + +Syntax +============ +rex field= [max_match=] + +* field: mandatory. The field must be a string field to extract data from. +* pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using ``(?pattern)`` syntax. +* max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable via ``plugins.ppl.rex.max_match.limit``). + +Example 1: Basic Field Extraction +================================== + +Extract username and domain from email addresses using named capture groups. Both extracted fields are returned as string type. + +PPL query:: + + os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, username, domain | head 2 ; + fetched rows / total rows = 2/2 + +-----------------------+------------+--------+ + | email | username | domain | + |-----------------------+------------+--------| + | amberduke@pyrami.com | amberduke | pyrami | + | hattiebond@netagy.com | hattiebond | netagy | + +-----------------------+------------+--------+ + + +Example 2: Handling Non-matching Patterns +========================================== + +The rex command returns all events, setting extracted fields to null for non-matching patterns. Extracted fields would be string type when matches are found. + +PPL query:: + + os> source=accounts | rex field=email "(?[^@]+)@(?gmail\\.com)" | fields email, user, domain | head 2 ; + fetched rows / total rows = 2/2 + +-----------------------+------+--------+ + | email | user | domain | + |-----------------------+------+--------| + | amberduke@pyrami.com | null | null | + | hattiebond@netagy.com | null | null | + +-----------------------+------+--------+ + + +Example 3: Multiple Matches with max_match +=========================================== + +Extract multiple words from address field using max_match parameter. The extracted field is returned as an array type containing string elements. + +PPL query:: + + os> source=accounts | rex field=address "(?[A-Za-z]+)" max_match=2 | fields address, words | head 3 ; + fetched rows / total rows = 3/3 + +--------------------+------------------+ + | address | words | + |--------------------+------------------| + | 880 Holmes Lane | [Holmes,Lane] | + | 671 Bristol Street | [Bristol,Street] | + | 789 Madison Street | [Madison,Street] | + +--------------------+------------------+ + + +Example 4: Complex Email Pattern +================================= + +Extract comprehensive email components including top-level domain. All extracted fields are returned as string type. + +PPL query:: + + os> source=accounts | rex field=email "(?[a-zA-Z0-9._%+-]+)@(?[a-zA-Z0-9.-]+)\\.(?[a-zA-Z]{2,})" | fields email, user, domain, tld | head 2 ; + fetched rows / total rows = 2/2 + +-----------------------+------------+--------+-----+ + | email | user | domain | tld | + |-----------------------+------------+--------+-----| + | amberduke@pyrami.com | amberduke | pyrami | com | + | hattiebond@netagy.com | hattiebond | netagy | com | + +-----------------------+------------+--------+-----+ + + +Example 5: Chaining Multiple rex Commands +========================================== + +Extract initial letters from both first and last names. All extracted fields are returned as string type. + +PPL query:: + + os> source=accounts | rex field=firstname "(?^.)" | rex field=lastname "(?^.)" | fields firstname, lastname, firstinitial, lastinitial | head 3 ; + fetched rows / total rows = 3/3 + +-----------+----------+--------------+-------------+ + | firstname | lastname | firstinitial | lastinitial | + |-----------+----------+--------------+-------------| + | Amber | Duke | A | D | + | Hattie | Bond | H | B | + | Nanette | Bates | N | B | + +-----------+----------+--------------+-------------+ + + +Example 6: Named Capture Group Limitations +============================================ + +Demonstrates naming restrictions for capture groups. Group names cannot contain underscores due to Java regex limitations. + +Invalid PPL query with underscores:: + + os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, user_name, email_domain ; + {'reason': 'Invalid Query', 'details': 'Rex pattern must contain at least one named capture group', 'type': 'IllegalArgumentException'} + Error: Query returned no data + +Correct PPL query without underscores:: + + os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, username, emaildomain | head 2 ; + fetched rows / total rows = 2/2 + +-----------------------+------------+-------------+ + | email | username | emaildomain | + |-----------------------+------------+-------------| + | amberduke@pyrami.com | amberduke | pyrami | + | hattiebond@netagy.com | hattiebond | netagy | + +-----------------------+------------+-------------+ + + +Example 7: Max Match Limit Protection +====================================== + +Demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion. + +PPL query with max_match=0 automatically capped to default limit of 10:: + + os> source=accounts | rex field=address "(?\\d*)" max_match=0 | eval digit_count=array_length(digit) | fields address, digit_count | head 1 ; + fetched rows / total rows = 1/1 + +-----------------+-------------+ + | address | digit_count | + |-----------------+-------------| + | 880 Holmes Lane | 10 | + +-----------------+-------------+ + +PPL query exceeding the configured limit results in an error:: + + os> source=accounts | rex field=address "(?\\d*)" max_match=100 | fields address, digit | head 1 ; + {'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'} + Error: Query returned no data + + +Comparison with Related Commands +================================ + +============================= ============ ============ +Feature rex parse +============================= ============ ============ +Pattern Type Java Regex Java Regex +Named Groups Required Yes Yes +Filtering by Match No Yes +Multiple Matches Yes No +Underscores in Group Names No No +============================= ============ ============ + + +Limitations +=========== + +There are several important limitations with the rex command: + +**Named Capture Group Naming:** + +- Named capture groups cannot contain underscores due to Java regex limitations +- Group names must start with a letter and contain only letters and digits +- For detailed Java regex pattern syntax and usage, refer to the `official Java Pattern documentation `_ + +**Pattern Requirements:** + +- Pattern must contain at least one named capture group +- Regular capture groups ``(...)`` without names are not allowed + +**Max Match Limit:** + +- The ``max_match`` parameter is subject to a configurable system limit to prevent memory exhaustion +- When ``max_match=0`` (unlimited) is specified, it is automatically capped at the configured limit (default: 10) +- User-specified values exceeding the configured limit will result in an error +- Users can adjust the limit via the ``plugins.ppl.rex.max_match.limit`` cluster setting. Setting this limit to a large value is not recommended as it can lead to excessive memory consumption, especially with patterns that match empty strings (e.g., ``\d*``, ``\w*``) \ No newline at end of file diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst index 307ecc0abba..09fad5c853d 100644 --- a/docs/user/ppl/index.rst +++ b/docs/user/ppl/index.rst @@ -100,6 +100,8 @@ The query start with search command and then flowing a set of command delimited - `regex command `_ + - `rex command `_ + - `search command `_ - `show datasources command `_ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index f13dea97f4d..ac6e1d9bd6c 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -86,6 +86,7 @@ CalciteQueryAnalysisIT.class, CalciteRareCommandIT.class, CalciteRegexCommandIT.class, + CalciteRexCommandIT.class, CalciteRenameCommandIT.class, CalciteResourceMonitorIT.class, CalciteSearchCommandIT.class, diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index b0e0f3d89fd..78eda7294a4 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -430,6 +430,16 @@ public void testSimpleSortExpressionPushDownWithOnlyExprProjected() throws Excep assertJsonEqualsIgnoreId(expected, result); } + @Test + public void testRexExplain() throws IOException { + String query = + "source=opensearch-sql_test_index_account | rex field=lastname \\\"(?^[A-Z])\\\" |" + + " head 5"; + var result = explainQueryToString(query); + String expected = loadExpectedPlan("explain_rex.json"); + assertJsonEqualsIgnoreId(expected, result); + } + @Test public void testExplainAppendCommand() throws IOException { String expected = loadExpectedPlan("explain_append_command.json"); diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java new file mode 100644 index 00000000000..c8facd98cc6 --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java @@ -0,0 +1,245 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_ACCOUNT; + +import java.io.IOException; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.common.setting.Settings; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +public class CalciteRexCommandIT extends PPLIntegTestCase { + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.ACCOUNT); + } + + @Test + public void testRexBasicFieldExtraction() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=email \\\"(?[^@]+)@(?.+)\\\" | fields email," + + " user, domain", + TEST_INDEX_ACCOUNT)); + + assertEquals(1000, result.getJSONArray("datarows").length()); + assertEquals("amberduke@pyrami.com", result.getJSONArray("datarows").getJSONArray(0).get(0)); + assertEquals("amberduke", result.getJSONArray("datarows").getJSONArray(0).get(1)); + assertEquals("pyrami.com", result.getJSONArray("datarows").getJSONArray(0).get(2)); + } + + @Test + public void testRexErrorNoNamedGroups() throws IOException { + try { + executeQuery( + String.format( + "source=%s | rex field=email \\\"([^@]+)@(.+)\\\" | fields email", + TEST_INDEX_ACCOUNT)); + fail("Should have thrown an exception for pattern without named capture groups"); + } catch (Exception e) { + assertTrue( + e.getMessage().contains("Rex pattern must contain at least one named capture group")); + } + } + + @Test + public void testRexWithFiltering() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=address" + + " \\\"(?\\\\\\\\d+)\\\\\\\\s+(?.+)\\\" | fields" + + " address, streetnum, streetname", + TEST_INDEX_ACCOUNT)); + + assertEquals(1000, result.getJSONArray("datarows").length()); + assertEquals("880 Holmes Lane", result.getJSONArray("datarows").getJSONArray(0).get(0)); + assertEquals("880", result.getJSONArray("datarows").getJSONArray(0).get(1)); + assertEquals("Holmes Lane", result.getJSONArray("datarows").getJSONArray(0).get(2)); + } + + @Test + public void testRexMultipleMatches() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?[A-Za-z]+)\\\" max_match=3 | fields" + + " address, words", + TEST_INDEX_ACCOUNT)); + + assertEquals(1000, result.getJSONArray("datarows").length()); + String wordsArray = result.getJSONArray("datarows").getJSONArray(0).get(1).toString(); + assertTrue(wordsArray.contains("Holmes") && wordsArray.contains("Lane")); + assertTrue(wordsArray.startsWith("[") && wordsArray.endsWith("]")); + } + + @Test + public void testRexChainedCommands() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=firstname \\\"(?^.)\\\" | rex field=lastname" + + " \\\"(?^.)\\\" | fields firstname, lastname, firstinitial," + + " lastinitial", + TEST_INDEX_ACCOUNT)); + + assertEquals(1000, result.getJSONArray("datarows").length()); + assertEquals("Amber", result.getJSONArray("datarows").getJSONArray(0).get(0)); + assertEquals("Duke", result.getJSONArray("datarows").getJSONArray(0).get(1)); + assertEquals("A", result.getJSONArray("datarows").getJSONArray(0).get(2)); + assertEquals("D", result.getJSONArray("datarows").getJSONArray(0).get(3)); + } + + @Test + public void testRexComplexPattern() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=email" + + " \\\"(?[a-zA-Z0-9._%%+-]+)@(?[a-zA-Z0-9.-]+)\\\\\\\\.(?[a-zA-Z]{2,})\\\"" + + " | fields email, user, domain, tld", + TEST_INDEX_ACCOUNT)); + + assertEquals(1000, result.getJSONArray("datarows").length()); + assertEquals("amberduke@pyrami.com", result.getJSONArray("datarows").getJSONArray(0).get(0)); + assertEquals("amberduke", result.getJSONArray("datarows").getJSONArray(0).get(1)); + assertEquals("pyrami", result.getJSONArray("datarows").getJSONArray(0).get(2)); + assertEquals("com", result.getJSONArray("datarows").getJSONArray(0).get(3)); + } + + @Test + public void testRexWithWhere() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | where state=\\\"CA\\\" | rex field=email" + + " \\\"(?[^@]+)@(?.+)\\\" | fields email, user, domain", + TEST_INDEX_ACCOUNT)); + + assertTrue(result.getJSONArray("datarows").length() > 0); + String email = result.getJSONArray("datarows").getJSONArray(0).get(0).toString(); + String user = result.getJSONArray("datarows").getJSONArray(0).get(1).toString(); + String domain = result.getJSONArray("datarows").getJSONArray(0).get(2).toString(); + assertTrue(email.startsWith(user)); + assertTrue(email.endsWith(domain)); + } + + @Test + public void testRexWithStatsCommand() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=email \\\"[^@]+@(?[^.]+)\\\" | stats count() by" + + " domain", + TEST_INDEX_ACCOUNT)); + + assertTrue(result.getJSONArray("datarows").length() > 0); + int count = Integer.parseInt(result.getJSONArray("datarows").getJSONArray(0).get(0).toString()); + String domain = result.getJSONArray("datarows").getJSONArray(0).get(1).toString(); + assertTrue(count > 0); + assertFalse(domain.contains("@")); + assertTrue(domain.matches("[a-z]+")); + } + + @Test + public void testRexMaxMatchZeroLimitedToDefaultTen() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?\\\\\\\\d*)\\\" max_match=0 | eval" + + " digit_count=array_length(digit) | fields address, digit_count | head 1", + TEST_INDEX_ACCOUNT)); + + assertEquals(1, result.getJSONArray("datarows").length()); + // Should be capped at 10 matches + assertEquals(10, result.getJSONArray("datarows").getJSONArray(0).get(1)); + } + + @Test + public void testRexMaxMatchExceedsDefaultLimit() throws IOException { + try { + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?\\\\\\\\d+)\\\" max_match=100 | fields" + + " address, digit", + TEST_INDEX_ACCOUNT)); + fail("Should have thrown an exception for max_match exceeding default limit"); + } catch (Exception e) { + assertTrue(e.getMessage().contains("exceeds the configured limit (10)")); + assertTrue(e.getMessage().contains("Consider using a smaller max_match value")); + } + } + + @Test + public void testRexMaxMatchWithinDefaultLimit() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?\\\\\\\\d*)\\\" max_match=5 | eval" + + " digit_count=array_length(digit) | fields address, digit_count | head 1", + TEST_INDEX_ACCOUNT)); + + assertEquals(1, result.getJSONArray("datarows").length()); + // Should respect the specified limit of 5 + assertEquals(5, result.getJSONArray("datarows").getJSONArray(0).get(1)); + } + + @Test + public void testRexMaxMatchAtDefaultLimit() throws IOException { + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?\\\\\\\\d*)\\\" max_match=10 | eval" + + " digit_count=array_length(digit) | fields address, digit_count | head 1", + TEST_INDEX_ACCOUNT)); + + assertEquals(1, result.getJSONArray("datarows").length()); + // Should accept exactly the limit + assertEquals(10, result.getJSONArray("datarows").getJSONArray(0).get(1)); + } + + @Test + public void testRexMaxMatchConfigurableLimit() throws IOException { + // Set a custom limit of 5 + updateClusterSettings( + new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), "5")); + + try { + // Test that max_match=0 is capped to the new limit + JSONObject result = + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?\\\\\\\\d*)\\\" max_match=0 | eval" + + " digit_count=array_length(digit) | fields address, digit_count | head 1", + TEST_INDEX_ACCOUNT)); + + assertEquals(1, result.getJSONArray("datarows").length()); + // Should be capped at the configured limit of 5 + assertEquals(5, result.getJSONArray("datarows").getJSONArray(0).get(1)); + + // Test that exceeding the custom limit throws an error + try { + executeQuery( + String.format( + "source=%s | rex field=address \\\"(?\\\\\\\\d+)\\\" max_match=10 | fields" + + " address, digit", + TEST_INDEX_ACCOUNT)); + fail("Should have thrown an exception for max_match exceeding custom limit"); + } catch (Exception e) { + assertTrue(e.getMessage().contains("exceeds the configured limit (5)")); + assertTrue(e.getMessage().contains("adjust the plugins.ppl.rex.max_match.limit setting")); + } + } finally { + updateClusterSettings( + new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), null)); + } + } +} diff --git a/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java b/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java index 8fcd756edac..281455096d8 100644 --- a/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/security/CalciteCrossClusterSearchIT.java @@ -266,4 +266,61 @@ public void testCrossClusterRegexWithNegation() throws IOException { rows("Amber JOHnny"), rows("Nanette")); } + + @Test + public void testCrossClusterRexBasic() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | rex field=firstname \\\"(?^[A-Z])\\\" | fields" + + " firstname, initial | head 3", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows(result, rows("Amber JOHnny", "A"), rows("Hattie", "H"), rows("Nanette", "N")); + } + + @Test + public void testCrossClusterRexMultipleGroups() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | rex field=lastname \\\"(?[A-Z])(?[a-z]+)\\\" |" + + " fields lastname, first, rest | head 2", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows(result, rows("Duke Willmington", "D", "uke"), rows("Bond", "B", "ond")); + } + + @Test + public void testCrossClusterRexSedMode() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | rex field=firstname mode=sed \\\"s/^[A-Z]/X/\\\" | fields" + + " firstname | head 3", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows(result, rows("Xmber JOHnny"), rows("Xattie"), rows("Xanette")); + } + + @Test + public void testCrossClusterRexWithMaxMatch() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | rex field=firstname \\\"(?[A-Z])\\\" max_match=2 |" + + " fields firstname, letter | head 2", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows( + result, rows("Amber JOHnny", new String[] {"A", "J"}), rows("Hattie", new String[] {"H"})); + } + + @Test + public void testCrossClusterRexWithOffsetField() throws IOException { + JSONObject result = + executeQuery( + String.format( + "search source=%s | rex field=lastname \\\"(?[aeiou])\\\" offset_field=pos |" + + " fields lastname, vowel, pos | head 2", + TEST_INDEX_BANK_REMOTE)); + verifyDataRows( + result, rows("Duke Willmington", "u", "vowel=1-1"), rows("Bond", "o", "vowel=1-1")); + } } diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_rex.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_rex.json new file mode 100644 index 00000000000..250524b8457 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_rex.json @@ -0,0 +1,6 @@ +{ + "calcite": { + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])\n LogicalSort(fetch=[5])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?^[A-Z])', 1)])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", + "physical": "EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?^[A-Z])'], expr#12=[1], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[LIMIT->5, LIMIT->10000, PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":5,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"],\"excludes\":[]}}, requestedTotalSize=5, pageSize=null, startFrom=0)])\n" + } +} diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_rex.json b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_rex.json new file mode 100644 index 00000000000..a64e2edeea2 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_rex.json @@ -0,0 +1,6 @@ +{ + "calcite": { + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])\n LogicalSort(fetch=[5])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?^[A-Z])', 1)])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", + "physical": "EnumerableLimit(fetch=[10000])\n EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?^[A-Z])'], expr#18=[1], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])\n EnumerableLimit(fetch=[5])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n" + } +} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java index e12ff9f9ab8..5f5240e67a2 100644 --- a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java @@ -99,6 +99,14 @@ public class OpenSearchSettings extends Settings { Setting.Property.NodeScope, Setting.Property.Dynamic); + public static final Setting PPL_REX_MAX_MATCH_LIMIT_SETTING = + Setting.intSetting( + Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), + 10, + 1, + Setting.Property.NodeScope, + Setting.Property.Dynamic); + public static final Setting CALCITE_ENGINE_ENABLED_SETTING = Setting.boolSetting( Key.CALCITE_ENGINE_ENABLED.getKeyValue(), @@ -327,6 +335,12 @@ public OpenSearchSettings(ClusterSettings clusterSettings) { Key.PATTERN_BUFFER_LIMIT, DEFAULT_PATTERN_BUFFER_LIMIT_SETTING, new Updater(Key.PATTERN_BUFFER_LIMIT)); + register( + settingBuilder, + clusterSettings, + Key.PPL_REX_MAX_MATCH_LIMIT, + PPL_REX_MAX_MATCH_LIMIT_SETTING, + new Updater(Key.PPL_REX_MAX_MATCH_LIMIT)); register( settingBuilder, clusterSettings, @@ -531,6 +545,7 @@ public static List> pluginSettings() { .add(DEFAULT_PATTERN_MODE_SETTING) .add(DEFAULT_PATTERN_MAX_SAMPLE_COUNT_SETTING) .add(DEFAULT_PATTERN_BUFFER_LIMIT_SETTING) + .add(PPL_REX_MAX_MATCH_LIMIT_SETTING) .add(QUERY_MEMORY_LIMIT_SETTING) .add(QUERY_SIZE_LIMIT_SETTING) .add(METRICS_ROLLING_WINDOW_SETTING) diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 939dde65fd1..b8f12d3c491 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -17,6 +17,7 @@ EXPLAIN: 'EXPLAIN'; FROM: 'FROM'; WHERE: 'WHERE'; FIELDS: 'FIELDS'; +FIELD: 'FIELD'; TABLE: 'TABLE'; // Alias for FIELDS command RENAME: 'RENAME'; STATS: 'STATS'; @@ -31,6 +32,7 @@ RARE: 'RARE'; PARSE: 'PARSE'; SPATH: 'SPATH'; REGEX: 'REGEX'; +REX: 'REX'; PUNCT: 'PUNCT'; GROK: 'GROK'; PATTERN: 'PATTERN'; @@ -51,6 +53,7 @@ VARIABLE_COUNT_THRESHOLD: 'VARIABLE_COUNT_THRESHOLD'; FREQUENCY_THRESHOLD_PERCENTAGE: 'FREQUENCY_THRESHOLD_PERCENTAGE'; METHOD: 'METHOD'; MAX_SAMPLE_COUNT: 'MAX_SAMPLE_COUNT'; +MAX_MATCH: 'MAX_MATCH'; BUFFER_LIMIT: 'BUFFER_LIMIT'; LABEL: 'LABEL'; AGGREGATION: 'AGGREGATION'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 6e8b7f374a5..2a7677cd5e3 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -76,6 +76,7 @@ commands | reverseCommand | regexCommand | timechartCommand + | rexCommand ; commandName @@ -112,6 +113,7 @@ commandName | REVERSE | REGEX | APPEND + | REX ; searchCommand @@ -274,6 +276,18 @@ regexExpr : field=qualifiedName operator=(EQUAL | NOT_EQUAL) pattern=stringLiteral ; +rexCommand + : REX rexExpr + ; + +rexExpr + : FIELD EQUAL field=qualifiedName (rexOption)* pattern=stringLiteral (rexOption)* + ; + +rexOption + : MAX_MATCH EQUAL maxMatch=integerLiteral + | MODE EQUAL EXTRACT + ; patternsMethod : PUNCT | REGEX diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index b4da891f754..c006afe2a20 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -92,6 +92,7 @@ import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.Rex; import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SpanBin; @@ -983,6 +984,47 @@ public UnresolvedPlan visitAppendCommand(OpenSearchPPLParser.AppendCommandContex return new Append(subsearch); } + @Override + public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx) { + UnresolvedExpression field = internalVisitExpression(ctx.rexExpr().field); + Literal pattern = (Literal) internalVisitExpression(ctx.rexExpr().pattern); + Rex.RexMode mode = Rex.RexMode.EXTRACT; + Optional maxMatch = Optional.empty(); + + for (OpenSearchPPLParser.RexOptionContext optionCtx : ctx.rexExpr().rexOption()) { + if (optionCtx.maxMatch != null) { + maxMatch = Optional.of(Integer.parseInt(optionCtx.maxMatch.getText())); + } + if (optionCtx.EXTRACT() != null) { + mode = Rex.RexMode.EXTRACT; + } + } + + int maxMatchLimit = + (settings != null) ? settings.getSettingValue(Settings.Key.PPL_REX_MAX_MATCH_LIMIT) : 10; + + int userMaxMatch = maxMatch.orElse(1); + int effectiveMaxMatch; + + if (userMaxMatch == 0) { + effectiveMaxMatch = maxMatchLimit; + } else if (userMaxMatch > maxMatchLimit) { + throw new IllegalArgumentException( + String.format( + "Rex command max_match value (%d) exceeds the configured limit (%d). " + + "Consider using a smaller max_match value" + + (settings != null + ? " or adjust the plugins.ppl.rex.max_match.limit setting." + : "."), + userMaxMatch, + maxMatchLimit)); + } else { + effectiveMaxMatch = userMaxMatch; + } + + return new Rex(field, pattern, mode, Optional.of(effectiveMaxMatch)); + } + /** Get original text in query. */ private String getTextInQuery(ParserRuleContext ctx) { Token start = ctx.getStart(); diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index d71666d3424..3c5001f39df 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -78,6 +78,7 @@ import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.Rex; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SpanBin; import org.opensearch.sql.ast.tree.SubqueryAlias; @@ -455,6 +456,24 @@ public String visitTimechart(Timechart node, String context) { return StringUtils.format("%s%s", child, timechartCommand.toString()); } + public String visitRex(Rex node, String context) { + String child = node.getChild().get(0).accept(this, context); + String field = visitExpression(node.getField()); + String pattern = "\"" + node.getPattern().toString() + "\""; + StringBuilder command = new StringBuilder(); + + command.append( + String.format( + "%s | rex field=%s mode=%s %s", + child, field, node.getMode().toString().toLowerCase(), pattern)); + + if (node.getMaxMatch().isPresent()) { + command.append(" max_match=").append(node.getMaxMatch().get()); + } + + return command.toString(); + } + @Override public String visitParse(Parse node, String context) { String child = node.getChild().get(0).accept(this, context); diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java new file mode 100644 index 00000000000..10072319ae2 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLRexTest.java @@ -0,0 +1,229 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import static org.mockito.Mockito.doReturn; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.test.CalciteAssert; +import org.junit.Before; +import org.junit.Test; +import org.opensearch.sql.common.setting.Settings; + +public class CalcitePPLRexTest extends CalcitePPLAbstractTest { + public CalcitePPLRexTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Before + public void setUp() { + doReturn(10).when(settings).getSettingValue(Settings.Key.PPL_REX_MAX_MATCH_LIMIT); + } + + @Test + public void testRexBasicFieldExtraction() { + String ppl = "source=EMP | rex field=ENAME '(?[A-Z]).*' | fields ENAME, first"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?[A-Z]).*', 1)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?[A-Z]).*', 1) `first`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexMultipleNamedGroups() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])(?.*)' | fields ENAME, first, rest"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?[A-Z])(?.*)', 1)]," + + " rest=[REX_EXTRACT($1, '(?[A-Z])(?.*)', 2)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?[A-Z])(?.*)', 1) `first`," + + " `REX_EXTRACT`(`ENAME`, '(?[A-Z])(?.*)', 2) `rest`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithMaxMatch() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])' max_match=3 | fields ENAME, letter"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?[A-Z])', 1, 3)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?[A-Z])', 1, 3) `letter`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexChainedCommands() { + String ppl = + "source=EMP | rex field=ENAME '(?^.)' | rex field=JOB '(?\\w+)' |" + + " fields ENAME, JOB, firstinitial, jobtype"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], JOB=[$2], firstinitial=[REX_EXTRACT($1," + + " '(?^.)', 1)], jobtype=[REX_EXTRACT($2, '(?\\w+)', 1)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `JOB`, `REX_EXTRACT`(`ENAME`, '(?^.)', 1) `firstinitial`," + + " `REX_EXTRACT`(`JOB`, '(?\\w+)', 1) `jobtype`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithWhereClause() { + String ppl = + "source=EMP | where SAL > 1000 | rex field=ENAME '(?[A-Z]).*' | fields ENAME, first," + + " SAL"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], first=[REX_EXTRACT($1, '(?[A-Z]).*', 1)], SAL=[$5])\n" + + " LogicalFilter(condition=[>($5, 1000)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?[A-Z]).*', 1) `first`, `SAL`\n" + + "FROM `scott`.`EMP`\n" + + "WHERE `SAL` > 1000"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithAggregation() { + String ppl = "source=EMP | rex field=JOB '(?\\w+)' | stats count() by jobtype"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(count()=[$1], jobtype=[$0])\n" + + " LogicalAggregate(group=[{0}], count()=[COUNT()])\n" + + " LogicalProject(jobtype=[REX_EXTRACT($2, '(?\\w+)', 1)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT COUNT(*) `count()`, `REX_EXTRACT`(`JOB`, '(?\\w+)', 1) `jobtype`\n" + + "FROM `scott`.`EMP`\n" + + "GROUP BY `REX_EXTRACT`(`JOB`, '(?\\w+)', 1)"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexComplexPattern() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z]{2})(?[A-Z]+)' | fields ENAME," + + " prefix, suffix"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], prefix=[REX_EXTRACT($1, '(?[A-Z]{2})(?[A-Z]+)'," + + " 1)], suffix=[REX_EXTRACT($1, '(?[A-Z]{2})(?[A-Z]+)', 2)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?[A-Z]{2})(?[A-Z]+)', 1)" + + " `prefix`, `REX_EXTRACT`(`ENAME`, '(?[A-Z]{2})(?[A-Z]+)', 2)" + + " `suffix`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithSort() { + String ppl = + "source=EMP | rex field=ENAME '(?^.)' | fields ENAME, firstletter | sort" + + " firstletter | head 5"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first], fetch=[5])\n" + + " LogicalProject(ENAME=[$1], firstletter=[REX_EXTRACT($1, '(?^.)'," + + " 1)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT`(`ENAME`, '(?^.)', 1) `firstletter`\n" + + "FROM `scott`.`EMP`\n" + + "ORDER BY 2\n" + + "LIMIT 5"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithMaxMatchZero() { + // Test that max_match=0 (unlimited) is capped to the configured limit + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])' max_match=0 | fields ENAME, letter"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?[A-Z])', 1, 10)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?[A-Z])', 1, 10) `letter`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test(expected = IllegalArgumentException.class) + public void testRexWithMaxMatchExceedsLimit() { + // Test that max_match exceeding the configured limit throws an exception + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])' max_match=100 | fields ENAME, letter"; + getRelNode(ppl); + } + + @Test + public void testRexWithMaxMatchWithinLimit() { + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])' max_match=5 | fields ENAME, letter"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?[A-Z])', 1, 5)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?[A-Z])', 1, 5) `letter`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testRexWithMaxMatchAtLimit() { + // Test that max_match exactly at the limit works + String ppl = + "source=EMP | rex field=ENAME '(?[A-Z])' max_match=10 | fields ENAME, letter"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], letter=[REX_EXTRACT_MULTI($1, '(?[A-Z])', 1, 10)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, `REX_EXTRACT_MULTI`(`ENAME`, '(?[A-Z])', 1, 10) `letter`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } +} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index 9b661ce3088..3085a4f8185 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -1077,6 +1077,6 @@ private String mappingTable(String indexName) { @Test(expected = IllegalArgumentException.class) public void testBinCommandDuplicateParameter() { // Test that duplicate parameters throw an exception - plan("search source=test | bin field span=10 span=20"); + plan("search source=test | bin index_field span=10 span=20"); } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 0fe0926b462..af25ea9b7be 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -544,13 +544,32 @@ public void testPatterns() { @Test public void testRegex() { - assertEquals("source=t | regex field=***", anonymize("source=t | regex field='pattern'")); - assertEquals("source=t | regex field!=***", anonymize("source=t | regex field!='pattern'")); + assertEquals( + "source=t | regex fieldname=***", anonymize("source=t | regex fieldname='pattern'")); + assertEquals( + "source=t | regex fieldname!=***", anonymize("source=t | regex fieldname!='pattern'")); assertEquals( "source=t | regex email=*** | fields + email", anonymize("source=t | regex email='.*@domain.com' | fields email")); } + @Test + public void testRexCommand() { + when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10); + + assertEquals( + "source=t | rex field=message mode=extract \"(?[A-Z]+)\" max_match=1", + anonymize("source=t | rex field=message \"(?[A-Z]+)\"")); + assertEquals( + "source=t | rex field=lastname mode=extract \"(?^[A-Z])\" max_match=1 | fields +" + + " lastname,initial", + anonymize( + "source=t | rex field=lastname \"(?^[A-Z])\" | fields lastname, initial")); + assertEquals( + "source=t | rex field=name mode=extract \"(?[A-Z])\" max_match=3", + anonymize("source=t | rex field=name \"(?[A-Z])\" max_match=3")); + } + private String anonymize(String query) { AstBuilder astBuilder = new AstBuilder(query, settings); return anonymize(astBuilder.visit(parser.parse(query)));