-
Notifications
You must be signed in to change notification settings - Fork 181
[Feature] Core Implementation of rex Command In PPL
#4109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7e5bd57
f070540
6362dc6
fe9676e
9eb69f6
8ced572
97536c2
b77aab7
63c01f6
08edb1a
8365237
f9adf39
783bfe2
422f475
bbd4f17
9c7359a
96de984
ce4b9ac
53645a8
28a24c2
de1f8a2
d15bed9
d22f733
5182e99
0ce00b2
68c2482
1c42e8d
d888d00
8162765
fa9af85
a17963c
764f400
a711d72
8c1ec27
691c0fa
0ae84d9
ab90dea
d062781
9c3f72e
47dddae
13132ef
6419982
2f4279a
8133d8c
b8a47f4
e0360a1
a8df4e7
b5e8e53
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| /* | ||
| * Copyright OpenSearch Contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| package org.opensearch.sql.ast.tree; | ||
|
|
||
| import com.google.common.collect.ImmutableList; | ||
| import java.util.List; | ||
| import java.util.Optional; | ||
| import lombok.EqualsAndHashCode; | ||
| import lombok.Getter; | ||
| import lombok.Setter; | ||
| import lombok.ToString; | ||
| import org.opensearch.sql.ast.AbstractNodeVisitor; | ||
| import org.opensearch.sql.ast.expression.Literal; | ||
| import org.opensearch.sql.ast.expression.UnresolvedExpression; | ||
|
|
||
| /** AST node represent Rex field extraction operation. */ | ||
| @Getter | ||
| @ToString | ||
| @EqualsAndHashCode(callSuper = false) | ||
| public class Rex extends UnresolvedPlan { | ||
|
|
||
| public enum RexMode { | ||
| EXTRACT | ||
| } | ||
|
|
||
| /** Field to extract from. */ | ||
| private final UnresolvedExpression field; | ||
|
|
||
| /** Pattern with named capture groups. */ | ||
| private final Literal pattern; | ||
|
|
||
| /** Rex mode (only EXTRACT supported). */ | ||
| private final RexMode mode; | ||
|
|
||
| /** Maximum number of matches (optional). */ | ||
| private final Optional<Integer> maxMatch; | ||
|
|
||
| /** Child Plan. */ | ||
| @Setter private UnresolvedPlan child; | ||
|
|
||
| public Rex(UnresolvedExpression field, Literal pattern) { | ||
| this(field, pattern, RexMode.EXTRACT, Optional.empty()); | ||
| } | ||
|
|
||
| public Rex(UnresolvedExpression field, Literal pattern, Optional<Integer> maxMatch) { | ||
| this(field, pattern, RexMode.EXTRACT, maxMatch); | ||
| } | ||
|
|
||
| public Rex( | ||
| UnresolvedExpression field, Literal pattern, RexMode mode, Optional<Integer> maxMatch) { | ||
| this.field = field; | ||
| this.pattern = pattern; | ||
| this.mode = mode; | ||
| this.maxMatch = maxMatch; | ||
| } | ||
|
|
||
| @Override | ||
| public Rex attach(UnresolvedPlan child) { | ||
| this.child = child; | ||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public List<UnresolvedPlan> getChild() { | ||
| return ImmutableList.of(child); | ||
| } | ||
|
|
||
| @Override | ||
| public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) { | ||
| return nodeVisitor.visitRex(this, context); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| /* | ||
| * Copyright OpenSearch Contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| package org.opensearch.sql.expression.function.udf; | ||
|
|
||
| import java.util.List; | ||
| import java.util.regex.Matcher; | ||
| import java.util.regex.Pattern; | ||
| import java.util.regex.PatternSyntaxException; | ||
| import org.apache.calcite.adapter.enumerable.NotNullImplementor; | ||
| import org.apache.calcite.adapter.enumerable.NullPolicy; | ||
| import org.apache.calcite.adapter.enumerable.RexToLixTranslator; | ||
| import org.apache.calcite.linq4j.tree.Expression; | ||
| import org.apache.calcite.linq4j.tree.Expressions; | ||
| import org.apache.calcite.rex.RexCall; | ||
| import org.apache.calcite.sql.type.ReturnTypes; | ||
| import org.apache.calcite.sql.type.SqlReturnTypeInference; | ||
| import org.opensearch.sql.calcite.utils.PPLOperandTypes; | ||
| import org.opensearch.sql.expression.function.ImplementorUDF; | ||
| import org.opensearch.sql.expression.function.UDFOperandMetadata; | ||
|
|
||
| /** Custom REX_EXTRACT function for extracting regex named capture groups. */ | ||
| public final class RexExtractFunction extends ImplementorUDF { | ||
|
|
||
| public RexExtractFunction() { | ||
| super(new RexExtractImplementor(), NullPolicy.ARG0); | ||
| } | ||
|
|
||
| @Override | ||
| public SqlReturnTypeInference getReturnTypeInference() { | ||
| return ReturnTypes.VARCHAR_2000_NULLABLE; | ||
| } | ||
|
|
||
| @Override | ||
| public UDFOperandMetadata getOperandMetadata() { | ||
| return PPLOperandTypes.STRING_STRING_INTEGER; | ||
| } | ||
|
|
||
| private static class RexExtractImplementor implements NotNullImplementor { | ||
|
|
||
| @Override | ||
| public Expression implement( | ||
| RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) { | ||
| Expression field = translatedOperands.get(0); | ||
| Expression pattern = translatedOperands.get(1); | ||
| Expression groupIndex = translatedOperands.get(2); | ||
|
|
||
| return Expressions.call(RexExtractFunction.class, "extractGroup", field, pattern, groupIndex); | ||
| } | ||
| } | ||
|
|
||
| public static String extractGroup(String text, String pattern, int groupIndex) { | ||
| try { | ||
| Pattern compiledPattern = Pattern.compile(pattern); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Low priority] I'm thinking a nice to have feature is to cache compiled pattern. Alternatively, define it as a global Linq4j expression. See Calcite example: https://github.com/apache/calcite/blob/44b57985eaeb0ef0c1eda2447aa75b5855259356/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java#L461-L475 Maybe we can create an issue to track perf improvement if it's not feasible
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1, Add issue to track it. per row pattern compile is expensive.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. follow up issue: #4235 |
||
| Matcher matcher = compiledPattern.matcher(text); | ||
|
|
||
| if (matcher.find() && groupIndex > 0 && groupIndex <= matcher.groupCount()) { | ||
| return matcher.group(groupIndex); | ||
| } | ||
| return null; | ||
RyanL1997 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } catch (PatternSyntaxException e) { | ||
| throw new IllegalArgumentException( | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @penghuo, I have rechecked the tequrement - and yes for pattern failure it should be catched. The cuurent behavior is like this: curl -X POST "localhost:9200/_plugins/_ppl" -H 'Content-Type: application/json' -d'{
"query": "source=accounts | rex field=email \"(?<invalid>[\" | fields email, invalid | head 1"
}' | jq
{
"error": {
"reason": "Invalid Query",
"details": "Error in 'rex' command: Encountered the following error while compiling the regex '(?<invalid>[': Unclosed character class near index 11\n(?<invalid>[\n ^",
"type": "IllegalArgumentException"
},
"status": 400
} |
||
| "Error in 'rex' command: Encountered the following error while compiling the regex '" | ||
| + pattern | ||
| + "': " | ||
| + e.getMessage()); | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.