Skip to content
Merged
Show file tree
Hide file tree
Changes from 43 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
e230341
implement regex cmd with calcite support by suing java library
RyanL1997 Aug 14, 2025
f215222
code hygiene fix
RyanL1997 Aug 14, 2025
64d7f7b
comment clean up
RyanL1997 Aug 14, 2025
b011b50
implement explain it
RyanL1997 Aug 15, 2025
39819f6
disable regex when calcite is disable and add a test in analyzer
RyanL1997 Aug 15, 2025
c3840d5
fix spotless check
RyanL1997 Aug 18, 2025
dbe6004
[refactor] refactor some regex fn into a util class for re-usage
RyanL1997 Aug 19, 2025
5667dcf
[refactor] revert filter query builder cuz we do not need it anymore
RyanL1997 Aug 19, 2025
2f91b40
add rst docs for regex cmd
RyanL1997 Aug 20, 2025
6698eb3
add IT for regex cmd
RyanL1997 Aug 20, 2025
f5fe7c2
add IT for calcite no pushdown
RyanL1997 Aug 20, 2025
dda1872
fix regex exp behavior for non string val
RyanL1997 Aug 20, 2025
64d5820
style - remove some verbose comments
RyanL1997 Aug 20, 2025
63c9e9d
remove string convertion
RyanL1997 Aug 20, 2025
cfd37a8
use existing operator of REGEXP_CONTAINS
RyanL1997 Aug 20, 2025
4aa435f
fix integ test of rgex with pushdown after operator commit
RyanL1997 Aug 20, 2025
470ccbc
remove some verbose comments and fix some style
RyanL1997 Aug 21, 2025
31b0c15
fix explain it in no pushdown
RyanL1997 Aug 21, 2025
8eea7e6
comment - remove unused fn for string converting
RyanL1997 Aug 21, 2025
281cb52
remove duplicated regex match operator alias
RyanL1997 Aug 22, 2025
b398932
unit test - initail commit
RyanL1997 Aug 23, 2025
f75ed98
anonymizer with test
RyanL1997 Aug 23, 2025
d1b4d81
fix spotlessApply
RyanL1997 Aug 23, 2025
88cce00
add cross cluster IT
RyanL1997 Aug 23, 2025
b6b1b9d
fix spotless apply
RyanL1997 Aug 26, 2025
0431c64
tomo - fix operator constant
RyanL1997 Aug 27, 2025
8203a63
tomo - fix regex java doc
RyanL1997 Aug 27, 2025
ed1380d
tomo - field and pattern handling fix
RyanL1997 Aug 27, 2025
c881a75
tomo - fix LRUCache
RyanL1997 Aug 27, 2025
34052b7
tomo - remove unnecessary delegation layer
RyanL1997 Aug 27, 2025
29d6afa
rst doc fix
RyanL1997 Aug 27, 2025
83594ae
tomo - fix comments
RyanL1997 Aug 28, 2025
fc92846
DEFAULT FIELD related change
RyanL1997 Aug 28, 2025
30f32f4
DEFAULT FIELD - fix anonymizer tests
RyanL1997 Aug 28, 2025
9fa8aaf
tomo - add unit test for regex util class
RyanL1997 Aug 28, 2025
9ee6818
chen - remove code for legacy engine
RyanL1997 Aug 29, 2025
d9e29a5
chen - remove stalled logic for spcified field
RyanL1997 Aug 29, 2025
5723a9f
chen - merge into 1 grammar in parser
RyanL1997 Aug 29, 2025
9061987
properly handle non-string field
RyanL1997 Aug 29, 2025
ee2f10c
remove verbose comments
RyanL1997 Aug 29, 2025
531600a
remove verbose comments
RyanL1997 Aug 29, 2025
64432e8
address commetns
RyanL1997 Aug 29, 2025
aebacdb
fix doc test for regex
RyanL1997 Aug 29, 2025
7b7f717
fix doc
RyanL1997 Aug 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
import org.opensearch.sql.ast.tree.Patterns;
import org.opensearch.sql.ast.tree.Project;
import org.opensearch.sql.ast.tree.RareTopN;
import org.opensearch.sql.ast.tree.Regex;
import org.opensearch.sql.ast.tree.Relation;
import org.opensearch.sql.ast.tree.RelationSubquery;
import org.opensearch.sql.ast.tree.Rename;
Expand Down Expand Up @@ -743,6 +744,12 @@ public LogicalPlan visitReverse(Reverse node, AnalysisContext context) {
"REVERSE is supported only when " + CALCITE_ENGINE_ENABLED.getKeyValue() + "=true");
}

@Override
public LogicalPlan visitRegex(Regex node, AnalysisContext context) {
throw new UnsupportedOperationException(
"REGEX is supported only when " + CALCITE_ENGINE_ENABLED.getKeyValue() + "=true");
}

@Override
public LogicalPlan visitPaginate(Paginate paginate, AnalysisContext context) {
LogicalPlan child = paginate.getChild().get(0).accept(this, context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
import org.opensearch.sql.ast.tree.Patterns;
import org.opensearch.sql.ast.tree.Project;
import org.opensearch.sql.ast.tree.RareTopN;
import org.opensearch.sql.ast.tree.Regex;
import org.opensearch.sql.ast.tree.Relation;
import org.opensearch.sql.ast.tree.RelationSubquery;
import org.opensearch.sql.ast.tree.Rename;
Expand Down Expand Up @@ -259,6 +260,10 @@ public T visitReverse(Reverse node, C context) {
return visitChildren(node, context);
}

public T visitRegex(Regex node, C context) {
return visitChildren(node, context);
}

public T visitLambdaFunction(LambdaFunction node, C context) {
return visitChildren(node, context);
}
Expand Down
55 changes: 55 additions & 0 deletions core/src/main/java/org/opensearch/sql/ast/tree/Regex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.ast.tree;

import com.google.common.collect.ImmutableList;
import java.util.List;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
import org.opensearch.sql.ast.AbstractNodeVisitor;
import org.opensearch.sql.ast.expression.Literal;
import org.opensearch.sql.ast.expression.UnresolvedExpression;

@Getter
@ToString
@EqualsAndHashCode(callSuper = false)
public class Regex extends UnresolvedPlan {
public static final String EQUALS_OPERATOR = "=";

public static final String NOT_EQUALS_OPERATOR = "!=";

private final UnresolvedExpression field;

private final boolean negated;

private final Literal pattern;

@Setter private UnresolvedPlan child;

public Regex(UnresolvedExpression field, boolean negated, Literal pattern) {
this.field = field;
this.negated = negated;
this.pattern = pattern;
}

@Override
public Regex attach(UnresolvedPlan child) {
this.child = child;
return this;
}

@Override
public List<UnresolvedPlan> getChild() {
return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child);
}

@Override
public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) {
return nodeVisitor.visitRegex(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexWindowBounds;
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.apache.calcite.sql.type.SqlTypeFamily;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.tools.RelBuilder;
import org.apache.calcite.tools.RelBuilder.AggCall;
Expand Down Expand Up @@ -99,6 +100,7 @@
import org.opensearch.sql.ast.tree.Patterns;
import org.opensearch.sql.ast.tree.Project;
import org.opensearch.sql.ast.tree.RareTopN;
import org.opensearch.sql.ast.tree.Regex;
import org.opensearch.sql.ast.tree.Relation;
import org.opensearch.sql.ast.tree.Rename;
import org.opensearch.sql.ast.tree.SPath;
Expand Down Expand Up @@ -174,6 +176,32 @@ public RelNode visitFilter(Filter node, CalcitePlanContext context) {
return context.relBuilder.peek();
}

@Override
public RelNode visitRegex(Regex node, CalcitePlanContext context) {
visitChildren(node, context);

RexNode fieldRex = rexVisitor.analyze(node.getField(), context);
RexNode patternRex = rexVisitor.analyze(node.getPattern(), context);

if (!SqlTypeFamily.CHARACTER.contains(fieldRex.getType())) {
throw new IllegalArgumentException(
String.format(
"Regex command requires field of string type, but got %s for field '%s'",
fieldRex.getType().getSqlTypeName(), node.getField().toString()));
}

RexNode regexCondition =
context.rexBuilder.makeCall(
org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS, fieldRex, patternRex);

if (node.isNegated()) {
regexCondition = context.rexBuilder.makeCall(SqlStdOperatorTable.NOT, regexCondition);
}

context.relBuilder.filter(regexCondition);
return context.relBuilder.peek();
}

private boolean containsSubqueryExpression(Node expr) {
if (expr == null) {
return false;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.expression.parse;

import com.google.common.collect.ImmutableList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/**
* Common utilities for regex operations. Provides pattern caching and consistent matching behavior.
*/
public class RegexCommonUtils {

private static final Pattern NAMED_GROUP_PATTERN =
Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");

private static final int MAX_CACHE_SIZE = 1000;

private static final Map<String, Pattern> patternCache =
Collections.synchronizedMap(
new LinkedHashMap<>(MAX_CACHE_SIZE + 1, 0.75f, true) {
@Override
protected boolean removeEldestEntry(Map.Entry<String, Pattern> eldest) {
return size() > MAX_CACHE_SIZE;
}
});

/**
* Get compiled pattern from cache or compile and cache it.
*
* @param regex The regex pattern string
* @return Compiled Pattern object
* @throws PatternSyntaxException if the regex is invalid
*/
public static Pattern getCompiledPattern(String regex) {
Pattern pattern = patternCache.get(regex);
if (pattern == null) {
pattern = Pattern.compile(regex);
patternCache.put(regex, pattern);
}
return pattern;
}

/**
* Extract list of named group candidates from a regex pattern.
*
* @param pattern The regex pattern string
* @return List of named group names found in the pattern
*/
public static List<String> getNamedGroupCandidates(String pattern) {
ImmutableList.Builder<String> namedGroups = ImmutableList.builder();
Matcher m = NAMED_GROUP_PATTERN.matcher(pattern);
while (m.find()) {
namedGroups.add(m.group(1));
}
return namedGroups.build();
}

/**
* Match using find() for partial match semantics with string pattern.
*
* @param text The text to match against
* @param patternStr The pattern string
* @return true if pattern is found anywhere in the text
* @throws PatternSyntaxException if the regex is invalid
*/
public static boolean matchesPartial(String text, String patternStr) {
if (text == null || patternStr == null) {
return false;
}
Pattern pattern = getCompiledPattern(patternStr);
return pattern.matcher(text).find();
}

/**
* Extract a specific named group from text using the pattern. Used by parse command regex method.
*
* @param text The text to extract from
* @param pattern The compiled pattern with named groups
* @param groupName The name of the group to extract
* @return The extracted value or null if not found
*/
public static String extractNamedGroup(String text, Pattern pattern, String groupName) {
if (text == null || pattern == null || groupName == null) {
return null;
}

Matcher matcher = pattern.matcher(text);

if (matcher.matches()) {
try {
return matcher.group(groupName);
} catch (IllegalArgumentException e) {
return null;
}
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@

package org.opensearch.sql.expression.parse;

import com.google.common.collect.ImmutableList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.EqualsAndHashCode;
import lombok.Getter;
Expand All @@ -24,7 +21,6 @@
@ToString
public class RegexExpression extends ParseExpression {
private static final Logger log = LogManager.getLogger(RegexExpression.class);
private static final Pattern GROUP_PATTERN = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");
@Getter @EqualsAndHashCode.Exclude private final Pattern regexPattern;

/**
Expand All @@ -36,32 +32,19 @@ public class RegexExpression extends ParseExpression {
*/
public RegexExpression(Expression sourceField, Expression pattern, Expression identifier) {
super("regex", sourceField, pattern, identifier);
this.regexPattern = Pattern.compile(pattern.valueOf().stringValue());
this.regexPattern = RegexCommonUtils.getCompiledPattern(pattern.valueOf().stringValue());
}

@Override
ExprValue parseValue(ExprValue value) throws ExpressionEvaluationException {
String rawString = value.stringValue();
Matcher matcher = regexPattern.matcher(rawString);
if (matcher.matches()) {
return new ExprStringValue(matcher.group(identifierStr));

String extracted = RegexCommonUtils.extractNamedGroup(rawString, regexPattern, identifierStr);

if (extracted != null) {
return new ExprStringValue(extracted);
}
log.debug("failed to extract pattern {} from input ***", regexPattern.pattern());
return new ExprStringValue("");
}

/**
* Get list of derived fields based on parse pattern.
*
* @param pattern pattern used for parsing
* @return list of names of the derived fields
*/
public static List<String> getNamedGroupCandidates(String pattern) {
ImmutableList.Builder<String> namedGroups = ImmutableList.builder();
Matcher m = GROUP_PATTERN.matcher(pattern);
while (m.find()) {
namedGroups.add(m.group(1));
}
return namedGroups.build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.opensearch.sql.expression.parse.GrokExpression;
import org.opensearch.sql.expression.parse.ParseExpression;
import org.opensearch.sql.expression.parse.PatternsExpression;
import org.opensearch.sql.expression.parse.RegexCommonUtils;
import org.opensearch.sql.expression.parse.RegexExpression;

/** Utils for {@link ParseExpression}. */
Expand Down Expand Up @@ -57,7 +58,7 @@ public static List<String> getNamedGroupCandidates(
ParseMethod parseMethod, String pattern, Map<String, Literal> arguments) {
switch (parseMethod) {
case REGEX:
return RegexExpression.getNamedGroupCandidates(pattern);
return RegexCommonUtils.getNamedGroupCandidates(pattern);
case GROK:
return GrokExpression.getNamedGroupCandidates(pattern);
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1929,4 +1929,18 @@ public void brain_patterns_command() {

assertAnalyzeEqual(expectedPlan, patterns);
}

@Test
public void regex_command_throws_unsupported_exception_with_legacy_engine() {
UnsupportedOperationException exception =
assertThrows(
UnsupportedOperationException.class,
() ->
analyze(
new org.opensearch.sql.ast.tree.Regex(
field("lastname"), false, stringLiteral("^[A-Z][a-z]+$"))
.attach(relation("schema"))));
assertEquals(
"REGEX is supported only when plugins.calcite.enabled=true", exception.getMessage());
}
}
Loading
Loading