diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index a6358a4d5cb..7d5decf136d 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -75,6 +75,7 @@ public enum BuiltinFunctionName { MVAPPEND(FunctionName.of("mvappend")), MVJOIN(FunctionName.of("mvjoin")), MVINDEX(FunctionName.of("mvindex")), + SPLIT(FunctionName.of("split")), MVDEDUP(FunctionName.of("mvdedup")), FORALL(FunctionName.of("forall")), EXISTS(FunctionName.of("exists")), diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index 7727d1bd85d..83aa8f26d9b 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -195,6 +195,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP; import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP; @@ -998,6 +999,34 @@ void populate() { builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter), PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER)); + // Register SPLIT with custom logic for empty delimiter + // Case 1: Delimiter is not empty string, use SPLIT + // Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern + register( + SPLIT, + (FunctionImp2) + (builder, str, delimiter) -> { + // Create condition: delimiter = '' + RexNode emptyString = builder.makeLiteral(""); + RexNode isEmptyDelimiter = + builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString); + + // For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.' + // pattern This matches each individual character + RexNode dotPattern = builder.makeLiteral("."); + RexNode splitChars = + builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern); + + // For non-empty delimiter: use standard SPLIT + RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter); + + // Use CASE to choose between the two approaches + // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END + return builder.makeCall( + SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit); + }, + PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER)); + // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization register( MVINDEX, diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst index 34c02074641..fdea75d3e81 100644 --- a/docs/user/ppl/functions/collection.rst +++ b/docs/user/ppl/functions/collection.rst @@ -186,6 +186,60 @@ Example:: | 120 | +--------+ +SPLIT +----- + +Description +>>>>>>>>>>> + +Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array. + +Argument type: str: STRING, delimiter: STRING + +Return type: ARRAY of STRING + +Example:: + + os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1 + fetched rows / total rows = 1/1 + +------------------------------------+ + | result | + |------------------------------------| + | [buttercup,rarity,tenderhoof,dash] | + +------------------------------------+ + + os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1 + fetched rows / total rows = 1/1 + +------------------+ + | result | + |------------------| + | [1a2b3c4,567890] | + +------------------+ + + os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1 + fetched rows / total rows = 1/1 + +-----------+ + | result | + |-----------| + | [a,b,c,d] | + +-----------+ + + os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1 + fetched rows / total rows = 1/1 + +--------------+ + | result | + |--------------| + | [name,value] | + +--------------+ + + os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1 + fetched rows / total rows = 1/1 + +---------+ + | result | + |---------| + | [hello] | + +---------+ + MVJOIN ------ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java index 52a6e181e20..31556e518b9 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java @@ -567,4 +567,43 @@ public void testMvdedupPreservesOrder() throws IOException { // Should preserve first occurrence order: z, a, b, c verifyDataRows(actual, rows(List.of("z", "a", "b", "c"))); } + + @Test + public void testSplitWithSemicolonDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result =" + + " split(test, ';') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh"))); + } + + @Test + public void testSplitWithMultiCharDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |" + + " fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("1a2b3c4", "567890"))); + } + + @Test + public void testSplitWithEmptyDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + // Empty delimiter splits into individual characters + verifyDataRows(actual, rows(List.of("a", "b", "c", "d"))); + } } diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 3818ba6a7df..694aabf43ab 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -448,6 +448,7 @@ MVAPPEND: 'MVAPPEND'; MVJOIN: 'MVJOIN'; MVINDEX: 'MVINDEX'; MVDEDUP: 'MVDEDUP'; +SPLIT: 'SPLIT'; FORALL: 'FORALL'; FILTER: 'FILTER'; TRANSFORM: 'TRANSFORM'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 6524adf87a6..1cc33cd7f5d 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -1104,6 +1104,7 @@ collectionFunctionName | MVJOIN | MVINDEX | MVDEDUP + | SPLIT | FORALL | EXISTS | FILTER diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java index 176fb534f37..96529adea24 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java @@ -290,4 +290,81 @@ public void testMvdedupPreservesOrder() { + "LIMIT 1"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + @Test + public void testSplitWithSemicolonDelimiter() { + String ppl = + "source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head" + + " 1 | fields result"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR]," + + " result=[CASE(=(';', '')," + + " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.')," + + " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', " + + "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END " + + "`result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testSplitWithMultiCharDelimiter() { + String ppl = + "source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |" + + " fields result"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR]," + + " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR," + + " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', " + + "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testSplitWithEmptyDelimiter() { + String ppl = + "source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result"; + RelNode root = getRelNode(ppl); + + // With empty delimiter, should split into individual characters + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR]," + + " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR," + + " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') " + + "ELSE SPLIT('abcd', '') END `result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 63d6eaf86d9..ec166c81b7e 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -845,6 +845,22 @@ public void testMvindex() { anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result")); } + @Test + public void testSplit() { + // Test split with delimiter + assertEquals( + "source=table | eval identifier=split(***,***) | fields + identifier", + anonymize("source=t | eval result=split('a;b;c', ';') | fields result")); + // Test split with field reference + assertEquals( + "source=table | eval identifier=split(identifier,***) | fields + identifier", + anonymize("source=t | eval result=split(text, ',') | fields result")); + // Test split with empty delimiter (splits into characters) + assertEquals( + "source=table | eval identifier=split(***,***) | fields + identifier", + anonymize("source=t | eval result=split('abcd', '') | fields result")); + } + @Test public void testMvdedup() { // Test mvdedup with array containing duplicates