From 21eb6f036ef3eac2d9f2468ec8f96ac5693af878 Mon Sep 17 00:00:00 2001 From: Rebecca Schlussel Date: Thu, 8 Aug 2024 11:51:04 -0400 Subject: [PATCH] Fix like matching with newlines and no wildcard LIKE expressions were not looking past newlines when there was no wild card. That means that certain expressions were incorrectly returning matches when after the newline the input did not match. This change fixes that behavior. For example: SELECT 'foo\nbar' LIKE 'foo'. Previously that query would return "true". Now it will return false. --- .../src/main/sphinx/functions/comparison.rst | 5 ++++- .../facebook/presto/type/LikeFunctions.java | 5 ++++- .../presto/sql/TestLikeFunctions.java | 20 +++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/presto-docs/src/main/sphinx/functions/comparison.rst b/presto-docs/src/main/sphinx/functions/comparison.rst index 0c32ee6eb089c..62da5d7936ba8 100644 --- a/presto-docs/src/main/sphinx/functions/comparison.rst +++ b/presto-docs/src/main/sphinx/functions/comparison.rst @@ -158,7 +158,7 @@ LIKE ---- The LIKE operator is used to match a specified character pattern in a string. Patterns can contain regular characters as well as wildcards. Wildcard characters can be escaped using the single character -specified for the ESCAPE parameter. Matching is case sensitive. +specified for the ESCAPE parameter. Matching is case sensitive, and the pattern must match the whole string. Syntax: @@ -199,6 +199,9 @@ Examples:: WHERE name LIKE '%#%%' ESCAPE '#' --returns 'a%c' and '%cd' + SELECT 'ab' || chr(10) || 'c' LIKE 'ab' --chr(10) is a newline character + --returns 'false' + Row comparison: IN ------------------ diff --git a/presto-main/src/main/java/com/facebook/presto/type/LikeFunctions.java b/presto-main/src/main/java/com/facebook/presto/type/LikeFunctions.java index 84c01c5b7327d..1673d2f9a8964 100644 --- a/presto-main/src/main/java/com/facebook/presto/type/LikeFunctions.java +++ b/presto-main/src/main/java/com/facebook/presto/type/LikeFunctions.java @@ -212,7 +212,10 @@ private static Regex likePattern(String patternString, char escapeChar, boolean regex.append('$'); byte[] bytes = regex.toString().getBytes(UTF_8); - return new Regex(bytes, 0, bytes.length, Option.MULTILINE, NonStrictUTF8Encoding.INSTANCE, SYNTAX); + // Option.MULTILINE specifies that wildcard characters (. and *) should match newlines + // Option.SINGLELINE specifies that anchors (^ and $) should match the beginning and end of + // input rather than the beginning and end of the line + return new Regex(bytes, 0, bytes.length, Option.MULTILINE | Option.SINGLELINE, NonStrictUTF8Encoding.INSTANCE, SYNTAX); } @SuppressWarnings("NumericCastThatLosesPrecision") diff --git a/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java b/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java index 496cd578519bf..b0d75d23a12a1 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java @@ -96,12 +96,32 @@ public void testLikeNewlineBeforeMatch() assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); } + @Test + public void testLikeNewlineNoWildcard() + { + Regex regex = likePattern(utf8Slice("foo\nbar")); + assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); + } + + @Test + public void testLikeNoMatchAfterNewline() + { + Regex regex = likePattern(utf8Slice("foo")); + assertFalse(likeVarchar(utf8Slice("foo\nbar"), regex)); + } + @Test public void testLikeNewlineInMatch() { Regex regex = likePattern(utf8Slice("f%b%")); assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); } + @Test + public void testLikeNewlineInSingleWildcardMatch() + { + Regex regex = likePattern(utf8Slice("foo_bar")); + assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); + } @Test(timeOut = 1000) public void testLikeUtf8Pattern()