From eb950f65e660a45c7e123f3c6fba9f2c86b4a256 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 16 Oct 2023 10:20:25 -0400 Subject: [PATCH] automata/meta: revert broadening of reverse suffix optimization This reverts commit 8a8d599f9d2f2d78e9ad84e4084788c2d563afa5 and includes a regression test, as well as a tweak to a log message. Essentially, the broadening was improper. We have to be careful when dealing with suffixes as opposed to prefixes. Namely, my logic previously was that the broadening was okay because we were already doing it for the reverse inner optimization. But the reverse inner optimization works with prefixes, not suffixes. So the comparison wasn't quite correct. This goes back to only applying the reverse suffix optimization when there is a non-empty single common suffix. Fixes #1110 Ref https://github.com/astral-sh/ruff/pull/7980 --- regex-automata/src/meta/strategy.rs | 39 +++++++++++++++++++---------- testdata/regression.toml | 15 +++++++++++ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 4cb3b29b9..04f2ba3c3 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1167,21 +1167,34 @@ impl ReverseSuffix { return Err(core); } let kind = core.info.config().get_match_kind(); - let suffixseq = crate::util::prefilter::suffixes(kind, hirs); - let Some(suffixes) = suffixseq.literals() else { - debug!( - "skipping reverse suffix optimization because \ - the extract suffix sequence is not finite", - ); - return Err(core); + let suffixes = crate::util::prefilter::suffixes(kind, hirs); + let lcs = match suffixes.longest_common_suffix() { + None => { + debug!( + "skipping reverse suffix optimization because \ + a longest common suffix could not be found", + ); + return Err(core); + } + Some(lcs) if lcs.is_empty() => { + debug!( + "skipping reverse suffix optimization because \ + the longest common suffix is the empty string", + ); + return Err(core); + } + Some(lcs) => lcs, }; - let Some(pre) = Prefilter::new(kind, suffixes) else { - debug!( - "skipping reverse suffix optimization because \ + let pre = match Prefilter::new(kind, &[lcs]) { + Some(pre) => pre, + None => { + debug!( + "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", - ); - return Err(core); + ); + return Err(core); + } }; if !pre.is_fast() { debug!( @@ -1268,7 +1281,7 @@ impl ReverseSuffix { e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( - "using lazy DFA for reverse inner search at {:?}, \ + "using lazy DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, diff --git a/testdata/regression.toml b/testdata/regression.toml index 2954c9118..53b0701a3 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -813,3 +813,18 @@ name = "hir-optimization-out-of-order-class" regex = '^[[:alnum:]./-]+$' haystack = "a-b" matches = [[0, 3]] + +# This is a regression test for an improper reverse suffix optimization. This +# occurred when I "broadened" the applicability of the optimization to include +# multiple possible literal suffixes instead of only sticking to a non-empty +# longest common suffix. It turns out that, at least given how the reverse +# suffix optimization works, we need to stick to the longest common suffix for +# now. +# +# See: https://github.com/rust-lang/regex/issues/1110 +# See also: https://github.com/astral-sh/ruff/pull/7980 +[[test]] +name = 'improper-reverse-suffix-optimization' +regex = '(\\N\{[^}]+})|([{}])' +haystack = 'hiya \N{snowman} bye' +matches = [[[5, 16], [5, 16], []]]