From de0339959b491ae0a26e6f96c0b0dc1635bc0f94 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Aug 2023 08:48:26 -0400 Subject: [PATCH] automata: fix incorrect use of Aho-Corasick's "standard" semantics This fixes a bug in how prefilters were applied for multi-regexes compiled with "all" semantics. It turns out that this corresponds to the regex crate's RegexSet API, but only its `is_match` routine. See the comment on the regression test added in this PR for an explanation of what happened. Basically, it came down to incorrectly using Aho-Corasick's "standard" semantics, which doesn't necessarily report leftmost matches. Since the regex crate is really all about leftmost matching, this can lead to skipping over parts of the haystack and thus lead to missing matches. Fixes #1070 --- .../src/util/prefilter/aho_corasick.rs | 13 ++++++++-- regex-automata/src/util/prefilter/mod.rs | 9 ------- regex-automata/src/util/prefilter/teddy.rs | 9 +++++-- testdata/regression.toml | 26 +++++++++++++++++++ 4 files changed, 44 insertions(+), 13 deletions(-) diff --git a/regex-automata/src/util/prefilter/aho_corasick.rs b/regex-automata/src/util/prefilter/aho_corasick.rs index a7474d29a..50cce827e 100644 --- a/regex-automata/src/util/prefilter/aho_corasick.rs +++ b/regex-automata/src/util/prefilter/aho_corasick.rs @@ -22,11 +22,20 @@ impl AhoCorasick { } #[cfg(feature = "perf-literal-multisubstring")] { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. let ac_match_kind = match kind { - MatchKind::LeftmostFirst => { + MatchKind::LeftmostFirst | MatchKind::All => { aho_corasick::MatchKind::LeftmostFirst } - MatchKind::All => aho_corasick::MatchKind::Standard, }; // This is kind of just an arbitrary number, but basically, if we // have a small enough set of literals, then we try to use the VERY diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index ea3eb73d8..51fc92233 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -195,15 +195,6 @@ impl Prefilter { /// Some(Span::from(6..9)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); - /// // Now we put 'samwise' back before 'sam', but change the match - /// // semantics to 'All'. In this case, there is no preference - /// // order semantics and the first match detected is returned. - /// let pre = Prefilter::new(MatchKind::All, &["samwise", "sam"]) - /// .expect("a prefilter"); - /// assert_eq!( - /// Some(Span::from(6..9)), - /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), - /// ); /// /// # Ok::<(), Box>(()) /// ``` diff --git a/regex-automata/src/util/prefilter/teddy.rs b/regex-automata/src/util/prefilter/teddy.rs index 02210a5ec..fc79f2b2f 100644 --- a/regex-automata/src/util/prefilter/teddy.rs +++ b/regex-automata/src/util/prefilter/teddy.rs @@ -50,12 +50,17 @@ impl Teddy { // theory we could at least support leftmost-longest, as the // aho-corasick crate does, but regex-automata doesn't know about // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) let (packed_match_kind, ac_match_kind) = match kind { - MatchKind::LeftmostFirst => ( + MatchKind::LeftmostFirst | MatchKind::All => ( aho_corasick::packed::MatchKind::LeftmostFirst, aho_corasick::MatchKind::LeftmostFirst, ), - _ => return None, }; let minimum_len = needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); diff --git a/testdata/regression.toml b/testdata/regression.toml index a2efa2ad3..03b15d6d5 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -756,3 +756,29 @@ name = "reverse-inner-short" regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])' haystack = '102:12:39' matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] + +# This regression test was found via the RegexSet APIs. It triggered a +# particular code path where a regex was compiled with 'All' match semantics +# (to support overlapping search), but got funneled down into a standard +# leftmost search when calling 'is_match'. This is fine on its own, but the +# leftmost search will use a prefilter and that's where this went awry. +# +# Namely, since 'All' semantics were used, the aho-corasick prefilter was +# incorrectly compiled with 'Standard' semantics. This was wrong because +# 'Standard' immediately attempts to report a match at every position, even if +# that would mean reporting a match past the leftmost match before reporting +# the leftmost match. This breaks the prefilter contract of never having false +# negatives and leads overall to the engine not finding a match. +# +# See: https://github.com/rust-lang/regex/issues/1070 +[[test]] +name = "prefilter-with-aho-corasick-standard-semantics" +regex = '(?m)^ *v [0-9]' +haystack = 'v 0' +matches = [ + { id = 0, spans = [[0, 3]] }, +] +match-kind = "all" +search-kind = "overlapping" +unicode = true +utf8 = true