From e7392ef1f453cfa368b239310d00f91ebf4b44b4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:39:14 -0400 Subject: [PATCH] automata: rejigger DFA start state computation It turns out that requiring callers to provide an `Input` (and thus a `&[u8]` haystack) is a bit onerous for all cases. Namely, part of the point of `regex-automata` was to expose enough guts to make it tractable to write a streaming regex engine. A streaming regex engine, especially one that does a byte-at-a-time loop, is somewhat antithetical to having a haystack in a single `&[u8]` slice. This made computing start states possible but very awkward and quite unclear in terms of what the implementation would actually do with the haystack. This commit fixes that by exposing a lower level `start_state` method on both of the DFAs that can be called without materializing an `Input`. Instead, callers must create a new `start::Config` value which provides all of the information necessary for the DFA to compute the correct start state. This in turn also exposes the `crate::util::start` module. This is ultimately a breaking change because it adds a new required method to the `Automaton` trait. It also makes `start_state_forward` and `start_state_reverse` optional. It isn't really expected for callers to implement the `Automaton` trait themselves (and perhaps I will seal it so we can do such changes in the future without it being breaking), but still, this is technically breaking. Callers using `start_state_forward` or `start_state_reverse` with either DFA remain unchanged and unaffected. Closes #1031 --- regex-automata/src/dfa/automaton.rs | 188 ++++++++++++++++++--- regex-automata/src/dfa/dense.rs | 95 +++++------ regex-automata/src/dfa/mod.rs | 2 +- regex-automata/src/dfa/sparse.rs | 59 +++---- regex-automata/src/hybrid/dfa.rs | 179 +++++++++++--------- regex-automata/src/hybrid/error.rs | 115 ++++++++++++- regex-automata/src/hybrid/mod.rs | 2 +- regex-automata/src/util/mod.rs | 2 +- regex-automata/src/util/start.rs | 243 ++++++++++++++++++++++++---- 9 files changed, 655 insertions(+), 230 deletions(-) diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index 7e2be9a15..cd597947e 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -7,6 +7,7 @@ use crate::{ prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, + start, }, }; @@ -226,8 +227,8 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this DFA for the given starting + /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -235,12 +236,41 @@ pub unsafe trait Automaton { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it may + /// be more succinct to use [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`]. Note, for example, that the + /// convenience routines return a [`MatchError`] on failure where as this + /// routine returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte). + /// This can also return an error if the given configuration contains an + /// unsupported [`Anchored`] configuration. + fn start_state( + &self, + config: &start::Config, + ) -> Result; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -251,23 +281,30 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_forward(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } - /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. + /// Return the ID of the start state for this DFA when executing a reverse + /// search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -278,7 +315,18 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_reverse(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).next_eoi_state(current) } + #[inline] + fn start_state( + &self, + config: &start::Config, + ) -> Result { + (**self).start_state(config) + } + #[inline] fn start_state_forward( &self, @@ -2015,6 +2071,90 @@ impl OverlappingState { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either based on +/// incorrect configuration or even based on whether the look-behind byte +/// triggers a quit state. Typically one does not need to handle this error +/// if you're using [`Automaton::start_state_forward`] (or its reverse +/// counterpart), as that routine automatically converts `StartError` to a +/// [`MatchError`] for you. +/// +/// This error may be returned by the [`Automaton::start_state`] routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError {} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 6da865f97..7af38b546 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -30,7 +30,7 @@ use crate::{ use crate::{ dfa::{ accel::Accels, - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, @@ -40,8 +40,8 @@ use crate::{ int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -2885,31 +2885,33 @@ impl OwnedDFA { fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); - let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + let start_id = |dfa: &mut OwnedDFA, + anchored: Anchored, + start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { - let inp = Input::new("").anchored(Anchored::No); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::No; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { - let inp = Input::new("").anchored(Anchored::Yes); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::Yes; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } @@ -3216,35 +3218,21 @@ unsafe impl> Automaton for DFA { } #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -4180,28 +4168,27 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -5086,6 +5073,8 @@ impl core::fmt::Display for BuildError { #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { + use crate::{Input, MatchError}; + use super::*; #[test] diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index 4bb870435..fd58cac23 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -320,7 +320,7 @@ dramatically. #[cfg(feature = "dfa-search")] pub use crate::dfa::{ - automaton::{Automaton, OverlappingState}, + automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 5d8ec2340..a5ccf9add 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -52,7 +52,7 @@ use alloc::{vec, vec::Vec}; use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, @@ -63,8 +63,8 @@ use crate::{ int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -1207,35 +1207,21 @@ unsafe impl> Automaton for DFA { } #[inline] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[inline] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[inline] @@ -2145,28 +2131,27 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 67261c1a3..102cfb6fe 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2122,16 +2159,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2178,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2610,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2678,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index 604daf3c3..d134e7ec9 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/regex-automata/src/hybrid/mod.rs b/regex-automata/src/hybrid/mod.rs index 44e67e129..2feb839d1 100644 --- a/regex-automata/src/hybrid/mod.rs +++ b/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs index bb739df1d..b3eef64e6 100644 --- a/regex-automata/src/util/mod.rs +++ b/regex-automata/src/util/mod.rs @@ -40,6 +40,7 @@ pub mod look; pub mod pool; pub mod prefilter; pub mod primitives; +pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; @@ -52,6 +53,5 @@ pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; -pub(crate) mod start; pub(crate) mod unicode_data; pub(crate) mod utf8; diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 4e360d083..de62a319d 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -1,17 +1,195 @@ /*! -Provides some helpers for dealing with start state configurations in DFAs. - -[`Start`] represents the possible starting configurations, while -[`StartByteMap`] represents a way to retrieve the `Start` configuration for a -given position in a haystack. +Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, - search::Input, + search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs supports the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to the regex `(?s-u:.)*?` +/// which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`](crate::Input). In particular, an `Input` wants a +/// haystack slice, but callers may not have a contiguous sequence of bytes as +/// a haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + /// A map from every possible byte value to its corresponding starting /// configuration. /// @@ -71,30 +249,11 @@ impl StartByteMap { StartByteMap { map } } - /// Return the forward starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - - /// Return the reverse starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] - fn get(&self, byte: u8) -> Start { + pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } @@ -253,21 +412,32 @@ mod tests { #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.fwd(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); @@ -287,8 +457,11 @@ mod tests { fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.rev(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0));