Skip to content

Commit

Permalink
allow starting DFA in noncontinuous bytes
Browse files Browse the repository at this point in the history
regex-automtaton already supports transversing the DFA one byte at a time with
`next_state`. This is potentially very useful when scanning noncontinuous data
like network stream or a rope data structures as commonly used in editors.

However, to start the DFA with `start_state_forward`/`start_state_reverse`
currently requires an `Input` and will look ahead/look one byte behind the
span boundaries. To support that (especially when using prefilters/literal
optimization) a streaming use case can not provide such a haystack easily (it
can be worked around with a temporary array and copying one byte over but its
extremely brittle/hacky).

This commit adds the `start_state_forward_with`/`start_state_reverse_with`
function which allow passing the information extracted from the Input directly.
  • Loading branch information
pascalkuthe committed Jul 9, 2023
1 parent 7c3463d commit bbf274e
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 71 deletions.
37 changes: 37 additions & 0 deletions regex-automata/src/dfa/automaton.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::{
primitives::{PatternID, StateID},
search::{Anchored, HalfMatch, Input, MatchError},
},
Span,
};

/// A trait describing the interface of a deterministic finite automaton (DFA).
Expand Down Expand Up @@ -253,6 +254,14 @@ pub unsafe trait Automaton {
input: &Input<'_>,
) -> Result<StateID, MatchError>;

/// TODO
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError>;

/// Return the ID of the start state for this lazy DFA when executing a
/// reverse search.
///
Expand Down Expand Up @@ -280,6 +289,14 @@ pub unsafe trait Automaton {
input: &Input<'_>,
) -> Result<StateID, MatchError>;

/// TODO
fn start_state_reverse_with(
&self,
mode: Anchored,
look_ahead: Option<u8>,
span: Span,
) -> Result<StateID, MatchError>;

/// If this DFA has a universal starting state for the given anchor mode
/// and the DFA supports universal starting states, then this returns that
/// state's identifier.
Expand Down Expand Up @@ -1806,6 +1823,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
(**self).start_state_forward(input)
}

#[inline]
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
(**self).start_state_forward_with(mode, look_behind, span)
}

#[inline]
fn start_state_reverse(
&self,
Expand All @@ -1814,6 +1841,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
(**self).start_state_reverse(input)
}

#[inline]
fn start_state_reverse_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
(**self).start_state_reverse_with(mode, look_behind, span)
}

#[inline]
fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
(**self).universal_start_state(mode)
Expand Down
68 changes: 51 additions & 17 deletions regex-automata/src/dfa/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ use crate::{
start::{Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
Span,
};

/// The label that is pre-pended to a serialized DFA.
Expand Down Expand Up @@ -2883,7 +2884,9 @@ impl OwnedDFA {
let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
// This OK because we only call 'start' under conditions
// in which we know it will succeed.
dfa.st.start(inp, start).expect("valid Input configuration")
dfa.st
.start(inp.get_anchored(), start)
.expect("valid Input configuration")
};
if self.start_kind().has_unanchored() {
let inp = Input::new("").anchored(Anchored::No);
Expand Down Expand Up @@ -3214,31 +3217,63 @@ unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.start() > 0 {
let offset = input.start() - 1;
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_forward_with(
input.get_anchored(),
input.start().checked_sub(1).map(|i| input.haystack()[i]),
input.get_span(),
)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
debug_assert_eq!(
span.start != 0,
look_behind.is_some(),
"look_behind should be provided if and only if the DFA starts at an offset"
);
if !self.quitset.is_empty() {
if let Some(byte) = look_behind {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.start - 1));
}
}
}
let start = self.st.start_map.fwd(&input);
self.st.start(input, start)
let start = self.st.start_map.fwd_with(look_behind);
self.st.start(mode, start)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse(
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.end() < input.haystack().len() {
let offset = input.end();
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_reverse_with(
input.get_anchored(),
input.haystack().get(input.end()).copied(),
input.get_span(),
)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse_with(
&self,
mode: Anchored,
look_ahead: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() {
if let Some(byte) = look_ahead {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.end));
}
}
}
let start = self.st.start_map.rev(&input);
self.st.start(input, start)
let start = self.st.start_map.rev_with(look_ahead);
self.st.start(mode, start)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
Expand Down Expand Up @@ -4174,11 +4209,10 @@ impl<T: AsRef<[u32]>> StartTable<T> {
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start(
&self,
input: &Input<'_>,
mode: Anchored,
start: Start,
) -> Result<StateID, MatchError> {
let start_index = start.as_usize();
let mode = input.get_anchored();
let index = match mode {
Anchored::No => {
if !self.kind.has_unanchored() {
Expand Down
69 changes: 51 additions & 18 deletions regex-automata/src/dfa/sparse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ use crate::{
start::{Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
Span,
};

const LABEL: &str = "rust-regex-automata-dfa-sparse";
Expand Down Expand Up @@ -1206,36 +1207,69 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
self.flags.is_always_start_anchored
}

#[inline]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward(
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.start() > 0 {
let offset = input.start() - 1;
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_forward_with(
input.get_anchored(),
input.start().checked_sub(1).map(|i| input.haystack()[i]),
input.get_span(),
)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
debug_assert_eq!(
span.start != 0,
look_behind.is_some(),
"look_behind should be provided if and only if the DFA starts at an offset"
);
if !self.quitset.is_empty() {
if let Some(byte) = look_behind {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.start - 1));
}
}
}
let start = self.st.start_map.fwd(&input);
self.st.start(input, start)
let start = self.st.start_map.fwd_with(look_behind);
self.st.start(mode, start)
}

#[inline]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse(
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.end() < input.haystack().len() {
let offset = input.end();
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_reverse_with(
input.get_anchored(),
input.haystack().get(input.end()).copied(),
input.get_span(),
)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse_with(
&self,
mode: Anchored,
look_ahead: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() {
if let Some(byte) = look_ahead {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.end));
}
}
}
let start = self.st.start_map.rev(&input);
self.st.start(input, start)
let start = self.st.start_map.rev_with(look_ahead);
self.st.start(mode, start)
}

#[inline]
Expand Down Expand Up @@ -2145,11 +2179,10 @@ impl<T: AsRef<[u8]>> StartTable<T> {
/// panics.
fn start(
&self,
input: &Input<'_>,
mode: Anchored,
start: Start,
) -> Result<StateID, MatchError> {
let start_index = start.as_usize();
let mode = input.get_anchored();
let index = match mode {
Anchored::No => {
if !self.kind.has_unanchored() {
Expand Down
Loading

0 comments on commit bbf274e

Please sign in to comment.