diff --git a/src/backtrack.rs b/src/backtrack.rs index 78bc39e93e..e734542c9d 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -56,6 +56,8 @@ pub struct Bounded<'a, 'm, 'r, 's, I> { input: I, matches: &'m mut [bool], slots: &'s mut [Slot], + /// Stores input positions for the Progress instruction. + progress_slots: Vec, m: &'a mut Cache, } @@ -84,6 +86,8 @@ impl Cache { enum Job { Inst { ip: InstPtr, at: InputAt }, SaveRestore { slot: usize, old_pos: Option }, + /// Restores a saved progress_slot. + SaveRestoreProgress { slot: usize, old_pos: Option }, } impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { @@ -102,11 +106,16 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { let mut cache = cache.borrow_mut(); let mut cache = &mut cache.backtrack; let start = input.at(start); + let mut progress_slots = Vec::new(); + for _ in 0..prog.num_progress_slots { + progress_slots.push(None); + } let mut b = Bounded { prog: prog, input: input, matches: matches, slots: slots, + progress_slots: progress_slots, m: cache, }; b.exec_(start) @@ -204,6 +213,11 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { self.slots[slot] = old_pos; } } + Job::SaveRestoreProgress { slot, old_pos } => { + if slot < self.slots.len() { + self.progress_slots[slot] = old_pos; + } + } } } matched @@ -274,6 +288,25 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { } return false; } + Progress(ref inst) => { + if let Some(&old_pos) = self.progress_slots.get(inst.slot) { + if let Some(p) = old_pos { + if p >= at.pos() { + return false; + } + } + // If this path doesn't work out, then we save the old + // progress index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old progress index is restored. + self.m.jobs.push(Job::SaveRestoreProgress { + slot: inst.slot, + old_pos: old_pos, + }); + self.progress_slots[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } } if self.has_visited(ip, at) { return false; diff --git a/src/compile.rs b/src/compile.rs index e4cedc3558..d932be67e0 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -22,6 +22,7 @@ use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences}; use prog::{ Program, Inst, InstPtr, EmptyLook, InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes, + InstProgress, }; use Error; @@ -45,6 +46,8 @@ pub struct Compiler { suffix_cache: SuffixCache, utf8_seqs: Option, byte_classes: ByteClassSet, + /// Number of progress slots used for the Progress instruction. + progress_count: usize, } impl Compiler { @@ -61,6 +64,7 @@ impl Compiler { suffix_cache: SuffixCache::new(1000), utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), byte_classes: ByteClassSet::new(), + progress_count: 0, } } @@ -535,13 +539,17 @@ impl Compiler { ) -> Result { let split_entry = self.insts.len(); let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr)); + let begin = self.insts.len(); + if !greedy && matches_empty(expr) { + self.push_progress(); + } + let Patch { hole: hole_rep, entry: _ } = try!(self.c(expr)); self.fill(hole_rep, split_entry); let split_hole = if greedy { - self.fill_split(split, Some(entry_rep), None) + self.fill_split(split, Some(begin), None) } else { - self.fill_split(split, None, Some(entry_rep)) + self.fill_split(split, None, Some(begin)) }; Ok(Patch { hole: split_hole, entry: split_entry }) } @@ -553,6 +561,9 @@ impl Compiler { ) -> Result { let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr)); self.fill_to_next(hole_rep); + if !greedy && matches_empty(expr) { + self.push_progress(); + } let split = self.push_split_hole(); let split_hole = if greedy { @@ -702,6 +713,14 @@ impl Compiler { Hole::One(hole) } + /// Pushes a Progress instruction onto the program. + fn push_progress(&mut self) { + let slot = self.progress_count; + self.progress_count += 1; + let next = self.insts.len() + 1; + self.push_compiled(Inst::Progress(InstProgress{goto: next, slot: slot})); + } + fn check_size(&self) -> result::Result<(), Error> { use std::mem::size_of; @@ -1059,6 +1078,30 @@ fn u32_to_usize(n: u32) -> usize { n as usize } +/// Returns true if and only if this expression can match without consuming +/// characters or bytes. +fn matches_empty(expr: &Expr) -> bool { + match expr { + &Expr::Empty | &Expr::StartLine | &Expr::EndLine | &Expr::StartText + | &Expr::EndText | &Expr::WordBoundary | &Expr::NotWordBoundary + | &Expr::WordBoundaryAscii | &Expr::NotWordBoundaryAscii => true, + &Expr::AnyChar | &Expr::AnyCharNoNL | &Expr::AnyByte + | &Expr::AnyByteNoNL | &Expr::Class(_) | &Expr::ClassBytes(_) + => false, + &Expr::Literal { ref chars, casei: _ } => chars.len() == 0, + &Expr::LiteralBytes { ref bytes, casei: _ } => bytes.len() == 0, + &Expr::Group { ref e, i: _, name: _ } => matches_empty(&e), + &Expr::Repeat { ref e, r, greedy: _ } => (match r { + Repeater::ZeroOrOne => true, + Repeater::ZeroOrMore => true, + Repeater::OneOrMore => false, + Repeater::Range { min, .. } => min == 0, + } || matches_empty(&e)), + &Expr::Concat(ref v) => v.iter().all(|e| matches_empty(e)), + &Expr::Alternate(ref v) => v.iter().any(|e| matches_empty(e)), + } +} + #[cfg(test)] mod tests { use super::ByteClassSet; diff --git a/src/dfa.rs b/src/dfa.rs index 7d3142c7a3..e8ffedf20a 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -77,7 +77,7 @@ pub fn can_exec(insts: &Program) -> bool { for inst in insts { match *inst { Char(_) | Ranges(_) => return false, - EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {} + EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) | Progress(_) => {} } } true @@ -980,7 +980,7 @@ impl<'a> Fsm<'a> { // These states never happen in a byte-based program. Char(_) | Ranges(_) => unreachable!(), // These states are handled when following epsilon transitions. - Save(_) | Split(_) | EmptyLook(_) => {} + Save(_) | Split(_) | EmptyLook(_) | Progress(_) => {} Match(_) => { state_flags.set_match(); if !self.continue_past_first_match() { @@ -1125,6 +1125,7 @@ impl<'a> Fsm<'a> { } } Save(ref inst) => self.cache.stack.push(inst.goto as InstPtr), + Progress(ref inst) => self.cache.stack.push(inst.goto as InstPtr), Split(ref inst) => { self.cache.stack.push(inst.goto2 as InstPtr); self.cache.stack.push(inst.goto1 as InstPtr); @@ -1217,6 +1218,7 @@ impl<'a> Fsm<'a> { match self.prog[ip as usize] { Char(_) | Ranges(_) => unreachable!(), Save(_) | Split(_) => {} + Progress(_) => {} // TODO: Verify that the DFA doesn't need Progress Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip), EmptyLook(_) => { state_flags.set_empty(); diff --git a/src/pikevm.rs b/src/pikevm.rs index b96f0e7588..874f199836 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -275,7 +275,7 @@ impl<'r, I: Input> Fsm<'r, I> { } false } - EmptyLook(_) | Save(_) | Split(_) => false, + EmptyLook(_) | Save(_) | Split(_) | Progress(_) => false, } } @@ -347,6 +347,7 @@ impl<'r, I: Input> Fsm<'r, I> { } return; } + Progress(_) => unreachable!(), // TODO } } } diff --git a/src/prog.rs b/src/prog.rs index 6ae49f6a12..cf41a4f0fe 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -72,6 +72,8 @@ pub struct Program { /// simultaneously, then the DFA cache is not shared. Instead, copies are /// made. pub dfa_size_limit: usize, + /// The number of progress slots. + pub num_progress_slots: usize, } impl Program { @@ -94,6 +96,7 @@ impl Program { has_unicode_word_boundary: false, prefixes: LiteralSearcher::empty(), dfa_size_limit: 2 * (1<<20), + num_progress_slots: 0, } } @@ -224,6 +227,10 @@ impl fmt::Debug for Program { try!(write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))); } + Progress(ref inst) => { + let s = format!("{:04} Progress({})", pc, inst.slot); + try!(write!(f, "{}", with_goto(pc, inst.goto, s))); + } } if pc == self.start { try!(write!(f, " (start)")); @@ -286,6 +293,12 @@ pub enum Inst { /// used in conjunction with Split instructions to implement multi-byte /// character classes. Bytes(InstBytes), + /// Progress marks a location that a program is required to make progress. + /// If the same progress instruction is encountered multiple times, each + /// encounter only matches if more of the input has been matched than at the + /// previous encounter. This is useful for preventing infinite loops in + /// regexes like `(x*)*`. + Progress(InstProgress), } impl Inst { @@ -423,3 +436,13 @@ impl InstBytes { self.start <= byte && byte <= self.end } } + +/// Representation of the Progress instruction. +#[derive(Clone, Debug)] +pub struct InstProgress { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The progress slot to store the minimum index of the next input to read. + pub slot: usize, +} diff --git a/tests/crazy.rs b/tests/crazy.rs index ade839ade1..c7c5723084 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -47,6 +47,20 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3))); mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3))); mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2))); +// Test that repeated empty expressions don't loop forever. +mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2))); +mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2))); +mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2))); +mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2))); +mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2))); +mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2))); +mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2))); +mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2))); +mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2))); +mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2))); +mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2))); +mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2))); + // Test that the DFA can handle pathological cases. // (This should result in the DFA's cache being flushed too frequently, which // should cause it to quit and fall back to the NFA algorithm.)