Skip to content

Commit

Permalink
Add Progress instruction to avoid infinite loops in repeated empty ex…
Browse files Browse the repository at this point in the history
…pressions

With certain repeated empty expressions similar to (x*)*?, the backtracker can
go into an infinite loop. This change adds the Progress instruction which
requires the engine to make progress to continue matching a repeated
subexpression.

Fixes rust-lang#375
  • Loading branch information
adamcrume committed Jun 19, 2017
1 parent 2f18730 commit 6d23735
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 6 deletions.
33 changes: 33 additions & 0 deletions src/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ pub struct Bounded<'a, 'm, 'r, 's, I> {
input: I,
matches: &'m mut [bool],
slots: &'s mut [Slot],
/// Stores input positions for the Progress instruction.
progress_slots: Vec<Slot>,
m: &'a mut Cache,
}

Expand Down Expand Up @@ -84,6 +86,8 @@ impl Cache {
enum Job {
Inst { ip: InstPtr, at: InputAt },
SaveRestore { slot: usize, old_pos: Option<usize> },
/// Restores a saved progress_slot.
SaveRestoreProgress { slot: usize, old_pos: Option<usize> },
}

impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
Expand All @@ -102,11 +106,16 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
let mut cache = cache.borrow_mut();
let mut cache = &mut cache.backtrack;
let start = input.at(start);
let mut progress_slots = Vec::new();
for _ in 0..prog.num_progress_slots {
progress_slots.push(None);
}
let mut b = Bounded {
prog: prog,
input: input,
matches: matches,
slots: slots,
progress_slots: progress_slots,
m: cache,
};
b.exec_(start)
Expand Down Expand Up @@ -204,6 +213,11 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
self.slots[slot] = old_pos;
}
}
Job::SaveRestoreProgress { slot, old_pos } => {
if slot < self.slots.len() {
self.progress_slots[slot] = old_pos;
}
}
}
}
matched
Expand Down Expand Up @@ -274,6 +288,25 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
}
return false;
}
Progress(ref inst) => {
if let Some(&old_pos) = self.progress_slots.get(inst.slot) {
if let Some(p) = old_pos {
if p >= at.pos() {
return false;
}
}
// If this path doesn't work out, then we save the old
// progress index (if one exists) in an alternate
// job. If the next path fails, then the alternate
// job is popped and the old progress index is restored.
self.m.jobs.push(Job::SaveRestoreProgress {
slot: inst.slot,
old_pos: old_pos,
});
self.progress_slots[inst.slot] = Some(at.pos());
}
ip = inst.goto;
}
}
if self.has_visited(ip, at) {
return false;
Expand Down
49 changes: 46 additions & 3 deletions src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
use prog::{
Program, Inst, InstPtr, EmptyLook,
InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
InstProgress,
};

use Error;
Expand All @@ -45,6 +46,8 @@ pub struct Compiler {
suffix_cache: SuffixCache,
utf8_seqs: Option<Utf8Sequences>,
byte_classes: ByteClassSet,
/// Number of progress slots used for the Progress instruction.
progress_count: usize,
}

impl Compiler {
Expand All @@ -61,6 +64,7 @@ impl Compiler {
suffix_cache: SuffixCache::new(1000),
utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
byte_classes: ByteClassSet::new(),
progress_count: 0,
}
}

Expand Down Expand Up @@ -535,13 +539,17 @@ impl Compiler {
) -> Result {
let split_entry = self.insts.len();
let split = self.push_split_hole();
let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
let begin = self.insts.len();
if !greedy && matches_empty(expr) {
self.push_progress();
}
let Patch { hole: hole_rep, entry: _ } = try!(self.c(expr));

self.fill(hole_rep, split_entry);
let split_hole = if greedy {
self.fill_split(split, Some(entry_rep), None)
self.fill_split(split, Some(begin), None)
} else {
self.fill_split(split, None, Some(entry_rep))
self.fill_split(split, None, Some(begin))
};
Ok(Patch { hole: split_hole, entry: split_entry })
}
Expand All @@ -553,6 +561,9 @@ impl Compiler {
) -> Result {
let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
self.fill_to_next(hole_rep);
if !greedy && matches_empty(expr) {
self.push_progress();
}
let split = self.push_split_hole();

let split_hole = if greedy {
Expand Down Expand Up @@ -702,6 +713,14 @@ impl Compiler {
Hole::One(hole)
}

/// Pushes a Progress instruction onto the program.
fn push_progress(&mut self) {
let slot = self.progress_count;
self.progress_count += 1;
let next = self.insts.len() + 1;
self.push_compiled(Inst::Progress(InstProgress{goto: next, slot: slot}));
}

fn check_size(&self) -> result::Result<(), Error> {
use std::mem::size_of;

Expand Down Expand Up @@ -1059,6 +1078,30 @@ fn u32_to_usize(n: u32) -> usize {
n as usize
}

/// Returns true if and only if this expression can match without consuming
/// characters or bytes.
fn matches_empty(expr: &Expr) -> bool {
match expr {
&Expr::Empty | &Expr::StartLine | &Expr::EndLine | &Expr::StartText
| &Expr::EndText | &Expr::WordBoundary | &Expr::NotWordBoundary
| &Expr::WordBoundaryAscii | &Expr::NotWordBoundaryAscii => true,
&Expr::AnyChar | &Expr::AnyCharNoNL | &Expr::AnyByte
| &Expr::AnyByteNoNL | &Expr::Class(_) | &Expr::ClassBytes(_)
=> false,
&Expr::Literal { ref chars, casei: _ } => chars.len() == 0,
&Expr::LiteralBytes { ref bytes, casei: _ } => bytes.len() == 0,
&Expr::Group { ref e, i: _, name: _ } => matches_empty(&e),
&Expr::Repeat { ref e, r, greedy: _ } => (match r {
Repeater::ZeroOrOne => true,
Repeater::ZeroOrMore => true,
Repeater::OneOrMore => false,
Repeater::Range { min, .. } => min == 0,
} || matches_empty(&e)),
&Expr::Concat(ref v) => v.iter().all(|e| matches_empty(e)),
&Expr::Alternate(ref v) => v.iter().any(|e| matches_empty(e)),
}
}

#[cfg(test)]
mod tests {
use super::ByteClassSet;
Expand Down
6 changes: 4 additions & 2 deletions src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ pub fn can_exec(insts: &Program) -> bool {
for inst in insts {
match *inst {
Char(_) | Ranges(_) => return false,
EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) | Progress(_) => {}
}
}
true
Expand Down Expand Up @@ -980,7 +980,7 @@ impl<'a> Fsm<'a> {
// These states never happen in a byte-based program.
Char(_) | Ranges(_) => unreachable!(),
// These states are handled when following epsilon transitions.
Save(_) | Split(_) | EmptyLook(_) => {}
Save(_) | Split(_) | EmptyLook(_) | Progress(_) => {}
Match(_) => {
state_flags.set_match();
if !self.continue_past_first_match() {
Expand Down Expand Up @@ -1125,6 +1125,7 @@ impl<'a> Fsm<'a> {
}
}
Save(ref inst) => self.cache.stack.push(inst.goto as InstPtr),
Progress(ref inst) => self.cache.stack.push(inst.goto as InstPtr),
Split(ref inst) => {
self.cache.stack.push(inst.goto2 as InstPtr);
self.cache.stack.push(inst.goto1 as InstPtr);
Expand Down Expand Up @@ -1217,6 +1218,7 @@ impl<'a> Fsm<'a> {
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) | Split(_) => {}
Progress(_) => {} // TODO: Verify that the DFA doesn't need Progress
Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
EmptyLook(_) => {
state_flags.set_empty();
Expand Down
3 changes: 2 additions & 1 deletion src/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ impl<'r, I: Input> Fsm<'r, I> {
}
false
}
EmptyLook(_) | Save(_) | Split(_) => false,
EmptyLook(_) | Save(_) | Split(_) | Progress(_) => false,
}
}

Expand Down Expand Up @@ -347,6 +347,7 @@ impl<'r, I: Input> Fsm<'r, I> {
}
return;
}
Progress(_) => unreachable!(), // TODO
}
}
}
Expand Down
23 changes: 23 additions & 0 deletions src/prog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ pub struct Program {
/// simultaneously, then the DFA cache is not shared. Instead, copies are
/// made.
pub dfa_size_limit: usize,
/// The number of progress slots.
pub num_progress_slots: usize,
}

impl Program {
Expand All @@ -94,6 +96,7 @@ impl Program {
has_unicode_word_boundary: false,
prefixes: LiteralSearcher::empty(),
dfa_size_limit: 2 * (1<<20),
num_progress_slots: 0,
}
}

Expand Down Expand Up @@ -224,6 +227,10 @@ impl fmt::Debug for Program {
try!(write!(f, "{:04} {}",
pc, with_goto(pc, inst.goto, s)));
}
Progress(ref inst) => {
let s = format!("{:04} Progress({})", pc, inst.slot);
try!(write!(f, "{}", with_goto(pc, inst.goto, s)));
}
}
if pc == self.start {
try!(write!(f, " (start)"));
Expand Down Expand Up @@ -286,6 +293,12 @@ pub enum Inst {
/// used in conjunction with Split instructions to implement multi-byte
/// character classes.
Bytes(InstBytes),
/// Progress marks a location that a program is required to make progress.
/// If the same progress instruction is encountered multiple times, each
/// encounter only matches if more of the input has been matched than at the
/// previous encounter. This is useful for preventing infinite loops in
/// regexes like `(x*)*`.
Progress(InstProgress),
}

impl Inst {
Expand Down Expand Up @@ -423,3 +436,13 @@ impl InstBytes {
self.start <= byte && byte <= self.end
}
}

/// Representation of the Progress instruction.
#[derive(Clone, Debug)]
pub struct InstProgress {
/// The next location to execute in the program if this instruction
/// succeeds.
pub goto: InstPtr,
/// The progress slot to store the minimum index of the next input to read.
pub slot: usize,
}
14 changes: 14 additions & 0 deletions tests/crazy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));

// Test that repeated empty expressions don't loop forever.
mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2)));
mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2)));
mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2)));
mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2)));
mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2)));
mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2)));
mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2)));
mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2)));
mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2)));
mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2)));
mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2)));
mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2)));

// Test that the DFA can handle pathological cases.
// (This should result in the DFA's cache being flushed too frequently, which
// should cause it to quit and fall back to the NFA algorithm.)
Expand Down

0 comments on commit 6d23735

Please sign in to comment.