From 3feff9e10e028eed26336ff1934d6b89fc6c74e9 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 9 Oct 2023 14:55:59 -0400
Subject: [PATCH] automata: improve sparse DFA validation

This rejiggers some code so that we can more reliably check whether
start state IDs are valid or not.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62726
---
 ...ta_deserialize_sparse_dfa-4903112680538112 | Bin 0 -> 953 bytes
 regex-automata/src/dfa/sparse.rs              | 159 +++++++++---------
 2 files changed, 83 insertions(+), 76 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112
diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112
new file mode 100644
index 0000000000000000000000000000000000000000..3056bca2f335559837ff22c307040e7d200693b5
GIT binary patch
literal 953
zcmcgrzb^z)5dPkCdx%D_SrUavRGOo46+(5LlBjgl;_fdHC6V|8G}<dQ{0AZtiBh9b
z_!(};H}B1E_GINclgxfI-@KWfnYTN6c-B2%Szh1U-dYop_tS|69)YDN-~xq+m*XLN
z4tL7cH-@uBjkZYkrUs^|UXPUI!0i1J=Sy}pX9+DA%AtnQNcISwXoly=FQ3sghTUaA
z7rg<`8K7xLM7~7E#sN;fDPs9v7D)%u?i1`H>v1iCd3X&#o?ZHHI$Q{9vE`2)E1aXc
z4dV*ibs}S9a$sF5@Ts&8Nm;m!h_LYvQgx@O&0-~*MqvXc*1mdu8%`|_4(ibjUcp(+
zUaNf^t&($zmCZ!j&`hg&V+sEZeFMdJm&UDZ#tk}9nN`%b_4cnaUQ`}Jy`sc(1@{;A
cVu5+$3H6G4Prat{2Q-<>jd&&e<ZLqk1EykL>Hq)$

literal 0
HcmV?d00001

diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs
index 7862d48a2..38096d994 100644
--- a/regex-automata/src/dfa/sparse.rs
+++ b/regex-automata/src/dfa/sparse.rs
@@ -992,8 +992,8 @@ impl<'a> DFA<&'a [u8]> {
         // (by trying to decode every state) and start state ID list below. If
         // either validation fails, then we return an error.
         let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
-        dfa.tt.validate(&dfa.special)?;
-        dfa.st.validate(&dfa.special, &dfa.tt)?;
+        let seen = dfa.tt.validate(&dfa.special)?;
+        dfa.st.validate(&dfa.special, &seen)?;
         // N.B. dfa.special doesn't have a way to do unchecked deserialization,
         // so it has already been validated.
         Ok((dfa, nread))
@@ -1388,63 +1388,8 @@ impl<T: AsRef<[u8]>> Transitions<T> {
     ///
     /// That is, every state ID can be used to correctly index a state in this
     /// table.
-    fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
-        // In order to validate everything, we not only need to make sure we
-        // can decode every state, but that every transition in every state
-        // points to a valid state. There are many duplicative transitions, so
-        // we record state IDs that we've verified so that we don't redo the
-        // decoding work.
-        //
-        // Except, when in no_std mode, we don't have dynamic memory allocation
-        // available to us, so we skip this optimization. It's not clear
-        // whether doing something more clever is worth it just yet. If you're
-        // profiling this code and need it to run faster, please file an issue.
-        //
-        // OK, so we also use this to record the set of valid state IDs. Since
-        // it is possible for a transition to point to an invalid state ID that
-        // still (somehow) deserializes to a valid state. So we need to make
-        // sure our transitions are limited to actually correct state IDs.
-        // The problem is, I'm not sure how to do this verification step in
-        // no-std no-alloc mode. I think we'd *have* to store the set of valid
-        // state IDs in the DFA itself. For now, we don't do this verification
-        // in no-std no-alloc mode. The worst thing that can happen is an
-        // incorrect result. But no panics or memory safety problems should
-        // result. Because we still do validate that the state itself is
-        // "valid" in the sense that everything it points to actually exists.
-        //
-        // ---AG
-        struct Seen {
-            #[cfg(feature = "alloc")]
-            set: alloc::collections::BTreeSet<StateID>,
-            #[cfg(not(feature = "alloc"))]
-            set: core::marker::PhantomData<StateID>,
-        }
-
-        #[cfg(feature = "alloc")]
-        impl Seen {
-            fn new() -> Seen {
-                Seen { set: alloc::collections::BTreeSet::new() }
-            }
-            fn insert(&mut self, id: StateID) {
-                self.set.insert(id);
-            }
-            fn contains(&self, id: &StateID) -> bool {
-                self.set.contains(id)
-            }
-        }
-
-        #[cfg(not(feature = "alloc"))]
-        impl Seen {
-            fn new() -> Seen {
-                Seen { set: core::marker::PhantomData }
-            }
-            fn insert(&mut self, _id: StateID) {}
-            fn contains(&self, _id: &StateID) -> bool {
-                false
-            }
-        }
-
-        let mut verified: Seen = Seen::new();
+    fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
+        let mut verified = Seen::new();
         // We need to make sure that we decode the correct number of states.
         // Otherwise, an empty set of transitions would validate even if the
         // recorded state length is non-empty.
@@ -1521,7 +1466,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
                 "mismatching sparse state length",
             ));
         }
-        Ok(())
+        Ok(verified)
     }
 
     /// Converts these transitions to a borrowed value.
@@ -1659,7 +1604,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
             let state = &state[nr..];
             if npats == 0 {
                 return Err(DeserializeError::generic(
-                    "state marked as a match, but has no pattern IDs",
+                    "state marked as a match, but pattern length is zero",
                 ));
             }
 
@@ -1681,6 +1626,21 @@ impl<T: AsRef<[u8]>> Transitions<T> {
         } else {
             (&[][..], state)
         };
+        if is_match && pattern_ids.is_empty() {
+            return Err(DeserializeError::generic(
+                "state marked as a match, but has no pattern IDs",
+            ));
+        }
+        if sp.is_match_state(id) && pattern_ids.is_empty() {
+            return Err(DeserializeError::generic(
+                "state marked special as a match, but has no pattern IDs",
+            ));
+        }
+        if sp.is_match_state(id) != is_match {
+            return Err(DeserializeError::generic(
+                "whether state is a match or not is inconsistent",
+            ));
+        }
 
         // Now read this state's accelerator info. The first byte is the length
         // of the accelerator, which is typically 0 (for no acceleration) but
@@ -2061,28 +2021,19 @@ impl<T: AsRef<[u8]>> StartTable<T> {
     fn validate(
         &self,
         sp: &Special,
-        trans: &Transitions<T>,
+        seen: &Seen,
     ) -> Result<(), DeserializeError> {
         for (id, _, _) in self.iter() {
+            if !seen.contains(&id) {
+                return Err(DeserializeError::generic(
+                    "found invalid start state ID",
+                ));
+            }
             if sp.is_match_state(id) {
                 return Err(DeserializeError::generic(
                     "start states cannot be match states",
                 ));
             }
-            // Confirm that the start state points to a valid state.
-            let state = trans.try_state(sp, id)?;
-            // And like for the transition table, confirm that the transitions
-            // on all start states themselves point to a valid state.
-            //
-            // It'd probably be better to integrate this validation with the
-            // transition table, or otherwise store a sorted sequence of all
-            // valid state IDs in the sparse DFA itself. That way, we could
-            // check that every pointer to a state corresponds precisely to a
-            // correct and valid state.
-            for i in 0..state.ntrans {
-                let to = state.next_at(i);
-                let _ = trans.try_state(sp, to)?;
-            }
         }
         Ok(())
     }
@@ -2537,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> {
     }
 }
 
+// In order to validate everything, we not only need to make sure we
+// can decode every state, but that every transition in every state
+// points to a valid state. There are many duplicative transitions, so
+// we record state IDs that we've verified so that we don't redo the
+// decoding work.
+//
+// Except, when in no_std mode, we don't have dynamic memory allocation
+// available to us, so we skip this optimization. It's not clear
+// whether doing something more clever is worth it just yet. If you're
+// profiling this code and need it to run faster, please file an issue.
+//
+// OK, so we also use this to record the set of valid state IDs. Since
+// it is possible for a transition to point to an invalid state ID that
+// still (somehow) deserializes to a valid state. So we need to make
+// sure our transitions are limited to actually correct state IDs.
+// The problem is, I'm not sure how to do this verification step in
+// no-std no-alloc mode. I think we'd *have* to store the set of valid
+// state IDs in the DFA itself. For now, we don't do this verification
+// in no-std no-alloc mode. The worst thing that can happen is an
+// incorrect result. But no panics or memory safety problems should
+// result. Because we still do validate that the state itself is
+// "valid" in the sense that everything it points to actually exists.
+//
+// ---AG
+#[derive(Debug)]
+struct Seen {
+    #[cfg(feature = "alloc")]
+    set: alloc::collections::BTreeSet<StateID>,
+    #[cfg(not(feature = "alloc"))]
+    set: core::marker::PhantomData<StateID>,
+}
+
+#[cfg(feature = "alloc")]
+impl Seen {
+    fn new() -> Seen {
+        Seen { set: alloc::collections::BTreeSet::new() }
+    }
+    fn insert(&mut self, id: StateID) {
+        self.set.insert(id);
+    }
+    fn contains(&self, id: &StateID) -> bool {
+        self.set.contains(id)
+    }
+}
+
+#[cfg(not(feature = "alloc"))]
+impl Seen {
+    fn new() -> Seen {
+        Seen { set: core::marker::PhantomData }
+    }
+    fn insert(&mut self, _id: StateID) {}
+    fn contains(&self, _id: &StateID) -> bool {
+        false
+    }
+}
+
 /*
 /// A binary search routine specialized specifically to a sparse DFA state's
 /// transitions. Specifically, the transitions are defined as a set of pairs