kivikakk · kivikakk · Jun 7, 2023 · Jun 1, 2023 · Jun 1, 2023 · Jun 6, 2023
diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -54,3 +54,9 @@ name = "gfm_sourcepos"
 path = "fuzz_targets/gfm_sourcepos.rs"
 test = false
 doc = false
+
+[[bin]]
+name = "gfm_footnotes"
+path = "fuzz_targets/gfm_footnotes.rs"
+test = false
+doc = false
diff --git a/fuzz/fuzz_targets/gfm_footnotes.rs b/fuzz/fuzz_targets/gfm_footnotes.rs
@@ -0,0 +1,32 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+use comrak::{markdown_to_html, ComrakExtensionOptions, ComrakOptions, ComrakRenderOptions};
+
+// Note that what I'm targetting here isn't exactly the same
+// as --gfm, but rather an approximation of what cmark-gfm
+// options are routinely used by Commonmarker users.
+
+fuzz_target!(|s: &str| {
+    markdown_to_html(
+        s,
+        &ComrakOptions {
+            extension: ComrakExtensionOptions {
+                strikethrough: true,
+                tagfilter: true,
+                table: true,
+                autolink: true,
+                footnotes: true,
+                ..Default::default()
+            },
+            parse: Default::default(),
+            render: ComrakRenderOptions {
+                hardbreaks: true,
+                github_pre_lang: true,
+                unsafe_: true,
+                ..Default::default()
+            },
+        },
+    );
+});
diff --git a/src/arena_tree.rs b/src/arena_tree.rs
@@ -7,7 +7,7 @@ A DOM-like tree data structure based on `&Node` references.
 Any non-trivial tree involves reference cycles
 (e.g. if a node has a first child, the parent of the child is that node).
 To enable this, nodes need to live in an arena allocator
-such as `arena::TypedArena` distrubuted with rustc (which is `#[unstable]` as of this writing)
+such as `arena::TypedArena` distributed with rustc (which is `#[unstable]` as of this writing)
 or [`typed_arena::Arena`](https://crates.io/crates/typed-arena).
 
 If you need mutability in the node’s `data`,
@@ -33,7 +33,7 @@ pub struct Node<'a, T: 'a> {
 }
 
 /// A simple Debug implementation that prints the children as a tree, without
-/// ilooping through the various interior pointer cycles.
+/// looping through the various interior pointer cycles.
 impl<'a, T: 'a> fmt::Debug for Node<'a, T>
 where
     T: fmt::Debug,
@@ -95,7 +95,7 @@ impl<'a, T> Node<'a, T> {
         self.previous_sibling.get()
     }
 
-    /// Return a reference to the previous sibling of this node, unless it is a last child.
+    /// Return a reference to the next sibling of this node, unless it is a last child.
     pub fn next_sibling(&self) -> Option<&'a Node<'a, T>> {
         self.next_sibling.get()
     }

diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs
@@ -9,6 +9,7 @@ use crate::parser::{
 };
 use crate::scanners;
 use crate::strings;
+use crate::strings::Case;
 use std::cell::{Cell, RefCell};
 use std::collections::HashMap;
 use std::convert::TryFrom;
@@ -462,12 +463,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
         // At this point the entire delimiter stack from `stack_bottom` up has
         // been scanned for matches, everything left is just text. Pop it all
         // off.
-        while self
-            .last_delimiter
-            .map_or(false, |d| d.position >= stack_bottom)
-        {
-            self.remove_delimiter(self.last_delimiter.unwrap());
-        }
+        self.remove_delimiters(stack_bottom);
     }
 
     fn remove_delimiter(&mut self, delimiter: &'d Delimiter<'a, 'd>) {
@@ -482,6 +478,15 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
         }
     }
 
+    fn remove_delimiters(&mut self, stack_bottom: usize) {
+        while self
+            .last_delimiter
+            .map_or(false, |d| d.position >= stack_bottom)
+        {
+            self.remove_delimiter(self.last_delimiter.unwrap());
+        }
+    }
+
     #[inline]
     pub fn eof(&self) -> bool {
         self.pos >= self.input.len()
@@ -1197,7 +1202,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
         }
 
         // Need to normalize both to lookup in refmap and to call callback
-        let lab = strings::normalize_label(&lab, false);
+        let lab = strings::normalize_label(&lab, Case::DontPreserve);
         let mut reff = if found_label {
             self.refmap.lookup(&lab)
         } else {
@@ -1216,18 +1221,46 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
             return None;
         }
 
-        let mut text: Option<String> = None;
+        let bracket_inl_text = self.brackets[brackets_len - 1].inl_text;
+
         if self.options.extension.footnotes
-            && match self.brackets[brackets_len - 1].inl_text.next_sibling() {
+            && match bracket_inl_text.next_sibling() {
                 Some(n) => {
-                    text = n.data.borrow().value.text().cloned();
-                    text.is_some() && n.next_sibling().is_none()
+                    if n.data.borrow().value.text().is_some() {
+                        n.data
+                            .borrow()
+                            .value
+                            .text()
+                            .unwrap()
+                            .as_bytes()
+                            .starts_with(&[b'^'])
+                    } else {
+                        false
+                    }
                 }
                 _ => false,
             }
         {
-            let text = text.unwrap();
-            if text.len() > 1 && text.as_bytes()[0] == b'^' {
+            let mut text = String::new();
+            let mut sibling_iterator = bracket_inl_text.following_siblings();
+
+            // Skip the initial node, which holds the `[`
+            sibling_iterator.next().unwrap();
+
+            // The footnote name could have been parsed into multiple text/htmlinline nodes.
+            // For example `[^_foo]` gives `^`, `_`, and `foo`. So pull them together.
+            // Since we're handling the closing bracket, the only siblings at this point are
+            // related to the footnote name.
+            for sibling in sibling_iterator {
+                match sibling.data.borrow().value {
+                    NodeValue::Text(ref literal) | NodeValue::HtmlInline(ref literal) => {
+                        text.push_str(literal);
+                    }
+                    _ => {}
+                };
+            }
+
+            if text.len() > 1 {
                 let inl = self.make_inline(
                     NodeValue::FootnoteReference(NodeFootnoteReference {
                         name: text[1..].to_string(),
@@ -1238,25 +1271,29 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
                     self.pos,
                     self.pos,
                 );
-                inl.data.borrow_mut().sourcepos.start.column = self.brackets[brackets_len - 1]
-                    .inl_text
-                    .data
-                    .borrow()
-                    .sourcepos
-                    .start
-                    .column;
+                inl.data.borrow_mut().sourcepos.start.column =
+                    bracket_inl_text.data.borrow().sourcepos.start.column;
                 inl.data.borrow_mut().sourcepos.end.column = usize::try_from(
                     self.pos as isize + self.column_offset + self.block_offset as isize,
                 )
                 .unwrap();
-                self.brackets[brackets_len - 1].inl_text.insert_before(inl);
-                self.brackets[brackets_len - 1]
-                    .inl_text
-                    .next_sibling()
-                    .unwrap()
-                    .detach();
-                self.brackets[brackets_len - 1].inl_text.detach();
-                self.process_emphasis(self.brackets[brackets_len - 1].position);
+                bracket_inl_text.insert_before(inl);
+
+                // detach all the nodes, including bracket_inl_text
+                sibling_iterator = bracket_inl_text.following_siblings();
+                for sibling in sibling_iterator {
+                    match sibling.data.borrow().value {
+                        NodeValue::Text(_) | NodeValue::HtmlInline(_) => {
+                            sibling.detach();
+                        }
+                        _ => {}
+                    };
+                }
+
+                // We don't need to process emphasis for footnote names, so cleanup
+                // any outstanding delimiters
+                self.remove_delimiters(self.brackets[brackets_len - 1].position);
+
                 self.brackets.pop();
                 return None;
             }

diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -14,7 +14,7 @@ use crate::nodes::{
     NodeHtmlBlock, NodeList, NodeValue,
 };
 use crate::scanners;
-use crate::strings::{self, split_off_front_matter};
+use crate::strings::{self, split_off_front_matter, Case};
 use std::cell::RefCell;
 use std::cmp::min;
 use std::collections::HashMap;
@@ -1785,11 +1785,11 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
             NodeValue::FootnoteDefinition(ref nfd) => {
                 node.detach();
                 map.insert(
-                    strings::normalize_label(&nfd.name, false),
+                    strings::normalize_label(&nfd.name, Case::DontPreserve),
                     FootnoteDefinition {
                         ix: None,
                         node,
-                        name: strings::normalize_label(&nfd.name, true),
+                        name: strings::normalize_label(&nfd.name, Case::Preserve),
                         total_references: 0,
                     },
                 );
@@ -1811,7 +1811,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
         let mut replace = None;
         match ast.value {
             NodeValue::FootnoteReference(ref mut nfr) => {
-                let normalized = strings::normalize_label(&nfr.name, false);
+                let normalized = strings::normalize_label(&nfr.name, Case::DontPreserve);
                 if let Some(ref mut footnote) = map.get_mut(&normalized) {
                     let ix = match footnote.ix {
                         Some(ix) => ix,
@@ -1824,7 +1824,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
                     footnote.total_references += 1;
                     nfr.ref_num = footnote.total_references;
                     nfr.ix = ix;
-                    nfr.name = strings::normalize_label(&footnote.name, true);
+                    nfr.name = strings::normalize_label(&footnote.name, Case::Preserve);
                 } else {
                     replace = Some(nfr.name.clone());
                 }
@@ -2025,7 +2025,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
             }
         }
 
-        lab = strings::normalize_label(&lab, false);
+        lab = strings::normalize_label(&lab, Case::DontPreserve);
         if !lab.is_empty() {
             subj.refmap.map.entry(lab).or_insert(Reference {
                 url: String::from_utf8(strings::clean_url(url)).unwrap(),

diff --git a/src/strings.rs b/src/strings.rs
@@ -4,6 +4,12 @@ use crate::parser::AutolinkType;
 use std::ptr;
 use std::str;
 
+#[derive(PartialEq, Eq)]
+pub enum Case {
+    Preserve,
+    DontPreserve,
+}
+
 pub fn unescape(v: &mut Vec<u8>) {
     let mut r = 0;
     let mut prev = None;
@@ -237,7 +243,7 @@ pub fn is_blank(s: &[u8]) -> bool {
     true
 }
 
-pub fn normalize_label(i: &str, preserve_case: bool) -> String {
+pub fn normalize_label(i: &str, casing: Case) -> String {
     // trim_slice only removes bytes from start and end that match isspace();
     // result is UTF-8.
     let i = unsafe { str::from_utf8_unchecked(trim_slice(i.as_bytes())) };
@@ -252,10 +258,9 @@ pub fn normalize_label(i: &str, preserve_case: bool) -> String {
             }
         } else {
             last_was_whitespace = false;
-            if preserve_case {
-                v.push(c);
-            } else {
-                v.push_str(&c.to_lowercase().to_string());
+            match casing {
+                Case::Preserve => v.push(c),
+                Case::DontPreserve => v.push_str(&c.to_lowercase().to_string()),
             }
         }
     }
@@ -311,6 +316,7 @@ pub fn trim_start_match<'s>(s: &'s str, pat: &str) -> &'s str {
 #[cfg(test)]
 pub mod tests {
     use super::{normalize_code, normalize_label, split_off_front_matter};
+    use crate::strings::Case;
 
     #[test]
     fn normalize_code_handles_lone_newline() {
@@ -346,13 +352,22 @@ pub mod tests {
 
     #[test]
     fn normalize_label_lowercase() {
-        assert_eq!(normalize_label("  Foo\u{A0}BAR  ", false), "foo bar");
-        assert_eq!(normalize_label("  FooİBAR  ", false), "fooi\u{307}bar");
+        assert_eq!(
+            normalize_label("  Foo\u{A0}BAR  ", Case::DontPreserve),
+            "foo bar"
+        );
+        assert_eq!(
+            normalize_label("  FooİBAR  ", Case::DontPreserve),
+            "fooi\u{307}bar"
+        );
     }
 
     #[test]
     fn normalize_label_preserve() {
-        assert_eq!(normalize_label("  Foo\u{A0}BAR  ", true), "Foo BAR");
-        assert_eq!(normalize_label("  FooİBAR  ", true), "FooİBAR");
+        assert_eq!(
+            normalize_label("  Foo\u{A0}BAR  ", Case::Preserve),
+            "Foo BAR"
+        );
+        assert_eq!(normalize_label("  FooİBAR  ", Case::Preserve), "FooİBAR");
     }
 }