Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle footnote names that have been parsed into multiple nodes #311

Merged
merged 5 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fuzz/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,9 @@ name = "gfm_sourcepos"
path = "fuzz_targets/gfm_sourcepos.rs"
test = false
doc = false

[[bin]]
name = "gfm_footnotes"
path = "fuzz_targets/gfm_footnotes.rs"
test = false
doc = false
32 changes: 32 additions & 0 deletions fuzz/fuzz_targets/gfm_footnotes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#![no_main]

use libfuzzer_sys::fuzz_target;

use comrak::{markdown_to_html, ComrakExtensionOptions, ComrakOptions, ComrakRenderOptions};

// Note that what I'm targetting here isn't exactly the same
// as --gfm, but rather an approximation of what cmark-gfm
// options are routinely used by Commonmarker users.

fuzz_target!(|s: &str| {
markdown_to_html(
s,
&ComrakOptions {
extension: ComrakExtensionOptions {
strikethrough: true,
tagfilter: true,
table: true,
autolink: true,
footnotes: true,
..Default::default()
},
parse: Default::default(),
render: ComrakRenderOptions {
hardbreaks: true,
github_pre_lang: true,
unsafe_: true,
..Default::default()
},
},
);
});
6 changes: 3 additions & 3 deletions src/arena_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ A DOM-like tree data structure based on `&Node` references.
Any non-trivial tree involves reference cycles
(e.g. if a node has a first child, the parent of the child is that node).
To enable this, nodes need to live in an arena allocator
such as `arena::TypedArena` distrubuted with rustc (which is `#[unstable]` as of this writing)
such as `arena::TypedArena` distributed with rustc (which is `#[unstable]` as of this writing)
or [`typed_arena::Arena`](https://crates.io/crates/typed-arena).

If you need mutability in the node’s `data`,
Expand All @@ -33,7 +33,7 @@ pub struct Node<'a, T: 'a> {
}

/// A simple Debug implementation that prints the children as a tree, without
/// ilooping through the various interior pointer cycles.
/// looping through the various interior pointer cycles.
impl<'a, T: 'a> fmt::Debug for Node<'a, T>
where
T: fmt::Debug,
Expand Down Expand Up @@ -95,7 +95,7 @@ impl<'a, T> Node<'a, T> {
self.previous_sibling.get()
}

/// Return a reference to the previous sibling of this node, unless it is a last child.
/// Return a reference to the next sibling of this node, unless it is a last child.
pub fn next_sibling(&self) -> Option<&'a Node<'a, T>> {
self.next_sibling.get()
}
Expand Down
93 changes: 65 additions & 28 deletions src/parser/inlines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::parser::{
};
use crate::scanners;
use crate::strings;
use crate::strings::Case;
use std::cell::{Cell, RefCell};
use std::collections::HashMap;
use std::convert::TryFrom;
Expand Down Expand Up @@ -462,12 +463,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
// At this point the entire delimiter stack from `stack_bottom` up has
// been scanned for matches, everything left is just text. Pop it all
// off.
while self
.last_delimiter
.map_or(false, |d| d.position >= stack_bottom)
{
self.remove_delimiter(self.last_delimiter.unwrap());
}
self.remove_delimiters(stack_bottom);
}

fn remove_delimiter(&mut self, delimiter: &'d Delimiter<'a, 'd>) {
Expand All @@ -482,6 +478,15 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
}
}

fn remove_delimiters(&mut self, stack_bottom: usize) {
while self
.last_delimiter
.map_or(false, |d| d.position >= stack_bottom)
{
self.remove_delimiter(self.last_delimiter.unwrap());
}
}

#[inline]
pub fn eof(&self) -> bool {
self.pos >= self.input.len()
Expand Down Expand Up @@ -1197,7 +1202,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
}

// Need to normalize both to lookup in refmap and to call callback
let lab = strings::normalize_label(&lab, false);
let lab = strings::normalize_label(&lab, Case::DontPreserve);
let mut reff = if found_label {
self.refmap.lookup(&lab)
} else {
Expand All @@ -1216,18 +1221,46 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
return None;
}

let mut text: Option<String> = None;
let bracket_inl_text = self.brackets[brackets_len - 1].inl_text;

if self.options.extension.footnotes
&& match self.brackets[brackets_len - 1].inl_text.next_sibling() {
&& match bracket_inl_text.next_sibling() {
Some(n) => {
text = n.data.borrow().value.text().cloned();
text.is_some() && n.next_sibling().is_none()
if n.data.borrow().value.text().is_some() {
n.data
.borrow()
.value
.text()
.unwrap()
.as_bytes()
.starts_with(&[b'^'])
} else {
false
}
}
_ => false,
}
{
let text = text.unwrap();
if text.len() > 1 && text.as_bytes()[0] == b'^' {
let mut text = String::new();
let mut sibling_iterator = bracket_inl_text.following_siblings();

// Skip the initial node, which holds the `[`
sibling_iterator.next().unwrap();

// The footnote name could have been parsed into multiple text/htmlinline nodes.
// For example `[^_foo]` gives `^`, `_`, and `foo`. So pull them together.
// Since we're handling the closing bracket, the only siblings at this point are
// related to the footnote name.
for sibling in sibling_iterator {
match sibling.data.borrow().value {
NodeValue::Text(ref literal) | NodeValue::HtmlInline(ref literal) => {
text.push_str(literal);
}
_ => {}
};
}

if text.len() > 1 {
let inl = self.make_inline(
NodeValue::FootnoteReference(NodeFootnoteReference {
name: text[1..].to_string(),
Expand All @@ -1238,25 +1271,29 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
self.pos,
self.pos,
);
inl.data.borrow_mut().sourcepos.start.column = self.brackets[brackets_len - 1]
.inl_text
.data
.borrow()
.sourcepos
.start
.column;
inl.data.borrow_mut().sourcepos.start.column =
bracket_inl_text.data.borrow().sourcepos.start.column;
inl.data.borrow_mut().sourcepos.end.column = usize::try_from(
self.pos as isize + self.column_offset + self.block_offset as isize,
)
.unwrap();
self.brackets[brackets_len - 1].inl_text.insert_before(inl);
self.brackets[brackets_len - 1]
.inl_text
.next_sibling()
.unwrap()
.detach();
self.brackets[brackets_len - 1].inl_text.detach();
self.process_emphasis(self.brackets[brackets_len - 1].position);
bracket_inl_text.insert_before(inl);

// detach all the nodes, including bracket_inl_text
sibling_iterator = bracket_inl_text.following_siblings();
for sibling in sibling_iterator {
match sibling.data.borrow().value {
NodeValue::Text(_) | NodeValue::HtmlInline(_) => {
sibling.detach();
}
_ => {}
};
}

// We don't need to process emphasis for footnote names, so cleanup
// any outstanding delimiters
self.remove_delimiters(self.brackets[brackets_len - 1].position);

self.brackets.pop();
return None;
}
Expand Down
12 changes: 6 additions & 6 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use crate::nodes::{
NodeHtmlBlock, NodeList, NodeValue,
};
use crate::scanners;
use crate::strings::{self, split_off_front_matter};
use crate::strings::{self, split_off_front_matter, Case};
use std::cell::RefCell;
use std::cmp::min;
use std::collections::HashMap;
Expand Down Expand Up @@ -1785,11 +1785,11 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
NodeValue::FootnoteDefinition(ref nfd) => {
node.detach();
map.insert(
strings::normalize_label(&nfd.name, false),
strings::normalize_label(&nfd.name, Case::DontPreserve),
FootnoteDefinition {
ix: None,
node,
name: strings::normalize_label(&nfd.name, true),
name: strings::normalize_label(&nfd.name, Case::Preserve),
total_references: 0,
},
);
Expand All @@ -1811,7 +1811,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
let mut replace = None;
match ast.value {
NodeValue::FootnoteReference(ref mut nfr) => {
let normalized = strings::normalize_label(&nfr.name, false);
let normalized = strings::normalize_label(&nfr.name, Case::DontPreserve);
if let Some(ref mut footnote) = map.get_mut(&normalized) {
let ix = match footnote.ix {
Some(ix) => ix,
Expand All @@ -1824,7 +1824,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
footnote.total_references += 1;
nfr.ref_num = footnote.total_references;
nfr.ix = ix;
nfr.name = strings::normalize_label(&footnote.name, true);
nfr.name = strings::normalize_label(&footnote.name, Case::Preserve);
} else {
replace = Some(nfr.name.clone());
}
Expand Down Expand Up @@ -2025,7 +2025,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
}
}

lab = strings::normalize_label(&lab, false);
lab = strings::normalize_label(&lab, Case::DontPreserve);
if !lab.is_empty() {
subj.refmap.map.entry(lab).or_insert(Reference {
url: String::from_utf8(strings::clean_url(url)).unwrap(),
Expand Down
33 changes: 24 additions & 9 deletions src/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ use crate::parser::AutolinkType;
use std::ptr;
use std::str;

#[derive(PartialEq, Eq)]
pub enum Case {
Preserve,
DontPreserve,
}

pub fn unescape(v: &mut Vec<u8>) {
let mut r = 0;
let mut prev = None;
Expand Down Expand Up @@ -237,7 +243,7 @@ pub fn is_blank(s: &[u8]) -> bool {
true
}

pub fn normalize_label(i: &str, preserve_case: bool) -> String {
pub fn normalize_label(i: &str, casing: Case) -> String {
// trim_slice only removes bytes from start and end that match isspace();
// result is UTF-8.
let i = unsafe { str::from_utf8_unchecked(trim_slice(i.as_bytes())) };
Expand All @@ -252,10 +258,9 @@ pub fn normalize_label(i: &str, preserve_case: bool) -> String {
}
} else {
last_was_whitespace = false;
if preserve_case {
v.push(c);
} else {
v.push_str(&c.to_lowercase().to_string());
match casing {
Case::Preserve => v.push(c),
Case::DontPreserve => v.push_str(&c.to_lowercase().to_string()),
}
}
}
Expand Down Expand Up @@ -311,6 +316,7 @@ pub fn trim_start_match<'s>(s: &'s str, pat: &str) -> &'s str {
#[cfg(test)]
pub mod tests {
use super::{normalize_code, normalize_label, split_off_front_matter};
use crate::strings::Case;

#[test]
fn normalize_code_handles_lone_newline() {
Expand Down Expand Up @@ -346,13 +352,22 @@ pub mod tests {

#[test]
fn normalize_label_lowercase() {
assert_eq!(normalize_label(" Foo\u{A0}BAR ", false), "foo bar");
assert_eq!(normalize_label(" FooİBAR ", false), "fooi\u{307}bar");
assert_eq!(
normalize_label(" Foo\u{A0}BAR ", Case::DontPreserve),
"foo bar"
);
assert_eq!(
normalize_label(" FooİBAR ", Case::DontPreserve),
"fooi\u{307}bar"
);
}

#[test]
fn normalize_label_preserve() {
assert_eq!(normalize_label(" Foo\u{A0}BAR ", true), "Foo BAR");
assert_eq!(normalize_label(" FooİBAR ", true), "FooİBAR");
assert_eq!(
normalize_label(" Foo\u{A0}BAR ", Case::Preserve),
"Foo BAR"
);
assert_eq!(normalize_label(" FooİBAR ", Case::Preserve), "FooİBAR");
}
}
Loading