From 266a7bea5c5251e96beccfd61e2b03ea644f4584 Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Sat, 18 Oct 2025 14:43:50 +1100 Subject: [PATCH 1/4] pull across some improvements from #617. --- examples/custom_formatter_alt_text.rs | 2 +- examples/iterator_replace.rs | 7 +- src/cm.rs | 16 +- src/html.rs | 7 +- src/lib.rs | 16 +- src/nodes.rs | 280 +++++++++++++------------- src/parser/mod.rs | 8 +- src/parser/table.rs | 6 +- 8 files changed, 173 insertions(+), 169 deletions(-) diff --git a/examples/custom_formatter_alt_text.rs b/examples/custom_formatter_alt_text.rs index 9ba0168fc..14c1cc705 100644 --- a/examples/custom_formatter_alt_text.rs +++ b/examples/custom_formatter_alt_text.rs @@ -56,7 +56,7 @@ fn main() { formatter, (), ) - .unwrap_or_else(|_| unreachable!("writing to Vec cannot fail")); + .unwrap_or_else(|_| unreachable!("writing to String cannot fail")); println!("{out}"); } diff --git a/examples/iterator_replace.rs b/examples/iterator_replace.rs index 65454ba86..bf4e64b71 100644 --- a/examples/iterator_replace.rs +++ b/examples/iterator_replace.rs @@ -24,9 +24,10 @@ fn replace_text(document: &str, orig_string: &str, replacement: &str) -> String } fn main() { - let doc = "This is my input.\n\n1. Also [my](#) input.\n2. Certainly *my* input.\n"; - let orig = "my"; - let repl = "your"; + let doc = + "Hello, pretty world!\n\n1. Do you like [pretty](#) paintings?\n2. Or *pretty* music?\n"; + let orig = "pretty"; + let repl = "beautiful"; let html = replace_text(doc, orig, repl); println!("{}", html); diff --git a/src/cm.rs b/src/cm.rs index 1af2972b4..8c8f5f704 100644 --- a/src/cm.rs +++ b/src/cm.rs @@ -1,3 +1,9 @@ +use std::cmp::max; +use std::fmt; +use std::io::{self, Write}; +use std::str; +pub use typed_arena::Arena; + use crate::ctype::{isalpha, isdigit, ispunct, isspace}; use crate::nodes::{ AstNode, ListDelimType, ListType, NodeAlert, NodeCodeBlock, NodeHeading, NodeHtmlBlock, @@ -8,13 +14,7 @@ use crate::parser::shortcodes::NodeShortCode; use crate::parser::{Options, WikiLinksMode}; use crate::scanners; use crate::strings::trim_start_match; -use crate::{node_matches, nodes, Plugins}; -pub use typed_arena::Arena; - -use std::cmp::max; -use std::fmt; -use std::io::{self, Write}; -use std::str; +use crate::{node_matches, Plugins}; /// Formats an AST as CommonMark, modified by the given options. pub fn format_document<'a>( @@ -292,7 +292,7 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> { } fn get_in_tight_list_item(&self, node: &'a AstNode<'a>) -> bool { - let tmp = match nodes::containing_block(node) { + let tmp = match node.containing_block() { Some(tmp) => tmp, None => return false, }; diff --git a/src/html.rs b/src/html.rs index 52006afd8..f1de22818 100644 --- a/src/html.rs +++ b/src/html.rs @@ -7,6 +7,10 @@ mod anchorizer; mod context; +use std::collections::HashMap; +use std::fmt::{self, Write}; +use std::str; + use crate::adapters::HeadingMeta; use crate::character_set::character_set; use crate::ctype::isspace; @@ -16,9 +20,6 @@ use crate::nodes::{ }; use crate::parser::{Options, Plugins}; use crate::{node_matches, scanners}; -use std::collections::HashMap; -use std::fmt::{self, Write}; -use std::str; #[doc(hidden)] pub use anchorizer::Anchorizer; diff --git a/src/lib.rs b/src/lib.rs index b017727f4..690b43109 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,8 +7,10 @@ //! //! ``` //! use comrak::{markdown_to_html, Options}; -//! assert_eq!(markdown_to_html("Hello, **世界**!", &Options::default()), -//! "

Hello, 世界!

\n"); +//! assert_eq!( +//! markdown_to_html("Olá, **世界**!", &Options::default()), +//! "

Olá, 世界!

\n" +//! ); //! ``` //! //! Or you can parse the input into an AST yourself, manipulate it, and then use your desired @@ -23,12 +25,12 @@ //! //! let root = parse_document( //! &arena, -//! "This is my input.\n\n1. Also [my](#) input.\n2. Certainly *my* input.\n", +//! "Hello, pretty world!\n\n1. Do you like [pretty](#) paintings?\n2. Or *pretty* music?\n", //! &Options::default()); //! //! for node in root.descendants() { //! if let NodeValue::Text(ref mut text) = node.data.borrow_mut().value { -//! *text = text.replace("my", "your"); +//! *text = text.replace("pretty", "beautiful"); //! } //! } //! @@ -37,10 +39,10 @@ //! //! assert_eq!( //! &html, -//! "

This is your input.

\n\ +//! "

Hello, beautiful world!

\n\ //!
    \n\ -//!
  1. Also your input.
  2. \n\ -//!
  3. Certainly your input.
  4. \n\ +//!
  5. Do you like beautiful paintings?
  6. \n\ +//!
  7. Or beautiful music?
  8. \n\ //!
\n"); //! # } //! ``` diff --git a/src/nodes.rs b/src/nodes.rs index 62c5c7af4..14ce7fbdb 100644 --- a/src/nodes.rs +++ b/src/nodes.rs @@ -669,7 +669,7 @@ impl Ast { /// /// You can construct a new `AstNode` from a `NodeValue` using the `From` trait: /// -/// ```no_run +/// ``` /// # use comrak::nodes::{AstNode, NodeValue}; /// let root = AstNode::from(NodeValue::Document); /// ``` @@ -678,20 +678,18 @@ impl Ast { /// to assign sourcepos information, use the `From` trait to create an `AstNode` /// from an `Ast`: /// -/// ```no_run +/// ``` /// # use comrak::nodes::{Ast, AstNode, NodeValue}; -/// let root = AstNode::from(Ast::new( +/// let root = AstNode::from(Ast::new_with_sourcepos( /// NodeValue::Paragraph, -/// (4, 1).into(), // start_line, start_col +/// (4, 1, 4, 10).into(), /// )); /// ``` /// -/// Adjust the `end` position manually. -/// /// For practical use, you'll probably need it allocated in an `Arena`, in which /// case you can use `.into()` to simplify creation: /// -/// ```no_run +/// ``` /// # use comrak::{nodes::{AstNode, NodeValue}, Arena}; /// # let arena = Arena::::new(); /// let node_in_arena = arena.alloc(NodeValue::Document.into()); @@ -726,6 +724,117 @@ pub enum ValidationError<'a> { } impl<'a> Node<'a, RefCell> { + /// Returns true if the given node can contain a node with the given value. + pub fn can_contain_type(&self, child: &NodeValue) -> bool { + match *child { + NodeValue::Document => { + return false; + } + NodeValue::FrontMatter(_) => { + return matches!(self.data.borrow().value, NodeValue::Document); + } + _ => {} + } + + match self.data.borrow().value { + NodeValue::Document + | NodeValue::BlockQuote + | NodeValue::FootnoteDefinition(_) + | NodeValue::DescriptionTerm + | NodeValue::DescriptionDetails + | NodeValue::Item(..) + | NodeValue::TaskItem(..) => { + child.block() && !matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)) + } + + NodeValue::List(..) => matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)), + + NodeValue::DescriptionList => matches!(*child, NodeValue::DescriptionItem(_)), + + NodeValue::DescriptionItem(_) => matches!( + *child, + NodeValue::DescriptionTerm | NodeValue::DescriptionDetails + ), + + #[cfg(feature = "shortcodes")] + NodeValue::ShortCode(..) => !child.block(), + + NodeValue::Paragraph + | NodeValue::Heading(..) + | NodeValue::Emph + | NodeValue::Strong + | NodeValue::Link(..) + | NodeValue::Image(..) + | NodeValue::WikiLink(..) + | NodeValue::Strikethrough + | NodeValue::Superscript + | NodeValue::SpoileredText + | NodeValue::Underline + | NodeValue::Subscript + // XXX: this is quite a hack: the EscapedTag _contains_ whatever was + // possibly going to fall into the spoiler. This should be fixed in + // inlines. + | NodeValue::EscapedTag(_) + => !child.block(), + + NodeValue::Table(..) => matches!(*child, NodeValue::TableRow(..)), + + NodeValue::TableRow(..) => matches!(*child, NodeValue::TableCell), + + #[cfg(not(feature = "shortcodes"))] + NodeValue::TableCell => matches!( + *child, + NodeValue::Text(..) + | NodeValue::Code(..) + | NodeValue::Emph + | NodeValue::Strong + | NodeValue::Link(..) + | NodeValue::Image(..) + | NodeValue::Strikethrough + | NodeValue::HtmlInline(..) + | NodeValue::Math(..) + | NodeValue::WikiLink(..) + | NodeValue::FootnoteReference(..) + | NodeValue::Superscript + | NodeValue::SpoileredText + | NodeValue::Underline + | NodeValue::Subscript + | NodeValue::TaskItem(_) + ), + + #[cfg(feature = "shortcodes")] + NodeValue::TableCell => matches!( + *child, + NodeValue::Text(..) + | NodeValue::Code(..) + | NodeValue::Emph + | NodeValue::Strong + | NodeValue::Link(..) + | NodeValue::Image(..) + | NodeValue::Strikethrough + | NodeValue::HtmlInline(..) + | NodeValue::Math(..) + | NodeValue::WikiLink(..) + | NodeValue::FootnoteReference(..) + | NodeValue::Superscript + | NodeValue::SpoileredText + | NodeValue::Underline + | NodeValue::Subscript + | NodeValue::ShortCode(..) + | NodeValue::TaskItem(_) + ), + + NodeValue::MultilineBlockQuote(_) => { + child.block() && !matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)) + } + + NodeValue::Alert(_) => { + child.block() && !matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)) + } + _ => false, + } + } + /// The comrak representation of a markdown node in Rust isn't strict enough to rule out /// invalid trees according to the CommonMark specification. One simple example is that block /// containers, such as lists, should only contain blocks, but it's possible to put naked @@ -743,7 +852,7 @@ impl<'a> Node<'a, RefCell> { while let Some(node) = stack.pop() { // Check that this node type is valid wrt to the type of its parent. if let Some(parent) = node.parent() { - if !can_contain_type(parent, &node.data.borrow().value) { + if !parent.can_contain_type(&node.data.borrow().value) { return Err(ValidationError::InvalidChildType { parent, child: node, @@ -756,146 +865,35 @@ impl<'a> Node<'a, RefCell> { Ok(()) } -} - -pub(crate) fn last_child_is_open<'a>(node: &'a AstNode<'a>) -> bool { - node.last_child().map_or(false, |n| n.data.borrow().open) -} -/// Returns true if the given node can contain a node with the given value. -pub fn can_contain_type<'a>(node: &'a AstNode<'a>, child: &NodeValue) -> bool { - match *child { - NodeValue::Document => { - return false; - } - NodeValue::FrontMatter(_) => { - return matches!(node.data.borrow().value, NodeValue::Document); - } - _ => {} + pub(crate) fn last_child_is_open(&self) -> bool { + self.last_child().map_or(false, |n| n.data.borrow().open) } - match node.data.borrow().value { - NodeValue::Document - | NodeValue::BlockQuote - | NodeValue::FootnoteDefinition(_) - | NodeValue::DescriptionTerm - | NodeValue::DescriptionDetails - | NodeValue::Item(..) - | NodeValue::TaskItem(..) => { - child.block() && !matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)) - } - - NodeValue::List(..) => matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)), - - NodeValue::DescriptionList => matches!(*child, NodeValue::DescriptionItem(_)), - - NodeValue::DescriptionItem(_) => matches!( - *child, - NodeValue::DescriptionTerm | NodeValue::DescriptionDetails - ), - - #[cfg(feature = "shortcodes")] - NodeValue::ShortCode(..) => !child.block(), - - NodeValue::Paragraph - | NodeValue::Heading(..) - | NodeValue::Emph - | NodeValue::Strong - | NodeValue::Link(..) - | NodeValue::Image(..) - | NodeValue::WikiLink(..) - | NodeValue::Strikethrough - | NodeValue::Superscript - | NodeValue::SpoileredText - | NodeValue::Underline - | NodeValue::Subscript - // XXX: this is quite a hack: the EscapedTag _contains_ whatever was - // possibly going to fall into the spoiler. This should be fixed in - // inlines. - | NodeValue::EscapedTag(_) - => !child.block(), - - NodeValue::Table(..) => matches!(*child, NodeValue::TableRow(..)), - - NodeValue::TableRow(..) => matches!(*child, NodeValue::TableCell), - - #[cfg(not(feature = "shortcodes"))] - NodeValue::TableCell => matches!( - *child, - NodeValue::Text(..) - | NodeValue::Code(..) - | NodeValue::Emph - | NodeValue::Strong - | NodeValue::Link(..) - | NodeValue::Image(..) - | NodeValue::Strikethrough - | NodeValue::HtmlInline(..) - | NodeValue::Math(..) - | NodeValue::WikiLink(..) - | NodeValue::FootnoteReference(..) - | NodeValue::Superscript - | NodeValue::SpoileredText - | NodeValue::Underline - | NodeValue::Subscript - | NodeValue::TaskItem(_) - ), - - #[cfg(feature = "shortcodes")] - NodeValue::TableCell => matches!( - *child, - NodeValue::Text(..) - | NodeValue::Code(..) - | NodeValue::Emph - | NodeValue::Strong - | NodeValue::Link(..) - | NodeValue::Image(..) - | NodeValue::Strikethrough - | NodeValue::HtmlInline(..) - | NodeValue::Math(..) - | NodeValue::WikiLink(..) - | NodeValue::FootnoteReference(..) - | NodeValue::Superscript - | NodeValue::SpoileredText - | NodeValue::Underline - | NodeValue::Subscript - | NodeValue::ShortCode(..) - | NodeValue::TaskItem(_) - ), - - NodeValue::MultilineBlockQuote(_) => { - child.block() && !matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)) - } - - NodeValue::Alert(_) => { - child.block() && !matches!(*child, NodeValue::Item(..) | NodeValue::TaskItem(..)) + pub(crate) fn ends_with_blank_line(&self) -> bool { + let mut it = Some(self); + while let Some(cur) = it { + if cur.data.borrow().last_line_blank { + return true; + } + match cur.data.borrow().value { + NodeValue::List(..) | NodeValue::Item(..) | NodeValue::TaskItem(..) => { + it = cur.last_child() + } + _ => it = None, + }; } - _ => false, + false } -} -pub(crate) fn ends_with_blank_line<'a>(node: &'a AstNode<'a>) -> bool { - let mut it = Some(node); - while let Some(cur) = it { - if cur.data.borrow().last_line_blank { - return true; - } - match cur.data.borrow().value { - NodeValue::List(..) | NodeValue::Item(..) | NodeValue::TaskItem(..) => { - it = cur.last_child() + pub(crate) fn containing_block(&'a self) -> Option<&'a AstNode<'a>> { + let mut ch = Some(self); + while let Some(n) = ch { + if n.data.borrow().value.block() { + return Some(n); } - _ => it = None, - }; - } - false -} - -pub(crate) fn containing_block<'a>(node: &'a AstNode<'a>) -> Option<&'a AstNode<'a>> { - let mut ch = Some(node); - while let Some(n) = ch { - if n.data.borrow().value.block() { - return Some(n); + ch = n.parent(); } - ch = n.parent(); + None } - None } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 6d224ef18..7caaad69a 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1508,7 +1508,7 @@ where ) -> (bool, &'a AstNode<'a>, bool) { let mut should_continue = true; - while nodes::last_child_is_open(container) { + while container.last_child_is_open() { container = container.last_child().unwrap(); let ast = &mut *container.data.borrow_mut(); @@ -2480,7 +2480,7 @@ where // The last child, like an indented codeblock, could be left open. // Make sure it's finalized. - if nodes::last_child_is_open(container) { + if container.last_child_is_open() { let child = container.last_child().unwrap(); let child_ast = &mut *child.data.borrow_mut(); @@ -2505,7 +2505,7 @@ where value: NodeValue, start_column: usize, ) -> &'a AstNode<'a> { - while !nodes::can_contain_type(parent, &value) { + while !parent.can_contain_type(&value) { parent = self.finalize(parent).unwrap(); } @@ -2811,7 +2811,7 @@ where let mut subch = item.first_child(); while let Some(subitem) = subch { if (item.next_sibling().is_some() || subitem.next_sibling().is_some()) - && nodes::ends_with_blank_line(subitem) + && subitem.ends_with_blank_line() { nl.tight = false; break; diff --git a/src/parser/table.rs b/src/parser/table.rs index 7df04c4d2..9bf846272 100644 --- a/src/parser/table.rs +++ b/src/parser/table.rs @@ -1,5 +1,4 @@ use crate::arena_tree::Node; -use crate::nodes; use crate::nodes::{Ast, AstNode, NodeTable, NodeValue, TableAlignment}; use crate::parser::Parser; use crate::scanners; @@ -288,7 +287,10 @@ fn try_inserting_table_header_paragraph<'a>( trim(&mut paragraph_content); if container.parent().is_none() - || !nodes::can_contain_type(container.parent().unwrap(), &NodeValue::Paragraph) + || !container + .parent() + .unwrap() + .can_contain_type(&NodeValue::Paragraph) { return; } From 7ff9cc9ee5f5d87fefdee42705616d0c3cdb9c7d Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Sat, 18 Oct 2025 15:00:06 +1100 Subject: [PATCH 2/4] cont'd. --- src/parser/autolink.rs | 32 +++++++------ src/parser/inlines.rs | 52 ++++++++++----------- src/parser/mod.rs | 104 ++++++++++++++++------------------------- 3 files changed, 82 insertions(+), 106 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 51822dbf6..8badca323 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -1,10 +1,12 @@ +use std::str; +use typed_arena::Arena; +use unicode_categories::UnicodeCategories; + use crate::character_set::character_set; use crate::ctype::{isalnum, isalpha, isspace}; use crate::nodes::{AstNode, NodeLink, NodeValue, Sourcepos}; +use crate::parser::inlines::Subject; use crate::parser::{inlines::make_inline, Spx}; -use std::str; -use typed_arena::Arena; -use unicode_categories::UnicodeCategories; pub(crate) fn process_email_autolinks<'a>( arena: &'a Arena>, @@ -226,12 +228,12 @@ fn validate_protocol(protocol: &str, contents: &[u8], cursor: usize) -> bool { } pub fn www_match<'a>( - arena: &'a Arena>, - contents: &[u8], - i: usize, - relaxed_autolinks: bool, + subject: &mut Subject<'a, '_, '_, '_, '_, '_, '_>, ) -> Option<(&'a AstNode<'a>, usize, usize)> { const WWW_DELIMS: [bool; 256] = character_set!(b"*_~(["); + let i = subject.pos; + let relaxed_autolinks = subject.options.parse.relaxed_autolinks; + let contents = subject.input; if i > 0 && !isspace(contents[i - 1]) && !WWW_DELIMS[contents[i - 1] as usize] { return None; @@ -259,7 +261,7 @@ pub fn www_match<'a>( url.push_str(str::from_utf8(&contents[i..link_end + i]).unwrap()); let inl = make_inline( - arena, + subject.arena, NodeValue::Link(NodeLink { url, title: String::new(), @@ -268,7 +270,7 @@ pub fn www_match<'a>( ); inl.append(make_inline( - arena, + subject.arena, NodeValue::Text( str::from_utf8(&contents[i..link_end + i]) .unwrap() @@ -384,13 +386,13 @@ fn autolink_delim(data: &[u8], mut link_end: usize, relaxed_autolinks: bool) -> } pub fn url_match<'a>( - arena: &'a Arena>, - contents: &[u8], - i: usize, - relaxed_autolinks: bool, + subject: &mut Subject<'a, '_, '_, '_, '_, '_, '_>, ) -> Option<(&'a AstNode<'a>, usize, usize)> { const SCHEMES: [&[u8]; 3] = [b"http", b"https", b"ftp"]; + let i = subject.pos; + let relaxed_autolinks = subject.options.parse.relaxed_autolinks; + let contents = subject.input; let size = contents.len(); if size - i < 4 || contents[i + 1] != b'/' || contents[i + 2] != b'/' { @@ -429,7 +431,7 @@ pub fn url_match<'a>( .unwrap() .to_string(); let inl = make_inline( - arena, + subject.arena, NodeValue::Link(NodeLink { url: url.clone(), title: String::new(), @@ -438,7 +440,7 @@ pub fn url_match<'a>( ); inl.append(make_inline( - arena, + subject.arena, NodeValue::Text(url), (0, 1, 0, 1).into(), )); diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 86c73b546..645901128 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -1,3 +1,11 @@ +use std::cell::{Cell, RefCell}; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::ptr; +use std::str; +use typed_arena::Arena; +use unicode_categories::UnicodeCategories; + use crate::arena_tree::Node; use crate::ctype::{isdigit, ispunct, isspace}; use crate::entity; @@ -5,23 +13,14 @@ use crate::nodes::{ Ast, AstNode, NodeCode, NodeFootnoteDefinition, NodeFootnoteReference, NodeLink, NodeMath, NodeValue, NodeWikiLink, Sourcepos, }; -use crate::parser::autolink; #[cfg(feature = "shortcodes")] use crate::parser::shortcodes::NodeShortCode; use crate::parser::{ - unwrap_into_2, unwrap_into_copy, AutolinkType, BrokenLinkReference, Options, ResolvedReference, + autolink, unwrap_into_2, unwrap_into_copy, AutolinkType, BrokenLinkReference, Options, + ResolvedReference, WikiLinksMode, }; use crate::scanners; use crate::strings::{self, is_blank, Case}; -use std::cell::{Cell, RefCell}; -use std::collections::HashMap; -use std::convert::TryFrom; -use std::ptr; -use std::str; -use typed_arena::Arena; -use unicode_categories::UnicodeCategories; - -use super::WikiLinksMode; const MAXBACKTICKS: usize = 80; const MAX_LINK_LABEL_LENGTH: usize = 1000; @@ -133,7 +132,7 @@ impl FlankingCheckHelper for char { pub struct Subject<'a: 'd, 'r, 'o, 'd, 'i, 'c, 'p> { pub arena: &'a Arena>, - options: &'o Options<'c>, + pub options: &'o Options<'c>, pub input: &'i [u8], line: usize, pub pos: usize, @@ -242,7 +241,13 @@ impl<'a: 'd, 'd> std::fmt::Debug for Delimiter<'a, 'd> { self.delim_char, self.can_open, self.can_close, - self.inl.data.borrow().sourcepos + self.inl + .data + .try_borrow() + .map_or("".to_string(), |d| format!( + "{}", + d.sourcepos + )) ) } } @@ -1488,25 +1493,16 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'p> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'p> { )) } - fn handle_autolink_with(&mut self, node: &'a AstNode<'a>, f: F) -> Option<&'a AstNode<'a>> - where - F: Fn( - &'a Arena>, - &[u8], - usize, - bool, - ) -> Option<(&'a AstNode<'a>, usize, usize)>, - { + fn handle_autolink_with( + &mut self, + node: &'a AstNode<'a>, + f: fn(&mut Subject<'a, '_, '_, '_, '_, '_, '_>) -> Option<(&'a AstNode<'a>, usize, usize)>, + ) -> Option<&'a AstNode<'a>> { if !self.options.parse.relaxed_autolinks && self.within_brackets { return None; } let startpos = self.pos; - let (post, need_reverse, skip) = f( - self.arena, - self.input, - self.pos, - self.options.parse.relaxed_autolinks, - )?; + let (post, need_reverse, skip) = f(self)?; self.pos += skip - need_reverse; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 7caaad69a..e89d2fb2f 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2799,37 +2799,36 @@ where mem::swap(&mut nhb.literal, content); } NodeValue::List(ref mut nl) => { - nl.tight = true; - let mut ch = node.first_child(); + nl.tight = self.determine_list_tight(node); + } + _ => (), + } - while let Some(item) = ch { - if item.data.borrow().last_line_blank && item.next_sibling().is_some() { - nl.tight = false; - break; - } + parent + } - let mut subch = item.first_child(); - while let Some(subitem) = subch { - if (item.next_sibling().is_some() || subitem.next_sibling().is_some()) - && subitem.ends_with_blank_line() - { - nl.tight = false; - break; - } - subch = subitem.next_sibling(); - } + fn determine_list_tight(&self, node: &'a AstNode<'a>) -> bool { + let mut ch = node.first_child(); - if !nl.tight { - break; - } + while let Some(item) = ch { + if item.data.borrow().last_line_blank && item.next_sibling().is_some() { + return false; + } - ch = item.next_sibling(); + let mut subch = item.first_child(); + while let Some(subitem) = subch { + if (item.next_sibling().is_some() || subitem.next_sibling().is_some()) + && subitem.ends_with_blank_line() + { + return false; } + subch = subitem.next_sibling(); } - _ => (), + + ch = item.next_sibling(); } - parent + true } fn process_inlines(&mut self) { @@ -2866,33 +2865,25 @@ where } fn process_footnotes(&mut self) { - let mut map = HashMap::new(); - Self::find_footnote_definitions(self.root, &mut map); - - let mut ix = 0; - Self::find_footnote_references(self.root, &mut map, &mut ix); - - if !map.is_empty() { - // In order for references to be found inside footnote definitions, - // such as `[^1]: another reference[^2]`, - // the node needed to remain in the AST. Now we can remove them. - Self::cleanup_footnote_definitions(self.root); - } - - if ix > 0 { - let mut v = map.into_values().collect::>(); - v.sort_unstable_by(|a, b| a.ix.cmp(&b.ix)); - for f in v { - if f.ix.is_some() { - match f.node.data.borrow_mut().value { - NodeValue::FootnoteDefinition(ref mut nfd) => { - nfd.name = f.name.to_string(); - nfd.total_references = f.total_references; - } - _ => unreachable!(), - } - self.root.append(f.node); - } + let mut fd_map = HashMap::new(); + Self::find_footnote_definitions(self.root, &mut fd_map); + + let mut next_ix = 0; + Self::find_footnote_references(self.root, &mut fd_map, &mut next_ix); + + let mut fds = fd_map.into_values().collect::>(); + fds.sort_unstable_by(|a, b| a.ix.cmp(&b.ix)); + for fd in fds { + if fd.ix.is_some() { + let NodeValue::FootnoteDefinition(ref mut nfd) = fd.node.data.borrow_mut().value + else { + unreachable!() + }; + nfd.name = fd.name.to_string(); + nfd.total_references = fd.total_references; + self.root.append(fd.node); + } else { + fd.node.detach(); } } } @@ -2962,19 +2953,6 @@ where } } - fn cleanup_footnote_definitions(node: &'a AstNode<'a>) { - match node.data.borrow().value { - NodeValue::FootnoteDefinition(_) => { - node.detach(); - } - _ => { - for n in node.children() { - Self::cleanup_footnote_definitions(n); - } - } - } - } - fn postprocess_text_nodes(&mut self, node: &'a AstNode<'a>) { self.postprocess_text_nodes_with_context(node, false); } From ca012153da884d0ea5131b420aeff76e5165ea8f Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Sat, 18 Oct 2025 15:19:17 +1100 Subject: [PATCH 3/4] cont'd. --- src/parser/mod.rs | 111 ++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 47 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index e89d2fb2f..ba07da401 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -8,17 +8,8 @@ pub mod alert; pub mod math; pub mod multiline_block_quote; -use crate::adapters::SyntaxHighlighterAdapter; -use crate::arena_tree::Node; -use crate::ctype::{isdigit, isspace}; -use crate::entity; -use crate::nodes::{self, NodeFootnoteDefinition, Sourcepos}; -use crate::nodes::{ - Ast, AstNode, ListDelimType, ListType, NodeCodeBlock, NodeDescriptionItem, NodeHeading, - NodeHtmlBlock, NodeList, NodeValue, -}; -use crate::scanners::{self, SetextChar}; -use crate::strings::{self, split_off_front_matter, Case}; +#[cfg(feature = "bon")] +use bon::Builder; use std::cell::RefCell; use std::cmp::{min, Ordering}; use std::collections::{HashMap, VecDeque}; @@ -29,14 +20,19 @@ use std::str; use std::sync::Arc; use typed_arena::Arena; -use crate::adapters::HeadingAdapter; +use crate::adapters::{HeadingAdapter, SyntaxHighlighterAdapter}; +use crate::arena_tree::Node; +use crate::ctype::{isdigit, isspace}; +use crate::entity; +use crate::nodes::{ + self, Ast, AstNode, ListDelimType, ListType, NodeCodeBlock, NodeDescriptionItem, + NodeFootnoteDefinition, NodeHeading, NodeHtmlBlock, NodeList, NodeValue, Sourcepos, +}; use crate::parser::alert::{AlertType, NodeAlert}; +use crate::parser::inlines::RefMap; use crate::parser::multiline_block_quote::NodeMultilineBlockQuote; - -#[cfg(feature = "bon")] -use bon::Builder; - -use self::inlines::RefMap; +use crate::scanners::{self, SetextChar}; +use crate::strings::{self, split_off_front_matter, Case}; const TAB_STOP: usize = 4; const CODE_INDENT: usize = 4; @@ -2972,36 +2968,18 @@ where let mut child_in_bracket_context = in_bracket_context; let mut emptied = false; let n_ast = &mut n.data.borrow_mut(); - let mut sourcepos = n_ast.sourcepos; + let sourcepos = n_ast.sourcepos; match n_ast.value { NodeValue::Text(ref mut root) => { - // Join adjacent text nodes together, then post-process. - // Record the original list of sourcepos and bytecounts - // for the post-processing step. - let mut spxv = VecDeque::new(); - spxv.push_back((sourcepos, root.len())); - while let Some(ns) = n.next_sibling() { - match ns.data.borrow().value { - NodeValue::Text(ref adj) => { - root.push_str(adj); - let sp = ns.data.borrow().sourcepos; - spxv.push_back((sp, adj.len())); - sourcepos.end.column = sp.end.column; - ns.detach(); - } - _ => break, - } - } - - self.postprocess_text_node_with_context( + let sourcepos = self.postprocess_text_node_with_context( n, + sourcepos, root, - &mut sourcepos, - spxv, in_bracket_context, ); emptied = root.is_empty(); + n_ast.sourcepos = sourcepos; } NodeValue::Link(..) | NodeValue::Image(..) | NodeValue::WikiLink(..) => { // Recurse into links, images, and wikilinks to join adjacent text nodes, @@ -3011,8 +2989,6 @@ where _ => {} } - n_ast.sourcepos = sourcepos; - if !emptied { children.push((n, child_in_bracket_context)); } @@ -3031,6 +3007,43 @@ where } fn postprocess_text_node_with_context( + &mut self, + node: &'a AstNode<'a>, + mut sourcepos: Sourcepos, + root: &mut String, + in_bracket_context: bool, + ) -> Sourcepos { + // Join adjacent text nodes together, then post-process. + // Record the original list of sourcepos and bytecounts + // for the post-processing step. + + let mut spxv = VecDeque::new(); + spxv.push_back((sourcepos, root.len())); + while let Some(ns) = node.next_sibling() { + match ns.data.borrow().value { + NodeValue::Text(ref adj) => { + root.push_str(adj); + let sp = ns.data.borrow().sourcepos; + spxv.push_back((sp, adj.len())); + sourcepos.end.column = sp.end.column; + ns.detach(); + } + _ => break, + } + } + + self.postprocess_text_node_with_context_inner( + node, + root, + &mut sourcepos, + spxv, + in_bracket_context, + ); + + sourcepos + } + + fn postprocess_text_node_with_context_inner( &mut self, node: &'a AstNode<'a>, text: &mut String, @@ -3154,15 +3167,19 @@ where fn parse_reference_inline(&mut self, content: &[u8]) -> Option { // In this case reference inlines rarely have delimiters // so we often just need the minimal case - let delimiter_arena = Arena::with_capacity(0); + let unused_node_arena = Arena::new(); + let unused_footnote_defs = inlines::FootnoteDefs::new(); + let unused_delimiter_arena = Arena::with_capacity(0); + let mut unused_refmap = inlines::RefMap::new(); + let mut subj = inlines::Subject::new( - self.arena, + &unused_node_arena, self.options, content, 0, // XXX -1 in upstream; never used? - &mut self.refmap, - &self.footnote_defs, - &delimiter_arena, + &mut unused_refmap, + &unused_footnote_defs, + &unused_delimiter_arena, ); let mut lab: String = match subj.link_label() { @@ -3216,7 +3233,7 @@ where lab = strings::normalize_label(&lab, Case::Fold); if !lab.is_empty() { - subj.refmap.map.entry(lab).or_insert(ResolvedReference { + self.refmap.map.entry(lab).or_insert(ResolvedReference { url: String::from_utf8(strings::clean_url(url)).unwrap(), title: String::from_utf8(strings::clean_title(&title)).unwrap(), }); From 8a13eee90448bad9a6763cbdd92bb35d6323a25b Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Sat, 18 Oct 2025 16:14:33 +1100 Subject: [PATCH 4/4] update bench submodules; build markdown-it without syntect. --- Makefile | 2 +- vendor/markdown-it | 2 +- vendor/pulldown-cmark | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 68d80e717..ac380d31c 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ build-cmark-gfm: build-markdown-it: cd ${ROOT}/vendor/markdown-it && \ - cargo build --release && \ + cargo build --release --no-default-features && \ cp target/release/markdown-it ${ROOT}/benches/markdown-it build-pulldown-cmark: diff --git a/vendor/markdown-it b/vendor/markdown-it index 537116980..2d7c08504 160000 --- a/vendor/markdown-it +++ b/vendor/markdown-it @@ -1 +1 @@ -Subproject commit 537116980ae6414a5dfa945149ae3a13f27dd4ae +Subproject commit 2d7c085046a144d221490331b25ca565ecddbb1b diff --git a/vendor/pulldown-cmark b/vendor/pulldown-cmark index dadad5f92..f4a326d22 160000 --- a/vendor/pulldown-cmark +++ b/vendor/pulldown-cmark @@ -1 +1 @@ -Subproject commit dadad5f92de6d1140b4ce39e8d05f0cabff01a9a +Subproject commit f4a326d225e79412b5ecabd1c241c851e8160815