String {
escape_html(text)
}
-/// Strip HTML tags from text (for image alt text).
-fn strip_html_tags(text: &str) -> String {
+/// Extract plain text for image alt attribute.
+/// Per CommonMark, the alt text is the content with inline formatting stripped
+/// but text from nested links/images preserved (recursively extracting their text).
+fn extract_alt_text(
+ list: &biome_markdown_syntax::MdInlineItemList,
+ ctx: &HtmlRenderContext,
+) -> String {
let mut result = String::new();
- let mut in_tag = false;
+ for item in list.iter() {
+ extract_alt_text_inline(&item, ctx, &mut result);
+ }
+ result
+}
- for c in text.chars() {
- if c == '<' {
- in_tag = true;
- } else if c == '>' {
- in_tag = false;
- } else if !in_tag {
- result.push(c);
+fn extract_alt_text_inline(inline: &AnyMdInline, ctx: &HtmlRenderContext, out: &mut String) {
+ match inline {
+ AnyMdInline::MdTextual(text) => {
+ render_textual(text, out);
+ }
+ AnyMdInline::MdInlineEmphasis(em) => {
+ out.push_str(&extract_alt_text(&em.content(), ctx));
+ }
+ AnyMdInline::MdInlineItalic(italic) => {
+ out.push_str(&extract_alt_text(&italic.content(), ctx));
+ }
+ AnyMdInline::MdInlineCode(code) => {
+ // Plain text only — no tags for alt attribute
+ let content = collect_raw_inline_text(&code.content());
+ let content = content.replace('\n', " ");
+ let content = if content.starts_with(' ')
+ && content.ends_with(' ')
+ && content.len() > 2
+ && content.chars().any(|c| c != ' ')
+ {
+ content[1..content.len() - 1].to_string()
+ } else {
+ content
+ };
+ out.push_str(&escape_html(&content));
+ }
+ AnyMdInline::MdInlineLink(link) => {
+ // Extract text content from link text
+ out.push_str(&extract_alt_text(&link.text(), ctx));
+ }
+ AnyMdInline::MdInlineImage(img) => {
+ // Recursively extract alt text from nested image
+ out.push_str(&extract_alt_text(&img.alt(), ctx));
+ }
+ AnyMdInline::MdReferenceLink(link) => {
+ out.push_str(&extract_alt_text(&link.text(), ctx));
+ }
+ AnyMdInline::MdReferenceImage(img) => {
+ out.push_str(&extract_alt_text(&img.alt(), ctx));
+ }
+ AnyMdInline::MdAutolink(autolink) => {
+ let content = collect_raw_inline_text(&autolink.value());
+ out.push_str(&escape_html(&content));
+ }
+ AnyMdInline::MdHardLine(_) | AnyMdInline::MdSoftBreak(_) => {
+ out.push(' ');
+ }
+ AnyMdInline::MdEntityReference(entity) => {
+ render_entity_reference(entity, out);
+ }
+ AnyMdInline::MdInlineHtml(_) | AnyMdInline::MdHtmlBlock(_) => {
+ // HTML tags are stripped in alt text
}
}
-
- result
}
// ============================================================================
@@ -1590,6 +1641,80 @@ mod tests {
assert_eq!(html, "italic and bold
\n");
}
+ #[test]
+ fn test_emphasis_complex_cases() {
+ // Test: Nested
+ let parsed = parse_markdown("**bold *and italic* text**\n");
+ assert_eq!(
+ parsed.syntax().kind(),
+ biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT,
+ "Nested failed: {}",
+ parsed.syntax()
+ );
+
+ // Test: Rule of 3
+ let parsed = parse_markdown("***bold italic***\n");
+ assert_eq!(
+ parsed.syntax().kind(),
+ biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT,
+ "Rule of 3 failed: {}",
+ parsed.syntax()
+ );
+
+ // Test: Multiple runs
+ let parsed = parse_markdown("*a **b** c*\n");
+ assert_eq!(
+ parsed.syntax().kind(),
+ biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT,
+ "Multiple runs failed: {}",
+ parsed.syntax()
+ );
+
+ // Test: Overlapping
+ let parsed = parse_markdown("*foo**bar**baz*\n");
+ assert_eq!(
+ parsed.syntax().kind(),
+ biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT,
+ "Overlapping failed: {}",
+ parsed.syntax()
+ );
+
+ // Test: Unbalanced emphasis (CommonMark example 442)
+ // **foo* should produce *foo
+ let parsed = parse_markdown("**foo*\n");
+ let html = document_to_html(
+ &parsed.tree(),
+ parsed.list_tightness(),
+ parsed.list_item_indents(),
+ parsed.quote_indents(),
+ );
+ assert_eq!(
+ html,
+ "*foo
\n",
+ "Unbalanced: {}",
+ parsed.syntax()
+ );
+ }
+
+ #[test]
+ fn test_example_431() {
+ // Test: Example 431 - nested emphasis with triple star closer
+ // **foo *bar*** should produce foo bar
+ let parsed = parse_markdown("**foo *bar***\n");
+ let html = document_to_html(
+ &parsed.tree(),
+ parsed.list_tightness(),
+ parsed.list_item_indents(),
+ parsed.quote_indents(),
+ );
+ assert_eq!(
+ html,
+ "foo bar
\n",
+ "Example 431: {}",
+ parsed.syntax()
+ );
+ }
+
#[test]
fn test_escape_html() {
assert_eq!(escape_html("a & b < c > d"), "a & b < c > d");
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap
deleted file mode 100644
index c5e71c93f27d..000000000000
--- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap
+++ /dev/null
@@ -1,86 +0,0 @@
----
-source: crates/biome_markdown_parser/tests/spec_test.rs
-expression: snapshot
----
-## Input
-
-```
-This has `unclosed code
-
-```
-
-
-## AST
-
-```
-MdDocument {
- bom_token: missing (optional),
- value: MdBlockList [
- MdParagraph {
- list: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [],
- },
- MdInlineCode {
- l_tick_token: BACKTICK@9..10 "`" [] [],
- content: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [],
- },
- ],
- r_tick_token: missing (required),
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [],
- },
- ],
- hard_line: missing (optional),
- },
- ],
- eof_token: EOF@24..24 "" [] [],
-}
-```
-
-## CST
-
-```
-0: MD_DOCUMENT@0..24
- 0: (empty)
- 1: MD_BLOCK_LIST@0..24
- 0: MD_PARAGRAPH@0..24
- 0: MD_INLINE_ITEM_LIST@0..24
- 0: MD_TEXTUAL@0..9
- 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] []
- 1: MD_INLINE_CODE@9..23
- 0: BACKTICK@9..10 "`" [] []
- 1: MD_INLINE_ITEM_LIST@10..23
- 0: MD_TEXTUAL@10..23
- 0: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] []
- 2: (empty)
- 2: MD_TEXTUAL@23..24
- 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] []
- 1: (empty)
- 2: EOF@24..24 "" [] []
-
-```
-
-## Diagnostics
-
-```
-unclosed_code_span.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-
- × Unclosed code span, expected closing `.
-
- > 1 │ This has `unclosed code
- │ ^
- 2 │
-
- i code span started here
-
- > 1 │ This has `unclosed code
- │ ^
- 2 │
-
- i Add closing ` to close the code span.
-
-```
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap
index 10e49ec3d451..f0790723e0d0 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap
@@ -103,20 +103,20 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@66..77 "Rule of 3: " [] [],
},
- MdInlineEmphasis {
- l_fence: DOUBLE_STAR@77..79 "**" [] [],
+ MdInlineItalic {
+ l_fence: STAR@77..78 "*" [] [],
content: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@79..80 "*" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [],
+ MdInlineEmphasis {
+ l_fence: DOUBLE_STAR@78..80 "**" [] [],
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [],
+ },
+ ],
+ r_fence: DOUBLE_STAR@91..93 "**" [] [],
},
],
- r_fence: DOUBLE_STAR@91..93 "**" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@93..94 "*" [] [],
+ r_fence: STAR@93..94 "*" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@94..95 "\n" [] [],
@@ -301,17 +301,17 @@ MdDocument {
0: MD_INLINE_ITEM_LIST@66..95
0: MD_TEXTUAL@66..77
0: MD_TEXTUAL_LITERAL@66..77 "Rule of 3: " [] []
- 1: MD_INLINE_EMPHASIS@77..93
- 0: DOUBLE_STAR@77..79 "**" [] []
- 1: MD_INLINE_ITEM_LIST@79..91
- 0: MD_TEXTUAL@79..80
- 0: MD_TEXTUAL_LITERAL@79..80 "*" [] []
- 1: MD_TEXTUAL@80..91
- 0: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] []
- 2: DOUBLE_STAR@91..93 "**" [] []
- 2: MD_TEXTUAL@93..94
- 0: MD_TEXTUAL_LITERAL@93..94 "*" [] []
- 3: MD_TEXTUAL@94..95
+ 1: MD_INLINE_ITALIC@77..94
+ 0: STAR@77..78 "*" [] []
+ 1: MD_INLINE_ITEM_LIST@78..93
+ 0: MD_INLINE_EMPHASIS@78..93
+ 0: DOUBLE_STAR@78..80 "**" [] []
+ 1: MD_INLINE_ITEM_LIST@80..91
+ 0: MD_TEXTUAL@80..91
+ 0: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] []
+ 2: DOUBLE_STAR@91..93 "**" [] []
+ 2: STAR@93..94 "*" [] []
+ 2: MD_TEXTUAL@94..95
0: MD_TEXTUAL_LITERAL@94..95 "\n" [] []
1: (empty)
5: MD_NEWLINE@95..96
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap
index bdbaf86e823d..94116b93fa48 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap
@@ -21,26 +21,32 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@0..10 "Crossing: " [] [],
},
- MdInlineEmphasis {
- l_fence: DOUBLE_STAR@10..12 "**" [] [],
+ MdInlineItalic {
+ l_fence: STAR@10..11 "*" [] [],
content: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@12..14 "a " [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@14..15 "*" [] [],
+ MdInlineItalic {
+ l_fence: STAR@11..12 "*" [] [],
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@12..14 "a " [] [],
+ },
+ MdInlineItalic {
+ l_fence: STAR@14..15 "*" [] [],
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@15..16 "b" [] [],
+ },
+ ],
+ r_fence: STAR@16..17 "*" [] [],
+ },
+ ],
+ r_fence: STAR@17..18 "*" [] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@15..16 "b" [] [],
+ value_token: MD_TEXTUAL_LITERAL@18..20 " c" [] [],
},
],
- r_fence: DOUBLE_STAR@16..18 "**" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@18..20 " c" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@20..21 "*" [] [],
+ r_fence: STAR@20..21 "*" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@21..22 "\n" [] [],
@@ -63,21 +69,25 @@ MdDocument {
0: MD_INLINE_ITEM_LIST@0..22
0: MD_TEXTUAL@0..10
0: MD_TEXTUAL_LITERAL@0..10 "Crossing: " [] []
- 1: MD_INLINE_EMPHASIS@10..18
- 0: DOUBLE_STAR@10..12 "**" [] []
- 1: MD_INLINE_ITEM_LIST@12..16
- 0: MD_TEXTUAL@12..14
- 0: MD_TEXTUAL_LITERAL@12..14 "a " [] []
- 1: MD_TEXTUAL@14..15
- 0: MD_TEXTUAL_LITERAL@14..15 "*" [] []
- 2: MD_TEXTUAL@15..16
- 0: MD_TEXTUAL_LITERAL@15..16 "b" [] []
- 2: DOUBLE_STAR@16..18 "**" [] []
- 2: MD_TEXTUAL@18..20
- 0: MD_TEXTUAL_LITERAL@18..20 " c" [] []
- 3: MD_TEXTUAL@20..21
- 0: MD_TEXTUAL_LITERAL@20..21 "*" [] []
- 4: MD_TEXTUAL@21..22
+ 1: MD_INLINE_ITALIC@10..21
+ 0: STAR@10..11 "*" [] []
+ 1: MD_INLINE_ITEM_LIST@11..20
+ 0: MD_INLINE_ITALIC@11..18
+ 0: STAR@11..12 "*" [] []
+ 1: MD_INLINE_ITEM_LIST@12..17
+ 0: MD_TEXTUAL@12..14
+ 0: MD_TEXTUAL_LITERAL@12..14 "a " [] []
+ 1: MD_INLINE_ITALIC@14..17
+ 0: STAR@14..15 "*" [] []
+ 1: MD_INLINE_ITEM_LIST@15..16
+ 0: MD_TEXTUAL@15..16
+ 0: MD_TEXTUAL_LITERAL@15..16 "b" [] []
+ 2: STAR@16..17 "*" [] []
+ 2: STAR@17..18 "*" [] []
+ 1: MD_TEXTUAL@18..20
+ 0: MD_TEXTUAL_LITERAL@18..20 " c" [] []
+ 2: STAR@20..21 "*" [] []
+ 2: MD_TEXTUAL@21..22
0: MD_TEXTUAL_LITERAL@21..22 "\n" [] []
1: (empty)
2: EOF@22..22 "" [] []
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap
index 1183219b2b94..abc0b3d4823f 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap
@@ -22,7 +22,10 @@ MdDocument {
value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@9..11 "**" [] [],
+ value_token: MD_TEXTUAL_LITERAL@9..10 "*" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@10..11 "*" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@11..24 "unclosed bold" [] [],
@@ -48,11 +51,13 @@ MdDocument {
0: MD_INLINE_ITEM_LIST@0..25
0: MD_TEXTUAL@0..9
0: MD_TEXTUAL_LITERAL@0..9 "This has " [] []
- 1: MD_TEXTUAL@9..11
- 0: MD_TEXTUAL_LITERAL@9..11 "**" [] []
- 2: MD_TEXTUAL@11..24
+ 1: MD_TEXTUAL@9..10
+ 0: MD_TEXTUAL_LITERAL@9..10 "*" [] []
+ 2: MD_TEXTUAL@10..11
+ 0: MD_TEXTUAL_LITERAL@10..11 "*" [] []
+ 3: MD_TEXTUAL@11..24
0: MD_TEXTUAL_LITERAL@11..24 "unclosed bold" [] []
- 3: MD_TEXTUAL@24..25
+ 4: MD_TEXTUAL@24..25
0: MD_TEXTUAL_LITERAL@24..25 "\n" [] []
1: (empty)
2: EOF@25..25 "" [] []
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md
similarity index 100%
rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md
rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap
new file mode 100644
index 000000000000..eefa654d5e99
--- /dev/null
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap
@@ -0,0 +1,60 @@
+---
+source: crates/biome_markdown_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```
+This has `unclosed code
+
+```
+
+
+## AST
+
+```
+MdDocument {
+ bom_token: missing (optional),
+ value: MdBlockList [
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@9..10 "`" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ ],
+ eof_token: EOF@24..24 "" [] [],
+}
+```
+
+## CST
+
+```
+0: MD_DOCUMENT@0..24
+ 0: (empty)
+ 1: MD_BLOCK_LIST@0..24
+ 0: MD_PARAGRAPH@0..24
+ 0: MD_INLINE_ITEM_LIST@0..24
+ 0: MD_TEXTUAL@0..9
+ 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] []
+ 1: MD_TEXTUAL@9..10
+ 0: MD_TEXTUAL_LITERAL@9..10 "`" [] []
+ 2: MD_TEXTUAL@10..23
+ 0: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] []
+ 3: MD_TEXTUAL@23..24
+ 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] []
+ 1: (empty)
+ 2: EOF@24..24 "" [] []
+
+```
diff --git a/crates/biome_unicode_table/src/lib.rs b/crates/biome_unicode_table/src/lib.rs
index 3d2c3a82cd4c..9a5495ebc4af 100644
--- a/crates/biome_unicode_table/src/lib.rs
+++ b/crates/biome_unicode_table/src/lib.rs
@@ -4,9 +4,11 @@ use crate::bytes::DISPATCHER;
use crate::tables::derived_property::{ID_Continue, ID_Start};
mod bytes;
+mod punctuation;
mod tables;
pub use crate::bytes::Dispatch;
+pub use crate::punctuation::is_unicode_punctuation;
/// Tests if `c` is a valid start of a CSS identifier
#[inline]
diff --git a/crates/biome_unicode_table/src/punctuation.rs b/crates/biome_unicode_table/src/punctuation.rs
new file mode 100644
index 000000000000..c3575a823d6b
--- /dev/null
+++ b/crates/biome_unicode_table/src/punctuation.rs
@@ -0,0 +1,413 @@
+//! CommonMark Unicode punctuation table.
+//!
+//! Derived from the markdown-rs Unicode punctuation list used for CommonMark.
+//! Per CommonMark, "Unicode punctuation" includes characters from both the
+//! General_Category=Punctuation (P*) and General_Category=Symbol (S*) categories.
+//! This is used for CommonMark flanking rules in emphasis parsing.
+
+// Note: duplicated from generated unicode tables to keep this module standalone.
+#[inline]
+fn bsearch_range_table(c: char, r: &[(char, char)]) -> bool {
+ use core::cmp::Ordering::{Equal, Greater, Less};
+ r.binary_search_by(|&(lo, hi)| {
+ if lo > c {
+ Greater
+ } else if hi < c {
+ Less
+ } else {
+ Equal
+ }
+ })
+ .is_ok()
+}
+
+const PUNCTUATION_RANGES: &[(char, char)] = &[
+ ('\u{0021}', '\u{002F}'),
+ ('\u{003A}', '\u{0040}'),
+ ('\u{005B}', '\u{0060}'),
+ ('\u{007B}', '\u{007E}'),
+ ('\u{00A1}', '\u{00A9}'),
+ ('\u{00AB}', '\u{00AC}'),
+ ('\u{00AE}', '\u{00B1}'),
+ ('\u{00B4}', '\u{00B4}'),
+ ('\u{00B6}', '\u{00B8}'),
+ ('\u{00BB}', '\u{00BB}'),
+ ('\u{00BF}', '\u{00BF}'),
+ ('\u{00D7}', '\u{00D7}'),
+ ('\u{00F7}', '\u{00F7}'),
+ ('\u{02C2}', '\u{02C5}'),
+ ('\u{02D2}', '\u{02DF}'),
+ ('\u{02E5}', '\u{02EB}'),
+ ('\u{02ED}', '\u{02ED}'),
+ ('\u{02EF}', '\u{02FF}'),
+ ('\u{0375}', '\u{0375}'),
+ ('\u{037E}', '\u{037E}'),
+ ('\u{0384}', '\u{0385}'),
+ ('\u{0387}', '\u{0387}'),
+ ('\u{03F6}', '\u{03F6}'),
+ ('\u{0482}', '\u{0482}'),
+ ('\u{055A}', '\u{055F}'),
+ ('\u{0589}', '\u{058A}'),
+ ('\u{058D}', '\u{058F}'),
+ ('\u{05BE}', '\u{05BE}'),
+ ('\u{05C0}', '\u{05C0}'),
+ ('\u{05C3}', '\u{05C3}'),
+ ('\u{05C6}', '\u{05C6}'),
+ ('\u{05F3}', '\u{05F4}'),
+ ('\u{0606}', '\u{060F}'),
+ ('\u{061B}', '\u{061B}'),
+ ('\u{061D}', '\u{061F}'),
+ ('\u{066A}', '\u{066D}'),
+ ('\u{06D4}', '\u{06D4}'),
+ ('\u{06DE}', '\u{06DE}'),
+ ('\u{06E9}', '\u{06E9}'),
+ ('\u{06FD}', '\u{06FE}'),
+ ('\u{0700}', '\u{070D}'),
+ ('\u{07F6}', '\u{07F9}'),
+ ('\u{07FE}', '\u{07FF}'),
+ ('\u{0830}', '\u{083E}'),
+ ('\u{085E}', '\u{085E}'),
+ ('\u{0888}', '\u{0888}'),
+ ('\u{0964}', '\u{0965}'),
+ ('\u{0970}', '\u{0970}'),
+ ('\u{09F2}', '\u{09F3}'),
+ ('\u{09FA}', '\u{09FB}'),
+ ('\u{09FD}', '\u{09FD}'),
+ ('\u{0A76}', '\u{0A76}'),
+ ('\u{0AF0}', '\u{0AF1}'),
+ ('\u{0B70}', '\u{0B70}'),
+ ('\u{0BF3}', '\u{0BFA}'),
+ ('\u{0C77}', '\u{0C77}'),
+ ('\u{0C7F}', '\u{0C7F}'),
+ ('\u{0C84}', '\u{0C84}'),
+ ('\u{0D4F}', '\u{0D4F}'),
+ ('\u{0D79}', '\u{0D79}'),
+ ('\u{0DF4}', '\u{0DF4}'),
+ ('\u{0E3F}', '\u{0E3F}'),
+ ('\u{0E4F}', '\u{0E4F}'),
+ ('\u{0E5A}', '\u{0E5B}'),
+ ('\u{0F01}', '\u{0F17}'),
+ ('\u{0F1A}', '\u{0F1F}'),
+ ('\u{0F34}', '\u{0F34}'),
+ ('\u{0F36}', '\u{0F36}'),
+ ('\u{0F38}', '\u{0F38}'),
+ ('\u{0F3A}', '\u{0F3D}'),
+ ('\u{0F85}', '\u{0F85}'),
+ ('\u{0FBE}', '\u{0FC5}'),
+ ('\u{0FC7}', '\u{0FCC}'),
+ ('\u{0FCE}', '\u{0FDA}'),
+ ('\u{104A}', '\u{104F}'),
+ ('\u{109E}', '\u{109F}'),
+ ('\u{10FB}', '\u{10FB}'),
+ ('\u{1360}', '\u{1368}'),
+ ('\u{1390}', '\u{1399}'),
+ ('\u{1400}', '\u{1400}'),
+ ('\u{166D}', '\u{166E}'),
+ ('\u{169B}', '\u{169C}'),
+ ('\u{16EB}', '\u{16ED}'),
+ ('\u{1735}', '\u{1736}'),
+ ('\u{17D4}', '\u{17D6}'),
+ ('\u{17D8}', '\u{17DB}'),
+ ('\u{1800}', '\u{180A}'),
+ ('\u{1940}', '\u{1940}'),
+ ('\u{1944}', '\u{1945}'),
+ ('\u{19DE}', '\u{19FF}'),
+ ('\u{1A1E}', '\u{1A1F}'),
+ ('\u{1AA0}', '\u{1AA6}'),
+ ('\u{1AA8}', '\u{1AAD}'),
+ ('\u{1B4E}', '\u{1B4F}'),
+ ('\u{1B5A}', '\u{1B6A}'),
+ ('\u{1B74}', '\u{1B7F}'),
+ ('\u{1BFC}', '\u{1BFF}'),
+ ('\u{1C3B}', '\u{1C3F}'),
+ ('\u{1C7E}', '\u{1C7F}'),
+ ('\u{1CC0}', '\u{1CC7}'),
+ ('\u{1CD3}', '\u{1CD3}'),
+ ('\u{1FBD}', '\u{1FBD}'),
+ ('\u{1FBF}', '\u{1FC1}'),
+ ('\u{1FCD}', '\u{1FCF}'),
+ ('\u{1FDD}', '\u{1FDF}'),
+ ('\u{1FED}', '\u{1FEF}'),
+ ('\u{1FFD}', '\u{1FFE}'),
+ ('\u{2010}', '\u{2027}'),
+ ('\u{2030}', '\u{205E}'),
+ ('\u{207A}', '\u{207E}'),
+ ('\u{208A}', '\u{208E}'),
+ ('\u{20A0}', '\u{20C0}'),
+ ('\u{2100}', '\u{2101}'),
+ ('\u{2103}', '\u{2106}'),
+ ('\u{2108}', '\u{2109}'),
+ ('\u{2114}', '\u{2114}'),
+ ('\u{2116}', '\u{2118}'),
+ ('\u{211E}', '\u{2123}'),
+ ('\u{2125}', '\u{2125}'),
+ ('\u{2127}', '\u{2127}'),
+ ('\u{2129}', '\u{2129}'),
+ ('\u{212E}', '\u{212E}'),
+ ('\u{213A}', '\u{213B}'),
+ ('\u{2140}', '\u{2144}'),
+ ('\u{214A}', '\u{214D}'),
+ ('\u{214F}', '\u{214F}'),
+ ('\u{218A}', '\u{218B}'),
+ ('\u{2190}', '\u{2429}'),
+ ('\u{2440}', '\u{244A}'),
+ ('\u{249C}', '\u{24E9}'),
+ ('\u{2500}', '\u{2775}'),
+ ('\u{2794}', '\u{2B73}'),
+ ('\u{2B76}', '\u{2B95}'),
+ ('\u{2B97}', '\u{2BFF}'),
+ ('\u{2CE5}', '\u{2CEA}'),
+ ('\u{2CF9}', '\u{2CFC}'),
+ ('\u{2CFE}', '\u{2CFF}'),
+ ('\u{2D70}', '\u{2D70}'),
+ ('\u{2E00}', '\u{2E2E}'),
+ ('\u{2E30}', '\u{2E5D}'),
+ ('\u{2E80}', '\u{2E99}'),
+ ('\u{2E9B}', '\u{2EF3}'),
+ ('\u{2F00}', '\u{2FD5}'),
+ ('\u{2FF0}', '\u{2FFF}'),
+ ('\u{3001}', '\u{3004}'),
+ ('\u{3008}', '\u{3020}'),
+ ('\u{3030}', '\u{3030}'),
+ ('\u{3036}', '\u{3037}'),
+ ('\u{303D}', '\u{303F}'),
+ ('\u{309B}', '\u{309C}'),
+ ('\u{30A0}', '\u{30A0}'),
+ ('\u{30FB}', '\u{30FB}'),
+ ('\u{3190}', '\u{3191}'),
+ ('\u{3196}', '\u{319F}'),
+ ('\u{31C0}', '\u{31E5}'),
+ ('\u{31EF}', '\u{31EF}'),
+ ('\u{3200}', '\u{321E}'),
+ ('\u{322A}', '\u{3247}'),
+ ('\u{3250}', '\u{3250}'),
+ ('\u{3260}', '\u{327F}'),
+ ('\u{328A}', '\u{32B0}'),
+ ('\u{32C0}', '\u{33FF}'),
+ ('\u{4DC0}', '\u{4DFF}'),
+ ('\u{A490}', '\u{A4C6}'),
+ ('\u{A4FE}', '\u{A4FF}'),
+ ('\u{A60D}', '\u{A60F}'),
+ ('\u{A673}', '\u{A673}'),
+ ('\u{A67E}', '\u{A67E}'),
+ ('\u{A6F2}', '\u{A6F7}'),
+ ('\u{A700}', '\u{A716}'),
+ ('\u{A720}', '\u{A721}'),
+ ('\u{A789}', '\u{A78A}'),
+ ('\u{A828}', '\u{A82B}'),
+ ('\u{A836}', '\u{A839}'),
+ ('\u{A874}', '\u{A877}'),
+ ('\u{A8CE}', '\u{A8CF}'),
+ ('\u{A8F8}', '\u{A8FA}'),
+ ('\u{A8FC}', '\u{A8FC}'),
+ ('\u{A92E}', '\u{A92F}'),
+ ('\u{A95F}', '\u{A95F}'),
+ ('\u{A9C1}', '\u{A9CD}'),
+ ('\u{A9DE}', '\u{A9DF}'),
+ ('\u{AA5C}', '\u{AA5F}'),
+ ('\u{AA77}', '\u{AA79}'),
+ ('\u{AADE}', '\u{AADF}'),
+ ('\u{AAF0}', '\u{AAF1}'),
+ ('\u{AB5B}', '\u{AB5B}'),
+ ('\u{AB6A}', '\u{AB6B}'),
+ ('\u{ABEB}', '\u{ABEB}'),
+ ('\u{FB29}', '\u{FB29}'),
+ ('\u{FBB2}', '\u{FBC2}'),
+ ('\u{FD3E}', '\u{FD4F}'),
+ ('\u{FDCF}', '\u{FDCF}'),
+ ('\u{FDFC}', '\u{FDFF}'),
+ ('\u{FE10}', '\u{FE19}'),
+ ('\u{FE30}', '\u{FE52}'),
+ ('\u{FE54}', '\u{FE66}'),
+ ('\u{FE68}', '\u{FE6B}'),
+ ('\u{FF01}', '\u{FF0F}'),
+ ('\u{FF1A}', '\u{FF20}'),
+ ('\u{FF3B}', '\u{FF40}'),
+ ('\u{FF5B}', '\u{FF65}'),
+ ('\u{FFE0}', '\u{FFE6}'),
+ ('\u{FFE8}', '\u{FFEE}'),
+ ('\u{FFFC}', '\u{FFFD}'),
+ ('\u{10100}', '\u{10102}'),
+ ('\u{10137}', '\u{1013F}'),
+ ('\u{10179}', '\u{10189}'),
+ ('\u{1018C}', '\u{1018E}'),
+ ('\u{10190}', '\u{1019C}'),
+ ('\u{101A0}', '\u{101A0}'),
+ ('\u{101D0}', '\u{101FC}'),
+ ('\u{1039F}', '\u{1039F}'),
+ ('\u{103D0}', '\u{103D0}'),
+ ('\u{1056F}', '\u{1056F}'),
+ ('\u{10857}', '\u{10857}'),
+ ('\u{10877}', '\u{10878}'),
+ ('\u{1091F}', '\u{1091F}'),
+ ('\u{1093F}', '\u{1093F}'),
+ ('\u{10A50}', '\u{10A58}'),
+ ('\u{10A7F}', '\u{10A7F}'),
+ ('\u{10AC8}', '\u{10AC8}'),
+ ('\u{10AF0}', '\u{10AF6}'),
+ ('\u{10B39}', '\u{10B3F}'),
+ ('\u{10B99}', '\u{10B9C}'),
+ ('\u{10D6E}', '\u{10D6E}'),
+ ('\u{10D8E}', '\u{10D8F}'),
+ ('\u{10EAD}', '\u{10EAD}'),
+ ('\u{10F55}', '\u{10F59}'),
+ ('\u{10F86}', '\u{10F89}'),
+ ('\u{11047}', '\u{1104D}'),
+ ('\u{110BB}', '\u{110BC}'),
+ ('\u{110BE}', '\u{110C1}'),
+ ('\u{11140}', '\u{11143}'),
+ ('\u{11174}', '\u{11175}'),
+ ('\u{111C5}', '\u{111C8}'),
+ ('\u{111CD}', '\u{111CD}'),
+ ('\u{111DB}', '\u{111DB}'),
+ ('\u{111DD}', '\u{111DF}'),
+ ('\u{11238}', '\u{1123D}'),
+ ('\u{112A9}', '\u{112A9}'),
+ ('\u{113D4}', '\u{113D5}'),
+ ('\u{113D7}', '\u{113D8}'),
+ ('\u{1144B}', '\u{1144F}'),
+ ('\u{1145A}', '\u{1145B}'),
+ ('\u{1145D}', '\u{1145D}'),
+ ('\u{114C6}', '\u{114C6}'),
+ ('\u{115C1}', '\u{115D7}'),
+ ('\u{11641}', '\u{11643}'),
+ ('\u{11660}', '\u{1166C}'),
+ ('\u{116B9}', '\u{116B9}'),
+ ('\u{1173C}', '\u{1173F}'),
+ ('\u{1183B}', '\u{1183B}'),
+ ('\u{11944}', '\u{11946}'),
+ ('\u{119E2}', '\u{119E2}'),
+ ('\u{11A3F}', '\u{11A46}'),
+ ('\u{11A9A}', '\u{11A9C}'),
+ ('\u{11A9E}', '\u{11AA2}'),
+ ('\u{11B00}', '\u{11B09}'),
+ ('\u{11BE1}', '\u{11BE1}'),
+ ('\u{11C41}', '\u{11C45}'),
+ ('\u{11C70}', '\u{11C71}'),
+ ('\u{11EF7}', '\u{11EF8}'),
+ ('\u{11F43}', '\u{11F4F}'),
+ ('\u{11FD5}', '\u{11FF1}'),
+ ('\u{11FFF}', '\u{11FFF}'),
+ ('\u{12470}', '\u{12474}'),
+ ('\u{12FF1}', '\u{12FF2}'),
+ ('\u{16A6E}', '\u{16A6F}'),
+ ('\u{16AF5}', '\u{16AF5}'),
+ ('\u{16B37}', '\u{16B3F}'),
+ ('\u{16B44}', '\u{16B45}'),
+ ('\u{16D6D}', '\u{16D6F}'),
+ ('\u{16E97}', '\u{16E9A}'),
+ ('\u{16FE2}', '\u{16FE2}'),
+ ('\u{1BC9C}', '\u{1BC9C}'),
+ ('\u{1BC9F}', '\u{1BC9F}'),
+ ('\u{1CC00}', '\u{1CCEF}'),
+ ('\u{1CD00}', '\u{1CEB3}'),
+ ('\u{1CF50}', '\u{1CFC3}'),
+ ('\u{1D000}', '\u{1D0F5}'),
+ ('\u{1D100}', '\u{1D126}'),
+ ('\u{1D129}', '\u{1D164}'),
+ ('\u{1D16A}', '\u{1D16C}'),
+ ('\u{1D183}', '\u{1D184}'),
+ ('\u{1D18C}', '\u{1D1A9}'),
+ ('\u{1D1AE}', '\u{1D1EA}'),
+ ('\u{1D200}', '\u{1D241}'),
+ ('\u{1D245}', '\u{1D245}'),
+ ('\u{1D300}', '\u{1D356}'),
+ ('\u{1D6C1}', '\u{1D6C1}'),
+ ('\u{1D6DB}', '\u{1D6DB}'),
+ ('\u{1D6FB}', '\u{1D6FB}'),
+ ('\u{1D715}', '\u{1D715}'),
+ ('\u{1D735}', '\u{1D735}'),
+ ('\u{1D74F}', '\u{1D74F}'),
+ ('\u{1D76F}', '\u{1D76F}'),
+ ('\u{1D789}', '\u{1D789}'),
+ ('\u{1D7A9}', '\u{1D7A9}'),
+ ('\u{1D7C3}', '\u{1D7C3}'),
+ ('\u{1D800}', '\u{1D9FF}'),
+ ('\u{1DA37}', '\u{1DA3A}'),
+ ('\u{1DA6D}', '\u{1DA74}'),
+ ('\u{1DA76}', '\u{1DA83}'),
+ ('\u{1DA85}', '\u{1DA8B}'),
+ ('\u{1E14F}', '\u{1E14F}'),
+ ('\u{1E2FF}', '\u{1E2FF}'),
+ ('\u{1E5FF}', '\u{1E5FF}'),
+ ('\u{1E95E}', '\u{1E95F}'),
+ ('\u{1ECAC}', '\u{1ECAC}'),
+ ('\u{1ECB0}', '\u{1ECB0}'),
+ ('\u{1ED2E}', '\u{1ED2E}'),
+ ('\u{1EEF0}', '\u{1EEF1}'),
+ ('\u{1F000}', '\u{1F02B}'),
+ ('\u{1F030}', '\u{1F093}'),
+ ('\u{1F0A0}', '\u{1F0AE}'),
+ ('\u{1F0B1}', '\u{1F0BF}'),
+ ('\u{1F0C1}', '\u{1F0CF}'),
+ ('\u{1F0D1}', '\u{1F0F5}'),
+ ('\u{1F10D}', '\u{1F1AD}'),
+ ('\u{1F1E6}', '\u{1F202}'),
+ ('\u{1F210}', '\u{1F23B}'),
+ ('\u{1F240}', '\u{1F248}'),
+ ('\u{1F250}', '\u{1F251}'),
+ ('\u{1F260}', '\u{1F265}'),
+ ('\u{1F300}', '\u{1F6D7}'),
+ ('\u{1F6DC}', '\u{1F6EC}'),
+ ('\u{1F6F0}', '\u{1F6FC}'),
+ ('\u{1F700}', '\u{1F776}'),
+ ('\u{1F77B}', '\u{1F7D9}'),
+ ('\u{1F7E0}', '\u{1F7EB}'),
+ ('\u{1F7F0}', '\u{1F7F0}'),
+ ('\u{1F800}', '\u{1F80B}'),
+ ('\u{1F810}', '\u{1F847}'),
+ ('\u{1F850}', '\u{1F859}'),
+ ('\u{1F860}', '\u{1F887}'),
+ ('\u{1F890}', '\u{1F8AD}'),
+ ('\u{1F8B0}', '\u{1F8BB}'),
+ ('\u{1F8C0}', '\u{1F8C1}'),
+ ('\u{1F900}', '\u{1FA53}'),
+ ('\u{1FA60}', '\u{1FA6D}'),
+ ('\u{1FA70}', '\u{1FA7C}'),
+ ('\u{1FA80}', '\u{1FA89}'),
+ ('\u{1FA8F}', '\u{1FAC6}'),
+ ('\u{1FACE}', '\u{1FADC}'),
+ ('\u{1FADF}', '\u{1FAE9}'),
+ ('\u{1FAF0}', '\u{1FAF8}'),
+ ('\u{1FB00}', '\u{1FB92}'),
+ ('\u{1FB94}', '\u{1FBEF}'),
+];
+
+/// Check if a character is Unicode punctuation per CommonMark.
+#[inline]
+pub fn is_unicode_punctuation(c: char) -> bool {
+ bsearch_range_table(c, PUNCTUATION_RANGES)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::PUNCTUATION_RANGES;
+ use super::is_unicode_punctuation;
+
+ #[test]
+ fn ascii_punctuation() {
+ assert!(is_unicode_punctuation('!'));
+ assert!(is_unicode_punctuation('.'));
+ assert!(is_unicode_punctuation('('));
+ }
+
+ #[test]
+ fn non_punctuation() {
+ assert!(!is_unicode_punctuation('a'));
+ assert!(!is_unicode_punctuation(' '));
+ assert!(!is_unicode_punctuation('0'));
+ }
+
+ #[test]
+ fn unicode_punctuation() {
+ assert!(is_unicode_punctuation('\u{2014}'));
+ assert!(is_unicode_punctuation('\u{00BF}'));
+ }
+
+ #[test]
+ fn table_is_sorted() {
+ for window in PUNCTUATION_RANGES.windows(2) {
+ assert!(window[0].1 < window[1].0, "Ranges must be sorted");
+ }
+ }
+}
From b6763e5a4850b2bddac6794031fc4b6335df8d7b Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Sun, 25 Jan 2026 01:29:45 -0500
Subject: [PATCH 04/26] fix(markdown): refine block structure and list handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Overhauls the handling of block-level elements, with a major focus on list "tightness" and HTML block detection.
Changes include:
- Align HTML block detection conditions with CommonMark specs (tightening start/end conditions).
- Fix logic for "tight" vs "loose" lists, ensuring correct paragraph wrapping in HTML output.
- Correct indentation handling for nested lists and empty list items.
- Improve detection of blockquotes when they appear on the first line of a list item.
- Fix Setext heading edge cases (examples 086–091).
- Prevent hard line breaks from appearing incorrectly at the end of blocks.
---
crates/biome_markdown_parser/src/lexer/mod.rs | 24 +-
crates/biome_markdown_parser/src/syntax.rs | 245 ++++++++--
.../src/syntax/fenced_code_block.rs | 53 +-
.../src/syntax/header.rs | 4 +-
.../src/syntax/html_block.rs | 273 +++++++++--
.../src/syntax/inline.rs | 50 +-
.../biome_markdown_parser/src/syntax/list.rs | 459 ++++++++++++++++--
.../biome_markdown_parser/src/syntax/quote.rs | 9 +-
crates/biome_markdown_parser/src/to_html.rs | 60 ++-
.../tests/md_test_suite/ok/edge_cases.md.snap | 59 +--
.../md_test_suite/ok/list_indentation.md.snap | 29 +-
.../ok/list_interrupt_empty_bullet.md.snap | 35 +-
.../md_test_suite/ok/list_tightness.md.snap | 113 ++---
.../md_test_suite/ok/multiline_list.md.snap | 57 +--
.../ok/setext_heading_edge_cases.md | 15 +
.../ok/setext_heading_edge_cases.md.snap | 236 +++++++++
.../ok/setext_heading_negative.md | 13 +
.../ok/setext_heading_negative.md.snap | 212 ++++++++
.../biome_markdown_parser/tests/spec_test.rs | 35 +-
xtask/coverage/src/reporters.rs | 2 +-
20 files changed, 1631 insertions(+), 352 deletions(-)
create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md
create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md.snap
create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md
create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md.snap
diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs
index 8122d861de06..a41e12749383 100644
--- a/crates/biome_markdown_parser/src/lexer/mod.rs
+++ b/crates/biome_markdown_parser/src/lexer/mod.rs
@@ -151,6 +151,7 @@ impl<'src> Lexer<'src> for MarkdownLexer<'src> {
// This ensures the *next* token (after NEWLINE) has PRECEDING_LINE_BREAK set.
if !kind.is_trivia()
&& kind != NEWLINE
+ && kind != MD_HARD_LINE_LITERAL
&& !(kind == MD_TEXTUAL_LITERAL
&& self.after_newline
&& self.current_text_is_whitespace())
@@ -876,21 +877,18 @@ impl<'src> MarkdownLexer<'src> {
let start_position = self.position;
let mut eq_count = 0;
- // Consume all `=` and spaces
- loop {
- match self.current_byte() {
- Some(b'=') => {
- self.advance(1);
- eq_count += 1;
- }
- Some(b' ') => {
- self.advance(1);
- }
- _ => break,
- }
+ // Consume only `=` characters — no spaces between (CommonMark §4.3)
+ while let Some(b'=') = self.current_byte() {
+ self.advance(1);
+ eq_count += 1;
+ }
+
+ // Allow optional trailing whitespace only
+ while matches!(self.current_byte(), Some(b' ' | b'\t')) {
+ self.advance(1);
}
- // Must have at least one `=` and be followed by newline or EOF
+ // Must have at least one `=` and nothing else before newline or EOF
if eq_count >= 1 && matches!(self.current_byte(), Some(b'\n' | b'\r') | None) {
return MD_SETEXT_UNDERLINE_LITERAL;
}
diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs
index 96037af6f798..832d93c97246 100644
--- a/crates/biome_markdown_parser/src/syntax.rs
+++ b/crates/biome_markdown_parser/src/syntax.rs
@@ -590,13 +590,25 @@ pub(crate) fn parse_paragraph(p: &mut MarkdownParser) -> ParsedSyntax {
// MD_SETEXT_UNDERLINE_LITERAL is for `=` underlines
// MD_THEMATIC_BREAK_LITERAL with only `-` is also a setext underline (H2)
let completed = if allow_setext && p.at(MD_SETEXT_UNDERLINE_LITERAL) {
- // This is a setext heading (H1 with `=`) - consume the underline
- p.bump(MD_SETEXT_UNDERLINE_LITERAL);
- m.complete(p, MD_SETEXT_HEADER)
+ let indent = real_line_indent_from_source(p);
+ if indent < 4 {
+ // This is a setext heading (H1 with `=`) - consume the underline
+ p.bump(MD_SETEXT_UNDERLINE_LITERAL);
+ m.complete(p, MD_SETEXT_HEADER)
+ } else {
+ // 4+ spaces of indent: not a setext underline (CommonMark §4.3)
+ m.complete(p, MD_PARAGRAPH)
+ }
} else if allow_setext && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
- // This is a setext heading (H2 with `-`) - remap token and consume
- p.bump_remap(MD_SETEXT_UNDERLINE_LITERAL);
- m.complete(p, MD_SETEXT_HEADER)
+ let indent = real_line_indent_from_source(p);
+ if indent < 4 {
+ // This is a setext heading (H2 with `-`) - remap token and consume
+ p.bump_remap(MD_SETEXT_UNDERLINE_LITERAL);
+ m.complete(p, MD_SETEXT_HEADER)
+ } else {
+ // 4+ spaces of indent: not a setext underline (CommonMark §4.3)
+ m.complete(p, MD_PARAGRAPH)
+ }
} else {
m.complete(p, MD_PARAGRAPH)
};
@@ -618,17 +630,105 @@ fn inline_has_non_whitespace(p: &MarkdownParser, start: usize, end: usize) -> bo
.is_empty()
}
+/// Check if a thematic break text contains only dashes (used for setext H2 detection).
+pub(crate) fn is_dash_only_thematic_break_text(text: &str) -> bool {
+ !text.is_empty() && text.trim().chars().all(|c| c == '-')
+}
+
+/// Token-based check: is the current line a setext underline?
+///
+/// Call after consuming a NEWLINE token. Skips 0–3 columns of leading whitespace
+/// (tabs expand to the next tab stop per CommonMark §2.2), then checks for
+/// `MD_SETEXT_UNDERLINE_LITERAL` or a dash-only `MD_THEMATIC_BREAK_LITERAL`.
+///
+/// Returns `Some(bytes_consumed)` if the line is a setext underline, `None` otherwise.
+/// The byte count includes only the whitespace tokens consumed during the indent skip,
+/// NOT the underline token itself. Callers that track byte budgets must subtract this.
+///
+/// This is the single source of truth for setext detection in inline contexts.
+/// Used by `has_matching_code_span_closer`, `parse_inline_html`, and `parse_inline_item_list`.
+///
+/// Context safety: this function does NOT call `allow_setext_heading` because the token
+/// stream itself encodes context. In blockquotes, `R_ANGLE` tokens appear after NEWLINE
+/// before content, so the whitespace-only skip naturally rejects those lines. In list
+/// items, the indent reflected in the token stream is the raw line indent, and the
+/// `columns < 4` check correctly rejects lines with 4+ columns of leading whitespace.
+pub(crate) fn at_setext_underline_after_newline(p: &mut MarkdownParser) -> Option {
+ let mut columns = 0;
+ let mut bytes_consumed = 0;
+ while columns < INDENT_CODE_BLOCK_SPACES
+ && p.at(MD_TEXTUAL_LITERAL)
+ && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
+ {
+ for c in p.cur_text().chars() {
+ match c {
+ ' ' => columns += 1,
+ '\t' => columns += 4 - (columns % 4),
+ _ => {}
+ }
+ }
+ bytes_consumed += p.cur_text().len();
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ if columns >= INDENT_CODE_BLOCK_SPACES {
+ return None;
+ }
+ let is_setext = p.at(MD_SETEXT_UNDERLINE_LITERAL)
+ || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break_text(p.cur_text()));
+ if is_setext {
+ Some(bytes_consumed)
+ } else {
+ None
+ }
+}
+
+/// Token-based check: does an inline span of `byte_len` bytes cross a setext underline?
+///
+/// Walks tokens via lookahead. At each NEWLINE, delegates to
+/// [`at_setext_underline_after_newline`] — the same detection used by
+/// `has_matching_code_span_closer` and `parse_inline_item_list`.
+pub(crate) fn inline_span_crosses_setext(p: &mut MarkdownParser, byte_len: usize) -> bool {
+ p.lookahead(|p| {
+ let mut remaining = byte_len;
+ loop {
+ if remaining == 0 || p.at(T![EOF]) {
+ return false;
+ }
+ if p.at(NEWLINE) {
+ let nl_len = p.cur_text().len();
+ if nl_len > remaining {
+ return false;
+ }
+ remaining -= nl_len;
+ p.bump(NEWLINE);
+ if let Some(ws_bytes) = at_setext_underline_after_newline(p) {
+ // Only flag if the whitespace consumed is still within our span
+ return ws_bytes <= remaining;
+ }
+ continue;
+ }
+ let tok_len = p.cur_text().len();
+ if tok_len > remaining {
+ return false;
+ }
+ remaining -= tok_len;
+ p.bump_any();
+ }
+ })
+}
+
/// Check if the current thematic break token contains only dashes.
/// This is used to detect H2 setext underlines.
fn is_dash_only_thematic_break(p: &MarkdownParser) -> bool {
- let text = p.cur_text();
- !text.is_empty() && text.trim().chars().all(|c| c == '-')
+ is_dash_only_thematic_break_text(p.cur_text())
}
fn allow_setext_heading(p: &MarkdownParser) -> bool {
let required_indent = p.state().list_item_required_indent;
if required_indent > 0 {
- let indent = p.line_start_leading_indent();
+ // Compute real indent from source text, since leading whitespace
+ // may have been consumed as trivia in list item context.
+ let indent = real_line_indent_from_source(p);
if indent < required_indent {
return false;
}
@@ -649,6 +749,31 @@ fn allow_setext_heading(p: &MarkdownParser) -> bool {
line_has_quote_prefix(p, depth)
}
+/// Compute the real leading indent of the current line from source text.
+/// This is needed because leading whitespace may have been consumed as trivia
+/// in list item context, making `line_start_leading_indent()` return 0.
+fn real_line_indent_from_source(p: &MarkdownParser) -> usize {
+ let source = p.source().source_text();
+ let pos: usize = p.cur_range().start().into();
+
+ // Find the start of the current line
+ let line_start = source[..pos]
+ .rfind('\n')
+ .map(|i| i + 1)
+ .unwrap_or(0);
+
+ // Count leading whitespace columns on this line
+ let mut column = 0;
+ for c in source[line_start..].chars() {
+ match c {
+ ' ' => column += 1,
+ '\t' => column += 4 - (column % 4),
+ _ => break,
+ }
+ }
+ column
+}
+
fn line_has_quote_prefix(p: &MarkdownParser, depth: usize) -> bool {
if depth == 0 {
return false;
@@ -804,13 +929,24 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
consume_partial_quote_prefix(p, quote_depth);
}
- // After crossing a line, check for block-level constructs and setext underlines
- // Check if we're at a setext heading underline
+ // After crossing a line, check for setext underlines.
+ // For non-list paragraphs, we need to look past up to 3 spaces of indent
+ // to detect setext underlines (CommonMark §4.3).
+ if has_content && p.state().list_item_required_indent == 0 {
+ let is_setext = p.lookahead(|p| {
+ at_setext_underline_after_newline(p).is_some()
+ });
+ if is_setext {
+ // Skip the indent so parse_paragraph sees the underline
+ p.skip_line_indent(INDENT_CODE_BLOCK_SPACES);
+ break;
+ }
+ }
+
+ // Check if we're at a setext heading underline (already past indent)
if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
break;
}
-
- // Check if we're at a thematic break that could be a setext underline
if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
break;
}
@@ -820,6 +956,23 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
// nested list markers like "\t - baz" to break out of the paragraph.
let required_indent = p.state().list_item_required_indent;
if required_indent > 0 {
+ // Check for setext underline after indent stripping.
+ // The `---` or `===` may be indented by the list item's required indent,
+ // so we need to look past that indent.
+ let real_indent = real_line_indent_from_source(p);
+ if real_indent >= required_indent {
+ let is_setext = p.lookahead(|p| {
+ p.skip_line_indent(required_indent);
+ p.at(MD_SETEXT_UNDERLINE_LITERAL)
+ || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
+ });
+ if is_setext && has_content {
+ // Skip the indent so parse_paragraph sees the underline
+ p.skip_line_indent(required_indent);
+ break;
+ }
+ }
+
let indent = p.line_start_leading_indent();
if indent >= required_indent {
let interrupts = p.lookahead(|p| {
@@ -886,13 +1039,22 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
}
// Check if we're at a setext heading underline (stop for paragraph to handle)
- if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
+ // Per CommonMark §4.3, setext underlines can be indented 0-3 spaces only.
+ if has_content
+ && p.at(MD_SETEXT_UNDERLINE_LITERAL)
+ && real_line_indent_from_source(p) < INDENT_CODE_BLOCK_SPACES
+ && allow_setext_heading(p)
+ {
break;
}
// Check if we're at a thematic break that could be a setext underline
// (dash-only thematic breaks following paragraph content are setext H2)
- if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
+ if has_content
+ && p.at(MD_THEMATIC_BREAK_LITERAL)
+ && real_line_indent_from_source(p) < INDENT_CODE_BLOCK_SPACES
+ && is_dash_only_thematic_break(p)
+ {
break;
}
@@ -949,9 +1111,10 @@ fn set_inline_emphasis_context(
};
let base_offset = u32::from(p.cur_range().start()) as usize;
// Create a reference checker closure that uses the parser's link reference definitions
- let context = crate::syntax::inline::EmphasisContext::new(inline_source, base_offset, |label| {
- p.has_link_reference_definition(label)
- });
+ let context =
+ crate::syntax::inline::EmphasisContext::new(inline_source, base_offset, |label| {
+ p.has_link_reference_definition(label)
+ });
p.set_emphasis_context(Some(context))
}
@@ -1075,10 +1238,13 @@ fn line_starts_with_fence(p: &mut MarkdownParser) -> bool {
}
p.skip_line_indent(3);
let rest = p.source_after_current();
- if rest.starts_with("```") {
+ let Some((fence_char, _len)) = fenced_code_block::detect_fence(rest) else {
+ return false;
+ };
+ if fence_char == '`' {
return !info_string_has_backtick(p);
}
- rest.starts_with("~~~")
+ true
})
}
@@ -1173,9 +1339,8 @@ pub(crate) fn at_block_interrupt(p: &mut MarkdownParser) -> bool {
}
// Bullet list item (-, *, +)
- // Per CommonMark §5.2: bullet lists can interrupt paragraphs if:
- // - The item has content, OR
- // - The item is empty but followed by a blank line
+ // Per CommonMark §5.2: bullet lists can interrupt paragraphs only if the
+ // item has content (non-empty). Empty markers cannot interrupt paragraphs.
// When inside a list, we also need to check for list items at ANY indent
// (not just at the current context's indent) because a less-indented list
// marker would end the current list item and start a sibling/parent item.
@@ -1382,17 +1547,12 @@ fn at_order_list_item_textual(p: &mut MarkdownParser) -> bool {
/// Check if a bullet list item can interrupt a top-level paragraph.
///
-/// Per CommonMark §5.2: A bullet list can interrupt a paragraph if:
-/// - The list item has content (at least one character after marker), OR
-/// - The list item is empty but is followed by a blank line
-///
-/// This allows patterns like:
-/// ```markdown
-/// Paragraph text
-/// +
+/// Per CommonMark §5.2: "A bullet list can interrupt a paragraph only if
+/// it starts with a non-empty item (that is, a list item that contains
+/// some non-blank character)."
///
-/// Next paragraph (interrupted by empty bullet + blank line)
-/// ```
+/// This means empty markers (marker followed by only whitespace/newline)
+/// cannot interrupt paragraphs, regardless of what follows.
fn can_bullet_interrupt_paragraph(p: &mut MarkdownParser) -> bool {
let checkpoint = p.checkpoint();
@@ -1414,19 +1574,22 @@ fn can_bullet_interrupt_paragraph(p: &mut MarkdownParser) -> bool {
}
// Check what follows the marker
+ // Per CommonMark §5.2: "A bullet list can interrupt a paragraph only if
+ // it starts with a non-empty item (that is, a list item that contains
+ // some non-blank character)."
let result = if p.at(T![EOF]) {
- // Empty item at EOF - cannot interrupt (no blank line follows)
+ // Empty item at EOF - cannot interrupt
false
} else if p.at(NEWLINE) {
- // Empty item - check if followed by blank line
- p.at_blank_line()
+ // Empty item (marker + newline) - cannot interrupt paragraphs
+ false
} else if p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
- p.bump(MD_TEXTUAL_LITERAL);
- if p.at(NEWLINE) {
- p.at_blank_line()
- } else {
- false
+ // Skip all whitespace tokens after marker
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
}
+ // If only whitespace followed by newline/EOF, item is empty and cannot interrupt
+ !(p.at(NEWLINE) || p.at(T![EOF]))
} else {
// Has content after marker - can interrupt
true
diff --git a/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs b/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs
index ccfd1430b225..37bba8046783 100644
--- a/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs
@@ -33,7 +33,6 @@ use biome_parser::{
};
use super::parse_error::unterminated_fenced_code;
-use super::quote::{consume_quote_prefix, has_quote_prefix};
/// Minimum number of fence characters required per CommonMark §4.5.
const MIN_FENCE_LENGTH: usize = 3;
@@ -135,7 +134,7 @@ fn find_line_start(before: &str) -> usize {
/// Returns `Some((fence_char, length))` if a valid fence is found,
/// where `length` is the actual number of fence characters (3 or more).
/// Returns `None` if no valid fence is present.
-fn detect_fence(s: &str) -> Option<(char, usize)> {
+pub(crate) fn detect_fence(s: &str) -> Option<(char, usize)> {
let first_char = s.chars().next()?;
if first_char != '`' && first_char != '~' {
@@ -272,21 +271,60 @@ fn parse_code_content(
) {
let m = p.start();
let quote_depth = p.state().block_quote_depth;
+ let mut at_line_start = false;
// Consume all tokens until we see the matching closing fence or EOF
while !p.at(T![EOF]) {
- if quote_depth > 0 && (p.at_line_start() || p.has_preceding_line_break()) {
- if !has_quote_prefix(p, quote_depth) {
+ if at_line_start && quote_depth > 0 {
+ let prev_virtual = p.state().virtual_line_start;
+ p.state_mut().virtual_line_start = Some(p.cur_range().start());
+ p.skip_line_indent(3);
+ p.state_mut().virtual_line_start = prev_virtual;
+
+ let mut ok = true;
+ for _ in 0..quote_depth {
+ if p.at(MD_TEXTUAL_LITERAL) && p.cur_text().starts_with('>') {
+ p.force_relex_regular();
+ }
+
+ if p.at(T![>]) {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(T![>]));
+ } else if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">" {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump_remap(T![>]));
+ } else {
+ ok = false;
+ break;
+ }
+
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text == " " || text == "\t" {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
+ }
+ }
+ }
+
+ if !ok {
break;
}
- consume_quote_prefix(p, quote_depth);
+ at_line_start = false;
+ }
+
+ if p.at(NEWLINE) {
+ // Preserve newlines as code content and reset virtual line start.
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+ p.set_virtual_line_start();
+ at_line_start = true;
+ continue;
}
if at_closing_fence(p, is_tilde_fence, fence_len) {
break;
}
- if p.at_line_start() && fence_indent > 0 {
+ if at_line_start && fence_indent > 0 {
skip_fenced_content_indent(p, fence_indent);
if at_closing_fence(p, is_tilde_fence, fence_len) {
break;
@@ -297,6 +335,7 @@ fn parse_code_content(
let text_m = p.start();
p.bump_remap(MD_TEXTUAL_LITERAL);
text_m.complete(p, MD_TEXTUAL);
+ at_line_start = false;
}
m.complete(p, MD_INLINE_ITEM_LIST);
@@ -317,7 +356,7 @@ pub(crate) fn info_string_has_backtick(p: &mut MarkdownParser) -> bool {
}
while !p.at_inline_end() {
- if p.at(BACKTICK) {
+ if p.at(BACKTICK) || p.at(T!["```"]) {
return true;
}
p.bump(p.cur());
diff --git a/crates/biome_markdown_parser/src/syntax/header.rs b/crates/biome_markdown_parser/src/syntax/header.rs
index 58127cfb57c3..555f1da00671 100644
--- a/crates/biome_markdown_parser/src/syntax/header.rs
+++ b/crates/biome_markdown_parser/src/syntax/header.rs
@@ -138,7 +138,7 @@ fn parse_hash_list(p: &mut MarkdownParser) -> usize {
///
/// This stops at end of line (NEWLINE or EOF) or when trailing hashes are detected.
/// Note: NEWLINE is an explicit token (not trivia), so we check `at_inline_end()`.
-fn parse_header_content(p: &mut MarkdownParser) {
+pub(crate) fn parse_header_content(p: &mut MarkdownParser) {
// Check if there's any content (not at EOF or NEWLINE)
if p.at_inline_end() {
return;
@@ -240,7 +240,7 @@ fn at_trailing_hashes_start(p: &mut MarkdownParser) -> bool {
///
/// The lexer emits all consecutive `#` characters as a single HASH token.
/// We wrap it in an MdHash node to match the grammar.
-fn parse_trailing_hashes(p: &mut MarkdownParser) {
+pub(crate) fn parse_trailing_hashes(p: &mut MarkdownParser) {
let m = p.start();
if at_trailing_hashes_start(p) {
diff --git a/crates/biome_markdown_parser/src/syntax/html_block.rs b/crates/biome_markdown_parser/src/syntax/html_block.rs
index 7865eb63651b..8d2f51c43608 100644
--- a/crates/biome_markdown_parser/src/syntax/html_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/html_block.rs
@@ -34,34 +34,164 @@ pub(crate) fn at_html_block(p: &mut MarkdownParser) -> bool {
/// Check if content after `<` looks like HTML (tag, comment, declaration, etc.).
fn is_html_like_content(p: &MarkdownParser) -> bool {
+ html_block_kind(p).is_some()
+}
+
+#[derive(Clone, Copy)]
+enum HtmlBlockKind {
+ Type1(Type1Tag),
+ Type2,
+ Type3,
+ Type4,
+ Type5,
+ Type6,
+ Type7,
+}
+
+#[derive(Clone, Copy)]
+enum Type1Tag {
+ Script,
+ Pre,
+ Style,
+ Textarea,
+}
+
+fn html_block_kind(p: &MarkdownParser) -> Option {
let remaining = p.source_after_current();
if !remaining.starts_with('<') {
- return false;
+ return None;
}
let after_angle = &remaining[1..];
- // Comment, CDATA, declaration, or processing instruction
- if after_angle.starts_with("!--")
- || after_angle.starts_with("![CDATA[")
- || after_angle.starts_with('?')
- {
- return true;
+ // Comment
+ if after_angle.starts_with("!--") {
+ return Some(HtmlBlockKind::Type2);
+ }
+
+ // Processing instruction
+ if after_angle.starts_with('?') {
+ return Some(HtmlBlockKind::Type3);
+ }
+
+ // CDATA
+ if after_angle.starts_with("![CDATA[") {
+ return Some(HtmlBlockKind::Type5);
}
// Declaration: Option<&str> {
let tag_start = after_angle.strip_prefix('/').unwrap_or(after_angle);
- tag_start
- .chars()
- .next()
- .is_some_and(|c| c.is_ascii_alphabetic())
+ let bytes = tag_start.as_bytes();
+ let first = *bytes.first()?;
+ if !first.is_ascii_alphabetic() {
+ return None;
+ }
+
+ let tag_end = bytes
+ .iter()
+ .position(|b| !b.is_ascii_alphanumeric() && *b != b'-')
+ .unwrap_or(tag_start.len());
+ let tag_name = &tag_start[..tag_end];
+
+ let boundary = tag_start.as_bytes().get(tag_end).copied();
+ if matches!(
+ boundary,
+ None | Some(b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' | b'>' | b'/')
+ ) {
+ Some(tag_name)
+ } else {
+ None
+ }
+}
+
+fn type1_tag(tag_name: &str) -> Option {
+ if tag_name.eq_ignore_ascii_case("script") {
+ Some(Type1Tag::Script)
+ } else if tag_name.eq_ignore_ascii_case("pre") {
+ Some(Type1Tag::Pre)
+ } else if tag_name.eq_ignore_ascii_case("style") {
+ Some(Type1Tag::Style)
+ } else if tag_name.eq_ignore_ascii_case("textarea") {
+ Some(Type1Tag::Textarea)
+ } else {
+ None
+ }
+}
+
+fn first_line(text: &str) -> &str {
+ text.split_once(['\n', '\r']).map_or(text, |(line, _)| line)
+}
+
+fn line_has_only_tag(line: &str) -> bool {
+ let bytes = line.as_bytes();
+ if !bytes.starts_with(b"<") {
+ return false;
+ }
+
+ let Some(end) = tag_end_index(bytes) else {
+ return false;
+ };
+
+ line[end + 1..].chars().all(|c| c == ' ' || c == '\t')
+}
+
+fn tag_end_index(bytes: &[u8]) -> Option {
+ let mut i = 1;
+ let mut in_single = false;
+ let mut in_double = false;
+
+ while i < bytes.len() {
+ let b = bytes[i];
+ if in_single {
+ if b == b'\'' {
+ in_single = false;
+ }
+ i += 1;
+ continue;
+ }
+ if in_double {
+ if b == b'"' {
+ in_double = false;
+ }
+ i += 1;
+ continue;
+ }
+
+ match b {
+ b'\'' => in_single = true,
+ b'"' => in_double = true,
+ b'>' => return Some(i),
+ _ => {}
+ }
+ i += 1;
+ }
+
+ None
}
/// Block-level tags that can interrupt paragraphs.
@@ -115,17 +245,12 @@ const BLOCK_TAGS: &[&str] = &[
"option",
"p",
"param",
- "pre",
- "script",
+ "search",
"section",
- "source",
- "style",
"summary",
"table",
"tbody",
"td",
- "template",
- "textarea",
"tfoot",
"th",
"thead",
@@ -138,34 +263,19 @@ const BLOCK_TAGS: &[&str] = &[
/// Only block-level HTML and special constructs interrupt paragraphs.
pub(crate) fn at_html_block_interrupt(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
- if !at_html_block(p) {
- return false;
- }
-
- let remaining = p.source_after_current();
- if remaining.len() < 2 {
+ let Some(kind) = html_block_kind(p) else {
return false;
- }
-
- let after_angle = &remaining[1..];
-
- // Special constructs always interrupt
- if after_angle.starts_with("!--")
- || after_angle.starts_with("![CDATA[")
- || after_angle.starts_with('!')
- || after_angle.starts_with('?')
- {
- return true;
- }
-
- // Check for block-level tag
- let tag_start = after_angle.strip_prefix('/').unwrap_or(after_angle);
- let tag_name: String = tag_start
- .chars()
- .take_while(|c| c.is_ascii_alphanumeric())
- .collect();
-
- BLOCK_TAGS.iter().any(|t| t.eq_ignore_ascii_case(&tag_name))
+ };
+
+ matches!(
+ kind,
+ HtmlBlockKind::Type1 { .. }
+ | HtmlBlockKind::Type2
+ | HtmlBlockKind::Type3
+ | HtmlBlockKind::Type4
+ | HtmlBlockKind::Type5
+ | HtmlBlockKind::Type6
+ )
})
}
@@ -177,10 +287,29 @@ pub(crate) fn parse_html_block(p: &mut MarkdownParser) -> ParsedSyntax {
return Absent;
}
+ let Some(kind) = html_block_kind(p) else {
+ return Absent;
+ };
+
let m = p.start();
let content_m = p.start();
- parse_until_blank_line(p);
+ match kind {
+ HtmlBlockKind::Type1(tag) => {
+ let terminator = match tag {
+ Type1Tag::Script => "",
+ Type1Tag::Pre => "",
+ Type1Tag::Style => "",
+ Type1Tag::Textarea => "",
+ };
+ parse_until_terminator(p, terminator, true);
+ }
+ HtmlBlockKind::Type2 => parse_until_terminator(p, "-->", false),
+ HtmlBlockKind::Type3 => parse_until_terminator(p, "?>", false),
+ HtmlBlockKind::Type4 => parse_until_terminator(p, ">", false),
+ HtmlBlockKind::Type5 => parse_until_terminator(p, "]]>", false),
+ HtmlBlockKind::Type6 | HtmlBlockKind::Type7 => parse_until_blank_line(p),
+ }
content_m.complete(p, MD_INLINE_ITEM_LIST);
Present(m.complete(p, MD_HTML_BLOCK))
@@ -210,6 +339,56 @@ fn parse_until_blank_line(p: &mut MarkdownParser) {
}
}
+fn parse_until_terminator(p: &mut MarkdownParser, terminator: &str, case_insensitive: bool) {
+ let mut line = String::new();
+
+ while !p.at(EOF) {
+ if at_container_boundary(p) {
+ break;
+ }
+
+ let text = p.cur_text();
+ let is_newline = p.at(NEWLINE);
+ line.push_str(text);
+
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+
+ if is_newline {
+ if line_contains(&line, terminator, case_insensitive) {
+ break;
+ }
+ line.clear();
+ skip_container_prefixes(p);
+ }
+ }
+}
+
+fn line_contains(line: &str, needle: &str, case_insensitive: bool) -> bool {
+ if !case_insensitive {
+ return line.contains(needle);
+ }
+
+ let hay = line.as_bytes();
+ let needle = needle.as_bytes();
+ if needle.is_empty() || hay.len() < needle.len() {
+ return false;
+ }
+
+ for i in 0..=hay.len() - needle.len() {
+ if hay[i..i + needle.len()]
+ .iter()
+ .zip(needle.iter())
+ .all(|(a, b)| a.to_ascii_lowercase() == b.to_ascii_lowercase())
+ {
+ return true;
+ }
+ }
+
+ false
+}
+
fn skip_container_prefixes(p: &mut MarkdownParser) {
let quote_depth = p.state().block_quote_depth;
if quote_depth > 0 && has_quote_prefix(p, quote_depth) {
diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs
index 87d649e36039..f00c96753a0e 100644
--- a/crates/biome_markdown_parser/src/syntax/inline.rs
+++ b/crates/biome_markdown_parser/src/syntax/inline.rs
@@ -647,8 +647,12 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -
use crate::lexer::MarkdownLexContext;
p.lookahead(|p| {
- // Skip the opening backticks
- p.bump(BACKTICK);
+ // Skip the opening backticks (handle both BACKTICK and TRIPLE_BACKTICK)
+ if p.at(T!["```"]) {
+ p.bump(T!["```"]);
+ } else {
+ p.bump(BACKTICK);
+ }
loop {
// EOF = no matching closer found
@@ -672,14 +676,18 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -
continue;
}
- // Found backticks - check if they match
- if p.at(BACKTICK) {
+ // Found backticks - check if they match (handle both BACKTICK and TRIPLE_BACKTICK)
+ if p.at(BACKTICK) || p.at(T!["```"]) {
let closing_count = p.cur_text().len();
if closing_count == opening_count {
return true;
}
// Not matching - continue searching
- p.bump(BACKTICK);
+ if p.at(T!["```"]) {
+ p.bump(T!["```"]);
+ } else {
+ p.bump(BACKTICK);
+ }
continue;
}
@@ -701,7 +709,12 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -
pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
use crate::lexer::MarkdownLexContext;
- if !p.at(BACKTICK) {
+ // Handle both BACKTICK and TRIPLE_BACKTICK (T!["```"]) as code span openers.
+ // TRIPLE_BACKTICK can appear when backticks are at line start but info string
+ // contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
+ let is_backtick = p.at(BACKTICK);
+ let is_triple_backtick = p.at(T!["```"]);
+ if !is_backtick && !is_triple_backtick {
return Absent;
}
@@ -717,8 +730,12 @@ pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
// We have a valid code span - now parse it
let m = p.start();
- // Opening backtick(s)
- p.bump(BACKTICK);
+ // Opening backtick(s) - remap TRIPLE_BACKTICK to BACKTICK for consistency
+ if is_triple_backtick {
+ p.bump_remap(BACKTICK);
+ } else {
+ p.bump(BACKTICK);
+ }
// Content - parse until we find matching closing backticks
// Per CommonMark, code spans can span multiple lines (newlines become spaces in output)
@@ -744,8 +761,8 @@ pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
continue;
}
- // Found matching closing backticks
- if p.at(BACKTICK) && p.cur_text().len() == opening_count {
+ // Found matching closing backticks (handle both BACKTICK and TRIPLE_BACKTICK)
+ if (p.at(BACKTICK) || p.at(T!["```"])) && p.cur_text().len() == opening_count {
break;
}
@@ -757,7 +774,12 @@ pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
content.complete(p, MD_INLINE_ITEM_LIST);
// Closing backticks (guaranteed to exist due to lookahead check)
- p.bump(BACKTICK);
+ // Remap TRIPLE_BACKTICK to BACKTICK for consistency
+ if p.at(T!["```"]) {
+ p.bump_remap(BACKTICK);
+ } else {
+ p.bump(BACKTICK);
+ }
Present(m.complete(p, MD_INLINE_CODE))
}
@@ -2527,8 +2549,10 @@ pub(crate) fn parse_autolink(p: &mut MarkdownParser) -> ParsedSyntax {
pub(crate) fn parse_any_inline(p: &mut MarkdownParser) -> ParsedSyntax {
if p.at(MD_HARD_LINE_LITERAL) {
parse_hard_line(p)
- } else if p.at(BACKTICK) {
- // Try code span, fall back to literal text if no matching closer exists
+ } else if p.at(BACKTICK) || p.at(T!["```"]) {
+ // Try code span, fall back to literal text if no matching closer exists.
+ // T!["```"] can appear when backticks are at line start but info string
+ // contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
let result = parse_inline_code(p);
if result.is_absent() {
super::parse_textual(p)
diff --git a/crates/biome_markdown_parser/src/syntax/list.rs b/crates/biome_markdown_parser/src/syntax/list.rs
index 896bb92a02f5..43a1af47b93f 100644
--- a/crates/biome_markdown_parser/src/syntax/list.rs
+++ b/crates/biome_markdown_parser/src/syntax/list.rs
@@ -40,7 +40,10 @@ use biome_parser::prelude::ParsedSyntax::{self, *};
use biome_parser::prelude::{CompletedMarker, Marker, ParseDiagnostic, TokenSet};
use biome_parser::{Parser, token_set};
-use super::quote::{consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix};
+use super::quote::{
+ consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix,
+ parse_quote_block_list,
+};
use biome_rowan::TextRange;
use super::fenced_code_block::parse_fenced_code_block;
@@ -63,6 +66,41 @@ const BLOCK_RECOVERY_SET: TokenSet = token_set![
/// CommonMark requires 4 or more spaces for indented code blocks.
const INDENT_CODE_BLOCK_SPACES: usize = 4;
+/// Compute the marker indent for list parsing.
+///
+/// For normal cases, this returns the leading whitespace count from
+/// `line_start_leading_indent()`. For virtual line start cases (nested list
+/// detection), we compute the actual column position from the source text
+/// to ensure correct indented code block detection in nested lists.
+fn compute_marker_indent(p: &MarkdownParser) -> usize {
+ if p.state().virtual_line_start == Some(p.cur_range().start()) {
+ // Virtual line start: compute actual column from source text.
+ // The leading whitespace was skipped as trivia, but we need the
+ // real column for indented code block detection.
+ let source = p.source().source_text();
+ let pos: usize = p.cur_range().start().into();
+
+ // Find the start of the current line
+ let line_start = source[..pos]
+ .rfind('\n')
+ .map(|i| i + 1)
+ .unwrap_or(0);
+
+ // Count columns from line start to current position
+ let mut column = 0;
+ for c in source[line_start..pos].chars() {
+ match c {
+ '\t' => column += 4 - (column % 4),
+ _ => column += 1,
+ }
+ }
+ column
+ } else {
+ // Normal case: use the standard leading indent count
+ p.source().line_start_leading_indent()
+ }
+}
+
/// Check if we're at the start of a bullet list item (`-`, `*`, or `+`).
///
/// A bullet list marker at line start followed by content is a list item.
@@ -183,9 +221,15 @@ fn skip_blank_lines_between_items(
is_tight: &mut bool,
last_item_ends_with_blank: &mut bool,
) {
+
// Skip blank lines between list items.
// Per CommonMark §5.3, blank lines between items make the list loose
// but don't end the list.
+ //
+ // Any NEWLINE we see at this position (after the item-terminating newline)
+ // represents a blank line between items. We don't use at_blank_line() here
+ // because it checks if what comes AFTER the newline is blank, but we're
+ // already past one newline - any additional newlines ARE blank lines.
while p.at(NEWLINE) {
// Only skip if there's another list item after the blank lines
if !has_item_after_blank_lines(p) {
@@ -204,6 +248,7 @@ fn update_list_tightness(
is_tight: &mut bool,
last_item_ends_with_blank: &mut bool,
) {
+
// Blank line between items makes the list loose
if *last_item_ends_with_blank {
*is_tight = false;
@@ -230,6 +275,9 @@ where
FMarker: Fn(&mut MarkdownParser) -> Option,
FParse: Fn(&mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo),
{
+ let prev_is_tight = *is_tight;
+ let prev_last_item_ends_with_blank = *last_item_ends_with_blank;
+
skip_blank_lines_between_items(
p,
has_item_after_blank_lines,
@@ -242,7 +290,16 @@ where
}
let (parsed, blank_info) = parse_item(p);
- update_list_tightness(blank_info, is_tight, last_item_ends_with_blank);
+
+ if parsed.is_absent() {
+ // The blank lines we skipped didn't lead to a valid item in this list.
+ // Restore tightness — the blank lines belong to a parent context.
+ *is_tight = prev_is_tight;
+ *last_item_ends_with_blank = prev_last_item_ends_with_blank;
+ } else {
+ update_list_tightness(blank_info, is_tight, last_item_ends_with_blank);
+ }
+
parsed
}
@@ -308,14 +365,17 @@ struct BulletList {
last_item_ends_with_blank: bool,
/// The marker kind for this list (`-`, `*`, or `+`).
marker_kind: Option,
+ /// The indentation level of the list marker (0 for top-level).
+ marker_indent: usize,
}
impl BulletList {
- fn new() -> Self {
+ fn new(marker_indent: usize) -> Self {
Self {
is_tight: true,
last_item_ends_with_blank: false,
marker_kind: None,
+ marker_indent,
}
}
}
@@ -339,29 +399,67 @@ impl ParseNodeList for BulletList {
}
fn is_at_list_end(&self, p: &mut Self::Parser<'_>) -> bool {
- is_at_list_end_common(
+ let marker_indent = self.marker_indent;
+
+ // Check blank line at line start with indent awareness BEFORE
+ // delegating to is_at_list_end_common (which uses non-indent-aware check).
+ if p.at_line_start() && at_blank_line_start(p) {
+ let result = !has_bullet_item_after_blank_lines_at_indent(p, marker_indent);
+
+ return result;
+ }
+
+ let result = is_at_list_end_common(
p,
self.marker_kind,
at_bullet_list_item,
current_bullet_marker,
has_bullet_item_after_blank_lines,
|p, _marker_kind| {
- let next_is_bullet = p.lookahead(|p| {
+ let next_is_bullet_at_indent = p.lookahead(|p| {
p.bump(NEWLINE);
- skip_leading_whitespace_tokens(p);
+ // Count indent before marker (tabs expand to next tab stop)
+ let mut indent = 0usize;
+ while p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text == " " {
+ indent += 1;
+ p.bump(MD_TEXTUAL_LITERAL);
+ } else if text == "\t" {
+ indent += 4 - (indent % 4);
+ p.bump(MD_TEXTUAL_LITERAL);
+ } else {
+ break;
+ }
+ }
+ // Check indent matches this list's marker indent
+ let indent_ok = if marker_indent == 0 {
+ indent <= 3
+ } else {
+ indent >= marker_indent && indent <= marker_indent + 3
+ };
+ if !indent_ok {
+ return false;
+ }
if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
p.bump(p.cur());
return marker_followed_by_whitespace_or_eol(p);
}
false
});
- if next_is_bullet {
+ if next_is_bullet_at_indent {
Some(false)
} else {
- Some(!has_bullet_item_after_blank_lines(p))
+ // Check if bullet after blank lines is at correct indent
+ let has_item = p.lookahead(|p| {
+ has_bullet_item_after_blank_lines_at_indent(p, marker_indent)
+ });
+ Some(!has_item)
}
},
- )
+ );
+
+ result
}
fn recover(
@@ -380,6 +478,7 @@ impl ParseNodeList for BulletList {
fn finish_list(&mut self, p: &mut Self::Parser<'_>, m: Marker) -> CompletedMarker {
let completed = m.complete(p, Self::LIST_KIND);
let range = completed.range(p);
+
p.record_list_tightness(range, self.is_tight);
completed
}
@@ -479,8 +578,11 @@ pub(crate) fn parse_bullet_list_item(p: &mut MarkdownParser) -> ParsedSyntax {
// Increment list depth
p.state_mut().list_nesting_depth += 1;
+ // Compute the marker indent (leading whitespace before the first marker)
+ let marker_indent = compute_marker_indent(p);
+
// Use ParseNodeList to parse the list with proper recovery
- let mut list_helper = BulletList::new();
+ let mut list_helper = BulletList::new(marker_indent);
list_helper.parse_list(p);
// Decrement list depth
@@ -501,11 +603,10 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) {
let m = p.start();
- let marker_indent = if p.state().virtual_line_start == Some(p.cur_range().start()) {
- 0
- } else {
- p.source().line_start_leading_indent()
- };
+ // Compute the marker indent, handling both normal and virtual line start cases.
+ // For virtual line start (nested list detection), we compute the actual column
+ // to ensure correct indented code block detection.
+ let marker_indent = compute_marker_indent(p);
skip_list_marker_indent(p);
// Bullet marker is 1 character (-, *, or +)
@@ -536,18 +637,41 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) {
// Count spaces after marker to determine required indentation.
// Per CommonMark §5.2, content aligns to first non-space after marker.
- let spaces_after_marker = if let Some(text) = marker_token_text.as_deref() {
- count_spaces_after_dash_in_token(text, marker_indent + marker_width)
+ //
+ // For the setext-remapped case (marker_token_text is Some), the token includes
+ // trailing spaces before the newline. This means the first line is empty
+ // (marker + whitespace + newline), and the trailing spaces shouldn't count
+ // for indentation purposes. Per CommonMark, the required indent is marker_width + 1.
+ let (spaces_after_marker, first_line_empty) = if let Some(text) = marker_token_text.as_deref() {
+ // Setext token case: token is "- " or "- " etc. followed by newline
+ // The first line is empty, so use minimum indent (marker_width + 1)
+ let spaces = count_spaces_after_dash_in_token(text, marker_indent + marker_width);
+ (spaces, true)
} else {
- count_spaces_after_marker(p.source_after_current(), marker_indent + marker_width)
+ let spaces =
+ count_spaces_after_marker(p.source_after_current(), marker_indent + marker_width);
+ // Check if first line is empty by looking at what follows
+ let first_empty = p.lookahead(|p| {
+ // Skip any whitespace
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ // If we hit newline or EOF, first line is empty
+ p.at(NEWLINE) || p.at(T![EOF])
+ });
+ (spaces, first_empty)
};
// Set required indent for continuation lines
// Required indent = marker width + spaces after marker (minimum 1)
+ // BUT: if first line is empty (marker + whitespace + newline), use minimum indent
let prev_required_indent = p.state().list_item_required_indent;
let prev_marker_indent = p.state().list_item_marker_indent;
p.state_mut().list_item_required_indent = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES {
marker_indent + marker_width + 1
+ } else if first_line_empty {
+ // Empty first line: use minimum indent (marker + 1 space)
+ marker_indent + marker_width + 1
} else {
marker_indent + marker_width + spaces_after_marker.max(1)
};
@@ -767,11 +891,10 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI
let m = p.start();
- let marker_indent = if p.state().virtual_line_start == Some(p.cur_range().start()) {
- 0
- } else {
- p.source().line_start_leading_indent()
- };
+ // Compute the marker indent, handling both normal and virtual line start cases.
+ // For virtual line start (nested list detection), we compute the actual column
+ // to ensure correct indented code block detection.
+ let marker_indent = compute_marker_indent(p);
skip_list_marker_indent(p);
// Get marker width from actual token text (e.g., "1." = 2, "10." = 3)
@@ -785,12 +908,24 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI
let spaces_after_marker =
count_spaces_after_marker(p.source_after_current(), marker_indent + marker_width);
+ // Check if first line is empty (marker followed by only whitespace + newline)
+ let first_line_empty = p.lookahead(|p| {
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ p.at(NEWLINE) || p.at(T![EOF])
+ });
+
// Set required indent for continuation lines
// Required indent = marker width + spaces after marker (minimum 1)
+ // BUT: if first line is empty (marker + whitespace + newline), use minimum indent
let prev_required_indent = p.state().list_item_required_indent;
let prev_marker_indent = p.state().list_item_marker_indent;
p.state_mut().list_item_required_indent = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES {
marker_indent + marker_width + 1
+ } else if first_line_empty {
+ // Empty first line: use minimum indent (marker + 1 space)
+ marker_indent + marker_width + 1
} else {
marker_indent + marker_width + spaces_after_marker.max(1)
};
@@ -1079,11 +1214,18 @@ fn parse_list_item_block_content(
if !first_line && p.at(NEWLINE) && !p.at_blank_line() && !newline_has_quote_prefix {
let action = classify_blank_line(p, required_indent, marker_indent);
+ // Check if the NEWLINE we're at is itself on a blank line
+ // (i.e., preceded by another newline). This distinguishes a real
+ // blank line from a content-terminating newline (e.g., after a
+ // fenced code block's closing fence).
+ let is_blank = list_newline_is_blank_line(p);
match action {
BlankLineAction::ContinueItem => {
consume_blank_line(p);
- has_blank_line = true;
- last_was_blank = true;
+ if is_blank {
+ has_blank_line = true;
+ }
+ last_was_blank = is_blank;
continue;
}
BlankLineAction::EndItemAfterBlank => {
@@ -1092,6 +1234,14 @@ fn parse_list_item_block_content(
last_was_blank = true;
break;
}
+ BlankLineAction::EndItemAtBoundary => {
+ consume_blank_line(p);
+ if is_blank {
+ has_blank_line = true;
+ last_was_blank = true;
+ }
+ break;
+ }
BlankLineAction::EndItemBeforeBlank => {
break;
}
@@ -1112,7 +1262,12 @@ fn parse_list_item_block_content(
at_blank_line_after_prefix(p)
};
- if (p.at_line_start() || line_has_quote_prefix) && blank_line_after_prefix {
+ // On the first line (same line as marker), if we're at a blank line,
+ // this is a marker-only line followed by blank line. Handle this
+ // in the first_line && p.at(NEWLINE) block below, not here.
+ if first_line && blank_line_after_prefix && p.at(NEWLINE) {
+ // Fall through to the first_line && p.at(NEWLINE) handler below
+ } else if (p.at_line_start() || line_has_quote_prefix) && blank_line_after_prefix {
if line_has_quote_prefix
&& quote_only_line_indent_at_current(p, quote_depth).is_some()
&& let Some(next_indent) = next_quote_content_indent(p, quote_depth)
@@ -1163,6 +1318,19 @@ fn parse_list_item_block_content(
last_was_blank = true;
break;
}
+ BlankLineAction::EndItemAtBoundary => {
+ // In the blank_line_after_prefix path, we know there's an
+ // actual blank line, so treat as EndItemAfterBlank.
+ if line_has_quote_prefix {
+ consume_quote_prefix(p, quote_depth);
+ }
+ consume_blank_line(p);
+ if !marker_line_break {
+ has_blank_line = true;
+ }
+ last_was_blank = true;
+ break;
+ }
BlankLineAction::EndItemBeforeBlank => {
break;
}
@@ -1195,6 +1363,25 @@ fn parse_list_item_block_content(
if next_is_sibling {
continue;
}
+
+ // Now check if we're at a blank line (the line immediately after marker is empty).
+ // Per CommonMark: if marker-only line is followed by a blank line,
+ // the item is truly empty and subsequent content is outside the list.
+ let now_at_blank_line = p.lookahead(|p| {
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ p.at(NEWLINE) || p.at(T![EOF])
+ });
+
+ if now_at_blank_line {
+ // Item is empty - break out of the loop
+ break;
+ }
+
+ // Continue to next iteration with fresh state to properly handle
+ // the continuation content on the next line.
+ continue;
}
if first_line {
@@ -1247,6 +1434,138 @@ fn parse_list_item_block_content(
}
}
+ // Check for ATX heading on the first line of list item content.
+ // e.g., `- # Foo` should produce a heading inside the list item.
+ let atx_heading_info = p.lookahead(|p| {
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ // # may be tokenized as HASH or MD_TEXTUAL_LITERAL
+ let is_hash = p.at(T![#])
+ || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == '#'));
+ if !is_hash {
+ return None;
+ }
+ let text = p.cur_text();
+ let hash_count = text.len();
+ if hash_count < 1 || hash_count > 6 {
+ return None;
+ }
+ p.bump(p.cur());
+ // Must be followed by space/tab, EOL, or EOF
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return Some(hash_count);
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let t = p.cur_text();
+ if t.starts_with(' ') || t.starts_with('\t') {
+ return Some(hash_count);
+ }
+ }
+ None
+ });
+
+ if atx_heading_info.is_some() {
+ // Skip leading whitespace as trivia
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
+ }
+
+ // Manually build the heading node since we're on the first
+ // line and parse_header can't handle tokens here directly.
+ let header_m = p.start();
+
+ // Build MdHashList > MdHash > T![#]
+ let hash_list_m = p.start();
+ let hash_m = p.start();
+ if p.at(T![#]) {
+ p.bump(T![#]);
+ } else {
+ p.bump_remap(T![#]);
+ }
+ hash_m.complete(p, MD_HASH);
+ hash_list_m.complete(p, MD_HASH_LIST);
+
+ // Parse heading content (inline until end of line)
+ super::header::parse_header_content(p);
+
+ // Parse trailing hashes
+ super::header::parse_trailing_hashes(p);
+
+ header_m.complete(p, MD_HEADER);
+
+ last_block_was_paragraph = false;
+ last_was_blank = false;
+ first_line = false;
+ continue;
+ }
+
+ // Check for blockquote on the first line of list item content.
+ // Per CommonMark §5.2, list item content can include block-level
+ // elements like blockquotes on the same line as the marker.
+ // e.g., `> 1. > Blockquote` has a blockquote inside the list item.
+ let blockquote_start = p.lookahead(|p| {
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ // Check for > as either T![>] or MD_TEXTUAL_LITERAL ">"
+ p.at(T![>])
+ || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">")
+ });
+
+ if blockquote_start {
+ // Skip leading whitespace as trivia
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
+ }
+
+ let prev_virtual = p.state().virtual_line_start;
+ let prev_required = p.state().list_item_required_indent;
+ p.state_mut().virtual_line_start = Some(p.cur_range().start());
+ p.state_mut().list_item_required_indent = 0;
+
+ // Remap textual ">" to T![>] so parse_quote can recognize it.
+ // parse_quote checks `p.at(T![>])` after skipping indent.
+ if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">" {
+ p.bump_remap(T![>]);
+ // We bumped the >, but parse_quote expects to bump it itself.
+ // Instead, manually build the quote node inline.
+ let quote_m = p.start();
+ p.state_mut().block_quote_depth += 1;
+
+ // Skip optional space after >
+ if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == " " {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
+ }
+ p.state_mut().virtual_line_start = Some(p.cur_range().start());
+
+ parse_quote_block_list(p);
+
+ p.state_mut().block_quote_depth -= 1;
+ quote_m.complete(p, MD_QUOTE);
+
+ last_block_was_paragraph = false;
+ last_was_blank = false;
+ first_line = false;
+ p.state_mut().virtual_line_start = prev_virtual;
+ p.state_mut().list_item_required_indent = prev_required;
+ continue;
+ }
+
+ // T![>] case: parse_quote can handle it directly
+ let parsed = super::quote::parse_quote(p);
+ if parsed.is_present() {
+ last_block_was_paragraph = false;
+ last_was_blank = false;
+ first_line = false;
+ p.state_mut().virtual_line_start = prev_virtual;
+ p.state_mut().list_item_required_indent = prev_required;
+ continue;
+ }
+ p.state_mut().virtual_line_start = prev_virtual;
+ p.state_mut().list_item_required_indent = prev_required;
+ }
+
let nested_marker = p.lookahead(|p| {
while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
p.bump(MD_TEXTUAL_LITERAL);
@@ -1501,7 +1820,10 @@ fn list_newline_is_blank_line(p: &MarkdownParser) -> bool {
enum BlankLineAction {
ContinueItem,
+ /// End item; actual blank lines were found before the next item.
EndItemAfterBlank,
+ /// End item; no actual blank lines, just a normal item boundary.
+ EndItemAtBoundary,
EndItemBeforeBlank,
}
@@ -1514,6 +1836,7 @@ fn classify_blank_line(
// Skip ALL consecutive blank lines (not just one).
// Per CommonMark §5.3, multiple blank lines between items still
// belong to the same list - they just make it "loose".
+ let mut blank_lines_found = 0usize;
loop {
let line_is_blank = p.lookahead(|p| {
while p.at(MD_TEXTUAL_LITERAL) {
@@ -1531,6 +1854,8 @@ fn classify_blank_line(
break;
}
+ blank_lines_found += 1;
+
while p.at(MD_TEXTUAL_LITERAL) {
let text = p.cur_text();
if text == " " || text == "\t" {
@@ -1563,7 +1888,12 @@ fn classify_blank_line(
&& (at_bullet_list_item_with_base_indent(p, marker_indent)
|| at_order_list_item_with_base_indent(p, marker_indent))
{
- return BlankLineAction::EndItemAfterBlank;
+ // The first "blank line" is just the item-ending newline.
+ // Only report actual blank lines if more than 1 was found.
+ if blank_lines_found > 1 {
+ return BlankLineAction::EndItemAfterBlank;
+ }
+ return BlankLineAction::EndItemAtBoundary;
}
BlankLineAction::EndItemBeforeBlank
@@ -1742,6 +2072,79 @@ fn has_bullet_item_after_blank_lines(p: &mut MarkdownParser) -> bool {
})
}
+/// Like `has_bullet_item_after_blank_lines` but also checks that the
+/// bullet marker is at the expected indent level for this list.
+fn has_bullet_item_after_blank_lines_at_indent(
+ p: &mut MarkdownParser,
+ expected_indent: usize,
+) -> bool {
+ has_list_item_after_blank_lines_at_indent(
+ p,
+ expected_indent,
+ |p| {
+ if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
+ p.bump(p.cur());
+ marker_followed_by_whitespace_or_eol(p)
+ } else {
+ false
+ }
+ },
+ )
+}
+
+fn has_list_item_after_blank_lines_at_indent(
+ p: &mut MarkdownParser,
+ expected_indent: usize,
+ has_marker: F,
+) -> bool
+where
+ F: Fn(&mut MarkdownParser) -> bool,
+{
+ p.lookahead(|p| {
+ // Skip all blank lines
+ loop {
+ while p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text == " " || text == "\t" {
+ p.bump(MD_TEXTUAL_LITERAL);
+ } else {
+ break;
+ }
+ }
+ if p.at(NEWLINE) {
+ p.bump(NEWLINE);
+ continue;
+ }
+ break;
+ }
+
+ let mut indent = 0;
+ while p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text == " " {
+ indent += 1;
+ p.bump(MD_TEXTUAL_LITERAL);
+ } else if text == "\t" {
+ indent += 4 - (indent % 4);
+ p.bump(MD_TEXTUAL_LITERAL);
+ } else {
+ break;
+ }
+ }
+
+ // Check indent matches the list's marker indent range
+ if expected_indent == 0 {
+ if indent > 3 {
+ return false;
+ }
+ } else if indent < expected_indent || indent > expected_indent + 3 {
+ return false;
+ }
+
+ has_marker(p)
+ })
+}
+
/// Check if there's an ordered list item after skipping blank lines.
///
/// Per CommonMark §5.3, blank lines between list items don't end the list,
@@ -1787,7 +2190,7 @@ where
indent += 1;
p.bump(MD_TEXTUAL_LITERAL);
} else if text == "\t" {
- indent += 4;
+ indent += 4 - (indent % 4);
p.bump(MD_TEXTUAL_LITERAL);
} else {
break;
diff --git a/crates/biome_markdown_parser/src/syntax/quote.rs b/crates/biome_markdown_parser/src/syntax/quote.rs
index 33e4ac03fa61..13d1d2dd660b 100644
--- a/crates/biome_markdown_parser/src/syntax/quote.rs
+++ b/crates/biome_markdown_parser/src/syntax/quote.rs
@@ -195,7 +195,11 @@ impl ParseNodeList for QuoteBlockList {
}
// Parse regular block
+ // Treat content after '>' as column 0 for block parsing (fence detection).
+ let prev_virtual = p.state().virtual_line_start;
+ p.state_mut().virtual_line_start = Some(p.cur_range().start());
let parsed = super::parse_any_block_with_indent_code_policy(p, true);
+ p.state_mut().virtual_line_start = prev_virtual;
if let Present(ref marker) = parsed {
self.last_block_was_paragraph = is_paragraph_like(marker.kind(p));
} else {
@@ -222,13 +226,13 @@ impl ParseNodeList for QuoteBlockList {
}
}
-fn parse_quote_block_list(p: &mut MarkdownParser) {
+pub(crate) fn parse_quote_block_list(p: &mut MarkdownParser) {
let depth = p.state().block_quote_depth;
let mut list = QuoteBlockList::new(depth);
list.parse_list(p);
}
-fn line_has_quote_prefix_at_current(p: &MarkdownParser, depth: usize) -> bool {
+pub(crate) fn line_has_quote_prefix_at_current(p: &MarkdownParser, depth: usize) -> bool {
if depth == 0 {
return false;
}
@@ -371,6 +375,7 @@ pub(crate) fn consume_quote_prefix(p: &mut MarkdownParser, depth: usize) -> bool
consume_quote_prefix_impl(p, depth, true)
}
+/// Check if a quote prefix starts at the current position.
pub(crate) fn consume_quote_prefix_without_virtual(p: &mut MarkdownParser, depth: usize) -> bool {
if depth == 0 || !has_quote_prefix(p, depth) {
return false;
diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs
index 90d75ad4127a..2d6f912fa43e 100644
--- a/crates/biome_markdown_parser/src/to_html.rs
+++ b/crates/biome_markdown_parser/src/to_html.rs
@@ -626,6 +626,9 @@ fn render_fenced_code_block(
if content_indent > 0 {
content = strip_indent_preserve_tabs(&content, content_indent);
}
+ if quote_indent > 0 {
+ content = strip_quote_prefixes(&content, quote_indent);
+ }
// Escape HTML but preserve the content structure
out.push_str(&escape_html(&content));
@@ -715,16 +718,7 @@ fn render_bullet_list(
quote_indent: usize,
) {
let range = list.syntax().text_trimmed_range();
- let mut is_tight = ctx.is_list_tight(range);
- let has_blank_lines = list.md_bullet_list().iter().any(|bullet| {
- bullet
- .content()
- .iter()
- .any(|block| matches!(block, AnyMdBlock::AnyLeafBlock(AnyLeafBlock::MdNewline(_))))
- });
- if has_blank_lines {
- is_tight = false;
- }
+ let is_tight = ctx.is_list_tight(range);
out.push_str("\n");
@@ -743,16 +737,7 @@ fn render_ordered_list(
quote_indent: usize,
) {
let range = list.syntax().text_trimmed_range();
- let mut is_tight = ctx.is_list_tight(range);
- let has_blank_lines = list.md_bullet_list().iter().any(|bullet| {
- bullet
- .content()
- .iter()
- .any(|block| matches!(block, AnyMdBlock::AnyLeafBlock(AnyLeafBlock::MdNewline(_))))
- });
- if has_blank_lines {
- is_tight = false;
- }
+ let is_tight = ctx.is_list_tight(range);
// Get starting number from first item
let start = list
@@ -797,17 +782,11 @@ fn render_list_item(
let list_indent = ctx.list_item_indent(bullet.syntax().text_trimmed_range());
let blocks: Vec<_> = bullet.content().iter().collect();
- let item_has_blank_line = blocks.iter().enumerate().any(|(index, block)| {
- if !is_newline_block(block) {
- return false;
- }
-
- // Ignore the marker-line newline when content follows.
- if index == 0 && blocks.iter().skip(1).any(|block| !is_newline_block(block)) {
- return false;
- }
-
- true
+ // A blank line within an item requires two consecutive newline blocks
+ // (one ending the previous line, one for the blank line itself).
+ // A single MD_NEWLINE between blocks is just a structural separator.
+ let item_has_blank_line = blocks.windows(2).any(|pair| {
+ is_newline_block(&pair[0]) && is_newline_block(&pair[1])
});
let is_tight = is_tight && !item_has_blank_line;
@@ -880,6 +859,13 @@ fn render_list_item(
};
render_block(block, ctx, out, true, block_indent, quote_indent);
}
+ // Remove trailing newline when the last content block is a paragraph
+ // (tight list paragraphs should not have trailing newlines)
+ if blocks.iter().rev().find(|b| !is_newline_block(b)).is_some_and(is_paragraph_block)
+ && out.ends_with('\n')
+ {
+ out.pop();
+ }
}
} else {
// Loose list or multiple blocks
@@ -1789,4 +1775,16 @@ mod tests {
"a
\n"
);
}
+
+ #[test]
+ fn test_hard_line_break_at_end_of_block_is_literal() {
+ let parsed = parse_markdown("foo\\\\\n");
+ let html = document_to_html(
+ &parsed.tree(),
+ parsed.list_tightness(),
+ parsed.list_item_indents(),
+ parsed.quote_indents(),
+ );
+ assert_eq!(html, "foo\\
\n");
+ }
}
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/edge_cases.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/edge_cases.md.snap
index f48ed0d022d2..c0a5de87ab3a 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/edge_cases.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/edge_cases.md.snap
@@ -327,20 +327,23 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@442..443 "\n" [] [],
},
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@443..450 "```" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@450..451 "\n" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@451..471 "not a code fence" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@471..472 "\n" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@472..479 "```" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] [],
+ MdInlineCode {
+ l_tick_token: BACKTICK@443..450 "```" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] [],
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@450..451 "\n" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@451..471 " not a code fence" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@471..472 "\n" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@472..476 " " [] [],
+ },
+ ],
+ r_tick_token: BACKTICK@476..479 "```" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@479..480 "\n" [] [],
@@ -549,21 +552,23 @@ MdDocument {
0: MD_TEXTUAL_LITERAL@418..442 "Para with indented fence" [] []
1: MD_TEXTUAL@442..443
0: MD_TEXTUAL_LITERAL@442..443 "\n" [] []
- 2: MD_TEXTUAL@443..450
- 0: MD_TEXTUAL_LITERAL@443..450 "```" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] []
- 3: MD_TEXTUAL@450..451
- 0: MD_TEXTUAL_LITERAL@450..451 "\n" [] []
- 4: MD_TEXTUAL@451..471
- 0: MD_TEXTUAL_LITERAL@451..471 "not a code fence" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] []
- 5: MD_TEXTUAL@471..472
- 0: MD_TEXTUAL_LITERAL@471..472 "\n" [] []
- 6: MD_TEXTUAL@472..479
- 0: MD_TEXTUAL_LITERAL@472..479 "```" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] []
- 7: MD_TEXTUAL@479..480
+ 2: MD_INLINE_CODE@443..479
+ 0: BACKTICK@443..450 "```" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] []
+ 1: MD_INLINE_ITEM_LIST@450..476
+ 0: MD_TEXTUAL@450..451
+ 0: MD_TEXTUAL_LITERAL@450..451 "\n" [] []
+ 1: MD_TEXTUAL@451..471
+ 0: MD_TEXTUAL_LITERAL@451..471 " not a code fence" [] []
+ 2: MD_TEXTUAL@471..472
+ 0: MD_TEXTUAL_LITERAL@471..472 "\n" [] []
+ 3: MD_TEXTUAL@472..476
+ 0: MD_TEXTUAL_LITERAL@472..476 " " [] []
+ 2: BACKTICK@476..479 "```" [] []
+ 3: MD_TEXTUAL@479..480
0: MD_TEXTUAL_LITERAL@479..480 "\n" [] []
- 8: MD_TEXTUAL@480..504
+ 4: MD_TEXTUAL@480..504
0: MD_TEXTUAL_LITERAL@480..504 "still the same paragraph" [] []
- 9: MD_TEXTUAL@504..505
+ 5: MD_TEXTUAL@504..505
0: MD_TEXTUAL_LITERAL@504..505 "\n" [] []
1: (empty)
2: EOF@505..505 "" [] []
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/list_indentation.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/list_indentation.md.snap
index 58ebed20087d..452b06da4ad5 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/list_indentation.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/list_indentation.md.snap
@@ -1,6 +1,5 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
-assertion_line: 131
expression: snapshot
---
## Input
@@ -645,19 +644,19 @@ MdDocument {
value_token: MD_TEXTUAL_LITERAL@721..722 "\n" [] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@722..725 " " [Skipped(" "), Skipped(" ")] [],
+ value_token: MD_TEXTUAL_LITERAL@722..741 "inner continued" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@725..726 " " [] [],
+ value_token: MD_TEXTUAL_LITERAL@741..742 "\n" [] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@726..741 "inner continued" [] [],
+ value_token: MD_TEXTUAL_LITERAL@742..743 " " [] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@741..742 "\n" [] [],
+ value_token: MD_TEXTUAL_LITERAL@743..744 " " [] [],
},
MdTextual {
- value_token: MD_TEXTUAL_LITERAL@742..759 "outer continued" [Skipped(" "), Skipped(" ")] [],
+ value_token: MD_TEXTUAL_LITERAL@744..759 "outer continued" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@759..760 "\n" [] [],
@@ -1122,16 +1121,16 @@ MdDocument {
0: MD_TEXTUAL_LITERAL@715..721 " inner" [] []
1: MD_TEXTUAL@721..722
0: MD_TEXTUAL_LITERAL@721..722 "\n" [] []
- 2: MD_TEXTUAL@722..725
- 0: MD_TEXTUAL_LITERAL@722..725 " " [Skipped(" "), Skipped(" ")] []
- 3: MD_TEXTUAL@725..726
- 0: MD_TEXTUAL_LITERAL@725..726 " " [] []
- 4: MD_TEXTUAL@726..741
- 0: MD_TEXTUAL_LITERAL@726..741 "inner continued" [] []
- 5: MD_TEXTUAL@741..742
+ 2: MD_TEXTUAL@722..741
+ 0: MD_TEXTUAL_LITERAL@722..741 "inner continued" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] []
+ 3: MD_TEXTUAL@741..742
0: MD_TEXTUAL_LITERAL@741..742 "\n" [] []
- 6: MD_TEXTUAL@742..759
- 0: MD_TEXTUAL_LITERAL@742..759 "outer continued" [Skipped(" "), Skipped(" ")] []
+ 4: MD_TEXTUAL@742..743
+ 0: MD_TEXTUAL_LITERAL@742..743 " " [] []
+ 5: MD_TEXTUAL@743..744
+ 0: MD_TEXTUAL_LITERAL@743..744 " " [] []
+ 6: MD_TEXTUAL@744..759
+ 0: MD_TEXTUAL_LITERAL@744..759 "outer continued" [] []
7: MD_TEXTUAL@759..760
0: MD_TEXTUAL_LITERAL@759..760 "\n" [] []
1: (empty)
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/list_interrupt_empty_bullet.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/list_interrupt_empty_bullet.md.snap
index 5bdba77cdb9e..b9a5510b4c6c 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/list_interrupt_empty_bullet.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/list_interrupt_empty_bullet.md.snap
@@ -1,6 +1,5 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
-assertion_line: 131
expression: snapshot
---
## Input
@@ -27,19 +26,14 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@40..41 "\n" [] [],
},
- ],
- hard_line: missing (optional),
- },
- MdBulletListItem {
- md_bullet_list: MdBulletList [
- MdBullet {
- bullet: PLUS@41..42 "+" [] [],
- content: MdBlockList [],
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@41..42 "+" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@42..43 "\n" [] [],
},
],
- },
- MdNewline {
- value_token: NEWLINE@42..43 "\n" [] [],
+ hard_line: missing (optional),
},
MdNewline {
value_token: NEWLINE@43..44 "\n" [] [],
@@ -55,21 +49,18 @@ MdDocument {
0: MD_DOCUMENT@0..44
0: (empty)
1: MD_BLOCK_LIST@0..44
- 0: MD_PARAGRAPH@0..41
- 0: MD_INLINE_ITEM_LIST@0..41
+ 0: MD_PARAGRAPH@0..43
+ 0: MD_INLINE_ITEM_LIST@0..43
0: MD_TEXTUAL@0..40
0: MD_TEXTUAL_LITERAL@0..40 "Paragraph followed by empty plus bullet." [] []
1: MD_TEXTUAL@40..41
0: MD_TEXTUAL_LITERAL@40..41 "\n" [] []
+ 2: MD_TEXTUAL@41..42
+ 0: MD_TEXTUAL_LITERAL@41..42 "+" [] []
+ 3: MD_TEXTUAL@42..43
+ 0: MD_TEXTUAL_LITERAL@42..43 "\n" [] []
1: (empty)
- 1: MD_BULLET_LIST_ITEM@41..42
- 0: MD_BULLET_LIST@41..42
- 0: MD_BULLET@41..42
- 0: PLUS@41..42 "+" [] []
- 1: MD_BLOCK_LIST@42..42
- 2: MD_NEWLINE@42..43
- 0: NEWLINE@42..43 "\n" [] []
- 3: MD_NEWLINE@43..44
+ 1: MD_NEWLINE@43..44
0: NEWLINE@43..44 "\n" [] []
2: EOF@44..44 "" [] []
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/list_tightness.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/list_tightness.md.snap
index 1ead17492a50..a4f40a10ce09 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/list_tightness.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/list_tightness.md.snap
@@ -1,6 +1,5 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
-assertion_line: 131
expression: snapshot
---
## Input
@@ -594,25 +593,21 @@ MdDocument {
MdNewline {
value_token: NEWLINE@559..560 "\n" [] [],
},
- MdBulletListItem {
- md_bullet_list: MdBulletList [
- MdBullet {
- bullet: MINUS@560..563 "-" [Skipped(" "), Skipped(" ")] [],
- content: MdBlockList [
- MdParagraph {
- list: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@563..571 " Inner 2" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@571..572 "\n" [] [],
- },
- ],
- hard_line: missing (optional),
- },
- ],
+ ],
+ },
+ MdBullet {
+ bullet: MINUS@560..563 "-" [Skipped(" "), Skipped(" ")] [],
+ content: MdBlockList [
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@563..571 " Inner 2" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@571..572 "\n" [] [],
},
],
+ hard_line: missing (optional),
},
],
},
@@ -706,25 +701,21 @@ MdDocument {
],
hard_line: missing (optional),
},
- MdBulletListItem {
- md_bullet_list: MdBulletList [
- MdBullet {
- bullet: MINUS@643..646 "-" [Skipped(" "), Skipped(" ")] [],
- content: MdBlockList [
- MdParagraph {
- list: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@646..654 " Inner B" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@654..655 "\n" [] [],
- },
- ],
- hard_line: missing (optional),
- },
- ],
+ ],
+ },
+ MdBullet {
+ bullet: MINUS@643..646 "-" [Skipped(" "), Skipped(" ")] [],
+ content: MdBlockList [
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@646..654 " Inner B" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@654..655 "\n" [] [],
},
],
+ hard_line: missing (optional),
},
],
},
@@ -1065,9 +1056,9 @@ MdDocument {
1: (empty)
1: MD_BULLET_LIST_ITEM@547..572
0: MD_BULLET_LIST@547..572
- 0: MD_BULLET@547..572
+ 0: MD_BULLET@547..560
0: MINUS@547..550 "-" [Skipped(" "), Skipped(" ")] []
- 1: MD_BLOCK_LIST@550..572
+ 1: MD_BLOCK_LIST@550..560
0: MD_PARAGRAPH@550..559
0: MD_INLINE_ITEM_LIST@550..559
0: MD_TEXTUAL@550..558
@@ -1077,18 +1068,16 @@ MdDocument {
1: (empty)
1: MD_NEWLINE@559..560
0: NEWLINE@559..560 "\n" [] []
- 2: MD_BULLET_LIST_ITEM@560..572
- 0: MD_BULLET_LIST@560..572
- 0: MD_BULLET@560..572
- 0: MINUS@560..563 "-" [Skipped(" "), Skipped(" ")] []
- 1: MD_BLOCK_LIST@563..572
- 0: MD_PARAGRAPH@563..572
- 0: MD_INLINE_ITEM_LIST@563..572
- 0: MD_TEXTUAL@563..571
- 0: MD_TEXTUAL_LITERAL@563..571 " Inner 2" [] []
- 1: MD_TEXTUAL@571..572
- 0: MD_TEXTUAL_LITERAL@571..572 "\n" [] []
- 1: (empty)
+ 1: MD_BULLET@560..572
+ 0: MINUS@560..563 "-" [Skipped(" "), Skipped(" ")] []
+ 1: MD_BLOCK_LIST@563..572
+ 0: MD_PARAGRAPH@563..572
+ 0: MD_INLINE_ITEM_LIST@563..572
+ 0: MD_TEXTUAL@563..571
+ 0: MD_TEXTUAL_LITERAL@563..571 " Inner 2" [] []
+ 1: MD_TEXTUAL@571..572
+ 0: MD_TEXTUAL_LITERAL@571..572 "\n" [] []
+ 1: (empty)
1: MD_BULLET@572..582
0: MINUS@572..573 "-" [] []
1: MD_BLOCK_LIST@573..582
@@ -1136,9 +1125,9 @@ MdDocument {
1: (empty)
1: MD_BULLET_LIST_ITEM@631..655
0: MD_BULLET_LIST@631..655
- 0: MD_BULLET@631..655
+ 0: MD_BULLET@631..643
0: MINUS@631..634 "-" [Skipped(" "), Skipped(" ")] []
- 1: MD_BLOCK_LIST@634..655
+ 1: MD_BLOCK_LIST@634..643
0: MD_PARAGRAPH@634..643
0: MD_INLINE_ITEM_LIST@634..643
0: MD_TEXTUAL@634..642
@@ -1146,18 +1135,16 @@ MdDocument {
1: MD_TEXTUAL@642..643
0: MD_TEXTUAL_LITERAL@642..643 "\n" [] []
1: (empty)
- 1: MD_BULLET_LIST_ITEM@643..655
- 0: MD_BULLET_LIST@643..655
- 0: MD_BULLET@643..655
- 0: MINUS@643..646 "-" [Skipped(" "), Skipped(" ")] []
- 1: MD_BLOCK_LIST@646..655
- 0: MD_PARAGRAPH@646..655
- 0: MD_INLINE_ITEM_LIST@646..655
- 0: MD_TEXTUAL@646..654
- 0: MD_TEXTUAL_LITERAL@646..654 " Inner B" [] []
- 1: MD_TEXTUAL@654..655
- 0: MD_TEXTUAL_LITERAL@654..655 "\n" [] []
- 1: (empty)
+ 1: MD_BULLET@643..655
+ 0: MINUS@643..646 "-" [Skipped(" "), Skipped(" ")] []
+ 1: MD_BLOCK_LIST@646..655
+ 0: MD_PARAGRAPH@646..655
+ 0: MD_INLINE_ITEM_LIST@646..655
+ 0: MD_TEXTUAL@646..654
+ 0: MD_TEXTUAL_LITERAL@646..654 " Inner B" [] []
+ 1: MD_TEXTUAL@654..655
+ 0: MD_TEXTUAL_LITERAL@654..655 "\n" [] []
+ 1: (empty)
2: EOF@655..655 "" [] []
```
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_list.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_list.md.snap
index 87d6d6b68c0b..3aad16c400a2 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_list.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_list.md.snap
@@ -1,6 +1,5 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
-assertion_line: 131
expression: snapshot
---
## Input
@@ -150,25 +149,21 @@ MdDocument {
],
hard_line: missing (optional),
},
- MdBulletListItem {
- md_bullet_list: MdBulletList [
- MdBullet {
- bullet: MINUS@130..133 "-" [Skipped(" "), Skipped(" ")] [],
- content: MdBlockList [
- MdParagraph {
- list: MdInlineItemList [
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@133..147 " Another child" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@147..148 "\n" [] [],
- },
- ],
- hard_line: missing (optional),
- },
- ],
+ ],
+ },
+ MdBullet {
+ bullet: MINUS@130..133 "-" [Skipped(" "), Skipped(" ")] [],
+ content: MdBlockList [
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@133..147 " Another child" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@147..148 "\n" [] [],
},
],
+ hard_line: missing (optional),
},
],
},
@@ -275,9 +270,9 @@ MdDocument {
1: (empty)
1: MD_BULLET_LIST_ITEM@115..148
0: MD_BULLET_LIST@115..148
- 0: MD_BULLET@115..148
+ 0: MD_BULLET@115..130
0: MINUS@115..118 "-" [Skipped(" "), Skipped(" ")] []
- 1: MD_BLOCK_LIST@118..148
+ 1: MD_BLOCK_LIST@118..130
0: MD_PARAGRAPH@118..130
0: MD_INLINE_ITEM_LIST@118..130
0: MD_TEXTUAL@118..129
@@ -285,18 +280,16 @@ MdDocument {
1: MD_TEXTUAL@129..130
0: MD_TEXTUAL_LITERAL@129..130 "\n" [] []
1: (empty)
- 1: MD_BULLET_LIST_ITEM@130..148
- 0: MD_BULLET_LIST@130..148
- 0: MD_BULLET@130..148
- 0: MINUS@130..133 "-" [Skipped(" "), Skipped(" ")] []
- 1: MD_BLOCK_LIST@133..148
- 0: MD_PARAGRAPH@133..148
- 0: MD_INLINE_ITEM_LIST@133..148
- 0: MD_TEXTUAL@133..147
- 0: MD_TEXTUAL_LITERAL@133..147 " Another child" [] []
- 1: MD_TEXTUAL@147..148
- 0: MD_TEXTUAL_LITERAL@147..148 "\n" [] []
- 1: (empty)
+ 1: MD_BULLET@130..148
+ 0: MINUS@130..133 "-" [Skipped(" "), Skipped(" ")] []
+ 1: MD_BLOCK_LIST@133..148
+ 0: MD_PARAGRAPH@133..148
+ 0: MD_INLINE_ITEM_LIST@133..148
+ 0: MD_TEXTUAL@133..147
+ 0: MD_TEXTUAL_LITERAL@133..147 " Another child" [] []
+ 1: MD_TEXTUAL@147..148
+ 0: MD_TEXTUAL_LITERAL@147..148 "\n" [] []
+ 1: (empty)
1: MD_BULLET@148..165
0: MINUS@148..149 "-" [] []
1: MD_BLOCK_LIST@149..165
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md
new file mode 100644
index 000000000000..65ad27b5b991
--- /dev/null
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md
@@ -0,0 +1,15 @@
+Foo
+ ----
+
+Foo
+-----
+
+Foo\
+----
+
+> Foo
+> ---
+
+- Foo
+ ---
+ baz
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md.snap
new file mode 100644
index 000000000000..b6833648b299
--- /dev/null
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md.snap
@@ -0,0 +1,236 @@
+---
+source: crates/biome_markdown_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```
+Foo
+ ----
+
+Foo
+-----
+
+Foo\
+----
+
+> Foo
+> ---
+
+- Foo
+ ---
+ baz
+
+```
+
+
+## AST
+
+```
+MdDocument {
+ bom_token: missing (optional),
+ value: MdBlockList [
+ MdSetextHeader {
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@0..3 "Foo" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@3..4 "\n" [] [],
+ },
+ ],
+ underline_token: MD_SETEXT_UNDERLINE_LITERAL@4..11 "----" [Skipped(" "), Skipped(" "), Skipped(" ")] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@11..12 "\n" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@12..13 "\n" [] [],
+ },
+ MdSetextHeader {
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@13..16 "Foo" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@16..17 "\n" [] [],
+ },
+ ],
+ underline_token: MD_SETEXT_UNDERLINE_LITERAL@17..22 "-----" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@22..23 "\n" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@23..24 "\n" [] [],
+ },
+ MdSetextHeader {
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@24..27 "Foo" [] [],
+ },
+ MdHardLine {
+ value_token: MD_HARD_LINE_LITERAL@27..29 "\\\n" [] [],
+ },
+ ],
+ underline_token: MD_SETEXT_UNDERLINE_LITERAL@29..33 "----" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@33..34 "\n" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@34..35 "\n" [] [],
+ },
+ MdQuote {
+ marker_token: R_ANGLE@35..36 ">" [] [],
+ content: MdBlockList [
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@36..40 "Foo" [Skipped(" ")] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@40..41 "\n" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@41..44 "-" [Skipped(">"), Skipped(" ")] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@44..45 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@45..46 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@46..47 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ ],
+ },
+ MdNewline {
+ value_token: NEWLINE@47..48 "\n" [] [],
+ },
+ MdBulletListItem {
+ md_bullet_list: MdBulletList [
+ MdBullet {
+ bullet: MINUS@48..49 "-" [] [],
+ content: MdBlockList [
+ MdSetextHeader {
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@49..53 "Foo" [Skipped(" ")] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@53..54 "\n" [] [],
+ },
+ ],
+ underline_token: MD_SETEXT_UNDERLINE_LITERAL@54..59 "---" [Skipped(" "), Skipped(" ")] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@59..60 "\n" [] [],
+ },
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@60..65 "baz" [Skipped(" "), Skipped(" ")] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@65..66 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ ],
+ },
+ ],
+ },
+ ],
+ eof_token: EOF@66..66 "" [] [],
+}
+```
+
+## CST
+
+```
+0: MD_DOCUMENT@0..66
+ 0: (empty)
+ 1: MD_BLOCK_LIST@0..66
+ 0: MD_SETEXT_HEADER@0..11
+ 0: MD_INLINE_ITEM_LIST@0..4
+ 0: MD_TEXTUAL@0..3
+ 0: MD_TEXTUAL_LITERAL@0..3 "Foo" [] []
+ 1: MD_TEXTUAL@3..4
+ 0: MD_TEXTUAL_LITERAL@3..4 "\n" [] []
+ 1: MD_SETEXT_UNDERLINE_LITERAL@4..11 "----" [Skipped(" "), Skipped(" "), Skipped(" ")] []
+ 1: MD_NEWLINE@11..12
+ 0: NEWLINE@11..12 "\n" [] []
+ 2: MD_NEWLINE@12..13
+ 0: NEWLINE@12..13 "\n" [] []
+ 3: MD_SETEXT_HEADER@13..22
+ 0: MD_INLINE_ITEM_LIST@13..17
+ 0: MD_TEXTUAL@13..16
+ 0: MD_TEXTUAL_LITERAL@13..16 "Foo" [] []
+ 1: MD_TEXTUAL@16..17
+ 0: MD_TEXTUAL_LITERAL@16..17 "\n" [] []
+ 1: MD_SETEXT_UNDERLINE_LITERAL@17..22 "-----" [] []
+ 4: MD_NEWLINE@22..23
+ 0: NEWLINE@22..23 "\n" [] []
+ 5: MD_NEWLINE@23..24
+ 0: NEWLINE@23..24 "\n" [] []
+ 6: MD_SETEXT_HEADER@24..33
+ 0: MD_INLINE_ITEM_LIST@24..29
+ 0: MD_TEXTUAL@24..27
+ 0: MD_TEXTUAL_LITERAL@24..27 "Foo" [] []
+ 1: MD_HARD_LINE@27..29
+ 0: MD_HARD_LINE_LITERAL@27..29 "\\\n" [] []
+ 1: MD_SETEXT_UNDERLINE_LITERAL@29..33 "----" [] []
+ 7: MD_NEWLINE@33..34
+ 0: NEWLINE@33..34 "\n" [] []
+ 8: MD_NEWLINE@34..35
+ 0: NEWLINE@34..35 "\n" [] []
+ 9: MD_QUOTE@35..47
+ 0: R_ANGLE@35..36 ">" [] []
+ 1: MD_BLOCK_LIST@36..47
+ 0: MD_PARAGRAPH@36..47
+ 0: MD_INLINE_ITEM_LIST@36..47
+ 0: MD_TEXTUAL@36..40
+ 0: MD_TEXTUAL_LITERAL@36..40 "Foo" [Skipped(" ")] []
+ 1: MD_TEXTUAL@40..41
+ 0: MD_TEXTUAL_LITERAL@40..41 "\n" [] []
+ 2: MD_TEXTUAL@41..44
+ 0: MD_TEXTUAL_LITERAL@41..44 "-" [Skipped(">"), Skipped(" ")] []
+ 3: MD_TEXTUAL@44..45
+ 0: MD_TEXTUAL_LITERAL@44..45 "-" [] []
+ 4: MD_TEXTUAL@45..46
+ 0: MD_TEXTUAL_LITERAL@45..46 "-" [] []
+ 5: MD_TEXTUAL@46..47
+ 0: MD_TEXTUAL_LITERAL@46..47 "\n" [] []
+ 1: (empty)
+ 10: MD_NEWLINE@47..48
+ 0: NEWLINE@47..48 "\n" [] []
+ 11: MD_BULLET_LIST_ITEM@48..66
+ 0: MD_BULLET_LIST@48..66
+ 0: MD_BULLET@48..66
+ 0: MINUS@48..49 "-" [] []
+ 1: MD_BLOCK_LIST@49..66
+ 0: MD_SETEXT_HEADER@49..59
+ 0: MD_INLINE_ITEM_LIST@49..54
+ 0: MD_TEXTUAL@49..53
+ 0: MD_TEXTUAL_LITERAL@49..53 "Foo" [Skipped(" ")] []
+ 1: MD_TEXTUAL@53..54
+ 0: MD_TEXTUAL_LITERAL@53..54 "\n" [] []
+ 1: MD_SETEXT_UNDERLINE_LITERAL@54..59 "---" [Skipped(" "), Skipped(" ")] []
+ 1: MD_NEWLINE@59..60
+ 0: NEWLINE@59..60 "\n" [] []
+ 2: MD_PARAGRAPH@60..66
+ 0: MD_INLINE_ITEM_LIST@60..66
+ 0: MD_TEXTUAL@60..65
+ 0: MD_TEXTUAL_LITERAL@60..65 "baz" [Skipped(" "), Skipped(" ")] []
+ 1: MD_TEXTUAL@65..66
+ 0: MD_TEXTUAL_LITERAL@65..66 "\n" [] []
+ 1: (empty)
+ 2: EOF@66..66 "" [] []
+
+```
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md
new file mode 100644
index 000000000000..2c1bcf55a4bc
--- /dev/null
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md
@@ -0,0 +1,13 @@
+Foo
+= =
+
+Foo
+ ---
+
+`Foo
+----
+`
+
+
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md.snap
new file mode 100644
index 000000000000..53c62486f83f
--- /dev/null
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md.snap
@@ -0,0 +1,212 @@
+---
+source: crates/biome_markdown_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```
+Foo
+= =
+
+Foo
+ ---
+
+`Foo
+----
+`
+
+
+
+```
+
+
+## AST
+
+```
+MdDocument {
+ bom_token: missing (optional),
+ value: MdBlockList [
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@0..3 "Foo" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@3..4 "\n" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@4..7 "= =" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@7..8 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ MdNewline {
+ value_token: NEWLINE@8..9 "\n" [] [],
+ },
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@9..12 "Foo" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@12..13 "\n" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@13..20 "---" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@20..21 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ MdNewline {
+ value_token: NEWLINE@21..22 "\n" [] [],
+ },
+ MdSetextHeader {
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@22..23 "`" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@23..26 "Foo" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@26..27 "\n" [] [],
+ },
+ ],
+ underline_token: MD_SETEXT_UNDERLINE_LITERAL@27..31 "----" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@31..32 "\n" [] [],
+ },
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@32..33 "`" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@33..34 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ MdNewline {
+ value_token: NEWLINE@34..35 "\n" [] [],
+ },
+ MdSetextHeader {
+ content: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@35..36 "<" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@36..50 "a title=\"a lot" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@50..51 "\n" [] [],
+ },
+ ],
+ underline_token: MD_SETEXT_UNDERLINE_LITERAL@51..54 "---" [] [],
+ },
+ MdNewline {
+ value_token: NEWLINE@54..55 "\n" [] [],
+ },
+ MdParagraph {
+ list: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@55..66 "of dashes\"/" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@66..67 ">" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@67..68 "\n" [] [],
+ },
+ ],
+ hard_line: missing (optional),
+ },
+ ],
+ eof_token: EOF@68..68 "" [] [],
+}
+```
+
+## CST
+
+```
+0: MD_DOCUMENT@0..68
+ 0: (empty)
+ 1: MD_BLOCK_LIST@0..68
+ 0: MD_PARAGRAPH@0..8
+ 0: MD_INLINE_ITEM_LIST@0..8
+ 0: MD_TEXTUAL@0..3
+ 0: MD_TEXTUAL_LITERAL@0..3 "Foo" [] []
+ 1: MD_TEXTUAL@3..4
+ 0: MD_TEXTUAL_LITERAL@3..4 "\n" [] []
+ 2: MD_TEXTUAL@4..7
+ 0: MD_TEXTUAL_LITERAL@4..7 "= =" [] []
+ 3: MD_TEXTUAL@7..8
+ 0: MD_TEXTUAL_LITERAL@7..8 "\n" [] []
+ 1: (empty)
+ 1: MD_NEWLINE@8..9
+ 0: NEWLINE@8..9 "\n" [] []
+ 2: MD_PARAGRAPH@9..21
+ 0: MD_INLINE_ITEM_LIST@9..21
+ 0: MD_TEXTUAL@9..12
+ 0: MD_TEXTUAL_LITERAL@9..12 "Foo" [] []
+ 1: MD_TEXTUAL@12..13
+ 0: MD_TEXTUAL_LITERAL@12..13 "\n" [] []
+ 2: MD_TEXTUAL@13..20
+ 0: MD_TEXTUAL_LITERAL@13..20 "---" [Skipped(" "), Skipped(" "), Skipped(" "), Skipped(" ")] []
+ 3: MD_TEXTUAL@20..21
+ 0: MD_TEXTUAL_LITERAL@20..21 "\n" [] []
+ 1: (empty)
+ 3: MD_NEWLINE@21..22
+ 0: NEWLINE@21..22 "\n" [] []
+ 4: MD_SETEXT_HEADER@22..31
+ 0: MD_INLINE_ITEM_LIST@22..27
+ 0: MD_TEXTUAL@22..23
+ 0: MD_TEXTUAL_LITERAL@22..23 "`" [] []
+ 1: MD_TEXTUAL@23..26
+ 0: MD_TEXTUAL_LITERAL@23..26 "Foo" [] []
+ 2: MD_TEXTUAL@26..27
+ 0: MD_TEXTUAL_LITERAL@26..27 "\n" [] []
+ 1: MD_SETEXT_UNDERLINE_LITERAL@27..31 "----" [] []
+ 5: MD_NEWLINE@31..32
+ 0: NEWLINE@31..32 "\n" [] []
+ 6: MD_PARAGRAPH@32..34
+ 0: MD_INLINE_ITEM_LIST@32..34
+ 0: MD_TEXTUAL@32..33
+ 0: MD_TEXTUAL_LITERAL@32..33 "`" [] []
+ 1: MD_TEXTUAL@33..34
+ 0: MD_TEXTUAL_LITERAL@33..34 "\n" [] []
+ 1: (empty)
+ 7: MD_NEWLINE@34..35
+ 0: NEWLINE@34..35 "\n" [] []
+ 8: MD_SETEXT_HEADER@35..54
+ 0: MD_INLINE_ITEM_LIST@35..51
+ 0: MD_TEXTUAL@35..36
+ 0: MD_TEXTUAL_LITERAL@35..36 "<" [] []
+ 1: MD_TEXTUAL@36..50
+ 0: MD_TEXTUAL_LITERAL@36..50 "a title=\"a lot" [] []
+ 2: MD_TEXTUAL@50..51
+ 0: MD_TEXTUAL_LITERAL@50..51 "\n" [] []
+ 1: MD_SETEXT_UNDERLINE_LITERAL@51..54 "---" [] []
+ 9: MD_NEWLINE@54..55
+ 0: NEWLINE@54..55 "\n" [] []
+ 10: MD_PARAGRAPH@55..68
+ 0: MD_INLINE_ITEM_LIST@55..68
+ 0: MD_TEXTUAL@55..66
+ 0: MD_TEXTUAL_LITERAL@55..66 "of dashes\"/" [] []
+ 1: MD_TEXTUAL@66..67
+ 0: MD_TEXTUAL_LITERAL@66..67 ">" [] []
+ 2: MD_TEXTUAL@67..68
+ 0: MD_TEXTUAL_LITERAL@67..68 "\n" [] []
+ 1: (empty)
+ 2: EOF@68..68 "" [] []
+
+```
diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs
index 195cd528609f..81b145bbded9 100644
--- a/crates/biome_markdown_parser/tests/spec_test.rs
+++ b/crates/biome_markdown_parser/tests/spec_test.rs
@@ -134,13 +134,32 @@ pub fn run(test_case: &str, _snapshot_name: &str, test_directory: &str, outcome_
#[test]
pub fn quick_test() {
- let code = r#"**bold *and italic* text**
-"#;
-
- let root = parse_markdown(code);
- let syntax = root.syntax();
- dbg!(&syntax, root.diagnostics(), root.has_errors());
- if has_bogus_nodes_or_empty_slots(&syntax) {
- panic!("modified tree has bogus nodes or empty slots:\n{syntax:#?} \n\n {syntax}")
+ use biome_markdown_parser::document_to_html;
+ use biome_markdown_syntax::MdDocument;
+ use biome_rowan::AstNode;
+
+ // Example 128: Fenced code block inside blockquote
+ let input = "> ```\n> aaa\n\nbbb\n";
+ let expected = "\naaa\n
\n
\nbbb
\n";
+
+ let root = parse_markdown(input);
+ eprintln!("=== AST ===\n{:#?}", root.syntax());
+
+ let doc = MdDocument::cast(root.syntax()).unwrap();
+ let html = document_to_html(
+ &doc,
+ root.list_tightness(),
+ root.list_item_indents(),
+ root.quote_indents(),
+ );
+
+ eprintln!("=== HTML ===");
+ eprintln!("Expected:\n{}", expected);
+ eprintln!("Actual:\n{}", html);
+
+ if expected == html {
+ eprintln!("✓ PASS");
+ } else {
+ eprintln!("✗ FAIL");
}
}
diff --git a/xtask/coverage/src/reporters.rs b/xtask/coverage/src/reporters.rs
index d1e0afe7d772..b89fe56d80d6 100644
--- a/xtask/coverage/src/reporters.rs
+++ b/xtask/coverage/src/reporters.rs
@@ -251,7 +251,7 @@ impl SummaryReporter {
let coverage = if summary.coverage.is_nan() {
"\u{221E}".to_string()
} else {
- format!("{:.2}", summary.coverage)
+ format!("{:.2}%", summary.coverage)
};
let total = panicked + errored + passed;
From 14bedaab3daadf4d5867afcf34618846bd2712f6 Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Tue, 27 Jan 2026 19:22:16 -0500
Subject: [PATCH 05/26] fix(markdown): polish CommonMark conformance and HTML
output
Final set of polish fixes to achieve 100% CommonMark conformance.
Changes include:
- Update render_textual to use trimmed text where appropriate to avoid excess whitespace.
- Fix specific HTML rendering edge cases (examples 174, 616, 619-626) related to entities and raw HTML.
- Resolve remaining edge cases for examples 042, 066, 073, 093, and 223.
- Ensure all remaining CommonMark tests pass.
---
.../src/link_reference.rs | 2 +
crates/biome_markdown_parser/src/parser.rs | 1 -
crates/biome_markdown_parser/src/syntax.rs | 95 +++++--
.../src/syntax/header.rs | 113 +++++++-
.../src/syntax/html_block.rs | 88 +++----
.../src/syntax/inline.rs | 88 ++++++-
.../biome_markdown_parser/src/syntax/list.rs | 164 ++++++++++--
.../src/syntax/thematic_break_block.rs | 170 +++++++++++-
crates/biome_markdown_parser/src/to_html.rs | 192 ++++++++++++--
.../ok/inline_html_invalid.md.snap | 248 ++++++++++--------
.../biome_markdown_parser/tests/spec_test.rs | 70 +++--
11 files changed, 952 insertions(+), 279 deletions(-)
diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/link_reference.rs
index e67f38759aa9..89ba4f7fa8da 100644
--- a/crates/biome_markdown_parser/src/link_reference.rs
+++ b/crates/biome_markdown_parser/src/link_reference.rs
@@ -32,6 +32,8 @@ pub(crate) fn normalize_reference_label(text: &str) -> String {
push_normalized_char(&mut out, c, &mut saw_whitespace);
}
+ // CommonMark uses Unicode case folding; uppercasing keeps ß/ẞ matching "SS"
+ // (e.g. example 540) and aligns with cmark's behavior for reference labels.
out.as_str().to_lowercase_cow().to_uppercase()
}
diff --git a/crates/biome_markdown_parser/src/parser.rs b/crates/biome_markdown_parser/src/parser.rs
index 877750fa6127..6f31db98d315 100644
--- a/crates/biome_markdown_parser/src/parser.rs
+++ b/crates/biome_markdown_parser/src/parser.rs
@@ -240,7 +240,6 @@ impl<'source> MarkdownParser<'source> {
self.source.bump_link_definition();
}
-
pub fn checkpoint(&self) -> MarkdownParserCheckpoint {
MarkdownParserCheckpoint {
context: self.context.checkpoint(),
diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs
index 832d93c97246..4c521eef6b3d 100644
--- a/crates/biome_markdown_parser/src/syntax.rs
+++ b/crates/biome_markdown_parser/src/syntax.rs
@@ -752,15 +752,13 @@ fn allow_setext_heading(p: &MarkdownParser) -> bool {
/// Compute the real leading indent of the current line from source text.
/// This is needed because leading whitespace may have been consumed as trivia
/// in list item context, making `line_start_leading_indent()` return 0.
+/// Token-based lookahead cannot recover the original column once trivia is skipped.
fn real_line_indent_from_source(p: &MarkdownParser) -> usize {
let source = p.source().source_text();
let pos: usize = p.cur_range().start().into();
// Find the start of the current line
- let line_start = source[..pos]
- .rfind('\n')
- .map(|i| i + 1)
- .unwrap_or(0);
+ let line_start = source[..pos].rfind('\n').map_or(0, |i| i + 1);
// Count leading whitespace columns on this line
let mut column = 0;
@@ -862,6 +860,9 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
if quote_depth > 0 {
let is_quote_blank_line = p.lookahead(|p| {
p.bump(NEWLINE);
+ if is_quote_only_blank_line_from_source(p, quote_depth) {
+ return true;
+ }
if !has_quote_prefix(p, quote_depth) {
return false;
}
@@ -932,10 +933,11 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
// After crossing a line, check for setext underlines.
// For non-list paragraphs, we need to look past up to 3 spaces of indent
// to detect setext underlines (CommonMark §4.3).
- if has_content && p.state().list_item_required_indent == 0 {
- let is_setext = p.lookahead(|p| {
- at_setext_underline_after_newline(p).is_some()
- });
+ // IMPORTANT: Only break if allow_setext_heading() is true - this ensures
+ // setext underlines outside a blockquote (without >) don't incorrectly
+ // terminate the paragraph (CommonMark example 093).
+ if has_content && p.state().list_item_required_indent == 0 && allow_setext_heading(p) {
+ let is_setext = p.lookahead(|p| at_setext_underline_after_newline(p).is_some());
if is_setext {
// Skip the indent so parse_paragraph sees the underline
p.skip_line_indent(INDENT_CODE_BLOCK_SPACES);
@@ -1074,18 +1076,16 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
if parsed.is_absent() {
break;
}
- let after_hard_break =
- matches!(&parsed, Present(cm) if cm.kind(p) == MD_HARD_LINE);
+ let after_hard_break = matches!(&parsed, Present(cm) if cm.kind(p) == MD_HARD_LINE);
// Per CommonMark §6.7: after a hard line break, leading spaces on the
// next line are ignored. Skip whitespace-only textual tokens as trivia.
- if after_hard_break && p.at(MD_TEXTUAL_LITERAL) {
- if p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
- while p.at(MD_TEXTUAL_LITERAL)
- && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
- {
- p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
- }
+ if after_hard_break
+ && p.at(MD_TEXTUAL_LITERAL)
+ && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
+ {
+ while p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
}
}
@@ -1097,6 +1097,67 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
p.set_emphasis_context(prev_emphasis_context);
}
+fn is_quote_only_blank_line_from_source(p: &MarkdownParser, depth: usize) -> bool {
+ if depth == 0 {
+ return false;
+ }
+
+ let source = p.source().source_text();
+ let start: usize = p.cur_range().start().into();
+ if start >= source.len() {
+ return true;
+ }
+
+ // Scan up to 3 spaces/tabs before the first '>'
+ let mut idx = start;
+ let mut indent = 0usize;
+ while idx < source.len() {
+ match source.as_bytes()[idx] {
+ b' ' => {
+ indent += 1;
+ idx += 1;
+ }
+ b'\t' => {
+ indent += 4 - (indent % 4);
+ idx += 1;
+ }
+ _ => break,
+ }
+ if indent > 3 {
+ return false;
+ }
+ }
+
+ // Consume quote markers and optional single space after each
+ for _ in 0..depth {
+ if idx >= source.len() || source.as_bytes()[idx] != b'>' {
+ return false;
+ }
+ idx += 1;
+ if idx < source.len() {
+ let c = source.as_bytes()[idx];
+ if c == b' ' || c == b'\t' {
+ idx += 1;
+ }
+ }
+ }
+
+ // Skip trailing whitespace
+ while idx < source.len() {
+ match source.as_bytes()[idx] {
+ b' ' | b'\t' => idx += 1,
+ _ => break,
+ }
+ }
+
+ // Blank if line ends here or at newline
+ if idx >= source.len() {
+ return true;
+ }
+
+ matches!(source.as_bytes()[idx], b'\n' | b'\r')
+}
+
/// Build an emphasis context for the current inline list and install it on the parser.
/// Returns the previous context so it can be restored.
fn set_inline_emphasis_context(
diff --git a/crates/biome_markdown_parser/src/syntax/header.rs b/crates/biome_markdown_parser/src/syntax/header.rs
index 555f1da00671..67803ba9f73d 100644
--- a/crates/biome_markdown_parser/src/syntax/header.rs
+++ b/crates/biome_markdown_parser/src/syntax/header.rs
@@ -25,6 +25,7 @@
//! ```
use crate::parser::MarkdownParser;
+use crate::syntax::inline::EmphasisContext;
use biome_markdown_syntax::{T, kind::MarkdownSyntaxKind::*};
use biome_parser::{
Parser,
@@ -144,6 +145,9 @@ pub(crate) fn parse_header_content(p: &mut MarkdownParser) {
return;
}
+ // Set up emphasis context for header content (single line only)
+ let prev_context = set_header_emphasis_context(p);
+
// Parse content as a paragraph containing inline items
let m = p.start();
let inline_m = p.start();
@@ -179,6 +183,83 @@ pub(crate) fn parse_header_content(p: &mut MarkdownParser) {
inline_m.complete(p, MD_INLINE_ITEM_LIST);
m.complete(p, MD_PARAGRAPH);
+
+ // Restore previous emphasis context
+ p.set_emphasis_context(prev_context);
+}
+
+/// Compute the byte length of header content (up to end of line or trailing hashes).
+fn header_content_source_len(p: &mut MarkdownParser) -> usize {
+ p.lookahead(|p| {
+ let mut len = 0usize;
+
+ loop {
+ if p.at(T![EOF]) || p.at(NEWLINE) {
+ break;
+ }
+
+ // Check for trailing hashes (whitespace + # + end of line)
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text.chars().all(|c| c == ' ' || c == '\t') {
+ // Might be whitespace before trailing hashes
+ let ws_len = text.len();
+ p.bump(MD_TEXTUAL_LITERAL);
+
+ // Check if followed by hash + end of line
+ if p.at(T![#]) {
+ let hash_len = p.cur_text().len();
+ p.bump(T![#]);
+
+ // Skip any whitespace after hashes
+ while p.at(MD_TEXTUAL_LITERAL)
+ && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
+ {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+
+ if p.at(T![EOF]) || p.at(NEWLINE) {
+ // This is trailing hashes - don't include in content
+ break;
+ }
+ // Not trailing hashes - include in content
+ len += ws_len + hash_len;
+ } else {
+ len += ws_len;
+ }
+ continue;
+ }
+ }
+
+ // Check for MD_HARD_LINE_LITERAL (trailing spaces/backslash)
+ if p.at(MD_HARD_LINE_LITERAL) {
+ // Don't include hard line in emphasis context
+ break;
+ }
+
+ len += p.cur_text().len();
+ p.bump_any();
+ }
+
+ len
+ })
+}
+
+/// Build an emphasis context for header content and install it on the parser.
+/// Returns the previous context so it can be restored.
+fn set_header_emphasis_context(p: &mut MarkdownParser) -> Option {
+ let source_len = header_content_source_len(p);
+ let source = p.source_after_current();
+ let inline_source = if source_len <= source.len() {
+ &source[..source_len]
+ } else {
+ source
+ };
+ let base_offset = u32::from(p.cur_range().start()) as usize;
+ let context = EmphasisContext::new(inline_source, base_offset, |label| {
+ p.has_link_reference_definition(label)
+ });
+ p.set_emphasis_context(Some(context))
}
/// Check if the current position has a trailing hash sequence.
@@ -198,11 +279,16 @@ fn is_trailing_hash_sequence(p: &mut MarkdownParser) -> bool {
// Consume the single HASH token (contains all consecutive hashes)
p.bump(T![#]);
- // Skip any trailing whitespace after hashes
- while p.at(MD_TEXTUAL_LITERAL) {
+ // Skip any trailing whitespace after hashes (may include newline in same token)
+ // Also check MD_HARD_LINE_LITERAL (5+ trailing spaces before newline)
+ while p.at(MD_TEXTUAL_LITERAL) || p.at(MD_HARD_LINE_LITERAL) {
let text = p.cur_text();
- if text.chars().all(|c| c == ' ' || c == '\t') {
- p.bump(MD_TEXTUAL_LITERAL);
+ // Accept whitespace tokens that may include newline
+ if text
+ .chars()
+ .all(|c| c == ' ' || c == '\t' || c == '\n' || c == '\r')
+ {
+ p.bump_any();
} else {
break;
}
@@ -255,10 +341,27 @@ pub(crate) fn parse_trailing_hashes(p: &mut MarkdownParser) {
}
// Consume the trailing hash token and wrap in MdHash node
- if p.at(T![#]) && !p.at_inline_end() {
+ if p.at(T![#]) {
let hash_m = p.start();
p.bump(T![#]);
hash_m.complete(p, MD_HASH);
+
+ // Skip any trailing whitespace AFTER the closing hashes
+ // Per CommonMark §4.2, trailing whitespace after closing hashes is ignored
+ // The lexer may combine trailing whitespace with the newline into a single token
+ // Check both MD_TEXTUAL_LITERAL and MD_HARD_LINE_LITERAL (5+ spaces before newline)
+ while p.at(MD_TEXTUAL_LITERAL) || p.at(MD_HARD_LINE_LITERAL) {
+ let text = p.cur_text();
+ // Check if this is whitespace-only (spaces/tabs) or whitespace+newline
+ let is_trailing_ws = text
+ .chars()
+ .all(|c| c == ' ' || c == '\t' || c == '\n' || c == '\r');
+ if is_trailing_ws {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump_any());
+ } else {
+ break;
+ }
+ }
}
}
diff --git a/crates/biome_markdown_parser/src/syntax/html_block.rs b/crates/biome_markdown_parser/src/syntax/html_block.rs
index 8d2f51c43608..5264929c322f 100644
--- a/crates/biome_markdown_parser/src/syntax/html_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/html_block.rs
@@ -56,8 +56,13 @@ enum Type1Tag {
Textarea,
}
+/// Determine HTML block type using raw source to match line-based CommonMark rules.
+/// Token lookahead is insufficient here because lexer contexts can split or merge
+/// tokens across `<...>` boundaries, and we need the exact line text.
fn html_block_kind(p: &MarkdownParser) -> Option {
let remaining = p.source_after_current();
+ // Skip whitespace trivia that may precede the '<' token
+ let remaining = remaining.trim_start_matches([' ', '\t']);
if !remaining.starts_with('<') {
return None;
}
@@ -147,51 +152,21 @@ fn first_line(text: &str) -> &str {
text.split_once(['\n', '\r']).map_or(text, |(line, _)| line)
}
+/// Check if a line contains only a valid HTML open or close tag (for type 7 HTML blocks).
+/// Uses the same validation as inline HTML (CommonMark §6.8) to ensure proper tag structure.
fn line_has_only_tag(line: &str) -> bool {
let bytes = line.as_bytes();
if !bytes.starts_with(b"<") {
return false;
}
- let Some(end) = tag_end_index(bytes) else {
+ // Use inline HTML validator which properly checks tag name, attributes, etc.
+ let Some(html_len) = super::inline::is_inline_html(line) else {
return false;
};
- line[end + 1..].chars().all(|c| c == ' ' || c == '\t')
-}
-
-fn tag_end_index(bytes: &[u8]) -> Option {
- let mut i = 1;
- let mut in_single = false;
- let mut in_double = false;
-
- while i < bytes.len() {
- let b = bytes[i];
- if in_single {
- if b == b'\'' {
- in_single = false;
- }
- i += 1;
- continue;
- }
- if in_double {
- if b == b'"' {
- in_double = false;
- }
- i += 1;
- continue;
- }
-
- match b {
- b'\'' => in_single = true,
- b'"' => in_double = true,
- b'>' => return Some(i),
- _ => {}
- }
- i += 1;
- }
-
- None
+ // After the tag, only whitespace is allowed
+ line[html_len..].chars().all(|c| c == ' ' || c == '\t')
}
/// Block-level tags that can interrupt paragraphs.
@@ -317,25 +292,33 @@ pub(crate) fn parse_html_block(p: &mut MarkdownParser) -> ParsedSyntax {
fn parse_until_blank_line(p: &mut MarkdownParser) {
while !p.at(EOF) {
- if p.at(NEWLINE) && p.at_blank_line() {
+ if p.at(NEWLINE) {
+ if p.at_blank_line() {
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+ break;
+ }
+ // Consume the newline first, then check if the next line exits the container
let text_m = p.start();
p.bump_remap(MD_TEXTUAL_LITERAL);
text_m.complete(p, MD_TEXTUAL);
- break;
+
+ if at_container_boundary(p) {
+ break;
+ }
+ skip_container_prefixes(p);
+ continue;
}
+ // For non-newline tokens, check container boundary (handles virtual line start)
if at_container_boundary(p) {
break;
}
let text_m = p.start();
- let is_newline = p.at(NEWLINE);
p.bump_remap(MD_TEXTUAL_LITERAL);
text_m.complete(p, MD_TEXTUAL);
-
- if is_newline {
- skip_container_prefixes(p);
- }
}
}
@@ -343,10 +326,6 @@ fn parse_until_terminator(p: &mut MarkdownParser, terminator: &str, case_insensi
let mut line = String::new();
while !p.at(EOF) {
- if at_container_boundary(p) {
- break;
- }
-
let text = p.cur_text();
let is_newline = p.at(NEWLINE);
line.push_str(text);
@@ -360,6 +339,10 @@ fn parse_until_terminator(p: &mut MarkdownParser, terminator: &str, case_insensi
break;
}
line.clear();
+ // Check container boundary after consuming newline
+ if at_container_boundary(p) {
+ break;
+ }
skip_container_prefixes(p);
}
}
@@ -380,7 +363,7 @@ fn line_contains(line: &str, needle: &str, case_insensitive: bool) -> bool {
if hay[i..i + needle.len()]
.iter()
.zip(needle.iter())
- .all(|(a, b)| a.to_ascii_lowercase() == b.to_ascii_lowercase())
+ .all(|(a, b)| a.eq_ignore_ascii_case(b))
{
return true;
}
@@ -406,7 +389,14 @@ fn skip_container_prefixes(p: &mut MarkdownParser) {
fn at_container_boundary(p: &mut MarkdownParser) -> bool {
let quote_depth = p.state().block_quote_depth;
if quote_depth > 0 && p.at_line_start() && !has_quote_prefix(p, quote_depth) {
- return true;
+ // Skip if at virtual line start — the quote prefix was already consumed
+ // by the container parser that set this virtual start position.
+ if p.state()
+ .virtual_line_start
+ .is_none_or(|vls| vls != p.cur_range().start())
+ {
+ return true;
+ }
}
let required_indent = p.state().list_item_required_indent;
diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs
index f00c96753a0e..95538d4b86e2 100644
--- a/crates/biome_markdown_parser/src/syntax/inline.rs
+++ b/crates/biome_markdown_parser/src/syntax/inline.rs
@@ -621,15 +621,6 @@ pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax {
return Absent;
}
- let ends_block = p.lookahead(|p| {
- p.bump(MD_HARD_LINE_LITERAL);
- p.at(NEWLINE) || p.at(EOF)
- });
-
- if ends_block {
- return super::parse_textual(p);
- }
-
let m = p.start();
p.bump(MD_HARD_LINE_LITERAL);
Present(m.complete(p, MD_HARD_LINE))
@@ -642,7 +633,6 @@ pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax {
/// as literal text, not an unclosed code span.
///
/// Returns false if no match found (opener should become literal text).
-
fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -> bool {
use crate::lexer::MarkdownLexContext;
@@ -673,6 +663,11 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -
if crate::syntax::at_setext_underline_after_newline(p).is_some() {
return false;
}
+ // Per CommonMark, block interrupts (including list markers) can
+ // terminate paragraphs. A code span cannot cross a block boundary.
+ if crate::syntax::at_block_interrupt(p) || at_list_marker_after_newline(p) {
+ return false;
+ }
continue;
}
@@ -697,6 +692,78 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -
})
}
+/// Check if we're at a list marker after a newline.
+/// This is used to detect when a code span would cross a list item boundary.
+fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool {
+ // Skip up to 3 spaces of indent (list markers can be indented 0-3 spaces)
+ let mut columns = 0usize;
+ while columns < 4
+ && p.at(MD_TEXTUAL_LITERAL)
+ && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
+ {
+ for c in p.cur_text().chars() {
+ match c {
+ ' ' => columns += 1,
+ '\t' => columns += 4 - (columns % 4),
+ _ => {}
+ }
+ }
+ if columns >= 4 {
+ return false; // Indented code block, not a list marker
+ }
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+
+ // Check for bullet list markers: -, *, +
+ if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
+ let marker_text = p.cur_text();
+ if marker_text.len() == 1 {
+ p.bump_any();
+ // Must be followed by space, tab, or EOL
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return true;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ return text.starts_with(' ') || text.starts_with('\t');
+ }
+ }
+ return false;
+ }
+
+ // Check for ordered list marker: digits followed by . or )
+ if p.at(MD_ORDERED_LIST_MARKER) {
+ p.bump(MD_ORDERED_LIST_MARKER);
+ // Must be followed by space, tab, or EOL
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return true;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ return text.starts_with(' ') || text.starts_with('\t');
+ }
+ return false;
+ }
+
+ // Check for textual bullet markers (lexed as MD_TEXTUAL_LITERAL in some contexts)
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text == "-" || text == "*" || text == "+" {
+ p.bump(MD_TEXTUAL_LITERAL);
+ // Must be followed by space, tab, or EOL
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return true;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let next = p.cur_text();
+ return next.starts_with(' ') || next.starts_with('\t');
+ }
+ }
+ }
+
+ false
+}
+
/// Parse inline code span (`` `code` `` or ``` `` `code` `` ```).
///
/// Grammar: MdInlineCode = l_tick: '`' content: MdInlineItemList r_tick: '`'
@@ -1382,6 +1449,7 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn
// Complete the destination and link immediately without looking for closing paren.
if destination_result == DestinationScanResult::DepthExceeded {
destination.complete(p, MD_INLINE_ITEM_LIST);
+ p.force_relex_regular();
return Present(m.complete(p, kind.inline_kind()));
}
diff --git a/crates/biome_markdown_parser/src/syntax/list.rs b/crates/biome_markdown_parser/src/syntax/list.rs
index 43a1af47b93f..d84aff39798a 100644
--- a/crates/biome_markdown_parser/src/syntax/list.rs
+++ b/crates/biome_markdown_parser/src/syntax/list.rs
@@ -72,19 +72,25 @@ const INDENT_CODE_BLOCK_SPACES: usize = 4;
/// `line_start_leading_indent()`. For virtual line start cases (nested list
/// detection), we compute the actual column position from the source text
/// to ensure correct indented code block detection in nested lists.
+///
+/// Raw source scan is required because leading whitespace may be consumed
+/// as trivia during list parsing, so token-based lookahead loses the true
+/// column needed for CommonMark's indent rules.
fn compute_marker_indent(p: &MarkdownParser) -> usize {
if p.state().virtual_line_start == Some(p.cur_range().start()) {
+ // Inside block quotes, treat the virtual line start as column 0.
+ if p.state().block_quote_depth > 0 {
+ return p.line_start_leading_indent();
+ }
+
// Virtual line start: compute actual column from source text.
// The leading whitespace was skipped as trivia, but we need the
- // real column for indented code block detection.
+ // real column for indented code block detection in nested lists.
let source = p.source().source_text();
let pos: usize = p.cur_range().start().into();
// Find the start of the current line
- let line_start = source[..pos]
- .rfind('\n')
- .map(|i| i + 1)
- .unwrap_or(0);
+ let line_start = source[..pos].rfind('\n').map_or(0, |i| i + 1);
// Count columns from line start to current position
let mut column = 0;
@@ -149,6 +155,44 @@ fn is_whitespace_only(text: &str) -> bool {
!text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t')
}
+/// Check if the remaining content forms a thematic break pattern.
+///
+/// Per CommonMark §4.1, a thematic break is 3 or more matching characters
+/// (`*`, `-`, or `_`) on a line by itself, optionally with spaces between them.
+///
+/// This function checks the source text directly since the lexer may not
+/// produce MD_THEMATIC_BREAK_LITERAL in all contexts (e.g., after list markers).
+/// Token lookahead is insufficient here because the marker may be lexed as
+/// textual content within list item contexts.
+fn is_thematic_break_pattern(p: &mut MarkdownParser) -> bool {
+ // Get the remaining text on the current line
+ let source = p.source_after_current();
+
+ // Find the end of the line
+ let line_end = source.find('\n').unwrap_or(source.len());
+ let line = &source[..line_end];
+
+ // Determine which character to check for
+ let first_char = line.trim_start().chars().next();
+ let break_char = match first_char {
+ Some('*' | '-' | '_') => first_char.unwrap(),
+ _ => return false,
+ };
+
+ // Count the break characters (must be at least 3)
+ let mut count = 0usize;
+ for c in line.chars() {
+ if c == break_char {
+ count += 1;
+ } else if c != ' ' && c != '\t' {
+ // Non-whitespace, non-break character - not a thematic break
+ return false;
+ }
+ }
+
+ count >= 3
+}
+
fn at_bullet_list_item_with_base_indent(p: &mut MarkdownParser, base_indent: usize) -> bool {
p.lookahead(|p| {
if !list_item_within_indent(p, base_indent) {
@@ -221,7 +265,6 @@ fn skip_blank_lines_between_items(
is_tight: &mut bool,
last_item_ends_with_blank: &mut bool,
) {
-
// Skip blank lines between list items.
// Per CommonMark §5.3, blank lines between items make the list loose
// but don't end the list.
@@ -248,7 +291,6 @@ fn update_list_tightness(
is_tight: &mut bool,
last_item_ends_with_blank: &mut bool,
) {
-
// Blank line between items makes the list loose
if *last_item_ends_with_blank {
*is_tight = false;
@@ -409,7 +451,7 @@ impl ParseNodeList for BulletList {
return result;
}
- let result = is_at_list_end_common(
+ is_at_list_end_common(
p,
self.marker_kind,
at_bullet_list_item,
@@ -457,9 +499,7 @@ impl ParseNodeList for BulletList {
Some(!has_item)
}
},
- );
-
- result
+ )
}
fn recover(
@@ -1212,6 +1252,36 @@ fn parse_list_item_block_content(
&& (p.at_line_start() || p.has_preceding_line_break())
&& has_quote_prefix(p, quote_depth);
+ // Special case: blank line with only quote prefixes (e.g., ">>").
+ // Treat it as a blank line inside the list item so it becomes loose.
+ if !first_line && quote_depth > 0 && p.at(NEWLINE) {
+ let is_quote_blank_line = p.lookahead(|p| {
+ p.bump(NEWLINE);
+ if !has_quote_prefix(p, quote_depth) {
+ return false;
+ }
+ consume_quote_prefix_without_virtual(p, quote_depth);
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ p.at(NEWLINE) || p.at(T![EOF])
+ });
+
+ if is_quote_blank_line {
+ let m = p.start();
+ p.bump(NEWLINE);
+ m.complete(p, MD_NEWLINE);
+ if has_quote_prefix(p, quote_depth) {
+ consume_quote_prefix(p, quote_depth);
+ }
+ consume_blank_line(p);
+ has_blank_line = true;
+ last_was_blank = true;
+ first_line = false;
+ continue;
+ }
+ }
+
if !first_line && p.at(NEWLINE) && !p.at_blank_line() && !newline_has_quote_prefix {
let action = classify_blank_line(p, required_indent, marker_indent);
// Check if the NEWLINE we're at is itself on a blank line
@@ -1253,6 +1323,27 @@ fn parse_list_item_block_content(
&& (has_quote_prefix(p, quote_depth)
|| quote_only_line_indent_at_current(p, quote_depth).is_some());
+ if line_has_quote_prefix {
+ let is_quote_only_line = p.lookahead(|p| {
+ consume_quote_prefix_without_virtual(p, quote_depth);
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ p.at(NEWLINE) || p.at(T![EOF])
+ });
+
+ if is_quote_only_line {
+ consume_quote_prefix(p, quote_depth);
+ consume_blank_line(p);
+ if !first_line {
+ has_blank_line = true;
+ }
+ last_was_blank = true;
+ first_line = false;
+ continue;
+ }
+ }
+
let blank_line_after_prefix = if line_has_quote_prefix {
p.lookahead(|p| {
consume_quote_prefix_without_virtual(p, quote_depth);
@@ -1448,7 +1539,7 @@ fn parse_list_item_block_content(
}
let text = p.cur_text();
let hash_count = text.len();
- if hash_count < 1 || hash_count > 6 {
+ if !(1..=6).contains(&hash_count) {
return None;
}
p.bump(p.cur());
@@ -1509,8 +1600,7 @@ fn parse_list_item_block_content(
p.bump(MD_TEXTUAL_LITERAL);
}
// Check for > as either T![>] or MD_TEXTUAL_LITERAL ">"
- p.at(T![>])
- || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">")
+ p.at(T![>]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">")
});
if blockquote_start {
@@ -1566,6 +1656,32 @@ fn parse_list_item_block_content(
p.state_mut().list_item_required_indent = prev_required;
}
+ // Check for thematic break BEFORE nested list markers.
+ // Per CommonMark §4.1, `* * *` or `- - -` on a line by itself is a thematic
+ // break, not nested list markers. This handles example 061.
+ let is_thematic_break = p.lookahead(|p| {
+ while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ // Check for lexer-produced thematic break token
+ if p.at(MD_THEMATIC_BREAK_LITERAL) {
+ return true;
+ }
+ // Check for token-based thematic break pattern
+ // The lexer may not produce MD_THEMATIC_BREAK_LITERAL after a list marker
+ // because after_newline is false. Check manually.
+ is_thematic_break_pattern(p)
+ });
+
+ if is_thematic_break {
+ // Parse the thematic break as a block within the list item.
+ let _ = super::thematic_break_block::parse_thematic_break_block(p);
+ last_block_was_paragraph = false;
+ last_was_blank = false;
+ first_line = false;
+ continue;
+ }
+
let nested_marker = p.lookahead(|p| {
while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) {
p.bump(MD_TEXTUAL_LITERAL);
@@ -2078,18 +2194,14 @@ fn has_bullet_item_after_blank_lines_at_indent(
p: &mut MarkdownParser,
expected_indent: usize,
) -> bool {
- has_list_item_after_blank_lines_at_indent(
- p,
- expected_indent,
- |p| {
- if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
- p.bump(p.cur());
- marker_followed_by_whitespace_or_eol(p)
- } else {
- false
- }
- },
- )
+ has_list_item_after_blank_lines_at_indent(p, expected_indent, |p| {
+ if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
+ p.bump(p.cur());
+ marker_followed_by_whitespace_or_eol(p)
+ } else {
+ false
+ }
+ })
}
fn has_list_item_after_blank_lines_at_indent(
diff --git a/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs b/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs
index 0e289cee66ff..008e3b71a290 100644
--- a/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs
@@ -15,6 +15,7 @@
use crate::parser::MarkdownParser;
use biome_markdown_syntax::MarkdownSyntaxKind::*;
+use biome_markdown_syntax::T;
use biome_parser::{
Parser,
prelude::ParsedSyntax::{self, *},
@@ -22,15 +23,89 @@ use biome_parser::{
pub(crate) fn at_thematic_break_block(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
- if !p.at_line_start() && !p.at_start_of_input() {
- return false;
+ if p.at_line_start() || p.at_start_of_input() {
+ if p.line_start_leading_indent() > 3 {
+ return false;
+ }
+ p.skip_line_indent(3);
+ return p.at(MD_THEMATIC_BREAK_LITERAL);
}
- if p.line_start_leading_indent() > 3 {
+
+ // Special case: we may not be at line start if a list marker was consumed
+ // (e.g., `- * * *` where `-` was consumed as a list marker).
+ // Check if the remaining content is a thematic break pattern.
+ is_thematic_break_pattern(p)
+ })
+}
+
+/// Check if the remaining content forms a thematic break pattern.
+///
+/// Per CommonMark §4.1, a thematic break is 3 or more matching characters
+/// (`*`, `-`, or `_`) on a line by itself, optionally with spaces between them.
+fn is_thematic_break_pattern(p: &mut MarkdownParser) -> bool {
+ // Skip leading whitespace
+ while p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+
+ // Check for lexer-produced thematic break token
+ if p.at(MD_THEMATIC_BREAK_LITERAL) {
+ return true;
+ }
+
+ // Get the break character from the first non-whitespace token
+ let break_char = if p.at(T![*]) {
+ '*'
+ } else if p.at(T![-]) {
+ '-'
+ } else if p.at(UNDERSCORE) {
+ '_'
+ } else if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text.len() == 1 {
+ match text.chars().next() {
+ Some('*') => '*',
+ Some('-') => '-',
+ Some('_') => '_',
+ _ => return false,
+ }
+ } else {
return false;
}
- p.skip_line_indent(3);
- p.at(MD_THEMATIC_BREAK_LITERAL)
- })
+ } else {
+ return false;
+ };
+
+ // Count matching characters
+ let mut count = 0usize;
+
+ loop {
+ // Check for the break character
+ let is_break = match break_char {
+ '*' => p.at(T![*]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == "*"),
+ '-' => p.at(T![-]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == "-"),
+ '_' => p.at(UNDERSCORE) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == "_"),
+ _ => false,
+ };
+
+ if is_break {
+ count += 1;
+ p.bump_any();
+ continue;
+ }
+
+ // Skip whitespace between break characters
+ if p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
+ p.bump(MD_TEXTUAL_LITERAL);
+ continue;
+ }
+
+ // End of line or other content
+ break;
+ }
+
+ // Valid thematic break if 3+ characters followed by end of line
+ count >= 3 && (p.at(NEWLINE) || p.at(T![EOF]))
}
pub(crate) fn parse_thematic_break_block(p: &mut MarkdownParser) -> ParsedSyntax {
@@ -41,7 +116,88 @@ pub(crate) fn parse_thematic_break_block(p: &mut MarkdownParser) -> ParsedSyntax
p.skip_line_indent(3);
- p.expect(MD_THEMATIC_BREAK_LITERAL);
+ // If the lexer produced MD_THEMATIC_BREAK_LITERAL, use it directly.
+ // Otherwise, parse the thematic break pattern from individual tokens and
+ // ensure we emit a literal token (required by the grammar).
+ if p.at(MD_THEMATIC_BREAK_LITERAL) {
+ p.expect(MD_THEMATIC_BREAK_LITERAL);
+ } else {
+ parse_thematic_break_tokens(p);
+ }
Present(m.complete(p, MD_THEMATIC_BREAK_BLOCK))
}
+
+/// Parse a thematic break from individual tokens when the lexer didn't produce
+/// MD_THEMATIC_BREAK_LITERAL (e.g., after a list marker was consumed).
+fn parse_thematic_break_tokens(p: &mut MarkdownParser) {
+ // Skip leading whitespace
+ while p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
+ }
+
+ // If the entire thematic break is in a single textual token, remap it.
+ if p.at(MD_TEXTUAL_LITERAL)
+ && p.cur_text()
+ .chars()
+ .all(|c| c == ' ' || c == '\t' || c == '*' || c == '-' || c == '_')
+ {
+ p.bump_remap(MD_THEMATIC_BREAK_LITERAL);
+ return;
+ }
+
+ // Determine the break character for multi-token cases.
+ let break_char = if p.at(T![*]) {
+ Some('*')
+ } else if p.at(T![-]) {
+ Some('-')
+ } else if p.at(UNDERSCORE) {
+ Some('_')
+ } else if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ match text.chars().next() {
+ Some('*') => Some('*'),
+ Some('-') => Some('-'),
+ Some('_') => Some('_'),
+ _ => None,
+ }
+ } else {
+ None
+ };
+
+ // Emit the required literal token by remapping the first break marker token.
+ if break_char.is_some()
+ && (p.at(T![*]) || p.at(T![-]) || p.at(UNDERSCORE) || p.at(MD_TEXTUAL_LITERAL))
+ {
+ p.bump_remap(MD_THEMATIC_BREAK_LITERAL);
+ }
+
+ // Parse all break characters and whitespace until end of line
+ loop {
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ break;
+ }
+
+ // Check for the break character
+ let is_break = match break_char {
+ Some('*') => p.at(T![*]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == "*"),
+ Some('-') => p.at(T![-]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == "-"),
+ Some('_') => p.at(UNDERSCORE) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == "_"),
+ _ => false,
+ };
+
+ if is_break {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump_any());
+ continue;
+ }
+
+ // Skip whitespace between break characters
+ if p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
+ p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL));
+ continue;
+ }
+
+ // Other content - shouldn't happen if at_thematic_break_block returned true
+ break;
+ }
+}
diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs
index 2d6f912fa43e..3aaee64c4bc2 100644
--- a/crates/biome_markdown_parser/src/to_html.rs
+++ b/crates/biome_markdown_parser/src/to_html.rs
@@ -121,8 +121,18 @@ fn expand_tabs(text: &str) -> String {
/// 2. Strips the first `strip_cols` columns of indentation
/// 3. Preserves literal tabs in the remaining content
fn strip_indent_preserve_tabs(text: &str, strip_cols: usize) -> String {
+ strip_indent_preserve_tabs_with_offset(text, strip_cols, 0)
+}
+
+fn strip_indent_preserve_tabs_with_offset(
+ text: &str,
+ strip_cols: usize,
+ first_line_column: usize,
+) -> String {
+ let mut first_line = true;
map_lines(text, |line, result| {
- let mut col = 0;
+ let mut col = if first_line { first_line_column } else { 0 };
+ first_line = false;
let mut char_idx = 0;
// Find where to start copying (after stripping strip_cols columns)
@@ -461,19 +471,22 @@ fn render_paragraph(
}
}
+/// Strip leading whitespace from paragraph continuation lines.
+///
+/// Per CommonMark §4.8, paragraph continuation lines can have any amount of
+/// initial whitespace, and that whitespace is stripped in the output.
+/// The first line keeps its content unchanged; subsequent lines have all
+/// leading spaces and tabs stripped.
fn strip_paragraph_indent(content: &str) -> String {
+ let mut first_line = true;
map_lines(content, |line, out| {
- let mut stripped = 0usize;
- let mut at_line_start = true;
- for ch in line.chars() {
- if at_line_start {
- if ch == ' ' && stripped < 4 {
- stripped += 1;
- continue;
- }
- at_line_start = false;
- }
- out.push(ch);
+ if first_line {
+ // First line: keep as-is
+ first_line = false;
+ out.push_str(line);
+ } else {
+ // Continuation lines: strip ALL leading whitespace
+ out.push_str(line.trim_start());
}
})
}
@@ -660,6 +673,49 @@ fn render_indented_code_block(
out.push_str("
\n");
}
+fn render_indented_code_block_in_list(
+ code: &MdIndentCodeBlock,
+ out: &mut String,
+ list_indent: usize,
+ quote_indent: usize,
+ first_line_column: usize,
+) {
+ out.push_str("");
+
+ let mut content = collect_raw_inline_text(&code.content());
+ if content.starts_with('\n') {
+ content = content[1..].to_string();
+ }
+
+ let content = strip_indent_preserve_tabs_with_offset(
+ &content,
+ 4 + list_indent + quote_indent,
+ first_line_column,
+ );
+ out.push_str(&escape_html(&content));
+
+ out.push_str("
\n");
+}
+
+fn render_block_in_list(
+ block: &AnyMdBlock,
+ ctx: &HtmlRenderContext,
+ out: &mut String,
+ in_tight_list: bool,
+ list_indent: usize,
+ quote_indent: usize,
+ first_line_column: usize,
+) {
+ if let AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(AnyCodeBlock::MdIndentCodeBlock(
+ code,
+ ))) = block
+ {
+ render_indented_code_block_in_list(code, out, list_indent, quote_indent, first_line_column);
+ } else {
+ render_block(block, ctx, out, in_tight_list, list_indent, quote_indent);
+ }
+}
+
/// Render an HTML block.
fn render_html_block(
html: &MdHtmlBlock,
@@ -785,19 +841,20 @@ fn render_list_item(
// A blank line within an item requires two consecutive newline blocks
// (one ending the previous line, one for the blank line itself).
// A single MD_NEWLINE between blocks is just a structural separator.
- let item_has_blank_line = blocks.windows(2).any(|pair| {
- is_newline_block(&pair[0]) && is_newline_block(&pair[1])
- });
+ let item_has_blank_line = blocks
+ .windows(2)
+ .any(|pair| is_newline_block(&pair[0]) && is_newline_block(&pair[1]));
let is_tight = is_tight && !item_has_blank_line;
- let (indent, first_line_code_indent) = match list_indent {
+ let (indent, first_line_code_indent, first_line_column) = match list_indent {
Some(entry) => {
let base = list_item_required_indent(entry);
let first_line_code =
- (entry.spaces_after_marker > INDENT_CODE_BLOCK_SPACES).then_some(1);
- (base, first_line_code)
+ (entry.spaces_after_marker > INDENT_CODE_BLOCK_SPACES).then_some(base);
+ let column = entry.marker_indent + entry.marker_width;
+ (base, first_line_code, column)
}
- None => (0, None),
+ None => (0, None, 0),
};
if is_empty_content(&blocks) {
@@ -818,7 +875,26 @@ fn render_list_item(
) => code_indent,
_ => indent,
};
- render_block(block, ctx, out, true, block_indent, quote_indent);
+ let column_for_block = if first_line_code_indent.is_some()
+ && matches!(
+ block,
+ AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(
+ AnyCodeBlock::MdIndentCodeBlock(_)
+ ))
+ ) {
+ first_line_column
+ } else {
+ 0
+ };
+ render_block_in_list(
+ block,
+ ctx,
+ out,
+ true,
+ block_indent,
+ quote_indent,
+ column_for_block,
+ );
}
// Remove trailing newline for tight lists
if out.ends_with('\n') {
@@ -836,10 +912,29 @@ fn render_list_item(
) => code_indent,
_ => indent,
};
- render_block(first, ctx, out, true, block_indent, quote_indent);
+ let column_for_block = if first_line_code_indent.is_some()
+ && matches!(
+ first,
+ AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(
+ AnyCodeBlock::MdIndentCodeBlock(_)
+ ))
+ ) {
+ first_line_column
+ } else {
+ 0
+ };
+ render_block_in_list(
+ first,
+ ctx,
+ out,
+ true,
+ block_indent,
+ quote_indent,
+ column_for_block,
+ );
}
for block in blocks.iter().skip(1) {
- render_block(block, ctx, out, true, indent, quote_indent);
+ render_block_in_list(block, ctx, out, true, indent, quote_indent, 0);
}
} else {
out.push('\n');
@@ -857,11 +952,35 @@ fn render_list_item(
} else {
indent
};
- render_block(block, ctx, out, true, block_indent, quote_indent);
+ let column_for_block = if idx == 0
+ && first_line_code_indent.is_some()
+ && matches!(
+ block,
+ AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(
+ AnyCodeBlock::MdIndentCodeBlock(_)
+ ))
+ ) {
+ first_line_column
+ } else {
+ 0
+ };
+ render_block_in_list(
+ block,
+ ctx,
+ out,
+ true,
+ block_indent,
+ quote_indent,
+ column_for_block,
+ );
}
// Remove trailing newline when the last content block is a paragraph
// (tight list paragraphs should not have trailing newlines)
- if blocks.iter().rev().find(|b| !is_newline_block(b)).is_some_and(is_paragraph_block)
+ if blocks
+ .iter()
+ .rev()
+ .find(|b| !is_newline_block(b))
+ .is_some_and(is_paragraph_block)
&& out.ends_with('\n')
{
out.pop();
@@ -884,7 +1003,27 @@ fn render_list_item(
} else {
indent
};
- render_block(block, ctx, out, false, block_indent, quote_indent);
+ let column_for_block = if idx == 0
+ && first_line_code_indent.is_some()
+ && matches!(
+ block,
+ AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(
+ AnyCodeBlock::MdIndentCodeBlock(_)
+ ))
+ ) {
+ first_line_column
+ } else {
+ 0
+ };
+ render_block_in_list(
+ block,
+ ctx,
+ out,
+ false,
+ block_indent,
+ quote_indent,
+ column_for_block,
+ );
}
}
@@ -982,7 +1121,8 @@ fn render_inline(inline: &AnyMdInline, ctx: &HtmlRenderContext, out: &mut String
/// Render textual content.
fn render_textual(text: &MdTextual, out: &mut String) {
if let Ok(token) = text.value_token() {
- let raw = token.text();
+ // Use text_trimmed() to exclude skipped trivia (e.g., indentation stripped during parsing)
+ let raw = token.text_trimmed();
// Process backslash escapes and escape HTML
let processed = process_escapes(raw);
out.push_str(&escape_html(&processed));
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_html_invalid.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_html_invalid.md.snap
index 1c1d0b04cd8e..2d520cefd3d6 100644
--- a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_html_invalid.md.snap
+++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_html_invalid.md.snap
@@ -315,20 +315,24 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@527..541 "Invalid start " [] [],
},
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@541..542 "<" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@542..543 "!" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@543..544 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@544..545 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@545..546 ">" [] [],
+ MdInlineHtml {
+ value: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@541..542 "<" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@542..543 "!" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@543..544 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@544..545 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@545..546 ">" [] [],
+ },
+ ],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@546..562 " should be text." [] [],
@@ -339,38 +343,42 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@563..575 "Double dash " [] [],
},
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@575..576 "<" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@576..577 "!" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@577..578 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@578..579 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@579..584 " foo " [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@584..585 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@585..586 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@586..591 " bar " [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@591..592 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@592..593 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@593..594 ">" [] [],
+ MdInlineHtml {
+ value: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@575..576 "<" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@576..577 "!" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@577..578 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@578..579 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@579..584 " foo " [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@584..585 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@585..586 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@586..591 " bar " [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@591..592 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@592..593 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@593..594 ">" [] [],
+ },
+ ],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@594..616 " should be text maybe." [] [],
@@ -381,23 +389,27 @@ MdDocument {
MdTextual {
value_token: MD_TEXTUAL_LITERAL@617..635 "Starts with arrow " [] [],
},
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@635..636 "<" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@636..637 "!" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@637..638 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@638..639 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@639..640 "-" [] [],
- },
- MdTextual {
- value_token: MD_TEXTUAL_LITERAL@640..641 ">" [] [],
+ MdInlineHtml {
+ value: MdInlineItemList [
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@635..636 "<" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@636..637 "!" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@637..638 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@638..639 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@639..640 "-" [] [],
+ },
+ MdTextual {
+ value_token: MD_TEXTUAL_LITERAL@640..641 ">" [] [],
+ },
+ ],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@641..657 " should be text." [] [],
@@ -602,65 +614,71 @@ MdDocument {
0: MD_INLINE_ITEM_LIST@527..658
0: MD_TEXTUAL@527..541
0: MD_TEXTUAL_LITERAL@527..541 "Invalid start " [] []
- 1: MD_TEXTUAL@541..542
- 0: MD_TEXTUAL_LITERAL@541..542 "<" [] []
- 2: MD_TEXTUAL@542..543
- 0: MD_TEXTUAL_LITERAL@542..543 "!" [] []
- 3: MD_TEXTUAL@543..544
- 0: MD_TEXTUAL_LITERAL@543..544 "-" [] []
- 4: MD_TEXTUAL@544..545
- 0: MD_TEXTUAL_LITERAL@544..545 "-" [] []
- 5: MD_TEXTUAL@545..546
- 0: MD_TEXTUAL_LITERAL@545..546 ">" [] []
- 6: MD_TEXTUAL@546..562
+ 1: MD_INLINE_HTML@541..546
+ 0: MD_INLINE_ITEM_LIST@541..546
+ 0: MD_TEXTUAL@541..542
+ 0: MD_TEXTUAL_LITERAL@541..542 "<" [] []
+ 1: MD_TEXTUAL@542..543
+ 0: MD_TEXTUAL_LITERAL@542..543 "!" [] []
+ 2: MD_TEXTUAL@543..544
+ 0: MD_TEXTUAL_LITERAL@543..544 "-" [] []
+ 3: MD_TEXTUAL@544..545
+ 0: MD_TEXTUAL_LITERAL@544..545 "-" [] []
+ 4: MD_TEXTUAL@545..546
+ 0: MD_TEXTUAL_LITERAL@545..546 ">" [] []
+ 2: MD_TEXTUAL@546..562
0: MD_TEXTUAL_LITERAL@546..562 " should be text." [] []
- 7: MD_TEXTUAL@562..563
+ 3: MD_TEXTUAL@562..563
0: MD_TEXTUAL_LITERAL@562..563 "\n" [] []
- 8: MD_TEXTUAL@563..575
+ 4: MD_TEXTUAL@563..575
0: MD_TEXTUAL_LITERAL@563..575 "Double dash " [] []
- 9: MD_TEXTUAL@575..576
- 0: MD_TEXTUAL_LITERAL@575..576 "<" [] []
- 10: MD_TEXTUAL@576..577
- 0: MD_TEXTUAL_LITERAL@576..577 "!" [] []
- 11: MD_TEXTUAL@577..578
- 0: MD_TEXTUAL_LITERAL@577..578 "-" [] []
- 12: MD_TEXTUAL@578..579
- 0: MD_TEXTUAL_LITERAL@578..579 "-" [] []
- 13: MD_TEXTUAL@579..584
- 0: MD_TEXTUAL_LITERAL@579..584 " foo " [] []
- 14: MD_TEXTUAL@584..585
- 0: MD_TEXTUAL_LITERAL@584..585 "-" [] []
- 15: MD_TEXTUAL@585..586
- 0: MD_TEXTUAL_LITERAL@585..586 "-" [] []
- 16: MD_TEXTUAL@586..591
- 0: MD_TEXTUAL_LITERAL@586..591 " bar " [] []
- 17: MD_TEXTUAL@591..592
- 0: MD_TEXTUAL_LITERAL@591..592 "-" [] []
- 18: MD_TEXTUAL@592..593
- 0: MD_TEXTUAL_LITERAL@592..593 "-" [] []
- 19: MD_TEXTUAL@593..594
- 0: MD_TEXTUAL_LITERAL@593..594 ">" [] []
- 20: MD_TEXTUAL@594..616
+ 5: MD_INLINE_HTML@575..594
+ 0: MD_INLINE_ITEM_LIST@575..594
+ 0: MD_TEXTUAL@575..576
+ 0: MD_TEXTUAL_LITERAL@575..576 "<" [] []
+ 1: MD_TEXTUAL@576..577
+ 0: MD_TEXTUAL_LITERAL@576..577 "!" [] []
+ 2: MD_TEXTUAL@577..578
+ 0: MD_TEXTUAL_LITERAL@577..578 "-" [] []
+ 3: MD_TEXTUAL@578..579
+ 0: MD_TEXTUAL_LITERAL@578..579 "-" [] []
+ 4: MD_TEXTUAL@579..584
+ 0: MD_TEXTUAL_LITERAL@579..584 " foo " [] []
+ 5: MD_TEXTUAL@584..585
+ 0: MD_TEXTUAL_LITERAL@584..585 "-" [] []
+ 6: MD_TEXTUAL@585..586
+ 0: MD_TEXTUAL_LITERAL@585..586 "-" [] []
+ 7: MD_TEXTUAL@586..591
+ 0: MD_TEXTUAL_LITERAL@586..591 " bar " [] []
+ 8: MD_TEXTUAL@591..592
+ 0: MD_TEXTUAL_LITERAL@591..592 "-" [] []
+ 9: MD_TEXTUAL@592..593
+ 0: MD_TEXTUAL_LITERAL@592..593 "-" [] []
+ 10: MD_TEXTUAL@593..594
+ 0: MD_TEXTUAL_LITERAL@593..594 ">" [] []
+ 6: MD_TEXTUAL@594..616
0: MD_TEXTUAL_LITERAL@594..616 " should be text maybe." [] []
- 21: MD_TEXTUAL@616..617
+ 7: MD_TEXTUAL@616..617
0: MD_TEXTUAL_LITERAL@616..617 "\n" [] []
- 22: MD_TEXTUAL@617..635
+ 8: MD_TEXTUAL@617..635
0: MD_TEXTUAL_LITERAL@617..635 "Starts with arrow " [] []
- 23: MD_TEXTUAL@635..636
- 0: MD_TEXTUAL_LITERAL@635..636 "<" [] []
- 24: MD_TEXTUAL@636..637
- 0: MD_TEXTUAL_LITERAL@636..637 "!" [] []
- 25: MD_TEXTUAL@637..638
- 0: MD_TEXTUAL_LITERAL@637..638 "-" [] []
- 26: MD_TEXTUAL@638..639
- 0: MD_TEXTUAL_LITERAL@638..639 "-" [] []
- 27: MD_TEXTUAL@639..640
- 0: MD_TEXTUAL_LITERAL@639..640 "-" [] []
- 28: MD_TEXTUAL@640..641
- 0: MD_TEXTUAL_LITERAL@640..641 ">" [] []
- 29: MD_TEXTUAL@641..657
+ 9: MD_INLINE_HTML@635..641
+ 0: MD_INLINE_ITEM_LIST@635..641
+ 0: MD_TEXTUAL@635..636
+ 0: MD_TEXTUAL_LITERAL@635..636 "<" [] []
+ 1: MD_TEXTUAL@636..637
+ 0: MD_TEXTUAL_LITERAL@636..637 "!" [] []
+ 2: MD_TEXTUAL@637..638
+ 0: MD_TEXTUAL_LITERAL@637..638 "-" [] []
+ 3: MD_TEXTUAL@638..639
+ 0: MD_TEXTUAL_LITERAL@638..639 "-" [] []
+ 4: MD_TEXTUAL@639..640
+ 0: MD_TEXTUAL_LITERAL@639..640 "-" [] []
+ 5: MD_TEXTUAL@640..641
+ 0: MD_TEXTUAL_LITERAL@640..641 ">" [] []
+ 10: MD_TEXTUAL@641..657
0: MD_TEXTUAL_LITERAL@641..657 " should be text." [] []
- 30: MD_TEXTUAL@657..658
+ 11: MD_TEXTUAL@657..658
0: MD_TEXTUAL_LITERAL@657..658 "\n" [] []
1: (empty)
2: EOF@658..658 "" [] []
diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs
index 81b145bbded9..12d69d4b6b66 100644
--- a/crates/biome_markdown_parser/tests/spec_test.rs
+++ b/crates/biome_markdown_parser/tests/spec_test.rs
@@ -138,28 +138,52 @@ pub fn quick_test() {
use biome_markdown_syntax::MdDocument;
use biome_rowan::AstNode;
- // Example 128: Fenced code block inside blockquote
- let input = "> ```\n> aaa\n\nbbb\n";
- let expected = "\naaa\n
\n
\nbbb
\n";
-
- let root = parse_markdown(input);
- eprintln!("=== AST ===\n{:#?}", root.syntax());
-
- let doc = MdDocument::cast(root.syntax()).unwrap();
- let html = document_to_html(
- &doc,
- root.list_tightness(),
- root.list_item_indents(),
- root.quote_indents(),
- );
-
- eprintln!("=== HTML ===");
- eprintln!("Expected:\n{}", expected);
- eprintln!("Actual:\n{}", html);
-
- if expected == html {
- eprintln!("✓ PASS");
- } else {
- eprintln!("✗ FAIL");
+ fn test_example(num: u32, input: &str, expected: &str) {
+ let root = parse_markdown(input);
+ let doc = MdDocument::cast(root.syntax())
+ .unwrap_or_else(|| panic!("Example {:03}: parse failed", num));
+ let html = document_to_html(
+ &doc,
+ root.list_tightness(),
+ root.list_item_indents(),
+ root.quote_indents(),
+ );
+
+ assert_eq!(expected, html, "Example {:03} failed", num);
}
+
+ // Test the 8 failing CommonMark examples
+ // TODO: Example 007 still failing - tab expansion issue (produces 3 spaces instead of 2)
+ // test_example(7, "-\t\tfoo\n", "\n");
+ test_example(
+ 42,
+ "- `one\n- two`\n",
+ "\n",
+ );
+ test_example(
+ 61,
+ "- Foo\n- * * *\n",
+ "\n",
+ );
+ test_example(
+ 66,
+ "# foo *bar* \\*baz\\*\n",
+ "foo bar *baz*
\n",
+ );
+ test_example(73, "### foo ### \n", "foo
\n");
+ test_example(
+ 93,
+ "> foo\nbar\n===\n",
+ "\nfoo\nbar\n===
\n
\n",
+ );
+ test_example(
+ 223,
+ "aaa\n bbb\n ccc\n",
+ "aaa\nbbb\nccc
\n",
+ );
+ test_example(
+ 259,
+ " > > 1. one\n>>\n>> two\n",
+ "\n\n\n- \n
one
\ntwo
\n \n
\n
\n
\n",
+ );
}
From 52557fe9e88ebdb2aa01a554afebe860f5a9b834 Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Wed, 28 Jan 2026 20:15:38 -0500
Subject: [PATCH 06/26] feat(string_case): add Unicode case folding helper
Introduce a utf8proc-backed case fold helper and use it for Markdown link label normalization to align with CommonMark Unicode folding.\n\nTests: just f\nLint: just l\nConformance: cargo run -p xtask_coverage -- --suites=markdown/commonmark --detailed (100%)
---
Cargo.lock | 28 +++++++++++++++++--
Cargo.toml | 1 +
.../src/link_reference.rs | 5 ++--
crates/biome_string_case/Cargo.toml | 1 +
crates/biome_string_case/src/lib.rs | 24 ++++++++++++++++
5 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index fc2f323c10e6..3a74e5f3c552 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1630,6 +1630,7 @@ name = "biome_string_case"
version = "0.5.7"
dependencies = [
"biome_rowan",
+ "utf8proc",
]
[[package]]
@@ -2029,9 +2030,9 @@ dependencies = [
[[package]]
name = "bstr"
-version = "1.8.0"
+version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "542f33a8835a0884b006a0c3df3dadd99c0c3f296ed26c2fdc8028e01ad6230c"
+checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
dependencies = [
"memchr",
"regex-automata",
@@ -5685,6 +5686,29 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+[[package]]
+name = "utf8proc"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52f85f7b7746ca99d325fbf7da3fc17a8d5c6f66d9747756165695bee619f19"
+dependencies = [
+ "bstr",
+ "libc",
+ "num_enum",
+ "paste",
+ "thiserror 2.0.17",
+ "utf8proc-sys",
+]
+
+[[package]]
+name = "utf8proc-sys"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3232ee5a98c4de53303dced4b70f0b42d456c71a36372832b08184f2d864aec"
+dependencies = [
+ "cc",
+]
+
[[package]]
name = "uuid"
version = "1.18.1"
diff --git a/Cargo.toml b/Cargo.toml
index 81c03fdeecd4..157620e5bbd3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,6 +161,7 @@ unicode-bom = "2.0.3"
unicode-width = "0.1.12"
ureq = "3.1.4"
url = "2.5.8"
+utf8proc = "0.1.2"
uuid = "1.18.1"
walkdir = "2.5.0"
web-time = "1.1.0"
diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/link_reference.rs
index 89ba4f7fa8da..37884700c1ea 100644
--- a/crates/biome_markdown_parser/src/link_reference.rs
+++ b/crates/biome_markdown_parser/src/link_reference.rs
@@ -32,9 +32,8 @@ pub(crate) fn normalize_reference_label(text: &str) -> String {
push_normalized_char(&mut out, c, &mut saw_whitespace);
}
- // CommonMark uses Unicode case folding; uppercasing keeps ß/ẞ matching "SS"
- // (e.g. example 540) and aligns with cmark's behavior for reference labels.
- out.as_str().to_lowercase_cow().to_uppercase()
+ // CommonMark uses Unicode case folding for case-insensitive matching (utf8proc).
+ out.as_str().to_casefold_cow().into_owned()
}
fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) {
diff --git a/crates/biome_string_case/Cargo.toml b/crates/biome_string_case/Cargo.toml
index 9152b3b4f490..abc58de48eb4 100644
--- a/crates/biome_string_case/Cargo.toml
+++ b/crates/biome_string_case/Cargo.toml
@@ -13,6 +13,7 @@ publish = true
[dependencies]
biome_rowan = { workspace = true, optional = true }
+utf8proc = { workspace = true }
[features]
biome_rowan = ["dep:biome_rowan"]
diff --git a/crates/biome_string_case/src/lib.rs b/crates/biome_string_case/src/lib.rs
index cd3059289e01..fbfae2a22742 100644
--- a/crates/biome_string_case/src/lib.rs
+++ b/crates/biome_string_case/src/lib.rs
@@ -3,6 +3,7 @@
#![deny(clippy::use_self)]
use std::{borrow::Cow, cmp::Ordering, ffi::OsStr};
+use utf8proc::transform::{TransformOptions, map};
#[cfg(feature = "biome_rowan")]
pub mod comparable_token;
@@ -664,6 +665,8 @@ pub trait StrOnlyExtension: ToOwned {
/// is that this functions returns ```Cow``` and does not allocate
/// if the string is already in lowercase.
fn to_lowercase_cow(&self) -> Cow<'_, Self>;
+ /// Returns Unicode case-folded text as a Cow, allocating only when needed.
+ fn to_casefold_cow(&self) -> Cow<'_, Self>;
}
impl StrLikeExtension for str {
@@ -696,6 +699,20 @@ impl StrOnlyExtension for str {
Cow::Borrowed(self)
}
}
+
+ fn to_casefold_cow(&self) -> Cow<'_, Self> {
+ let mut options = TransformOptions::default();
+ options.case_fold = true;
+ let folded = match map(self, &options) {
+ Ok(value) => value,
+ Err(_) => return Cow::Borrowed(self),
+ };
+ if folded == self {
+ Cow::Borrowed(self)
+ } else {
+ Cow::Owned(folded)
+ }
+ }
}
impl StrLikeExtension for std::ffi::OsStr {
@@ -1183,6 +1200,13 @@ mod tests {
assert!(matches!("tešt".to_lowercase_cow(), Cow::Borrowed(_)));
}
+ #[test]
+ fn to_casefold_cow() {
+ assert_eq!("ss", "ẞ".to_casefold_cow());
+ assert_eq!("ss", "ß".to_casefold_cow());
+ assert!(matches!("test".to_casefold_cow(), Cow::Borrowed(_)));
+ }
+
#[test]
fn collation_weight_unique() {
for weight in 0..=255 {
From a33d915027ff7a570464bfa377754fdad8b8565b Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Wed, 28 Jan 2026 21:03:24 -0500
Subject: [PATCH 07/26] refactor(markdown): split inline parsing into modules
Move inline parsing helpers into inline/ submodules (code spans, emphasis, entities, html, links) and rewire the inline dispatch and link parsing.
Keep strong emphasis closer tokens ("**", "__") intact when closing matches to avoid MD_BOGUS nodes in emphasis fixtures.
Tests: cargo test -p biome_markdown_parser --test spec_tests ok::ok::emphasis_complex_md -- --nocapture
---
crates/biome_markdown_parser/src/lexer/mod.rs | 6 +-
crates/biome_markdown_parser/src/lib.rs | 3 +-
crates/biome_markdown_parser/src/parser.rs | 2 +-
.../src/syntax/inline.rs | 2689 -----------------
.../src/syntax/inline/code_span.rs | 248 ++
.../src/syntax/inline/emphasis.rs | 704 +++++
.../src/syntax/inline/entities.rs | 26 +
.../src/syntax/inline/html.rs | 486 +++
.../src/syntax/inline/links.rs | 866 ++++++
.../src/syntax/inline/mod.rs | 331 ++
.../src/syntax/link_block.rs | 2 +-
.../src/{syntax.rs => syntax/mod.rs} | 5 +-
.../reference.rs} | 3 +-
crates/biome_markdown_parser/src/to_html.rs | 2 +-
.../biome_markdown_parser/tests/spec_test.rs | 2 +-
15 files changed, 2674 insertions(+), 2701 deletions(-)
delete mode 100644 crates/biome_markdown_parser/src/syntax/inline.rs
create mode 100644 crates/biome_markdown_parser/src/syntax/inline/code_span.rs
create mode 100644 crates/biome_markdown_parser/src/syntax/inline/emphasis.rs
create mode 100644 crates/biome_markdown_parser/src/syntax/inline/entities.rs
create mode 100644 crates/biome_markdown_parser/src/syntax/inline/html.rs
create mode 100644 crates/biome_markdown_parser/src/syntax/inline/links.rs
create mode 100644 crates/biome_markdown_parser/src/syntax/inline/mod.rs
rename crates/biome_markdown_parser/src/{syntax.rs => syntax/mod.rs} (99%)
rename crates/biome_markdown_parser/src/{link_reference.rs => syntax/reference.rs} (98%)
diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs
index a41e12749383..0f21aa144a5a 100644
--- a/crates/biome_markdown_parser/src/lexer/mod.rs
+++ b/crates/biome_markdown_parser/src/lexer/mod.rs
@@ -707,8 +707,8 @@ impl<'src> MarkdownLexer<'src> {
///
/// For `-` at line start:
/// - 1-2 dashes followed by newline: setext underline (H2)
- /// - 3+ dashes followed by newline: thematic break (not setext, since the parser
- /// will convert thematic breaks to setext headers when preceded by paragraph)
+ /// - 3+ dashes followed by newline: thematic break (not setext; the parser may
+ /// convert dash-only thematic breaks to setext when preceded by a paragraph)
fn consume_thematic_break_or_emphasis(
&mut self,
dispatched: Dispatch,
@@ -735,7 +735,7 @@ impl<'src> MarkdownLexer<'src> {
// For `-` at line start with 1-2 dashes, emit setext underline.
// 3+ dashes could be thematic break, so let that logic handle it.
- // The parser will convert thematic breaks to setext when preceded by paragraph.
+ // The parser may convert dash-only thematic breaks to setext when preceded by paragraph.
if start_char == b'-' && self.after_newline {
let mut dash_count = 0;
// Consume only `-` characters (no spaces between)
diff --git a/crates/biome_markdown_parser/src/lib.rs b/crates/biome_markdown_parser/src/lib.rs
index c4a825e017c0..8126725b0176 100644
--- a/crates/biome_markdown_parser/src/lib.rs
+++ b/crates/biome_markdown_parser/src/lib.rs
@@ -8,7 +8,6 @@ use parser::MarkdownParser;
use syntax::parse_document;
mod lexer;
-mod link_reference;
mod parser;
mod syntax;
mod token_source;
@@ -39,7 +38,7 @@ pub fn parse_markdown_with_cache(
options: MarkdownParseOptions,
) -> MarkdownParse {
let link_definitions =
- link_reference::collect_link_reference_definitions(source, options.clone());
+ syntax::reference::collect_link_reference_definitions(source, options.clone());
let mut parser = MarkdownParser::new(source, options);
parser.set_link_reference_definitions(link_definitions);
diff --git a/crates/biome_markdown_parser/src/parser.rs b/crates/biome_markdown_parser/src/parser.rs
index 6f31db98d315..9a772601bd5a 100644
--- a/crates/biome_markdown_parser/src/parser.rs
+++ b/crates/biome_markdown_parser/src/parser.rs
@@ -222,7 +222,7 @@ impl<'source> MarkdownParser<'source> {
/// a DOUBLE_STAR or DOUBLE_UNDERSCORE token. After re-lexing, the token will
/// be either STAR or UNDERSCORE (single char).
///
- /// # Safety
+ /// Note
/// Only call on the current token, NOT inside lookahead closures.
/// This invalidates any buffered lookahead, so ensure no lookahead is active.
pub(crate) fn force_relex_emphasis_inline(&mut self) -> MarkdownSyntaxKind {
diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs
deleted file mode 100644
index 95538d4b86e2..000000000000
--- a/crates/biome_markdown_parser/src/syntax/inline.rs
+++ /dev/null
@@ -1,2689 +0,0 @@
-//! Inline element parsing for Markdown.
-//!
-//! Handles inline code spans, emphasis (bold/italic), links, images, line breaks, and raw HTML.
-//!
-//! # CommonMark Specification References
-//!
-//! This module implements the following CommonMark 0.31.2 sections:
-//!
-//! - **§6.1 Code spans**: Backtick-delimited inline code (`code`)
-//! - **§6.2 Emphasis and strong emphasis**: `*italic*`, `**bold**`, `_italic_`, `__bold__`
-//! - **§6.3 Links**: `[text](url)` inline links
-//! - **§6.4 Autolinks (URI)**: ``
-//! - **§6.5 Autolinks (email)**: ``
-//! - **§6.6 Hard line breaks**: Trailing spaces or backslash before newline
-//! - **§6.7 Soft line breaks**: Single newline within paragraph
-//! - **§6.8 Raw HTML**: ``, ``, ``, `...?>`, ``, ``
-//!
-//! # Emphasis Algorithm (§6.4)
-//!
-//! This module implements the CommonMark delimiter stack algorithm for emphasis:
-//!
-//! 1. **First pass**: Collect delimiter runs from the inline content
-//! 2. **Second pass**: Match openers and closers using the delimiter stack algorithm
-//! 3. **Rule of 3**: If (opener_count + closer_count) % 3 == 0 and both can open/close,
-//! skip the match unless both counts are divisible by 3
-//!
-//! # Emphasis Flanking Rules (§6.2)
-//!
-//! A delimiter run is **left-flanking** if:
-//! 1. Not followed by Unicode whitespace, AND
-//! 2. Not followed by punctuation, OR preceded by whitespace/punctuation
-//!
-//! A delimiter run is **right-flanking** if:
-//! 1. Not preceded by Unicode whitespace, AND
-//! 2. Not preceded by punctuation, OR followed by whitespace/punctuation
-//!
-//! Underscore (`_`) has additional intraword restrictions (§6.2 rules 2, 5, 7, 8).
-
-use biome_markdown_syntax::MarkdownSyntaxKind;
-use biome_markdown_syntax::T;
-use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
-use biome_parser::Parser;
-use biome_parser::prelude::ParsedSyntax::{self, *};
-use biome_unicode_table::is_unicode_punctuation;
-
-use biome_rowan::TextRange;
-
-use crate::MarkdownParser;
-use crate::link_reference::normalize_reference_label;
-
-// ============================================================================
-// Delimiter Stack Types for Emphasis Parsing
-// ============================================================================
-
-/// Kind of emphasis delimiter (* or _)
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum DelimKind {
- Star,
- Underscore,
-}
-
-/// A delimiter run collected during the first pass
-#[derive(Debug, Clone)]
-struct DelimRun {
- /// The delimiter character kind
- kind: DelimKind,
- /// Number of delimiter characters in this run
- count: usize,
- /// Whether this can open emphasis (left-flanking)
- can_open: bool,
- /// Whether this can close emphasis (right-flanking)
- can_close: bool,
- /// Byte offset in the source where this run starts
- start_offset: usize,
- /// Bracket nesting depth for scoping emphasis within link text.
- /// Delimiters inside brackets (links) should only match with each other,
- /// not with delimiters outside the brackets. 0 = outside brackets.
- label_id: usize,
-}
-
-/// A matched emphasis span (opener + closer)
-#[derive(Debug, Clone)]
-struct EmphasisMatch {
- /// Byte offset where the opener delimiter starts
- opener_start: usize,
- /// Byte offset where the closer delimiter starts
- closer_start: usize,
- /// Whether this is strong (2 chars) or regular (1 char) emphasis
- is_strong: bool,
-}
-
-/// Check if a character is Unicode whitespace for flanking rules.
-fn is_whitespace(c: char) -> bool {
- c.is_whitespace()
-}
-
-fn is_emphasis_marker(c: char) -> bool {
- matches!(c, '*' | '_')
-}
-
-/// Check if a character is Unicode punctuation for flanking rules.
-/// Per CommonMark spec, this includes ASCII punctuation and Unicode punctuation categories.
-fn is_punctuation(c: char) -> bool {
- is_unicode_punctuation(c)
-}
-
-/// Check if an opening delimiter is left-flanking per CommonMark rules.
-/// A left-flanking delimiter run is one that is:
-/// - Not followed by Unicode whitespace, AND
-/// - Either (a) not followed by punctuation, OR (b) preceded by whitespace/punctuation
-fn is_left_flanking_delimiter(char_after: Option, char_before: Option) -> bool {
- match char_after {
- None => false, // At end of input, can't be left-flanking
- Some(c) if is_whitespace(c) => false, // Followed by whitespace
- Some(c) if is_emphasis_marker(c) => true,
- Some(c) if is_punctuation(c) => {
- // Followed by punctuation - only left-flanking if preceded by whitespace or punctuation
- match char_before {
- None => true, // Start of input counts as whitespace
- Some(b) => is_whitespace(b) || is_punctuation(b),
- }
- }
- Some(_) => true, // Not followed by whitespace or punctuation = left-flanking
- }
-}
-
-/// Check if a closing delimiter is right-flanking per CommonMark rules.
-/// A right-flanking delimiter run is one that is:
-/// - Not preceded by Unicode whitespace, AND
-/// - Either (a) not preceded by punctuation, OR (b) followed by whitespace/punctuation
-fn is_right_flanking_delimiter(char_before: Option, char_after: Option) -> bool {
- match char_before {
- None => false, // At start of input, can't be right-flanking
- Some(c) if is_whitespace(c) => false, // Preceded by whitespace
- Some(c) if is_emphasis_marker(c) => true,
- Some(c) if is_punctuation(c) => {
- // Preceded by punctuation - only right-flanking if followed by whitespace or punctuation
- match char_after {
- None => true, // End of input counts as whitespace
- Some(a) => is_whitespace(a) || is_punctuation(a),
- }
- }
- Some(_) => true, // Not preceded by whitespace or punctuation = right-flanking
- }
-}
-
-/// Check if underscore can open emphasis (stricter rules than asterisk).
-/// Per CommonMark 6.2, underscore can open emphasis iff it is left-flanking AND either:
-/// - Not part of a right-flanking delimiter run, OR
-/// - Preceded by a punctuation character
-fn can_underscore_open(char_before: Option, char_after: Option) -> bool {
- // Must be left-flanking
- if !is_left_flanking_delimiter(char_after, char_before) {
- return false;
- }
- // If also right-flanking, must be preceded by punctuation
- if is_right_flanking_delimiter(char_before, char_after) {
- return matches!(char_before, Some(c) if is_punctuation(c));
- }
- true
-}
-
-/// Check if underscore can close emphasis (stricter rules than asterisk).
-/// Per CommonMark 6.2, underscore can close emphasis iff it is right-flanking AND either:
-/// - Not part of a left-flanking delimiter run, OR
-/// - Followed by a punctuation character
-fn can_underscore_close(char_before: Option, char_after: Option) -> bool {
- // Must be right-flanking
- if !is_right_flanking_delimiter(char_before, char_after) {
- return false;
- }
- // If also left-flanking, must be followed by punctuation
- if is_left_flanking_delimiter(char_after, char_before) {
- return matches!(char_after, Some(c) if is_punctuation(c));
- }
- true
-}
-
-// ============================================================================
-// Delimiter Stack Algorithm Implementation
-// ============================================================================
-
-/// Collect all delimiter runs from source text.
-///
-/// This is the first pass of the CommonMark emphasis algorithm. It scans
-/// the source text and identifies all potential delimiter runs (sequences
-/// of `*` or `_`), computing their flanking status.
-/// Result of checking if a bracket forms a valid link.
-/// Contains the closing bracket position if found.
-struct BracketCheckResult {
- /// Position of the closing `]` (or 0 if not found)
- close_pos: usize,
- /// Whether this is a valid inline link `[...](` or full reference `[...][`
- is_inline_or_full_ref: bool,
-}
-
-/// Check if a bracket at position `start` forms a valid link pattern.
-/// Returns the closing bracket position and whether it's an inline link or full reference.
-fn check_bracket_pattern(bytes: &[u8], start: usize) -> Option {
- if start >= bytes.len() || bytes[start] != b'[' {
- return None;
- }
-
- // Find matching ] with proper nesting
- let mut depth = 1;
- let mut i = start + 1;
- while i < bytes.len() && depth > 0 {
- match bytes[i] {
- b'[' => depth += 1,
- b']' => depth -= 1,
- b'\\' if i + 1 < bytes.len() => i += 1, // Skip escaped char
- b'`' => {
- // Skip code spans
- let backtick_count = {
- let mut c = 1;
- while i + c < bytes.len() && bytes[i + c] == b'`' {
- c += 1;
- }
- c
- };
- i += backtick_count;
- while i < bytes.len() {
- if bytes[i] == b'`' {
- let close_count = {
- let mut c = 1;
- while i + c < bytes.len() && bytes[i + c] == b'`' {
- c += 1;
- }
- c
- };
- i += close_count;
- if close_count == backtick_count {
- break;
- }
- } else {
- i += 1;
- }
- }
- continue;
- }
- b'<' => {
- // Skip potential HTML/autolinks
- i += 1;
- while i < bytes.len() && bytes[i] != b'>' && bytes[i] != b'\n' {
- i += 1;
- }
- if i < bytes.len() && bytes[i] == b'>' {
- i += 1;
- }
- continue;
- }
- _ => {}
- }
- i += 1;
- }
-
- if depth != 0 {
- return None;
- }
-
- // i now points to position after `]`
- let close_pos = i - 1;
- let is_inline_or_full_ref = i < bytes.len() && (bytes[i] == b'(' || bytes[i] == b'[');
-
- Some(BracketCheckResult {
- close_pos,
- is_inline_or_full_ref,
- })
-}
-
-/// Extract label text from a bracket pattern for reference lookup.
-fn extract_label_text(source: &str, start: usize, close_pos: usize) -> &str {
- if start < close_pos && close_pos <= source.len() {
- &source[start + 1..close_pos]
- } else {
- ""
- }
-}
-
-fn collect_delimiter_runs(source: &str, reference_checker: impl Fn(&str) -> bool) -> Vec {
- let mut runs = Vec::new();
- let bytes = source.as_bytes();
- let mut i = 0;
-
- // Pre-compute valid link bracket positions.
- // A bracket is considered a valid link if:
- // 1. It's followed by `(` (inline link) or `[` (full reference), OR
- // 2. It's a shortcut reference with a defined reference (checked via reference_checker)
- let mut link_bracket_starts = Vec::new();
- for pos in 0..bytes.len() {
- if bytes[pos] == b'['
- && let Some(result) = check_bracket_pattern(bytes, pos)
- {
- if result.is_inline_or_full_ref {
- // Inline link or full reference link
- link_bracket_starts.push(pos);
- } else {
- // Could be a shortcut reference - check if definition exists
- let label = extract_label_text(source, pos, result.close_pos);
- let normalized = normalize_reference_label(label);
- if !normalized.is_empty() && reference_checker(&normalized) {
- link_bracket_starts.push(pos);
- }
- }
- }
- }
-
- // Track bracket depth, but only for valid link brackets
- let mut bracket_depth = 0usize;
- let mut active_link_brackets: Vec = Vec::new();
-
- while i < bytes.len() {
- let b = bytes[i];
-
- // Track bracket depth for valid links only
- if b == b'[' && link_bracket_starts.contains(&i) {
- bracket_depth += 1;
- active_link_brackets.push(i);
- i += 1;
- continue;
- }
- if b == b']' && !active_link_brackets.is_empty() {
- bracket_depth = bracket_depth.saturating_sub(1);
- active_link_brackets.pop();
- i += 1;
- continue;
- }
-
- // Check for delimiter characters
- if b == b'*' || b == b'_' {
- let kind = if b == b'*' {
- DelimKind::Star
- } else {
- DelimKind::Underscore
- };
- let start_offset = i;
-
- // Count consecutive delimiter characters
- let mut count = 1;
- while i + count < bytes.len() && bytes[i + count] == b {
- count += 1;
- }
- let end_offset = i + count;
-
- // Get character before delimiter run
- let char_before = if start_offset > 0 {
- // Get the char ending at start_offset
- let before_slice = &source[..start_offset];
- before_slice.chars().next_back()
- } else {
- None
- };
-
- // Get character after delimiter run
- let char_after = source[end_offset..].chars().next();
-
- // Compute flanking status
- let (can_open, can_close) = if kind == DelimKind::Underscore {
- (
- can_underscore_open(char_before, char_after),
- can_underscore_close(char_before, char_after),
- )
- } else {
- // Asterisk: can open if left-flanking, can close if right-flanking
- (
- is_left_flanking_delimiter(char_after, char_before),
- is_right_flanking_delimiter(char_before, char_after),
- )
- };
-
- runs.push(DelimRun {
- kind,
- count,
- can_open,
- can_close,
- start_offset,
- // Only scope by bracket depth when inside a valid link pattern.
- // This prevents emphasis from spanning link boundaries, but allows
- // emphasis to span brackets that don't form valid links.
- label_id: bracket_depth,
- });
-
- i = end_offset;
- } else if b == b'`' {
- // Skip code spans - they block emphasis
- let backtick_count = {
- let mut c = 1;
- while i + c < bytes.len() && bytes[i + c] == b'`' {
- c += 1;
- }
- c
- };
- i += backtick_count;
-
- // Find closing backticks
- while i < bytes.len() {
- if bytes[i] == b'`' {
- let close_count = {
- let mut c = 1;
- while i + c < bytes.len() && bytes[i + c] == b'`' {
- c += 1;
- }
- c
- };
- i += close_count;
- if close_count == backtick_count {
- break;
- }
- } else {
- i += 1;
- }
- }
- } else if b == b'<' {
- // Skip potential HTML tags and autolinks
- i += 1;
- while i < bytes.len() && bytes[i] != b'>' && bytes[i] != b'\n' {
- i += 1;
- }
- if i < bytes.len() && bytes[i] == b'>' {
- i += 1;
- }
- } else if b == b'\\' && i + 1 < bytes.len() {
- // Skip escaped characters
- i += 2;
- } else {
- i += 1;
- }
- }
-
- runs
-}
-
-/// Match delimiter runs using the CommonMark algorithm.
-///
-/// This is the second pass. It processes closers from left to right,
-/// searching backward for matching openers. Returns a list of matched
-/// emphasis spans sorted by opener position.
-fn match_delimiters(runs: &mut [DelimRun]) -> Vec {
- let mut matches = Vec::new();
- let mut opener_stack: Vec = Vec::new();
-
- for idx in 0..runs.len() {
- if runs[idx].can_close && runs[idx].count > 0 {
- loop {
- let mut opener_stack_pos = None;
-
- // Search backward for the closest matching opener.
- // Per CommonMark spec, we find any matching opener first,
- // then determine strong vs regular based on both counts.
- for (pos, &opener_idx) in opener_stack.iter().enumerate().rev() {
- let opener = &runs[opener_idx];
- let closer = &runs[idx];
-
- // Only match within same bracket scope (label_id).
- // This prevents emphasis from spanning link boundaries.
- if opener.label_id != closer.label_id {
- continue;
- }
-
- if opener.kind != closer.kind || !opener.can_open || opener.count == 0 {
- continue;
- }
-
- // Rule of 3: if (opener_count + closer_count) % 3 == 0 and
- // the closer can open or the opener can close, skip unless
- // both counts are divisible by 3
- let opener_count = opener.count;
- let closer_count = closer.count;
- if (opener.can_close || closer.can_open)
- && !closer_count.is_multiple_of(3)
- && (opener_count + closer_count).is_multiple_of(3)
- {
- continue;
- }
-
- opener_stack_pos = Some(pos);
- break;
- }
-
- let Some(pos) = opener_stack_pos else { break };
- let opener_idx = opener_stack[pos];
- let use_count = if runs[opener_idx].count >= 2 && runs[idx].count >= 2 {
- 2
- } else {
- 1
- };
-
- // Openers consume from END of run (leftover stays at beginning).
- // This ensures for `***foo***`, the inner `**` is consumed leaving `*` at start.
- let opener_start =
- runs[opener_idx].start_offset + runs[opener_idx].count - use_count;
- // Closers consume from BEGINNING of what remains.
- let closer_start = runs[idx].start_offset;
-
- matches.push(EmphasisMatch {
- opener_start,
- closer_start,
- is_strong: use_count == 2,
- });
-
- // Opener: reduce count but keep start_offset (leftover is at beginning)
- runs[opener_idx].count -= use_count;
- // Closer: reduce count and advance start_offset (leftover is at end)
- runs[idx].count -= use_count;
- runs[idx].start_offset += use_count;
-
- // Remove openers between the matched opener and this closer.
- opener_stack.truncate(pos + 1);
- if runs[opener_idx].count == 0 {
- opener_stack.pop();
- }
-
- // Note: With the "consume from END" algorithm for openers,
- // crossing matches are no longer an issue because the leftover
- // chars end up at the beginning of the opener run (wrapping
- // around the inner match), not at the end (which would cross).
-
- if runs[idx].count == 0 {
- break;
- }
- }
- }
-
- if runs[idx].can_open && runs[idx].count > 0 {
- opener_stack.push(idx);
- }
- }
-
- // Sort matches by opener position for nested processing
- matches.sort_by_key(|m| m.opener_start);
-
- matches
-}
-
-/// Context for emphasis-aware inline parsing
-#[derive(Debug)]
-pub(crate) struct EmphasisContext {
- /// Matched emphasis spans, sorted by opener_start
- matches: Vec,
- /// Base offset of the inline content in the source
- base_offset: usize,
-}
-
-/// Information about a match found within a token's range.
-/// Used when the opener doesn't start at the exact token boundary.
-#[derive(Debug)]
-struct OpenerMatch<'a> {
- /// The matched emphasis span
- matched: &'a EmphasisMatch,
- /// How many chars before opener_start (literal prefix to emit)
- prefix_len: usize,
-}
-
-impl EmphasisContext {
- /// Create a new emphasis context by analyzing the source text.
- /// The reference_checker function is used to determine if a bracket pattern
- /// is a valid shortcut reference link.
- pub(crate) fn new(
- source: &str,
- base_offset: usize,
- reference_checker: impl Fn(&str) -> bool,
- ) -> Self {
- let mut runs = collect_delimiter_runs(source, reference_checker);
- let matches = match_delimiters(&mut runs);
- Self {
- matches,
- base_offset,
- }
- }
-
- /// Find the *earliest* match whose opener_start is within [token_start, token_end)
- /// and matches the expected `is_strong` value.
- /// Returns None if no match found, or the match plus prefix length.
- ///
- /// This is used instead of exact offset matching because with the "consume from END"
- /// algorithm, an opener might start in the middle of a DOUBLE_STAR token.
- fn opener_within(
- &self,
- token_start: usize,
- token_len: usize,
- expect_strong: bool,
- ) -> Option> {
- let token_end = token_start + token_len;
- let mut best: Option> = None;
-
- for m in &self.matches {
- // Filter by expected emphasis type
- if m.is_strong != expect_strong {
- continue;
- }
-
- let abs_opener = m.opener_start + self.base_offset;
- if abs_opener >= token_start && abs_opener < token_end {
- let candidate = OpenerMatch {
- matched: m,
- prefix_len: abs_opener - token_start,
- };
- // Pick the earliest match (smallest prefix_len)
- if best
- .as_ref()
- .is_none_or(|b| candidate.prefix_len < b.prefix_len)
- {
- best = Some(candidate);
- }
- }
- }
-
- best
- }
-}
-
-/// Parse a hard line break.
-///
-/// Grammar: MdHardLine = value: 'md_hard_line_literal'
-///
-/// A hard line break is created by either:
-/// - Two or more trailing spaces followed by a newline
-/// - A backslash followed by a newline
-pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax {
- if !p.at(MD_HARD_LINE_LITERAL) {
- return Absent;
- }
-
- let m = p.start();
- p.bump(MD_HARD_LINE_LITERAL);
- Present(m.complete(p, MD_HARD_LINE))
-}
-
-/// Check if there's a matching closing backtick sequence before EOF/blank line.
-///
-/// Per CommonMark §6.1, a code span opener must have a matching closer with the
-/// same number of backticks. If no match exists, the opener should be treated
-/// as literal text, not an unclosed code span.
-///
-/// Returns false if no match found (opener should become literal text).
-fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -> bool {
- use crate::lexer::MarkdownLexContext;
-
- p.lookahead(|p| {
- // Skip the opening backticks (handle both BACKTICK and TRIPLE_BACKTICK)
- if p.at(T!["```"]) {
- p.bump(T!["```"]);
- } else {
- p.bump(BACKTICK);
- }
-
- loop {
- // EOF = no matching closer found
- if p.at(T![EOF]) {
- return false;
- }
-
- // Blank line = paragraph boundary, terminates search
- if p.at(NEWLINE) && p.at_blank_line() {
- return false;
- }
-
- // Per CommonMark §4.3, setext heading underlines take priority over
- // inline code spans. If crossing a newline would land on a setext
- // underline, the code span is invalid — the underline forms a heading.
- if p.at(NEWLINE) {
- p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
- if crate::syntax::at_setext_underline_after_newline(p).is_some() {
- return false;
- }
- // Per CommonMark, block interrupts (including list markers) can
- // terminate paragraphs. A code span cannot cross a block boundary.
- if crate::syntax::at_block_interrupt(p) || at_list_marker_after_newline(p) {
- return false;
- }
- continue;
- }
-
- // Found backticks - check if they match (handle both BACKTICK and TRIPLE_BACKTICK)
- if p.at(BACKTICK) || p.at(T!["```"]) {
- let closing_count = p.cur_text().len();
- if closing_count == opening_count {
- return true;
- }
- // Not matching - continue searching
- if p.at(T!["```"]) {
- p.bump(T!["```"]);
- } else {
- p.bump(BACKTICK);
- }
- continue;
- }
-
- // Consume token and continue (use CodeSpan context for proper backslash handling)
- p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
- }
- })
-}
-
-/// Check if we're at a list marker after a newline.
-/// This is used to detect when a code span would cross a list item boundary.
-fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool {
- // Skip up to 3 spaces of indent (list markers can be indented 0-3 spaces)
- let mut columns = 0usize;
- while columns < 4
- && p.at(MD_TEXTUAL_LITERAL)
- && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
- {
- for c in p.cur_text().chars() {
- match c {
- ' ' => columns += 1,
- '\t' => columns += 4 - (columns % 4),
- _ => {}
- }
- }
- if columns >= 4 {
- return false; // Indented code block, not a list marker
- }
- p.bump(MD_TEXTUAL_LITERAL);
- }
-
- // Check for bullet list markers: -, *, +
- if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
- let marker_text = p.cur_text();
- if marker_text.len() == 1 {
- p.bump_any();
- // Must be followed by space, tab, or EOL
- if p.at(NEWLINE) || p.at(T![EOF]) {
- return true;
- }
- if p.at(MD_TEXTUAL_LITERAL) {
- let text = p.cur_text();
- return text.starts_with(' ') || text.starts_with('\t');
- }
- }
- return false;
- }
-
- // Check for ordered list marker: digits followed by . or )
- if p.at(MD_ORDERED_LIST_MARKER) {
- p.bump(MD_ORDERED_LIST_MARKER);
- // Must be followed by space, tab, or EOL
- if p.at(NEWLINE) || p.at(T![EOF]) {
- return true;
- }
- if p.at(MD_TEXTUAL_LITERAL) {
- let text = p.cur_text();
- return text.starts_with(' ') || text.starts_with('\t');
- }
- return false;
- }
-
- // Check for textual bullet markers (lexed as MD_TEXTUAL_LITERAL in some contexts)
- if p.at(MD_TEXTUAL_LITERAL) {
- let text = p.cur_text();
- if text == "-" || text == "*" || text == "+" {
- p.bump(MD_TEXTUAL_LITERAL);
- // Must be followed by space, tab, or EOL
- if p.at(NEWLINE) || p.at(T![EOF]) {
- return true;
- }
- if p.at(MD_TEXTUAL_LITERAL) {
- let next = p.cur_text();
- return next.starts_with(' ') || next.starts_with('\t');
- }
- }
- }
-
- false
-}
-
-/// Parse inline code span (`` `code` `` or ``` `` `code` `` ```).
-///
-/// Grammar: MdInlineCode = l_tick: '`' content: MdInlineItemList r_tick: '`'
-///
-/// Per CommonMark §6.1:
-/// - Code spans can use multiple backticks to allow literal backticks inside
-/// - The opening and closing backtick strings must be the same length
-/// - Backslash escapes are NOT processed inside code spans (\` is literal `\``)
-/// - If no matching closer exists, the opener is treated as literal text
-pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
- use crate::lexer::MarkdownLexContext;
-
- // Handle both BACKTICK and TRIPLE_BACKTICK (T!["```"]) as code span openers.
- // TRIPLE_BACKTICK can appear when backticks are at line start but info string
- // contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
- let is_backtick = p.at(BACKTICK);
- let is_triple_backtick = p.at(T!["```"]);
- if !is_backtick && !is_triple_backtick {
- return Absent;
- }
-
- let opening_count = p.cur_text().len();
-
- // DESIGN PRINCIPLE #2 & #4: Check for matching closer BEFORE creating any nodes.
- // If no match exists, return Absent so backticks become literal text.
- // This avoids synthesizing MD_INLINE_CODE with missing r_tick_token.
- if !has_matching_code_span_closer(p, opening_count) {
- return Absent; // Caller will treat backtick as literal MD_TEXTUAL
- }
-
- // We have a valid code span - now parse it
- let m = p.start();
-
- // Opening backtick(s) - remap TRIPLE_BACKTICK to BACKTICK for consistency
- if is_triple_backtick {
- p.bump_remap(BACKTICK);
- } else {
- p.bump(BACKTICK);
- }
-
- // Content - parse until we find matching closing backticks
- // Per CommonMark, code spans can span multiple lines (newlines become spaces in output)
- // All content is lexed in CodeSpan context to keep backslash literal and avoid
- // hard-line-break detection.
- let content = p.start();
- loop {
- // EOF should not happen (lookahead guaranteed a closer), but handle defensively
- if p.at(T![EOF]) {
- break;
- }
-
- // DESIGN PRINCIPLE #3: Terminate on blank line (paragraph boundary)
- if p.at(NEWLINE) {
- if p.at_blank_line() {
- break; // Paragraph boundary - stop
- }
- // Soft line break - consume NEWLINE as content and continue
- // Use CodeSpan context so next token is also lexed without escape processing
- let text_m = p.start();
- p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
- text_m.complete(p, MD_TEXTUAL);
- continue;
- }
-
- // Found matching closing backticks (handle both BACKTICK and TRIPLE_BACKTICK)
- if (p.at(BACKTICK) || p.at(T!["```"])) && p.cur_text().len() == opening_count {
- break;
- }
-
- // DESIGN PRINCIPLE #1: Use CodeSpan context so backslash is literal
- let text_m = p.start();
- p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
- text_m.complete(p, MD_TEXTUAL);
- }
- content.complete(p, MD_INLINE_ITEM_LIST);
-
- // Closing backticks (guaranteed to exist due to lookahead check)
- // Remap TRIPLE_BACKTICK to BACKTICK for consistency
- if p.at(T!["```"]) {
- p.bump_remap(BACKTICK);
- } else {
- p.bump(BACKTICK);
- }
-
- Present(m.complete(p, MD_INLINE_CODE))
-}
-
-/// Parse emphasis using the delimiter stack matches.
-fn parse_emphasis_from_context(p: &mut MarkdownParser, expect_strong: bool) -> ParsedSyntax {
- let context = match p.emphasis_context() {
- Some(context) => context,
- None => return Absent,
- };
-
- // Must be at an emphasis token
- if !p.at(DOUBLE_STAR) && !p.at(DOUBLE_UNDERSCORE) && !p.at(T![*]) && !p.at(UNDERSCORE) {
- return Absent;
- }
-
- // Get current token info BEFORE any re-lex
- let token_start = u32::from(p.cur_range().start()) as usize;
- let token_len: usize = p.cur_range().len().into();
-
- // Find match within current token's range that has the expected is_strong value
- let opener_match = match context.opener_within(token_start, token_len, expect_strong) {
- Some(m) => m,
- None => return Absent,
- };
-
- // If the opener doesn't start at the exact token boundary, return Absent.
- // The caller (parse_any_inline) will emit literal text, advancing the parser position.
- // On subsequent calls, we'll eventually be at the correct position with prefix_len == 0.
- if opener_match.prefix_len > 0 {
- return Absent;
- }
-
- // Extract values before dropping the borrow on context
- let use_count = if expect_strong { 2 } else { 1 };
- let closer_offset = opener_match.matched.closer_start + context.base_offset;
- // Use the correct delimiter character for error messages
- let is_underscore = p.at(DOUBLE_UNDERSCORE) || p.at(UNDERSCORE);
- let opener_text = match (expect_strong, is_underscore) {
- (true, true) => "__",
- (true, false) => "**",
- (false, true) => "_",
- (false, false) => "*",
- };
-
- let m = p.start();
- let opening_range = p.cur_range();
-
- // Consume opener tokens
- // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one.
- // Only re-lex when we need to consume a partial token or single chars.
- if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) {
- // Bump the double token as a single unit
- p.bump_any();
- } else {
- // Consume individual tokens
- for _ in 0..use_count {
- if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
- p.force_relex_emphasis_inline();
- }
- p.bump_any();
- }
- }
-
- // Parse content until we reach the closer
- let content = p.start();
- loop {
- // EOF always ends content
- if p.at(T![EOF]) {
- break;
- }
-
- let current_offset = u32::from(p.cur_range().start()) as usize;
- let current_len: usize = p.cur_range().len().into();
-
- // Check if closer is AT or WITHIN current token
- if closer_offset >= current_offset && closer_offset < current_offset + current_len {
- break;
- }
-
- // Check if we've passed the closer (can happen when link parsing consumes past it)
- if current_offset > closer_offset {
- break;
- }
-
- // Handle NEWLINE: emphasis can span multiple lines per CommonMark
- // But blank lines end paragraphs, so stop there
- if p.at(NEWLINE) {
- if p.at_blank_line() {
- // Blank line = paragraph boundary, emphasis is unclosed
- break;
- }
- if closer_offset > current_offset {
- // Soft line break - consume NEWLINE as textual content and continue
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
- continue;
- }
- // Closer should have been at or before this newline - stop
- break;
- }
-
- if parse_any_inline(p).is_absent() {
- break;
- }
- }
- content.complete(p, MD_INLINE_ITEM_LIST);
-
- // Consume closer tokens (1 or 2)
- // Handle partial closer consumption (e.g., `*foo**` where closer might be at offset 4
- // but token DOUBLE_STAR spans 4-6)
- let current_offset = u32::from(p.cur_range().start()) as usize;
- let closer_prefix_len = closer_offset.saturating_sub(current_offset);
-
- if closer_prefix_len > 0 {
- // Closer starts AFTER token start - emit prefix as literal
- if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
- p.force_relex_emphasis_inline();
- }
- for _ in 0..closer_prefix_len {
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
- }
- }
-
- // Now consume actual closer delimiters
- // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one.
- let mut consumed_closer = 0;
- if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) {
- p.bump_any();
- consumed_closer = 2;
- } else {
- for _ in 0..use_count {
- if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
- p.force_relex_emphasis_inline();
- }
- if p.at(T![*]) || p.at(UNDERSCORE) || p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
- p.bump_any();
- consumed_closer += 1;
- } else {
- break;
- }
- }
- }
-
- if consumed_closer < use_count {
- p.error(super::parse_error::unclosed_emphasis(
- p,
- opening_range,
- opener_text,
- ));
- }
-
- if expect_strong {
- Present(m.complete(p, MD_INLINE_EMPHASIS))
- } else {
- Present(m.complete(p, MD_INLINE_ITALIC))
- }
-}
-
-/// Parse inline emphasis (bold: `**text**` or `__text__`).
-pub(crate) fn parse_inline_emphasis(p: &mut MarkdownParser) -> ParsedSyntax {
- parse_emphasis_from_context(p, true)
-}
-
-/// Parse inline italic (`*text*` or `_text_`).
-pub(crate) fn parse_inline_italic(p: &mut MarkdownParser) -> ParsedSyntax {
- parse_emphasis_from_context(p, false)
-}
-
-fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool {
- let m = p.start();
- let prev_context = set_inline_emphasis_context_until(p, stop);
- let mut bracket_depth = 0usize;
- let mut has_nested_link = false;
-
- loop {
- // Per CommonMark, link text can span lines, but blank lines end the link.
- // Check for blank line (NEWLINE followed by NEWLINE or EOF after optional whitespace)
- if p.at(NEWLINE) {
- if p.at_blank_line() {
- break; // Blank line ends link text
- }
- // Single newline inside link text - consume and continue
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(T![EOF]) {
- break;
- }
-
- // IMPORTANT: Parse constructs that can contain `]` BEFORE checking for stop token.
- // Per CommonMark, `]` inside code spans, autolinks, and HTML doesn't terminate links.
-
- // Code spans can contain `]`
- if p.at(BACKTICK) {
- if parse_inline_code(p).is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- // Autolinks and inline HTML can contain `]`
- if p.at(L_ANGLE) {
- if parse_autolink(p).is_present() {
- continue;
- }
- if parse_inline_html(p).is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- // NOW check for stop token (after constructs that can contain it)
- if p.at(stop) {
- if bracket_depth == 0 {
- break;
- }
- bracket_depth = bracket_depth.saturating_sub(1);
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(L_BRACK) {
- if !has_nested_link && nested_link_starts_here(p) {
- has_nested_link = true;
- }
- bracket_depth += 1;
- let _ = super::parse_textual(p);
- continue;
- }
-
- if parse_any_inline_no_links(p).is_absent() {
- break;
- }
- }
-
- m.complete(p, MD_INLINE_ITEM_LIST);
- p.set_emphasis_context(prev_context);
- has_nested_link
-}
-
-/// Parse inline items until `stop` token, allowing full inline parsing including links.
-/// Used for image alt text where nested links/images should be fully parsed
-/// so their text content can be extracted for the alt attribute.
-fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) {
- let m = p.start();
- let prev_context = set_inline_emphasis_context_until(p, stop);
- let mut bracket_depth = 0usize;
-
- loop {
- if p.at(NEWLINE) {
- if p.at_blank_line() {
- break;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(T![EOF]) {
- break;
- }
-
- // Code spans can contain `]`
- if p.at(BACKTICK) {
- if parse_inline_code(p).is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- // Autolinks and inline HTML can contain `]`
- if p.at(L_ANGLE) {
- if parse_autolink(p).is_present() {
- continue;
- }
- if parse_inline_html(p).is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(stop) {
- if bracket_depth == 0 {
- break;
- }
- bracket_depth = bracket_depth.saturating_sub(1);
- let _ = super::parse_textual(p);
- continue;
- }
-
- // For image alt: allow full inline parsing including links and images
- if p.at(L_BRACK) {
- let result = parse_link_or_image(p, LinkParseKind::Link);
- if result.is_present() {
- continue;
- }
- bracket_depth += 1;
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(BANG) && p.nth_at(1, L_BRACK) {
- let result = parse_link_or_image(p, LinkParseKind::Image);
- if result.is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- if parse_any_inline(p).is_absent() {
- break;
- }
- }
-
- m.complete(p, MD_INLINE_ITEM_LIST);
- p.set_emphasis_context(prev_context);
-}
-
-fn nested_link_starts_here(p: &mut MarkdownParser) -> bool {
- p.lookahead(|p| {
- if !p.at(L_BRACK) {
- return false;
- }
-
- p.bump(L_BRACK);
- let mut depth = 0usize;
-
- loop {
- if p.at(EOF) || p.at_inline_end() {
- return false;
- }
-
- if p.at(L_BRACK) {
- depth += 1;
- p.bump(L_BRACK);
- continue;
- }
-
- if p.at(R_BRACK) {
- if depth > 0 {
- depth -= 1;
- p.bump(R_BRACK);
- continue;
- }
- p.bump(R_BRACK);
- return p.at(L_PAREN) || p.at(L_BRACK);
- }
-
- p.bump(p.cur());
- }
- })
-}
-
-fn parse_any_inline_no_links(p: &mut MarkdownParser) -> ParsedSyntax {
- if p.at(L_BRACK) {
- return super::parse_textual(p);
- }
-
- if p.at(BANG) && p.nth_at(1, L_BRACK) {
- return parse_inline_image(p);
- }
-
- parse_any_inline(p)
-}
-
-fn set_inline_emphasis_context_until(
- p: &mut MarkdownParser,
- stop: MarkdownSyntaxKind,
-) -> Option {
- let source_len = inline_list_source_len_until(p, stop);
- let source = p.source_after_current();
- let inline_source = if source_len <= source.len() {
- &source[..source_len]
- } else {
- source
- };
- let base_offset = u32::from(p.cur_range().start()) as usize;
- // Create a reference checker closure that uses the parser's link reference definitions
- let context = EmphasisContext::new(inline_source, base_offset, |label| {
- p.has_link_reference_definition(label)
- });
- p.set_emphasis_context(Some(context))
-}
-
-fn inline_list_source_len_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> usize {
- p.lookahead(|p| {
- let mut len = 0usize;
-
- loop {
- if p.at(T![EOF]) || p.at(stop) || p.at_inline_end() {
- break;
- }
-
- len += p.cur_text().len();
- p.bump(p.cur());
- }
-
- len
- })
-}
-
-/// Parse link starting with `[` - dispatches to inline link or reference link.
-///
-/// After parsing `[text]`:
-/// - If followed by `(` → inline link `[text](url)`
-/// - If followed by `[` → reference link `[text][label]` or `[text][]`
-/// - Otherwise → shortcut reference `[text]`
-pub(crate) fn parse_link_or_reference(p: &mut MarkdownParser) -> ParsedSyntax {
- parse_link_or_image(p, LinkParseKind::Link)
-}
-
-/// Parse reference link label `[label]` or `[]`.
-///
-/// Grammar: `MdReferenceLinkLabel = '[' label: MdInlineItemList ']'`
-///
-/// Returns Present if `[` and `]` are found (even if empty for collapsed reference).
-/// On failure (missing `]`), rewinds to the checkpoint so no tokens are consumed.
-fn parse_reference_label(p: &mut MarkdownParser) -> ParsedSyntax {
- if !p.at(L_BRACK) {
- return Absent;
- }
-
- // Checkpoint so we can rewind if ] is missing
- let checkpoint = p.checkpoint();
- let m = p.start();
-
- // [
- p.bump(L_BRACK);
-
- // Label content (may be empty for collapsed reference)
- let label = p.start();
- while !p.at(R_BRACK) && !p.at_inline_end() {
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
- }
- label.complete(p, MD_INLINE_ITEM_LIST);
-
- // ]
- if !p.eat(R_BRACK) {
- // Missing closing bracket - abandon and rewind to not consume tokens
- m.abandon(p);
- p.rewind(checkpoint);
- return Absent;
- }
-
- Present(m.complete(p, MD_REFERENCE_LINK_LABEL))
-}
-
-/// Parse inline link (`[text](url)`).
-///
-/// Grammar: `MdInlineLink = '[' text: MdInlineItemList ']' '(' source: MdInlineItemList ')'`
-///
-/// Note: This is kept for backwards compatibility but `parse_link_or_reference`
-/// is the preferred entry point for link parsing.
-pub(crate) fn parse_inline_link(p: &mut MarkdownParser) -> ParsedSyntax {
- parse_link_or_reference(p)
-}
-
-/// Parse image starting with `![` - dispatches to inline image or reference image.
-///
-/// After parsing `![alt]`:
-/// - If followed by `(` → inline image ``
-/// - If followed by `[` → reference image `![alt][label]` or `![alt][]`
-/// - Otherwise → shortcut reference image `![alt]`
-pub(crate) fn parse_image_or_reference(p: &mut MarkdownParser) -> ParsedSyntax {
- parse_link_or_image(p, LinkParseKind::Image)
-}
-
-#[derive(Copy, Clone)]
-enum LinkParseKind {
- Link,
- Image,
-}
-
-impl LinkParseKind {
- fn starts_here(self, p: &mut MarkdownParser) -> bool {
- match self {
- Self::Link => p.at(L_BRACK),
- Self::Image => p.at(BANG) && p.nth_at(1, L_BRACK),
- }
- }
-
- fn bump_opening(self, p: &mut MarkdownParser) {
- if matches!(self, Self::Image) {
- p.bump(BANG);
- }
- p.bump(L_BRACK);
- }
-
- fn lookahead_reference(self, p: &mut MarkdownParser) -> Option {
- match self {
- Self::Link => lookahead_reference_link(p),
- Self::Image => lookahead_reference_image(p),
- }
- }
-
- fn inline_kind(self) -> MarkdownSyntaxKind {
- match self {
- Self::Link => MD_INLINE_LINK,
- Self::Image => MD_INLINE_IMAGE,
- }
- }
-
- fn reference_kind(self) -> MarkdownSyntaxKind {
- match self {
- Self::Link => MD_REFERENCE_LINK,
- Self::Image => MD_REFERENCE_IMAGE,
- }
- }
-
- fn report_unclosed_destination(self, p: &mut MarkdownParser, opening_range: TextRange) {
- match self {
- Self::Link => p.error(super::parse_error::unclosed_link(
- p,
- opening_range,
- "expected `)` to close URL",
- )),
- Self::Image => p.error(super::parse_error::unclosed_image(
- p,
- opening_range,
- "expected `)` to close image URL",
- )),
- }
- }
-}
-
-fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyntax {
- if !kind.starts_here(p) {
- return Absent;
- }
-
- let checkpoint = p.checkpoint();
- let m = p.start();
- let opening_range = p.cur_range();
- let reference = kind.lookahead_reference(p);
- // Clear any cached lookahead tokens before switching lexing context.
- p.reset_lookahead();
-
- kind.bump_opening(p);
-
- // Link text / alt text
- let has_nested_link = if matches!(kind, LinkParseKind::Image) {
- // For images, allow full inline parsing (including links) in alt text.
- // This lets nested links/images be parsed so their text can be extracted for alt.
- parse_inline_item_list_until(p, R_BRACK);
- false
- } else {
- parse_inline_item_list_until_no_links(p, R_BRACK)
- };
-
- // ] - if missing, rewind and treat [ as literal text.
- // Per CommonMark, if there's no valid ] to close the link (e.g., all ]
- // characters are inside code spans or HTML), the [ is literal text.
- // NOTE: We intentionally do NOT emit an "unclosed link" diagnostic here.
- // CommonMark treats unmatched `[` as literal text, not an error.
- if !p.eat(R_BRACK) {
- m.abandon(p);
- p.rewind(checkpoint);
- return Absent;
- }
-
- // Per CommonMark, a link (not image) whose text contains another link must fail.
- // The inner link wins and the outer `[` becomes literal text.
- if matches!(kind, LinkParseKind::Link) && has_nested_link {
- m.abandon(p);
- p.rewind(checkpoint);
- return Absent;
- }
-
- // Now decide based on what follows ]
- let link_validation = if p.at(L_PAREN) {
- inline_link_is_valid(p)
- } else {
- InlineLinkValidation::Invalid
- };
-
- if matches!(
- link_validation,
- InlineLinkValidation::Valid | InlineLinkValidation::DepthExceeded
- ) {
- // Inline link/image: [text](url) or 
- // Bump past ( and lex the following tokens in LinkDefinition context
- // so whitespace separates destination and title.
- p.expect_with_context(L_PAREN, crate::lexer::MarkdownLexContext::LinkDefinition);
-
- let destination = p.start();
- let destination_result = parse_inline_link_destination_tokens(p);
-
- // When depth exceeded, destination is truncated but link is still valid.
- // Complete the destination and link immediately without looking for closing paren.
- if destination_result == DestinationScanResult::DepthExceeded {
- destination.complete(p, MD_INLINE_ITEM_LIST);
- p.force_relex_regular();
- return Present(m.complete(p, kind.inline_kind()));
- }
-
- let has_title = inline_title_starts_after_whitespace_tokens(p);
- while is_title_separator_token(p) {
- bump_link_def_separator(p);
- }
- if destination_result == DestinationScanResult::Invalid {
- destination.abandon(p);
- m.abandon(p);
- p.rewind(checkpoint);
- p.force_relex_regular();
- return Absent;
- }
- destination.complete(p, MD_INLINE_ITEM_LIST);
-
- if has_title {
- let title_m = p.start();
- let list_m = p.start();
- parse_title_content(p, get_title_close_char(p));
- list_m.complete(p, MD_INLINE_ITEM_LIST);
- title_m.complete(p, MD_LINK_TITLE);
- }
-
- // Skip trailing whitespace/newlines before closing paren without creating nodes
- // (creating nodes would violate the MD_INLINE_LINK grammar which expects exactly 7 children)
- while is_title_separator_token(p) {
- skip_link_def_separator_tokens(p);
- }
-
- if !p.eat(R_PAREN) {
- if p.at_inline_end() {
- kind.report_unclosed_destination(p, opening_range);
- }
- m.abandon(p);
- p.rewind(checkpoint);
- p.force_relex_regular();
- return Absent;
- }
-
- Present(m.complete(p, kind.inline_kind()))
- } else if p.at(L_BRACK) {
- // Reference link/image: [text][label] or [text][]
- let label = parse_reference_label(p);
- let reference = reference.filter(|reference| {
- if label.is_absent() {
- reference.is_shortcut
- } else {
- true
- }
- });
-
- if let Some(reference) = reference
- && !reference.is_defined(p)
- {
- m.abandon(p);
- p.rewind(checkpoint);
- // Return Absent - the caller will treat `[` as textual.
- // Don't consume the whole bracket sequence to avoid consuming
- // past emphasis closers.
- return Absent;
- }
-
- Present(m.complete(p, kind.reference_kind()))
- } else {
- // Shortcut reference: [text] or ![alt]
- // No label part - the text/alt IS the label for resolution
- if let Some(reference) = reference
- && reference.is_shortcut
- && !reference.is_defined(p)
- {
- m.abandon(p);
- p.rewind(checkpoint);
- // Return Absent - the caller will treat `[` as textual.
- // Don't consume the whole bracket sequence to avoid consuming
- // past emphasis closers.
- return Absent;
- }
- Present(m.complete(p, kind.reference_kind()))
- }
-}
-
-struct ReferenceLinkLookahead {
- label_raw: String,
- is_shortcut: bool,
-}
-
-impl ReferenceLinkLookahead {
- fn is_defined(&self, p: &MarkdownParser) -> bool {
- let normalized = normalize_reference_label(&self.label_raw);
- p.has_link_reference_definition(&normalized)
- }
-}
-
-fn lookahead_reference_link(p: &mut MarkdownParser) -> Option {
- lookahead_reference_common(p, false)
-}
-
-fn lookahead_reference_image(p: &mut MarkdownParser) -> Option {
- lookahead_reference_common(p, true)
-}
-
-fn lookahead_reference_common(
- p: &mut MarkdownParser,
- is_image: bool,
-) -> Option {
- p.lookahead(|p| {
- if is_image {
- if !p.at(BANG) || !p.nth_at(1, L_BRACK) {
- return None;
- }
- p.bump(BANG);
- }
-
- if !p.at(L_BRACK) {
- return None;
- }
-
- p.bump(L_BRACK);
-
- let link_text = collect_link_text(p)?;
-
- // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty)
- let normalized_link = normalize_reference_label(&link_text);
- if normalized_link.is_empty() {
- return None;
- }
-
- p.bump(R_BRACK);
-
- if p.at(L_PAREN) {
- return None;
- }
-
- if p.at(L_BRACK) {
- p.bump(L_BRACK);
- let label_text = collect_label_text_simple(p);
- if let Some(label_text) = label_text {
- let label = if label_text.is_empty() {
- link_text.clone()
- } else {
- // Explicit label must also normalize to non-empty
- let normalized_label = normalize_reference_label(&label_text);
- if normalized_label.is_empty() {
- return None;
- }
- label_text
- };
- p.bump(R_BRACK);
- return Some(ReferenceLinkLookahead {
- label_raw: label,
- is_shortcut: false,
- });
- }
- }
-
- Some(ReferenceLinkLookahead {
- label_raw: link_text,
- is_shortcut: true,
- })
- })
-}
-
-/// Collect text for a link label (e.g., the `label` in `[text][label]`).
-///
-/// Per CommonMark §4.7, link labels have specific rules:
-/// - Unescaped square brackets are NOT allowed inside labels (see example 555)
-/// - Backslash escapes ARE allowed (e.g., `\]` is a literal `]` in the label)
-/// - No inline parsing (backticks, HTML, etc. are literal characters)
-///
-/// We stop at the first R_BRACK token (unescaped `]`). Escaped brackets like `\]`
-/// are lexed as MD_TEXTUAL_LITERAL, not R_BRACK, so they're included in the label.
-fn collect_label_text_simple(p: &mut MarkdownParser) -> Option {
- let mut text = String::new();
-
- loop {
- if p.at(T![EOF]) || p.at_inline_end() {
- return None;
- }
-
- // Blank lines terminate
- if p.at(NEWLINE) && p.at_blank_line() {
- return None;
- }
-
- // R_BRACK token = unescaped `]` closes the label.
- // Note: Escaped brackets (`\]`) are lexed as MD_TEXTUAL_LITERAL,
- // not R_BRACK, so they're correctly included in the label text.
- if p.at(R_BRACK) {
- return Some(text);
- }
-
- text.push_str(p.cur_text());
- p.bump(p.cur());
- }
-}
-
-/// Collect text for link text (e.g., the `text` in `[text](url)` or `[text][label]`).
-/// Per CommonMark, link text CAN contain inline elements - code spans, autolinks, HTML.
-/// `]` inside these constructs does NOT close the link text.
-fn collect_link_text(p: &mut MarkdownParser) -> Option {
- let mut text = String::new();
- let mut bracket_depth = 0usize;
-
- loop {
- if p.at(T![EOF]) || p.at_inline_end() {
- return None;
- }
-
- // Per CommonMark, blank lines terminate link text
- if p.at(NEWLINE) && p.at_blank_line() {
- return None;
- }
-
- // Code spans can contain `]` - skip them entirely.
- // Per CommonMark, `]` inside code spans doesn't terminate link text.
- if p.at(BACKTICK) {
- let opening_count = p.cur_text().len();
- text.push_str(p.cur_text());
- p.bump(p.cur());
-
- // Find matching closing backticks
- let mut found_close = false;
- while !p.at(T![EOF]) && !p.at_inline_end() {
- if p.at(NEWLINE) && p.at_blank_line() {
- break; // Blank line terminates
- }
- if p.at(BACKTICK) && p.cur_text().len() == opening_count {
- text.push_str(p.cur_text());
- p.bump(p.cur());
- found_close = true;
- break;
- }
- text.push_str(p.cur_text());
- p.bump(p.cur());
- }
- if !found_close {
- // Unclosed code span - treat opening backticks as literal
- // (already added to text, continue normally)
- }
- continue;
- }
-
- // Autolinks and inline HTML can contain `]` - skip them entirely.
- // Per CommonMark, `]` inside `<...>` constructs doesn't terminate link text.
- if p.at(L_ANGLE) {
- text.push_str(p.cur_text());
- p.bump(p.cur());
-
- // Consume until `>` or newline
- while !p.at(T![EOF]) && !p.at_inline_end() && !p.at(R_ANGLE) {
- if p.at(NEWLINE) {
- // Newlines end autolinks/HTML tags
- break;
- }
- text.push_str(p.cur_text());
- p.bump(p.cur());
- }
- if p.at(R_ANGLE) {
- text.push_str(p.cur_text());
- p.bump(p.cur());
- }
- continue;
- }
-
- if p.at(L_BRACK) {
- bracket_depth += 1;
- text.push_str(p.cur_text());
- p.bump(p.cur());
- continue;
- }
-
- if p.at(R_BRACK) {
- if bracket_depth == 0 {
- return Some(text);
- }
- bracket_depth -= 1;
- text.push_str(p.cur_text());
- p.bump(p.cur());
- continue;
- }
-
- text.push_str(p.cur_text());
- p.bump(p.cur());
- }
-}
-
-fn bump_textual_link_def(p: &mut MarkdownParser) {
- use crate::lexer::MarkdownLexContext;
-
- let item = p.start();
- p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::LinkDefinition);
- item.complete(p, MD_TEXTUAL);
-}
-fn is_whitespace_token(p: &MarkdownParser) -> bool {
- let text = p.cur_text();
- !text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t')
-}
-
-fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool {
- p.lookahead(|p| {
- let mut saw_whitespace = false;
- while is_title_separator_token(p) {
- bump_link_def_separator(p);
- saw_whitespace = true;
- }
- saw_whitespace && get_title_close_char(p).is_some()
- })
-}
-
-/// Result of validating an inline link.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum InlineLinkValidation {
- /// Link is valid with complete destination
- Valid,
- /// Link is invalid
- Invalid,
- /// Link is valid but destination was truncated due to paren depth limit.
- /// The link should be closed immediately without looking for `)`.
- DepthExceeded,
-}
-
-fn inline_link_is_valid(p: &mut MarkdownParser) -> InlineLinkValidation {
- p.lookahead(|p| {
- if !p.at(L_PAREN) {
- return InlineLinkValidation::Invalid;
- }
-
- p.bump(L_PAREN);
- p.re_lex_link_definition();
-
- let destination_result = scan_inline_link_destination_tokens(p);
-
- // If depth exceeded, link is valid but truncated - no need to check for closing paren
- if destination_result == DestinationScanResult::DepthExceeded {
- return InlineLinkValidation::DepthExceeded;
- }
-
- if destination_result == DestinationScanResult::Invalid {
- return InlineLinkValidation::Invalid;
- }
-
- let mut saw_separator = false;
- while is_title_separator_token(p) {
- skip_link_def_separator_tokens(p);
- saw_separator = true;
- }
- let has_title = saw_separator && get_title_close_char(p).is_some();
- while is_title_separator_token(p) {
- skip_link_def_separator_tokens(p);
- }
-
- if has_title {
- scan_title_content(p, get_title_close_char(p));
- }
-
- while is_title_separator_token(p) {
- skip_link_def_separator_tokens(p);
- }
-
- if p.at(R_PAREN) {
- InlineLinkValidation::Valid
- } else {
- InlineLinkValidation::Invalid
- }
- })
-}
-
-/// Result of scanning a link destination.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum DestinationScanResult {
- /// Destination is valid and complete
- Valid,
- /// Destination is invalid (contains invalid characters, etc.)
- Invalid,
- /// Destination was truncated because paren depth exceeded the limit.
- /// In this case, the link is considered valid but closed at the truncation point.
- DepthExceeded,
-}
-
-fn scan_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult {
- const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH;
- // Skip leading whitespace to match parse_inline_link_destination_tokens behavior
- while is_title_separator_token(p) {
- skip_link_def_separator_tokens(p);
- }
- if p.at(L_ANGLE) {
- p.bump_link_definition();
- let mut pending_escape = false;
- loop {
- if p.at(EOF) || p.at(NEWLINE) {
- return DestinationScanResult::Invalid;
- }
- if p.at(R_ANGLE) {
- if pending_escape {
- if !super::validate_link_destination_text(
- p.cur_text(),
- super::LinkDestinationKind::Enclosed,
- &mut pending_escape,
- ) {
- return DestinationScanResult::Invalid;
- }
- p.bump_link_definition();
- continue;
- }
- p.bump_link_definition();
- return DestinationScanResult::Valid;
- }
- if !super::validate_link_destination_text(
- p.cur_text(),
- super::LinkDestinationKind::Enclosed,
- &mut pending_escape,
- ) {
- return DestinationScanResult::Invalid;
- }
- p.bump_link_definition();
- }
- }
-
- let mut paren_depth: i32 = 0;
- let mut pending_escape = false;
- while !p.at(EOF) && !p.at(NEWLINE) {
- if is_whitespace_token(p) {
- break;
- }
- let text = p.cur_text();
- if !super::validate_link_destination_text(
- text,
- super::LinkDestinationKind::Raw,
- &mut pending_escape,
- ) {
- return DestinationScanResult::Invalid;
- }
- match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) {
- super::ParenDepthResult::Ok(next_depth) => {
- paren_depth = next_depth;
- p.bump_link_definition();
- }
- super::ParenDepthResult::DepthExceeded => {
- // Paren depth exceeded - destination is truncated at this point.
- // Per CommonMark/cmark, the link is still valid but closed here.
- return DestinationScanResult::DepthExceeded;
- }
- super::ParenDepthResult::UnmatchedClose => {
- // Unmatched closing paren - destination ends here normally.
- // The `)` belongs to the enclosing construct (inline link closer).
- break;
- }
- }
- }
- if p.at(EOF) {
- return DestinationScanResult::Invalid;
- }
- if p.at(NEWLINE) {
- return if p.at_blank_line() {
- DestinationScanResult::Invalid
- } else {
- DestinationScanResult::Valid
- };
- }
- DestinationScanResult::Valid
-}
-
-fn scan_title_content(p: &mut MarkdownParser, close_char: Option) {
- let Some(close_char) = close_char else {
- return;
- };
-
- let text = p.cur_text();
- let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char);
-
- p.bump_link_definition();
- if is_complete {
- return;
- }
-
- loop {
- // Stop on EOF or blank line (titles cannot span blank lines per CommonMark)
- if p.at(EOF) || p.at_blank_line() {
- return;
- }
-
- // Continue through single newlines (titles can span non-blank lines)
- if p.at(NEWLINE) {
- skip_link_def_separator_tokens(p);
- continue;
- }
-
- let text = p.cur_text();
- if super::ends_with_unescaped_close(text, close_char) {
- p.bump_link_definition();
- return;
- }
-
- p.bump_link_definition();
- }
-}
-
-fn skip_link_def_separator_tokens(p: &mut MarkdownParser) {
- if p.at(NEWLINE) {
- p.bump(NEWLINE);
- } else {
- p.bump_link_definition();
- }
-}
-
-fn is_title_separator_token(p: &MarkdownParser) -> bool {
- is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line())
-}
-
-fn bump_link_def_separator(p: &mut MarkdownParser) {
- if p.at(NEWLINE) {
- let item = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- item.complete(p, MD_TEXTUAL);
- } else {
- bump_textual_link_def(p);
- }
-}
-
-fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult {
- p.re_lex_link_definition();
- const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH;
-
- if p.at(L_ANGLE) {
- bump_textual_link_def(p);
- let mut pending_escape = false;
- loop {
- if p.at(EOF) || p.at(NEWLINE) {
- return DestinationScanResult::Invalid;
- }
- if p.at(R_ANGLE) {
- if pending_escape {
- if !super::validate_link_destination_text(
- p.cur_text(),
- super::LinkDestinationKind::Enclosed,
- &mut pending_escape,
- ) {
- return DestinationScanResult::Invalid;
- }
- bump_textual_link_def(p);
- continue;
- }
- bump_textual_link_def(p);
- return DestinationScanResult::Valid;
- }
- if !super::validate_link_destination_text(
- p.cur_text(),
- super::LinkDestinationKind::Enclosed,
- &mut pending_escape,
- ) {
- return DestinationScanResult::Invalid;
- }
- bump_textual_link_def(p);
- }
- }
-
- let mut paren_depth: i32 = 0;
- let mut pending_escape = false;
- while is_title_separator_token(p) {
- bump_link_def_separator(p);
- }
- while !p.at(EOF) && !p.at(NEWLINE) {
- if is_whitespace_token(p) {
- break;
- }
-
- let text = p.cur_text();
- if !super::validate_link_destination_text(
- text,
- super::LinkDestinationKind::Raw,
- &mut pending_escape,
- ) {
- return DestinationScanResult::Invalid;
- }
- match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) {
- super::ParenDepthResult::Ok(next_depth) => {
- paren_depth = next_depth;
- bump_textual_link_def(p);
- }
- super::ParenDepthResult::DepthExceeded => {
- // Paren depth exceeded - destination is truncated at this point.
- return DestinationScanResult::DepthExceeded;
- }
- super::ParenDepthResult::UnmatchedClose => {
- // Unmatched closing paren - destination ends here normally.
- // The `)` belongs to the enclosing construct (inline link closer).
- break;
- }
- }
- }
- if p.at(EOF) {
- return DestinationScanResult::Invalid;
- }
- if p.at(NEWLINE) {
- return if p.at_blank_line() {
- DestinationScanResult::Invalid
- } else {
- DestinationScanResult::Valid
- };
- }
- DestinationScanResult::Valid
-}
-
-fn get_title_close_char(p: &MarkdownParser) -> Option {
- let text = p.cur_text();
- if text.starts_with('"') {
- Some('"')
- } else if text.starts_with('\'') {
- Some('\'')
- } else if p.at(L_PAREN) {
- Some(')')
- } else {
- None
- }
-}
-
-fn parse_title_content(p: &mut MarkdownParser, close_char: Option) {
- let Some(close_char) = close_char else {
- return;
- };
-
- let text = p.cur_text();
- let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char);
-
- bump_textual_link_def(p);
- if is_complete {
- return;
- }
-
- loop {
- // Stop on EOF or blank line (titles cannot span blank lines per CommonMark)
- if p.at(EOF) || p.at_blank_line() {
- return;
- }
-
- // Continue through single newlines (titles can span non-blank lines)
- if p.at(NEWLINE) {
- bump_link_def_separator(p);
- continue;
- }
-
- let text = p.cur_text();
- if super::ends_with_unescaped_close(text, close_char) {
- bump_textual_link_def(p);
- return;
- }
-
- bump_textual_link_def(p);
- }
-}
-
-/// Parse inline image (``).
-///
-/// Grammar: `MdInlineImage = '!' '[' alt: MdInlineItemList ']' '(' source: MdInlineItemList ')'`
-///
-/// Note: This is kept for backwards compatibility but `parse_image_or_reference`
-/// is the preferred entry point for image parsing.
-pub(crate) fn parse_inline_image(p: &mut MarkdownParser) -> ParsedSyntax {
- parse_image_or_reference(p)
-}
-
-/// Check if text starting with `<` is valid inline HTML per CommonMark §6.8.
-/// Returns the length of the HTML element if valid, None otherwise.
-///
-/// Valid patterns:
-/// - Open tags: ``, ``, ``
-/// - Close tags: ``
-/// - Comments: ``
-/// - Processing instructions: ` ... ?>`
-/// - Declarations: ``
-/// - CDATA: ``
-pub(crate) fn is_inline_html(text: &str) -> Option {
- let bytes = text.as_bytes();
- if bytes.len() < 2 || bytes[0] != b'<' {
- return None;
- }
-
- // HTML comment:
- // Per CommonMark 0.31.2 §6.8, an HTML comment consists of ``,
- // where text does not start with `>` or `->`, and does not end with `-`.
- // Additionally, `` and `` are valid (degenerate) comments.
- if bytes.starts_with(b" and
- if rest.starts_with(b">") {
- return Some(5); //
- }
- if rest.starts_with(b"->") {
- return Some(6); //
- }
- // Find closing --> after ") {
- let body = &text[4..4 + pos];
- // Body must not end with '-'
- if body.ends_with('-') {
- return None;
- }
- return Some(4 + pos + 3);
- }
- return None;
- }
-
- // Processing instruction: ... ?>
- if bytes.len() >= 2 && bytes[1] == b'?' {
- // Find closing ?>
- if let Some(pos) = text[2..].find("?>") {
- return Some(2 + pos + 2);
- }
- return None;
- }
-
- // CDATA section:
- if bytes.starts_with(b"
- if let Some(pos) = text[9..].find("]]>") {
- return Some(9 + pos + 3);
- }
- return None;
- }
-
- // Declaration:
- // e.g.,
- if bytes.len() >= 3 && bytes[1] == b'!' && bytes[2].is_ascii_alphabetic() {
- // Find closing >
- if let Some(pos) = text[2..].find('>') {
- return Some(2 + pos + 1);
- }
- return None;
- }
-
- // Close tag:
- if bytes.len() >= 4 && bytes[1] == b'/' {
- if !bytes[2].is_ascii_alphabetic() {
- return None;
- }
- // Tag name: [A-Za-z][A-Za-z0-9-]*
- let mut i = 3;
- while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
- i += 1;
- }
- // Skip optional whitespace
- while i < bytes.len()
- && (bytes[i] == b' '
- || bytes[i] == b'\t'
- || bytes[i] == b'\n'
- || bytes[i] == b'\r'
- || bytes[i] == b'\x0c')
- {
- i += 1;
- }
- // Must end with >
- if i < bytes.len() && bytes[i] == b'>' {
- return Some(i + 1);
- }
- return None;
- }
-
- // Open tag: or
- // Defensive bounds check - should be guaranteed by earlier len check but be explicit
- if bytes.len() < 2 || !bytes[1].is_ascii_alphabetic() {
- return None;
- }
-
- // Tag name: [A-Za-z][A-Za-z0-9-]*
- // Note: tag names cannot contain `.` (so is NOT a valid tag)
- let mut i = 2;
- while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
- i += 1;
- }
-
- // After tag name, must have valid boundary: whitespace, >, or /
- // This prevents from being treated as HTML
- if i >= bytes.len() {
- return None;
- }
- let boundary = bytes[i];
- if boundary != b' '
- && boundary != b'\t'
- && boundary != b'\n'
- && boundary != b'\r'
- && boundary != b'\x0c'
- && boundary != b'>'
- && boundary != b'/'
- {
- return None;
- }
-
- // Handle immediate close or self-close
- if boundary == b'>' {
- return Some(i + 1);
- }
- if boundary == b'/' {
- if i + 1 < bytes.len() && bytes[i + 1] == b'>' {
- return Some(i + 2);
- }
- return None;
- }
-
- // Has attributes - validate per CommonMark §6.8
-
- let skip_spaces = |i: &mut usize| -> Option {
- let mut skipped = false;
- while *i < bytes.len() {
- match bytes[*i] {
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' => {
- skipped = true;
- *i += 1;
- }
- _ => break,
- }
- }
- Some(skipped)
- };
-
- let is_attr_name_start = |b: u8| b.is_ascii_alphabetic() || b == b'_' || b == b':';
- let is_attr_name_continue =
- |b: u8| b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'.' || b == b'-';
-
- let mut need_space = true;
- // We already know the boundary char was whitespace, so first iteration has space.
- let mut had_space = true;
-
- loop {
- if need_space {
- let s = skip_spaces(&mut i)?;
- had_space = had_space || s;
- }
- need_space = true;
-
- if i >= bytes.len() {
- return None;
- }
-
- // End or self-close
- if bytes[i] == b'>' {
- return Some(i + 1);
- }
- if bytes[i] == b'/' {
- if i + 1 < bytes.len() && bytes[i + 1] == b'>' {
- return Some(i + 2);
- }
- return None;
- }
-
- // Attributes must be separated by whitespace
- if !had_space {
- return None;
- }
-
- // Parse attribute name
- if !is_attr_name_start(bytes[i]) {
- return None;
- }
- i += 1;
- while i < bytes.len() && is_attr_name_continue(bytes[i]) {
- i += 1;
- }
-
- // Optional whitespace and value
- had_space = skip_spaces(&mut i)?;
- if i < bytes.len() && bytes[i] == b'=' {
- i += 1;
- skip_spaces(&mut i)?;
- if i >= bytes.len() {
- return None;
- }
-
- match bytes[i] {
- b'"' => {
- i += 1;
- while i < bytes.len() && bytes[i] != b'"' {
- i += 1;
- }
- if i >= bytes.len() {
- return None;
- }
- i += 1;
- }
- b'\'' => {
- i += 1;
- while i < bytes.len() && bytes[i] != b'\'' {
- i += 1;
- }
- if i >= bytes.len() {
- return None;
- }
- i += 1;
- }
- _ => {
- let start = i;
- while i < bytes.len() {
- let b = bytes[i];
- if b <= b' '
- || b == b'"'
- || b == b'\''
- || b == b'='
- || b == b'<'
- || b == b'>'
- || b == b'`'
- {
- break;
- }
- i += 1;
- }
- if i == start {
- return None;
- }
- }
- }
- // After value, need to find whitespace at top of loop
- had_space = false;
- }
- // If no '=' was found, `had_space` from skip_spaces above carries over
- // as the separator for the next attribute (boolean attribute case).
- }
-}
-
-/// Parse entity or numeric character reference per CommonMark §6.2.
-///
-/// Grammar: MdEntityReference = value: 'md_entity_literal'
-///
-/// Valid patterns:
-/// - Named entity: `&name;` where name is 2-31 alphanumeric chars starting with letter
-/// - Decimal numeric: `digits;` where digits is 1-7 decimal digits
-/// - Hexadecimal: `hex;` or `hex;` where hex is 1-6 hex digits
-///
-/// The lexer has already validated and tokenized valid entity references as
-/// MD_ENTITY_LITERAL tokens. Invalid patterns remain as textual.
-pub(crate) fn parse_entity_reference(p: &mut MarkdownParser) -> ParsedSyntax {
- if !p.at(MD_ENTITY_LITERAL) {
- return Absent;
- }
-
- let m = p.start();
- p.bump(MD_ENTITY_LITERAL);
- Present(m.complete(p, MD_ENTITY_REFERENCE))
-}
-
-/// Parse raw inline HTML per CommonMark §6.8.
-///
-/// Grammar: MdInlineHtml = value: MdInlineItemList
-///
-/// Includes: open tags, close tags, comments, processing instructions,
-/// declarations, and CDATA sections.
-pub(crate) fn parse_inline_html(p: &mut MarkdownParser) -> ParsedSyntax {
- if !p.at(L_ANGLE) {
- return Absent;
- }
-
- // Get the source text starting from current position
- let source = p.source_after_current();
-
- // Check if this is valid inline HTML
- let html_len = match is_inline_html(source) {
- Some(len) => len,
- None => return Absent,
- };
-
- // Per CommonMark §4.3, setext heading underlines take priority over inline HTML.
- // If this HTML tag spans across a line that is a setext underline, treat `<` as literal.
- if crate::syntax::inline_span_crosses_setext(p, html_len) {
- return Absent;
- }
-
- // Valid inline HTML - create the node
- // Use checkpoint so we can rewind if token boundaries don't align
- let checkpoint = p.checkpoint();
- let m = p.start();
-
- // Create content as inline item list containing textual nodes
- let content = p.start();
-
- // Track remaining bytes to consume
- let mut remaining = html_len;
-
- while remaining > 0 && !p.at(T![EOF]) {
- let token_len = p.cur_text().len();
-
- // If the current token is larger than remaining bytes, token boundaries
- // don't align with our validated HTML - rewind and treat as text
- if token_len > remaining {
- m.abandon(p);
- p.rewind(checkpoint);
- return Absent;
- }
-
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
- remaining -= token_len;
- }
-
- content.complete(p, MD_INLINE_ITEM_LIST);
-
- Present(m.complete(p, MD_INLINE_HTML))
-}
-
-/// Check if the text after `<` looks like a URI autolink.
-/// Per CommonMark §6.4: scheme must be 2-32 chars, start with letter,
-/// followed by letters/digits/+/-/., then `:`.
-fn is_uri_autolink(text: &str) -> bool {
- let bytes = text.as_bytes();
- if bytes.is_empty() {
- return false;
- }
-
- // Must start with a letter
- if !bytes[0].is_ascii_alphabetic() {
- return false;
- }
-
- // Find the colon
- let mut colon_pos = None;
- for (i, &b) in bytes.iter().enumerate().skip(1) {
- if b == b':' {
- colon_pos = Some(i);
- break;
- }
- // Scheme chars: letters, digits, +, -, .
- if !b.is_ascii_alphanumeric() && b != b'+' && b != b'-' && b != b'.' {
- return false;
- }
- }
-
- // Scheme must be 2-32 chars and followed by colon
- match colon_pos {
- Some(pos) if (2..=32).contains(&pos) => {
- // Must have content after the colon and no whitespace/< in URI
- let rest = &text[pos + 1..];
- !rest.is_empty()
- && !rest.contains('<')
- && !rest.contains('>')
- && !rest.chars().any(|c| c.is_whitespace())
- }
- _ => false,
- }
-}
-
-/// Check if the text after `<` looks like an email autolink.
-/// Per CommonMark §6.5: local@domain pattern with specific char restrictions.
-fn is_email_autolink(text: &str) -> bool {
- // Must contain exactly one @ not at start or end
- let at_pos = match text.find('@') {
- Some(pos) if pos > 0 && pos < text.len() - 1 => pos,
- _ => return false,
- };
-
- // Check no second @
- if text[at_pos + 1..].contains('@') {
- return false;
- }
-
- // Local part: alphanumerics and .!#$%&'*+/=?^_`{|}~-
- let local = &text[..at_pos];
- for c in local.chars() {
- if !c.is_ascii_alphanumeric()
- && !matches!(
- c,
- '.' | '!'
- | '#'
- | '$'
- | '%'
- | '&'
- | '\''
- | '*'
- | '+'
- | '/'
- | '='
- | '?'
- | '^'
- | '_'
- | '`'
- | '{'
- | '|'
- | '}'
- | '~'
- | '-'
- )
- {
- return false;
- }
- }
-
- // Domain part: alphanumerics and hyphens, dots for subdomains
- let domain = &text[at_pos + 1..];
- if domain.is_empty() || domain.starts_with('.') || domain.ends_with('.') {
- return false;
- }
-
- for c in domain.chars() {
- if !c.is_ascii_alphanumeric() && c != '-' && c != '.' {
- return false;
- }
- }
-
- true
-}
-
-/// Parse an autolink (`` or ``).
-///
-/// Grammar: MdAutolink = '<' value: MdInlineItemList '>'
-///
-/// Per CommonMark §6.4 and §6.5, autolinks are URIs or email addresses
-/// wrapped in angle brackets.
-pub(crate) fn parse_autolink(p: &mut MarkdownParser) -> ParsedSyntax {
- if !p.at(L_ANGLE) {
- return Absent;
- }
-
- // Look ahead to find the closing > and check if content is valid
- let source = p.source_after_current();
-
- // Skip the < and find >
- let after_open = &source[1..];
- let close_pos = match after_open.find('>') {
- Some(pos) => pos,
- None => return Absent, // No closing >
- };
-
- // Check for newline before > (not allowed in autolinks)
- let content = &after_open[..close_pos];
- if content.contains('\n') || content.contains('\r') {
- return Absent;
- }
-
- // Must be either URI or email autolink
- if !is_uri_autolink(content) && !is_email_autolink(content) {
- return Absent;
- }
-
- // Valid autolink - parse it
- let m = p.start();
-
- // <
- p.bump(L_ANGLE);
-
- // Content as inline item list containing textual nodes.
- // Autolinks don't process backslash escapes, but the lexer may combine
- // `\>` into a single escape token. We re-lex in CodeSpan context where
- // backslash is literal, so `\` and `>` are separate tokens.
- p.force_relex_code_span();
-
- let content_m = p.start();
- while !p.at(R_ANGLE) && !p.at(T![EOF]) && !p.at_inline_end() {
- let text_m = p.start();
- p.bump_remap_with_context(
- MD_TEXTUAL_LITERAL,
- crate::lexer::MarkdownLexContext::CodeSpan,
- );
- text_m.complete(p, MD_TEXTUAL);
- }
- content_m.complete(p, MD_INLINE_ITEM_LIST);
-
- // >
- p.expect(R_ANGLE);
-
- // Re-lex back to regular context
- p.force_relex_regular();
-
- Present(m.complete(p, MD_AUTOLINK))
-}
-
-/// Dispatch to the appropriate inline parser based on current token.
-pub(crate) fn parse_any_inline(p: &mut MarkdownParser) -> ParsedSyntax {
- if p.at(MD_HARD_LINE_LITERAL) {
- parse_hard_line(p)
- } else if p.at(BACKTICK) || p.at(T!["```"]) {
- // Try code span, fall back to literal text if no matching closer exists.
- // T!["```"] can appear when backticks are at line start but info string
- // contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
- let result = parse_inline_code(p);
- if result.is_absent() {
- super::parse_textual(p)
- } else {
- result
- }
- } else if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
- // For cases like `***foo***`, the em match starts at the exact token boundary
- // (prefix_len=0) while the strong match starts at offset 1 (prefix_len=1).
- // Try italic first to handle nested emphasis correctly, then try strong.
- let result = parse_inline_italic(p);
- if result.is_present() {
- return result;
- }
- let result = parse_inline_emphasis(p);
- if result.is_present() {
- return result;
- }
- // Neither matched - re-lex to single token and emit just one char as literal.
- // This handles cases like `**foo*` where opener is at offset 1.
- p.force_relex_emphasis_inline();
- super::parse_textual(p)
- } else if p.at(T![*]) || p.at(UNDERSCORE) {
- // Try italic, fall back to literal text if flanking rules fail
- let result = parse_inline_italic(p);
- if result.is_absent() {
- super::parse_textual(p)
- } else {
- result
- }
- } else if p.at(BANG) && p.nth_at(1, L_BRACK) {
- // Try image, fall back to literal text if parsing fails
- let result = parse_inline_image(p);
- if result.is_absent() {
- super::parse_textual(p)
- } else {
- result
- }
- } else if p.at(L_BRACK) {
- // Try link, fall back to literal text if parsing fails
- let result = parse_inline_link(p);
- if result.is_absent() {
- super::parse_textual(p)
- } else {
- result
- }
- } else if p.at(L_ANGLE) {
- // Try autolink first (takes priority per CommonMark)
- let result = parse_autolink(p);
- if result.is_present() {
- return result;
- }
- // Then try inline HTML
- let result = parse_inline_html(p);
- if result.is_present() {
- return result;
- }
- // Fall back to textual
- super::parse_textual(p)
- } else if p.at(MD_ENTITY_LITERAL) {
- // Entity or numeric character reference (already validated by lexer)
- parse_entity_reference(p)
- } else {
- super::parse_textual(p)
- }
-}
diff --git a/crates/biome_markdown_parser/src/syntax/inline/code_span.rs b/crates/biome_markdown_parser/src/syntax/inline/code_span.rs
new file mode 100644
index 000000000000..1f42fe86d948
--- /dev/null
+++ b/crates/biome_markdown_parser/src/syntax/inline/code_span.rs
@@ -0,0 +1,248 @@
+use biome_markdown_syntax::T;
+use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
+use biome_parser::Parser;
+use biome_parser::prelude::ParsedSyntax::{self, *};
+
+use crate::MarkdownParser;
+
+/// Parse a hard line break.
+///
+/// Grammar: MdHardLine = value: 'md_hard_line_literal'
+///
+/// A hard line break is created by either:
+/// - Two or more trailing spaces followed by a newline
+/// - A backslash followed by a newline
+pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax {
+ if !p.at(MD_HARD_LINE_LITERAL) {
+ return Absent;
+ }
+
+ let m = p.start();
+ p.bump(MD_HARD_LINE_LITERAL);
+ Present(m.complete(p, MD_HARD_LINE))
+}
+
+/// Check if there's a matching closing backtick sequence before EOF/blank line.
+///
+/// Per CommonMark §6.1, a code span opener must have a matching closer with the
+/// same number of backticks. If no match exists, the opener should be treated
+/// as literal text, not an unclosed code span.
+///
+/// Returns false if no match found (opener should become literal text).
+fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -> bool {
+ use crate::lexer::MarkdownLexContext;
+
+ p.lookahead(|p| {
+ // Skip the opening backticks (handle both BACKTICK and TRIPLE_BACKTICK)
+ if p.at(T!["```"]) {
+ p.bump(T!["```"]);
+ } else {
+ p.bump(BACKTICK);
+ }
+
+ loop {
+ // EOF = no matching closer found
+ if p.at(T![EOF]) {
+ return false;
+ }
+
+ // Blank line = paragraph boundary, terminates search
+ if p.at(NEWLINE) && p.at_blank_line() {
+ return false;
+ }
+
+ // Per CommonMark §4.3, setext heading underlines take priority over
+ // inline code spans. If crossing a newline would land on a setext
+ // underline, the code span is invalid — the underline forms a heading.
+ if p.at(NEWLINE) {
+ p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
+ if crate::syntax::at_setext_underline_after_newline(p).is_some() {
+ return false;
+ }
+ // Per CommonMark, block interrupts (including list markers) can
+ // terminate paragraphs. A code span cannot cross a block boundary.
+ if crate::syntax::at_block_interrupt(p) || at_list_marker_after_newline(p) {
+ return false;
+ }
+ continue;
+ }
+
+ // Found backticks - check if they match (handle both BACKTICK and TRIPLE_BACKTICK)
+ if p.at(BACKTICK) || p.at(T!["```"]) {
+ let closing_count = p.cur_text().len();
+ if closing_count == opening_count {
+ return true;
+ }
+ // Not matching - continue searching
+ if p.at(T!["```"]) {
+ p.bump(T!["```"]);
+ } else {
+ p.bump(BACKTICK);
+ }
+ continue;
+ }
+
+ // Consume token and continue (use CodeSpan context for proper backslash handling)
+ p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
+ }
+ })
+}
+
+/// Check if we're at a list marker after a newline.
+/// This is used to detect when a code span would cross a list item boundary.
+fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool {
+ // Skip up to 3 spaces of indent (list markers can be indented 0-3 spaces)
+ let mut columns = 0usize;
+ while columns < 4
+ && p.at(MD_TEXTUAL_LITERAL)
+ && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
+ {
+ for c in p.cur_text().chars() {
+ match c {
+ ' ' => columns += 1,
+ '\t' => columns += 4 - (columns % 4),
+ _ => {}
+ }
+ }
+ if columns >= 4 {
+ return false; // Indented code block, not a list marker
+ }
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+
+ // Check for bullet list markers: -, *, +
+ if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) {
+ let marker_text = p.cur_text();
+ if marker_text.len() == 1 {
+ p.bump_any();
+ // Must be followed by space, tab, or EOL
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return true;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ return text.starts_with(' ') || text.starts_with('\t');
+ }
+ }
+ return false;
+ }
+
+ // Check for ordered list marker: digits followed by . or )
+ if p.at(MD_ORDERED_LIST_MARKER) {
+ p.bump(MD_ORDERED_LIST_MARKER);
+ // Must be followed by space, tab, or EOL
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return true;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ return text.starts_with(' ') || text.starts_with('\t');
+ }
+ return false;
+ }
+
+ // Check for textual bullet markers (lexed as MD_TEXTUAL_LITERAL in some contexts)
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text == "-" || text == "*" || text == "+" {
+ p.bump(MD_TEXTUAL_LITERAL);
+ // Must be followed by space, tab, or EOL
+ if p.at(NEWLINE) || p.at(T![EOF]) {
+ return true;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let next = p.cur_text();
+ return next.starts_with(' ') || next.starts_with('\t');
+ }
+ }
+ }
+
+ false
+}
+
+/// Parse inline code span (`` `code` `` or ``` `` `code` `` ```).
+///
+/// Grammar: MdInlineCode = l_tick: '`' content: MdInlineItemList r_tick: '`'
+///
+/// Per CommonMark §6.1:
+/// - Code spans can use multiple backticks to allow literal backticks inside
+/// - The opening and closing backtick strings must be the same length
+/// - Backslash escapes are NOT processed inside code spans (\` is literal `\``)
+/// - If no matching closer exists, the opener is treated as literal text
+pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
+ use crate::lexer::MarkdownLexContext;
+
+ // Handle both BACKTICK and TRIPLE_BACKTICK (T!["```"] ) as code span openers.
+ // TRIPLE_BACKTICK can appear when backticks are at line start but info string
+ // contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
+ let is_backtick = p.at(BACKTICK);
+ let is_triple_backtick = p.at(T!["```"]);
+ if !is_backtick && !is_triple_backtick {
+ return Absent;
+ }
+
+ let opening_count = p.cur_text().len();
+
+ // DESIGN PRINCIPLE #2 & #4: Check for matching closer BEFORE creating any nodes.
+ // If no match exists, return Absent so backticks become literal text.
+ // This avoids synthesizing MD_INLINE_CODE with missing r_tick_token.
+ if !has_matching_code_span_closer(p, opening_count) {
+ return Absent; // Caller will treat backtick as literal MD_TEXTUAL
+ }
+
+ // We have a valid code span - now parse it
+ let m = p.start();
+
+ // Opening backtick(s) - remap TRIPLE_BACKTICK to BACKTICK for consistency
+ if is_triple_backtick {
+ p.bump_remap(BACKTICK);
+ } else {
+ p.bump(BACKTICK);
+ }
+
+ // Content - parse until we find matching closing backticks
+ // Per CommonMark, code spans can span multiple lines (newlines become spaces in output)
+ // All content is lexed in CodeSpan context to keep backslash literal and avoid
+ // hard-line-break detection.
+ let content = p.start();
+ loop {
+ // EOF should not happen (lookahead guaranteed a closer), but handle defensively
+ if p.at(T![EOF]) {
+ break;
+ }
+
+ // DESIGN PRINCIPLE #3: Terminate on blank line (paragraph boundary)
+ if p.at(NEWLINE) {
+ if p.at_blank_line() {
+ break; // Paragraph boundary - stop
+ }
+ // Soft line break - consume NEWLINE as content and continue
+ // Use CodeSpan context so next token is also lexed without escape processing
+ let text_m = p.start();
+ p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
+ text_m.complete(p, MD_TEXTUAL);
+ continue;
+ }
+
+ // Found matching closing backticks (handle both BACKTICK and TRIPLE_BACKTICK)
+ if (p.at(BACKTICK) || p.at(T!["```"])) && p.cur_text().len() == opening_count {
+ break;
+ }
+
+ // DESIGN PRINCIPLE #1: Use CodeSpan context so backslash is literal
+ let text_m = p.start();
+ p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan);
+ text_m.complete(p, MD_TEXTUAL);
+ }
+ content.complete(p, MD_INLINE_ITEM_LIST);
+
+ // Closing backticks (guaranteed to exist due to lookahead check)
+ // Remap TRIPLE_BACKTICK to BACKTICK for consistency
+ if p.at(T!["```"]) {
+ p.bump_remap(BACKTICK);
+ } else {
+ p.bump(BACKTICK);
+ }
+
+ Present(m.complete(p, MD_INLINE_CODE))
+}
diff --git a/crates/biome_markdown_parser/src/syntax/inline/emphasis.rs b/crates/biome_markdown_parser/src/syntax/inline/emphasis.rs
new file mode 100644
index 000000000000..d10fdd4d6baa
--- /dev/null
+++ b/crates/biome_markdown_parser/src/syntax/inline/emphasis.rs
@@ -0,0 +1,704 @@
+use biome_markdown_syntax::MarkdownSyntaxKind;
+use biome_markdown_syntax::T;
+use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
+use biome_parser::Parser;
+use biome_parser::prelude::ParsedSyntax::{self, *};
+use biome_unicode_table::is_unicode_punctuation;
+
+use crate::MarkdownParser;
+use crate::syntax::reference::normalize_reference_label;
+
+// ============================================================================
+// Delimiter Stack Types for Emphasis Parsing
+// ============================================================================
+
+/// Kind of emphasis delimiter (* or _)
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum DelimKind {
+ Star,
+ Underscore,
+}
+
+/// A delimiter run collected during the first pass
+#[derive(Debug, Clone)]
+struct DelimRun {
+ /// The delimiter character kind
+ kind: DelimKind,
+ /// Number of delimiter characters in this run
+ count: usize,
+ /// Whether this can open emphasis (left-flanking)
+ can_open: bool,
+ /// Whether this can close emphasis (right-flanking)
+ can_close: bool,
+ /// Byte offset in the source where this run starts
+ start_offset: usize,
+ /// Bracket nesting depth for scoping emphasis within link text.
+ /// Delimiters inside brackets (links) should only match with each other,
+ /// not with delimiters outside the brackets. 0 = outside brackets.
+ label_id: usize,
+}
+
+/// A matched emphasis span (opener + closer)
+#[derive(Debug, Clone)]
+struct EmphasisMatch {
+ /// Byte offset where the opener delimiter starts
+ opener_start: usize,
+ /// Byte offset where the closer delimiter starts
+ closer_start: usize,
+ /// Whether this is strong (2 chars) or regular (1 char) emphasis
+ is_strong: bool,
+}
+
+/// Check if a character is Unicode whitespace for flanking rules.
+fn is_whitespace(c: char) -> bool {
+ c.is_whitespace()
+}
+
+fn is_emphasis_marker(c: char) -> bool {
+ matches!(c, '*' | '_')
+}
+
+/// Check if a character is Unicode punctuation for flanking rules.
+/// Per CommonMark spec, this includes ASCII punctuation and Unicode punctuation categories.
+fn is_punctuation(c: char) -> bool {
+ is_unicode_punctuation(c)
+}
+
+fn backtick_run_len(bytes: &[u8], start: usize) -> usize {
+ let mut count = 1;
+ while start + count < bytes.len() && bytes[start + count] == b'`' {
+ count += 1;
+ }
+ count
+}
+
+fn skip_code_span(bytes: &[u8], i: &mut usize) {
+ let backtick_count = backtick_run_len(bytes, *i);
+ *i += backtick_count;
+
+ while *i < bytes.len() {
+ if bytes[*i] == b'`' {
+ let close_count = backtick_run_len(bytes, *i);
+ *i += close_count;
+ if close_count == backtick_count {
+ break;
+ }
+ } else {
+ *i += 1;
+ }
+ }
+}
+
+fn skip_angle_bracket(bytes: &[u8], i: &mut usize) {
+ *i += 1;
+ while *i < bytes.len() && bytes[*i] != b'>' && bytes[*i] != b'\n' {
+ *i += 1;
+ }
+ if *i < bytes.len() && bytes[*i] == b'>' {
+ *i += 1;
+ }
+}
+
+fn is_flanking_delimiter(primary: Option, secondary: Option) -> bool {
+ match primary {
+ None => false, // At start/end of input, can't be flanking
+ Some(c) if is_whitespace(c) => false, // Next to whitespace
+ Some(c) if is_emphasis_marker(c) => true,
+ Some(c) if is_punctuation(c) => {
+ // Only flanking if the other side is whitespace or punctuation
+ match secondary {
+ None => true, // Boundary counts as whitespace
+ Some(s) => is_whitespace(s) || is_punctuation(s),
+ }
+ }
+ Some(_) => true, // Not next to whitespace or punctuation = flanking
+ }
+}
+
+/// Check if an opening delimiter is left-flanking per CommonMark rules.
+/// A left-flanking delimiter run is one that is:
+/// - Not followed by Unicode whitespace, AND
+/// - Either (a) not followed by punctuation, OR (b) preceded by whitespace/punctuation
+fn is_left_flanking_delimiter(char_after: Option, char_before: Option) -> bool {
+ is_flanking_delimiter(char_after, char_before)
+}
+
+/// Check if a closing delimiter is right-flanking per CommonMark rules.
+/// A right-flanking delimiter run is one that is:
+/// - Not preceded by Unicode whitespace, AND
+/// - Either (a) not preceded by punctuation, OR (b) followed by whitespace/punctuation
+fn is_right_flanking_delimiter(char_before: Option, char_after: Option) -> bool {
+ is_flanking_delimiter(char_before, char_after)
+}
+
+/// Check if underscore can open emphasis (stricter rules than asterisk).
+/// Per CommonMark 6.2, underscore can open emphasis iff it is left-flanking AND either:
+/// - Not part of a right-flanking delimiter run, OR
+/// - Preceded by a punctuation character
+fn can_underscore_open(char_before: Option, char_after: Option) -> bool {
+ // Must be left-flanking
+ if !is_left_flanking_delimiter(char_after, char_before) {
+ return false;
+ }
+ // If also right-flanking, must be preceded by punctuation
+ if is_right_flanking_delimiter(char_before, char_after) {
+ return matches!(char_before, Some(c) if is_punctuation(c));
+ }
+ true
+}
+
+/// Check if underscore can close emphasis (stricter rules than asterisk).
+/// Per CommonMark 6.2, underscore can close emphasis iff it is right-flanking AND either:
+/// - Not part of a left-flanking delimiter run, OR
+/// - Followed by a punctuation character
+fn can_underscore_close(char_before: Option, char_after: Option) -> bool {
+ // Must be right-flanking
+ if !is_right_flanking_delimiter(char_before, char_after) {
+ return false;
+ }
+ // If also left-flanking, must be followed by punctuation
+ if is_left_flanking_delimiter(char_after, char_before) {
+ return matches!(char_after, Some(c) if is_punctuation(c));
+ }
+ true
+}
+
+// ============================================================================
+// Delimiter Stack Algorithm Implementation
+// ============================================================================
+
+/// Collect all delimiter runs from source text.
+///
+/// This is the first pass of the CommonMark emphasis algorithm. It scans
+/// the source text and identifies all potential delimiter runs (sequences
+/// of `*` or `_`), computing their flanking status.
+/// Result of checking if a bracket forms a valid link.
+/// Contains the closing bracket position if found.
+struct BracketCheckResult {
+ /// Position of the closing `]` (or 0 if not found)
+ close_pos: usize,
+ /// Whether this is a valid inline link `[...](` or full reference `[...][]`
+ is_inline_or_full_ref: bool,
+}
+
+/// Check if a bracket at position `start` forms a valid link pattern.
+/// Returns the closing bracket position and whether it's an inline link or full reference.
+fn check_bracket_pattern(bytes: &[u8], start: usize) -> Option {
+ if start >= bytes.len() || bytes[start] != b'[' {
+ return None;
+ }
+
+ // Find matching ] with proper nesting
+ let mut depth = 1;
+ let mut i = start + 1;
+ while i < bytes.len() && depth > 0 {
+ match bytes[i] {
+ b'[' => depth += 1,
+ b']' => depth -= 1,
+ b'\\' if i + 1 < bytes.len() => i += 1, // Skip escaped char
+ b'`' => {
+ // Skip code spans
+ skip_code_span(bytes, &mut i);
+ continue;
+ }
+ b'<' => {
+ // Skip potential HTML/autolinks
+ skip_angle_bracket(bytes, &mut i);
+ continue;
+ }
+ _ => {}
+ }
+ i += 1;
+ }
+
+ if depth != 0 {
+ return None;
+ }
+
+ // i now points to position after `]`
+ let close_pos = i - 1;
+ let is_inline_or_full_ref = i < bytes.len() && (bytes[i] == b'(' || bytes[i] == b'[');
+
+ Some(BracketCheckResult {
+ close_pos,
+ is_inline_or_full_ref,
+ })
+}
+
+/// Extract label text from a bracket pattern for reference lookup.
+fn extract_label_text(source: &str, start: usize, close_pos: usize) -> &str {
+ if start < close_pos && close_pos <= source.len() {
+ &source[start + 1..close_pos]
+ } else {
+ ""
+ }
+}
+
+fn collect_delimiter_runs(source: &str, reference_checker: impl Fn(&str) -> bool) -> Vec {
+ let mut runs = Vec::new();
+ let bytes = source.as_bytes();
+ let mut i = 0;
+
+ // Pre-compute valid link bracket positions.
+ // A bracket is considered a valid link if:
+ // 1. It's followed by `(` (inline link) or `[` (full reference), OR
+ // 2. It's a shortcut reference with a defined reference (checked via reference_checker)
+ let mut link_bracket_starts = Vec::new();
+ for pos in 0..bytes.len() {
+ if bytes[pos] == b'['
+ && let Some(result) = check_bracket_pattern(bytes, pos)
+ {
+ if result.is_inline_or_full_ref {
+ // Inline link or full reference link
+ link_bracket_starts.push(pos);
+ } else {
+ // Could be a shortcut reference - check if definition exists
+ let label = extract_label_text(source, pos, result.close_pos);
+ let normalized = normalize_reference_label(label);
+ if !normalized.is_empty() && reference_checker(&normalized) {
+ link_bracket_starts.push(pos);
+ }
+ }
+ }
+ }
+
+ // Track bracket depth, but only for valid link brackets
+ let mut bracket_depth = 0usize;
+ let mut active_link_brackets: Vec = Vec::new();
+
+ while i < bytes.len() {
+ let b = bytes[i];
+
+ // Track bracket depth for valid links only
+ if b == b'[' && link_bracket_starts.contains(&i) {
+ bracket_depth += 1;
+ active_link_brackets.push(i);
+ i += 1;
+ continue;
+ }
+ if b == b']' && !active_link_brackets.is_empty() {
+ bracket_depth = bracket_depth.saturating_sub(1);
+ active_link_brackets.pop();
+ i += 1;
+ continue;
+ }
+
+ // Check for delimiter characters
+ if b == b'*' || b == b'_' {
+ let kind = if b == b'*' {
+ DelimKind::Star
+ } else {
+ DelimKind::Underscore
+ };
+ let start_offset = i;
+
+ // Count consecutive delimiter characters
+ let mut count = 1;
+ while i + count < bytes.len() && bytes[i + count] == b {
+ count += 1;
+ }
+ let end_offset = i + count;
+
+ // Get character before delimiter run
+ let char_before = if start_offset > 0 {
+ // Get the char ending at start_offset
+ let before_slice = &source[..start_offset];
+ before_slice.chars().next_back()
+ } else {
+ None
+ };
+
+ // Get character after delimiter run
+ let char_after = source[end_offset..].chars().next();
+
+ // Compute flanking status
+ let (can_open, can_close) = if kind == DelimKind::Underscore {
+ (
+ can_underscore_open(char_before, char_after),
+ can_underscore_close(char_before, char_after),
+ )
+ } else {
+ // Asterisk: can open if left-flanking, can close if right-flanking
+ (
+ is_left_flanking_delimiter(char_after, char_before),
+ is_right_flanking_delimiter(char_before, char_after),
+ )
+ };
+
+ runs.push(DelimRun {
+ kind,
+ count,
+ can_open,
+ can_close,
+ start_offset,
+ // Only scope by bracket depth when inside a valid link pattern.
+ // This prevents emphasis from spanning link boundaries, but allows
+ // emphasis to span brackets that don't form valid links.
+ label_id: bracket_depth,
+ });
+
+ i = end_offset;
+ } else if b == b'`' {
+ // Skip code spans - they block emphasis
+ skip_code_span(bytes, &mut i);
+ } else if b == b'<' {
+ // Skip potential HTML tags and autolinks
+ skip_angle_bracket(bytes, &mut i);
+ } else if b == b'\\' && i + 1 < bytes.len() {
+ // Skip escaped characters
+ i += 2;
+ } else {
+ i += 1;
+ }
+ }
+
+ runs
+}
+
+/// Match delimiter runs using the CommonMark algorithm.
+///
+/// This is the second pass. It processes closers from left to right,
+/// searching backward for matching openers. Returns a list of matched
+/// emphasis spans sorted by opener position.
+fn match_delimiters(runs: &mut [DelimRun]) -> Vec {
+ let mut matches = Vec::new();
+ let mut opener_stack: Vec = Vec::new();
+
+ for idx in 0..runs.len() {
+ if runs[idx].can_close && runs[idx].count > 0 {
+ loop {
+ let mut opener_stack_pos = None;
+
+ // Search backward for the closest matching opener.
+ // Per CommonMark spec, we find any matching opener first,
+ // then determine strong vs regular based on both counts.
+ for (pos, &opener_idx) in opener_stack.iter().enumerate().rev() {
+ let opener = &runs[opener_idx];
+ let closer = &runs[idx];
+
+ // Only match within same bracket scope (label_id).
+ // This prevents emphasis from spanning link boundaries.
+ if opener.label_id != closer.label_id {
+ continue;
+ }
+
+ if opener.kind != closer.kind || !opener.can_open || opener.count == 0 {
+ continue;
+ }
+
+ // Rule of 3: if (opener_count + closer_count) % 3 == 0 and
+ // the closer can open or the opener can close, skip unless
+ // both counts are divisible by 3
+ let opener_count = opener.count;
+ let closer_count = closer.count;
+ if (opener.can_close || closer.can_open)
+ && !closer_count.is_multiple_of(3)
+ && (opener_count + closer_count).is_multiple_of(3)
+ {
+ continue;
+ }
+
+ opener_stack_pos = Some(pos);
+ break;
+ }
+
+ let Some(pos) = opener_stack_pos else { break };
+ let opener_idx = opener_stack[pos];
+ let use_count = if runs[opener_idx].count >= 2 && runs[idx].count >= 2 {
+ 2
+ } else {
+ 1
+ };
+
+ // Openers consume from END of run (leftover stays at beginning).
+ // This ensures for `***foo***`, the inner `**` is consumed leaving `*` at start.
+ let opener_start =
+ runs[opener_idx].start_offset + runs[opener_idx].count - use_count;
+ // Closers consume from BEGINNING of what remains.
+ let closer_start = runs[idx].start_offset;
+
+ matches.push(EmphasisMatch {
+ opener_start,
+ closer_start,
+ is_strong: use_count == 2,
+ });
+
+ // Opener: reduce count but keep start_offset (leftover is at beginning)
+ runs[opener_idx].count -= use_count;
+ // Closer: reduce count and advance start_offset (leftover is at end)
+ runs[idx].count -= use_count;
+ runs[idx].start_offset += use_count;
+
+ // Remove openers between the matched opener and this closer.
+ opener_stack.truncate(pos + 1);
+ if runs[opener_idx].count == 0 {
+ opener_stack.pop();
+ }
+
+ // Note: With the "consume from END" algorithm for openers,
+ // crossing matches are no longer an issue because the leftover
+ // chars end up at the beginning of the opener run (wrapping
+ // around the inner match), not at the end (which would cross).
+
+ if runs[idx].count == 0 {
+ break;
+ }
+ }
+ }
+
+ if runs[idx].can_open && runs[idx].count > 0 {
+ opener_stack.push(idx);
+ }
+ }
+
+ // Sort matches by opener position for nested processing
+ matches.sort_by_key(|m| m.opener_start);
+
+ matches
+}
+
+/// Context for emphasis-aware inline parsing
+#[derive(Debug)]
+pub(crate) struct EmphasisContext {
+ /// Matched emphasis spans, sorted by opener_start
+ matches: Vec,
+ /// Base offset of the inline content in the source
+ base_offset: usize,
+}
+
+/// Information about a match found within a token's range.
+/// Used when the opener doesn't start at the exact token boundary.
+#[derive(Debug)]
+struct OpenerMatch<'a> {
+ /// The matched emphasis span
+ matched: &'a EmphasisMatch,
+ /// How many chars before opener_start (literal prefix to emit)
+ prefix_len: usize,
+}
+
+impl EmphasisContext {
+ /// Create a new emphasis context by analyzing the source text.
+ /// The reference_checker function is used to determine if a bracket pattern
+ /// is a valid shortcut reference link.
+ pub(crate) fn new(
+ source: &str,
+ base_offset: usize,
+ reference_checker: impl Fn(&str) -> bool,
+ ) -> Self {
+ let mut runs = collect_delimiter_runs(source, reference_checker);
+ let matches = match_delimiters(&mut runs);
+ Self {
+ matches,
+ base_offset,
+ }
+ }
+
+ /// Find the *earliest* match whose opener_start is within [token_start, token_end)
+ /// and matches the expected `is_strong` value.
+ /// Returns None if no match found, or the match plus prefix length.
+ ///
+ /// This is used instead of exact offset matching because with the "consume from END"
+ /// algorithm, an opener might start in the middle of a DOUBLE_STAR token.
+ fn opener_within(
+ &self,
+ token_start: usize,
+ token_len: usize,
+ expect_strong: bool,
+ ) -> Option> {
+ let token_end = token_start + token_len;
+ let mut best: Option> = None;
+
+ for m in &self.matches {
+ // Filter by expected emphasis type
+ if m.is_strong != expect_strong {
+ continue;
+ }
+
+ let abs_opener = m.opener_start + self.base_offset;
+ if abs_opener >= token_start && abs_opener < token_end {
+ let candidate = OpenerMatch {
+ matched: m,
+ prefix_len: abs_opener - token_start,
+ };
+ // Pick the earliest match (smallest prefix_len)
+ if best
+ .as_ref()
+ .is_none_or(|b| candidate.prefix_len < b.prefix_len)
+ {
+ best = Some(candidate);
+ }
+ }
+ }
+
+ best
+ }
+}
+
+/// Parse emphasis using the delimiter stack matches.
+fn parse_emphasis_from_context(p: &mut MarkdownParser, expect_strong: bool) -> ParsedSyntax {
+ let context = match p.emphasis_context() {
+ Some(context) => context,
+ None => return Absent,
+ };
+
+ // Must be at an emphasis token
+ if !p.at(DOUBLE_STAR) && !p.at(DOUBLE_UNDERSCORE) && !p.at(T![*]) && !p.at(UNDERSCORE) {
+ return Absent;
+ }
+
+ // Get current token info BEFORE any re-lex
+ let token_start = u32::from(p.cur_range().start()) as usize;
+ let token_len: usize = p.cur_range().len().into();
+
+ // Find match within current token's range that has the expected is_strong value
+ let opener_match = match context.opener_within(token_start, token_len, expect_strong) {
+ Some(m) => m,
+ None => return Absent,
+ };
+
+ // If the opener doesn't start at the exact token boundary, return Absent.
+ // The caller (parse_any_inline) will emit literal text, advancing the parser position.
+ // On subsequent calls, we'll eventually be at the correct position with prefix_len == 0.
+ if opener_match.prefix_len > 0 {
+ return Absent;
+ }
+
+ // Extract values before dropping the borrow on context
+ let use_count = if expect_strong { 2 } else { 1 };
+ let closer_offset = opener_match.matched.closer_start + context.base_offset;
+ // Use the correct delimiter character for error messages
+ let is_underscore = p.at(DOUBLE_UNDERSCORE) || p.at(UNDERSCORE);
+ let opener_text = match (expect_strong, is_underscore) {
+ (true, true) => "__",
+ (true, false) => "**",
+ (false, true) => "_",
+ (false, false) => "*",
+ };
+
+ let m = p.start();
+ let opening_range = p.cur_range();
+
+ // Consume opener tokens
+ // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one.
+ // Only re-lex when we need to consume a partial token or single chars.
+ if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) {
+ // Bump the double token as a single unit
+ p.bump_any();
+ } else {
+ // Consume individual tokens
+ for _ in 0..use_count {
+ if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
+ p.force_relex_emphasis_inline();
+ }
+ p.bump_any();
+ }
+ }
+
+ // Parse content until we reach the closer
+ let content = p.start();
+ loop {
+ // EOF always ends content
+ if p.at(T![EOF]) {
+ break;
+ }
+
+ let current_offset = u32::from(p.cur_range().start()) as usize;
+ let current_len: usize = p.cur_range().len().into();
+
+ // Check if closer is AT or WITHIN current token
+ if closer_offset >= current_offset && closer_offset < current_offset + current_len {
+ break;
+ }
+
+ // Check if we've passed the closer (can happen when link parsing consumes past it)
+ if current_offset > closer_offset {
+ break;
+ }
+
+ // Parse nested inline content
+ let result = super::parse_any_inline(p);
+ if result.is_absent() {
+ break;
+ }
+ }
+ content.complete(p, MD_INLINE_ITEM_LIST);
+
+ // Consume closer tokens
+ let mut consumed_closer = 0;
+ while consumed_closer < use_count {
+ if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) {
+ p.bump_any();
+ consumed_closer += 2;
+ continue;
+ }
+ if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
+ p.force_relex_emphasis_inline();
+ }
+ if p.at(T![*]) || p.at(UNDERSCORE) {
+ p.bump_any();
+ consumed_closer += 1;
+ } else {
+ break;
+ }
+ }
+
+ if consumed_closer < use_count {
+ p.error(crate::syntax::parse_error::unclosed_emphasis(
+ p,
+ opening_range,
+ opener_text,
+ ));
+ }
+
+ if expect_strong {
+ Present(m.complete(p, MD_INLINE_EMPHASIS))
+ } else {
+ Present(m.complete(p, MD_INLINE_ITALIC))
+ }
+}
+
+/// Parse inline emphasis (bold: `**text**` or `__text__`).
+pub(crate) fn parse_inline_emphasis(p: &mut MarkdownParser) -> ParsedSyntax {
+ parse_emphasis_from_context(p, true)
+}
+
+/// Parse inline italic (`*text*` or `_text_`).
+pub(crate) fn parse_inline_italic(p: &mut MarkdownParser) -> ParsedSyntax {
+ parse_emphasis_from_context(p, false)
+}
+
+pub(crate) fn set_inline_emphasis_context_until(
+ p: &mut MarkdownParser,
+ stop: MarkdownSyntaxKind,
+) -> Option {
+ let source_len = inline_list_source_len_until(p, stop);
+ let source = p.source_after_current();
+ let inline_source = if source_len <= source.len() {
+ &source[..source_len]
+ } else {
+ source
+ };
+ let base_offset = u32::from(p.cur_range().start()) as usize;
+ // Create a reference checker closure that uses the parser's link reference definitions
+ let context = EmphasisContext::new(inline_source, base_offset, |label| {
+ p.has_link_reference_definition(label)
+ });
+ p.set_emphasis_context(Some(context))
+}
+
+fn inline_list_source_len_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> usize {
+ p.lookahead(|p| {
+ let mut len = 0usize;
+
+ loop {
+ if p.at(T![EOF]) || p.at(stop) || p.at_inline_end() {
+ break;
+ }
+
+ len += p.cur_text().len();
+ p.bump(p.cur());
+ }
+
+ len
+ })
+}
diff --git a/crates/biome_markdown_parser/src/syntax/inline/entities.rs b/crates/biome_markdown_parser/src/syntax/inline/entities.rs
new file mode 100644
index 000000000000..94340865c529
--- /dev/null
+++ b/crates/biome_markdown_parser/src/syntax/inline/entities.rs
@@ -0,0 +1,26 @@
+use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
+use biome_parser::Parser;
+use biome_parser::prelude::ParsedSyntax::{self, *};
+
+use crate::MarkdownParser;
+
+/// Parse entity or numeric character reference per CommonMark §6.2.
+///
+/// Grammar: MdEntityReference = value: 'md_entity_literal'
+///
+/// Valid patterns:
+/// - Named entity: `&name;` where name is 2-31 alphanumeric chars starting with letter
+/// - Decimal numeric: `digits;` where digits is 1-7 decimal digits
+/// - Hexadecimal: `hex;` or `hex;` where hex is 1-6 hex digits
+///
+/// The lexer has already validated and tokenized valid entity references as
+/// MD_ENTITY_LITERAL tokens. Invalid patterns remain as textual.
+pub(crate) fn parse_entity_reference(p: &mut MarkdownParser) -> ParsedSyntax {
+ if !p.at(MD_ENTITY_LITERAL) {
+ return Absent;
+ }
+
+ let m = p.start();
+ p.bump(MD_ENTITY_LITERAL);
+ Present(m.complete(p, MD_ENTITY_REFERENCE))
+}
diff --git a/crates/biome_markdown_parser/src/syntax/inline/html.rs b/crates/biome_markdown_parser/src/syntax/inline/html.rs
new file mode 100644
index 000000000000..eed80b102c38
--- /dev/null
+++ b/crates/biome_markdown_parser/src/syntax/inline/html.rs
@@ -0,0 +1,486 @@
+use biome_markdown_syntax::T;
+use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
+use biome_parser::Parser;
+use biome_parser::prelude::ParsedSyntax::{self, *};
+
+use crate::MarkdownParser;
+
+/// Check if text starting with `<` is valid inline HTML per CommonMark §6.8.
+/// Returns the length of the HTML element if valid, None otherwise.
+///
+/// Valid patterns:
+/// - Open tags: ``, ``, ``
+/// - Close tags: ``
+/// - Comments: ``
+/// - Processing instructions: ` ... ?>`
+/// - Declarations: ``
+/// - CDATA: ``
+pub(crate) fn is_inline_html(text: &str) -> Option {
+ let bytes = text.as_bytes();
+ if bytes.len() < 2 || bytes[0] != b'<' {
+ return None;
+ }
+
+ // HTML comment:
+ // Per CommonMark 0.31.2 §6.8, an HTML comment consists of ``,
+ // where text does not start with `>` or `->`, and does not end with `-`.
+ // Additionally, `` and `` are valid (degenerate) comments.
+ if bytes.starts_with(b" and
+ if rest.starts_with(b">") {
+ return Some(5); //
+ }
+ if rest.starts_with(b"->") {
+ return Some(6); //
+ }
+ // Find closing --> after ") {
+ let body = &text[4..4 + pos];
+ // Body must not end with '-'
+ if body.ends_with('-') {
+ return None;
+ }
+ return Some(4 + pos + 3);
+ }
+ return None;
+ }
+
+ // Processing instruction: ... ?>
+ if bytes.len() >= 2 && bytes[1] == b'?' {
+ // Find closing ?>
+ if let Some(pos) = text[2..].find("?>") {
+ return Some(2 + pos + 2);
+ }
+ return None;
+ }
+
+ // CDATA section:
+ if bytes.starts_with(b"
+ if let Some(pos) = text[9..].find("]]>") {
+ return Some(9 + pos + 3);
+ }
+ return None;
+ }
+
+ // Declaration:
+ // e.g.,
+ if bytes.len() >= 3 && bytes[1] == b'!' && bytes[2].is_ascii_alphabetic() {
+ // Find closing >
+ if let Some(pos) = text[2..].find('>') {
+ return Some(2 + pos + 1);
+ }
+ return None;
+ }
+
+ // Close tag:
+ if bytes.len() >= 4 && bytes[1] == b'/' {
+ if !bytes[2].is_ascii_alphabetic() {
+ return None;
+ }
+ // Tag name: [A-Za-z][A-Za-z0-9-]*
+ let mut i = 3;
+ while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
+ i += 1;
+ }
+ // Skip optional whitespace
+ while i < bytes.len()
+ && (bytes[i] == b' '
+ || bytes[i] == b'\t'
+ || bytes[i] == b'\n'
+ || bytes[i] == b'\r'
+ || bytes[i] == b'\x0c')
+ {
+ i += 1;
+ }
+ // Must end with >
+ if i < bytes.len() && bytes[i] == b'>' {
+ return Some(i + 1);
+ }
+ return None;
+ }
+
+ // Open tag: or
+ // Defensive bounds check - should be guaranteed by earlier len check but be explicit
+ if bytes.len() < 2 || !bytes[1].is_ascii_alphabetic() {
+ return None;
+ }
+
+ // Tag name: [A-Za-z][A-Za-z0-9-]*
+ // Note: tag names cannot contain `.` (so is NOT a valid tag)
+ let mut i = 2;
+ while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
+ i += 1;
+ }
+
+ // After tag name, must have valid boundary: whitespace, >, or /
+ // This prevents from being treated as HTML
+ if i >= bytes.len() {
+ return None;
+ }
+ let boundary = bytes[i];
+ if boundary != b' '
+ && boundary != b'\t'
+ && boundary != b'\n'
+ && boundary != b'\r'
+ && boundary != b'\x0c'
+ && boundary != b'>'
+ && boundary != b'/'
+ {
+ return None;
+ }
+
+ // Handle immediate close or self-close
+ if boundary == b'>' {
+ return Some(i + 1);
+ }
+ if boundary == b'/' {
+ if i + 1 < bytes.len() && bytes[i + 1] == b'>' {
+ return Some(i + 2);
+ }
+ return None;
+ }
+
+ // Has attributes - validate per CommonMark §6.8
+
+ let skip_spaces = |i: &mut usize| -> Option {
+ let mut skipped = false;
+ while *i < bytes.len() {
+ match bytes[*i] {
+ b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' => {
+ skipped = true;
+ *i += 1;
+ }
+ _ => break,
+ }
+ }
+ Some(skipped)
+ };
+
+ let is_attr_name_start = |b: u8| b.is_ascii_alphabetic() || b == b'_' || b == b':';
+ let is_attr_name_continue =
+ |b: u8| b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'.' || b == b'-';
+
+ let mut need_space = true;
+ // We already know the boundary char was whitespace, so first iteration has space.
+ let mut had_space = true;
+
+ loop {
+ if need_space {
+ let s = skip_spaces(&mut i)?;
+ had_space = had_space || s;
+ }
+ need_space = true;
+
+ if i >= bytes.len() {
+ return None;
+ }
+
+ // End or self-close
+ if bytes[i] == b'>' {
+ return Some(i + 1);
+ }
+ if bytes[i] == b'/' {
+ if i + 1 < bytes.len() && bytes[i + 1] == b'>' {
+ return Some(i + 2);
+ }
+ return None;
+ }
+
+ // Attributes must be separated by whitespace
+ if !had_space {
+ return None;
+ }
+
+ // Parse attribute name
+ if !is_attr_name_start(bytes[i]) {
+ return None;
+ }
+ i += 1;
+ while i < bytes.len() && is_attr_name_continue(bytes[i]) {
+ i += 1;
+ }
+
+ // Optional whitespace and value
+ had_space = skip_spaces(&mut i)?;
+ if i < bytes.len() && bytes[i] == b'=' {
+ i += 1;
+ skip_spaces(&mut i)?;
+ if i >= bytes.len() {
+ return None;
+ }
+
+ match bytes[i] {
+ b'"' => {
+ i += 1;
+ while i < bytes.len() && bytes[i] != b'"' {
+ i += 1;
+ }
+ if i >= bytes.len() {
+ return None;
+ }
+ i += 1;
+ }
+ b'\'' => {
+ i += 1;
+ while i < bytes.len() && bytes[i] != b'\'' {
+ i += 1;
+ }
+ if i >= bytes.len() {
+ return None;
+ }
+ i += 1;
+ }
+ _ => {
+ let start = i;
+ while i < bytes.len() {
+ let b = bytes[i];
+ if b <= b' '
+ || b == b'"'
+ || b == b'\''
+ || b == b'='
+ || b == b'<'
+ || b == b'>'
+ || b == b'`'
+ {
+ break;
+ }
+ i += 1;
+ }
+ if i == start {
+ return None;
+ }
+ }
+ }
+ // After value, need to find whitespace at top of loop
+ had_space = false;
+ }
+ // If no '=' was found, `had_space` from skip_spaces above carries over
+ // as the separator for the next attribute (boolean attribute case).
+ }
+}
+
+/// Parse raw inline HTML per CommonMark §6.8.
+///
+/// Grammar: MdInlineHtml = value: MdInlineItemList
+///
+/// Includes: open tags, close tags, comments, processing instructions,
+/// declarations, and CDATA sections.
+pub(crate) fn parse_inline_html(p: &mut MarkdownParser) -> ParsedSyntax {
+ if !p.at(L_ANGLE) {
+ return Absent;
+ }
+
+ // Get the source text starting from current position
+ let source = p.source_after_current();
+
+ // Check if this is valid inline HTML
+ let html_len = match is_inline_html(source) {
+ Some(len) => len,
+ None => return Absent,
+ };
+
+ // Per CommonMark §4.3, setext heading underlines take priority over inline HTML.
+ // If this HTML tag spans across a line that is a setext underline, treat `<` as literal.
+ if crate::syntax::inline_span_crosses_setext(p, html_len) {
+ return Absent;
+ }
+
+ // Valid inline HTML - create the node
+ // Use checkpoint so we can rewind if token boundaries don't align
+ let checkpoint = p.checkpoint();
+ let m = p.start();
+
+ // Create content as inline item list containing textual nodes
+ let content = p.start();
+
+ // Track remaining bytes to consume
+ let mut remaining = html_len;
+
+ while remaining > 0 && !p.at(T![EOF]) {
+ let token_len = p.cur_text().len();
+
+ // If the current token is larger than remaining bytes, token boundaries
+ // don't align with our validated HTML - rewind and treat as text
+ if token_len > remaining {
+ m.abandon(p);
+ p.rewind(checkpoint);
+ return Absent;
+ }
+
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+ remaining -= token_len;
+ }
+
+ content.complete(p, MD_INLINE_ITEM_LIST);
+
+ Present(m.complete(p, MD_INLINE_HTML))
+}
+
+/// Check if the text after `<` looks like a URI autolink.
+/// Per CommonMark §6.4: scheme must be 2-32 chars, start with letter,
+/// followed by letters/digits/+/-/., then `:`.
+fn is_uri_autolink(text: &str) -> bool {
+ let bytes = text.as_bytes();
+ if bytes.is_empty() {
+ return false;
+ }
+
+ // Must start with a letter
+ if !bytes[0].is_ascii_alphabetic() {
+ return false;
+ }
+
+ // Find the colon
+ let mut colon_pos = None;
+ for (i, &b) in bytes.iter().enumerate().skip(1) {
+ if b == b':' {
+ colon_pos = Some(i);
+ break;
+ }
+ // Scheme chars: letters, digits, +, -, .
+ if !b.is_ascii_alphanumeric() && b != b'+' && b != b'-' && b != b'.' {
+ return false;
+ }
+ }
+
+ // Scheme must be 2-32 chars and followed by colon
+ match colon_pos {
+ Some(pos) if (2..=32).contains(&pos) => {
+ // Must have content after the colon and no whitespace/< in URI
+ let rest = &text[pos + 1..];
+ !rest.is_empty()
+ && !rest.contains('<')
+ && !rest.contains('>')
+ && !rest.chars().any(|c| c.is_whitespace())
+ }
+ _ => false,
+ }
+}
+
+/// Check if the text after `<` looks like an email autolink.
+/// Per CommonMark §6.5: local@domain pattern with specific char restrictions.
+fn is_email_autolink(text: &str) -> bool {
+ // Must contain exactly one @ not at start or end
+ let at_pos = match text.find('@') {
+ Some(pos) if pos > 0 && pos < text.len() - 1 => pos,
+ _ => return false,
+ };
+
+ // Check no second @
+ if text[at_pos + 1..].contains('@') {
+ return false;
+ }
+
+ // Local part: alphanumerics and .!#$%&'*+/=?^_`{|}~-
+ let local = &text[..at_pos];
+ for c in local.chars() {
+ if !c.is_ascii_alphanumeric()
+ && !matches!(
+ c,
+ '.' | '!'
+ | '#'
+ | '$'
+ | '%'
+ | '&'
+ | '\''
+ | '*'
+ | '+'
+ | '/'
+ | '='
+ | '?'
+ | '^'
+ | '_'
+ | '`'
+ | '{'
+ | '|'
+ | '}'
+ | '~'
+ | '-'
+ )
+ {
+ return false;
+ }
+ }
+
+ // Domain part: alphanumerics and hyphens, dots for subdomains
+ let domain = &text[at_pos + 1..];
+ if domain.is_empty() || domain.starts_with('.') || domain.ends_with('.') {
+ return false;
+ }
+
+ for c in domain.chars() {
+ if !c.is_ascii_alphanumeric() && c != '-' && c != '.' {
+ return false;
+ }
+ }
+
+ true
+}
+
+/// Parse an autolink (`` or ``).
+///
+/// Grammar: MdAutolink = '<' value: MdInlineItemList '>'
+///
+/// Per CommonMark §6.4 and §6.5, autolinks are URIs or email addresses
+/// wrapped in angle brackets.
+pub(crate) fn parse_autolink(p: &mut MarkdownParser) -> ParsedSyntax {
+ if !p.at(L_ANGLE) {
+ return Absent;
+ }
+
+ // Look ahead to find the closing > and check if content is valid
+ let source = p.source_after_current();
+
+ // Skip the < and find >
+ let after_open = &source[1..];
+ let close_pos = match after_open.find('>') {
+ Some(pos) => pos,
+ None => return Absent, // No closing >
+ };
+
+ // Check for newline before > (not allowed in autolinks)
+ let content = &after_open[..close_pos];
+ if content.contains('\n') || content.contains('\r') {
+ return Absent;
+ }
+
+ // Must be either URI or email autolink
+ if !is_uri_autolink(content) && !is_email_autolink(content) {
+ return Absent;
+ }
+
+ // Valid autolink - parse it
+ let m = p.start();
+
+ // <
+ p.bump(L_ANGLE);
+
+ // Content as inline item list containing textual nodes.
+ // Autolinks don't process backslash escapes, but the lexer may combine
+ // `\>` into a single escape token. We re-lex in CodeSpan context where
+ // backslash is literal, so `\` and `>` are separate tokens.
+ p.force_relex_code_span();
+
+ let content_m = p.start();
+ while !p.at(R_ANGLE) && !p.at(T![EOF]) && !p.at_inline_end() {
+ let text_m = p.start();
+ p.bump_remap_with_context(
+ MD_TEXTUAL_LITERAL,
+ crate::lexer::MarkdownLexContext::CodeSpan,
+ );
+ text_m.complete(p, MD_TEXTUAL);
+ }
+ content_m.complete(p, MD_INLINE_ITEM_LIST);
+
+ // >
+ p.expect(R_ANGLE);
+
+ // Re-lex back to regular context
+ p.force_relex_regular();
+
+ Present(m.complete(p, MD_AUTOLINK))
+}
diff --git a/crates/biome_markdown_parser/src/syntax/inline/links.rs b/crates/biome_markdown_parser/src/syntax/inline/links.rs
new file mode 100644
index 000000000000..fc9b11de33de
--- /dev/null
+++ b/crates/biome_markdown_parser/src/syntax/inline/links.rs
@@ -0,0 +1,866 @@
+use biome_markdown_syntax::MarkdownSyntaxKind;
+use biome_markdown_syntax::T;
+use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
+use biome_parser::Parser;
+use biome_parser::prelude::ParsedSyntax::{self, *};
+use biome_rowan::TextRange;
+
+use crate::MarkdownParser;
+use crate::syntax::reference::normalize_reference_label;
+
+use super::{parse_inline_item_list_until, parse_inline_item_list_until_no_links};
+
+/// Parse link starting with `[` - dispatches to inline link or reference link.
+///
+/// After parsing `[text]`:
+/// - If followed by `(` → inline link `[text](url)`
+/// - If followed by `[` → reference link `[text][label]` or `[text][]`
+/// - Otherwise → shortcut reference `[text]`
+pub(crate) fn parse_link_or_reference(p: &mut MarkdownParser) -> ParsedSyntax {
+ parse_link_or_image(p, LinkParseKind::Link)
+}
+
+/// Parse reference link label `[label]` or `[]`.
+///
+/// Grammar: `MdReferenceLinkLabel = '[' label: MdInlineItemList ']'`
+///
+/// Returns Present if `[` and `]` are found (even if empty for collapsed reference).
+/// On failure (missing `]`), rewinds to the checkpoint so no tokens are consumed.
+fn parse_reference_label(p: &mut MarkdownParser) -> ParsedSyntax {
+ if !p.at(L_BRACK) {
+ return Absent;
+ }
+
+ // Checkpoint so we can rewind if ] is missing
+ let checkpoint = p.checkpoint();
+ let m = p.start();
+
+ // [
+ p.bump(L_BRACK);
+
+ // Label content (may be empty for collapsed reference)
+ let label = p.start();
+ while !p.at(R_BRACK) && !p.at_inline_end() {
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+ }
+ label.complete(p, MD_INLINE_ITEM_LIST);
+
+ // ]
+ if !p.eat(R_BRACK) {
+ // Missing closing bracket - abandon and rewind to not consume tokens
+ m.abandon(p);
+ p.rewind(checkpoint);
+ return Absent;
+ }
+
+ Present(m.complete(p, MD_REFERENCE_LINK_LABEL))
+}
+
+/// Parse inline link (`[text](url)`).
+///
+/// Grammar: `MdInlineLink = '[' text: MdInlineItemList ']' '(' source: MdInlineItemList ')'`
+///
+/// Note: This is kept for backwards compatibility but `parse_link_or_reference`
+/// is the preferred entry point for link parsing.
+pub(crate) fn parse_inline_link(p: &mut MarkdownParser) -> ParsedSyntax {
+ parse_link_or_reference(p)
+}
+
+/// Parse image starting with `![` - dispatches to inline image or reference image.
+///
+/// After parsing `![alt]`:
+/// - If followed by `(` → inline image ``
+/// - If followed by `[` → reference image `![alt][label]` or `![alt][]`
+/// - Otherwise → shortcut reference image `![alt]`
+pub(crate) fn parse_image_or_reference(p: &mut MarkdownParser) -> ParsedSyntax {
+ parse_link_or_image(p, LinkParseKind::Image)
+}
+
+#[derive(Copy, Clone)]
+enum LinkParseKind {
+ Link,
+ Image,
+}
+
+impl LinkParseKind {
+ fn starts_here(self, p: &mut MarkdownParser) -> bool {
+ match self {
+ Self::Link => p.at(L_BRACK),
+ Self::Image => p.at(BANG) && p.nth_at(1, L_BRACK),
+ }
+ }
+
+ fn bump_opening(self, p: &mut MarkdownParser) {
+ if matches!(self, Self::Image) {
+ p.bump(BANG);
+ }
+ p.bump(L_BRACK);
+ }
+
+ fn lookahead_reference(self, p: &mut MarkdownParser) -> Option {
+ match self {
+ Self::Link => lookahead_reference_link(p),
+ Self::Image => lookahead_reference_image(p),
+ }
+ }
+
+ fn inline_kind(self) -> MarkdownSyntaxKind {
+ match self {
+ Self::Link => MD_INLINE_LINK,
+ Self::Image => MD_INLINE_IMAGE,
+ }
+ }
+
+ fn reference_kind(self) -> MarkdownSyntaxKind {
+ match self {
+ Self::Link => MD_REFERENCE_LINK,
+ Self::Image => MD_REFERENCE_IMAGE,
+ }
+ }
+
+ fn report_unclosed_destination(self, p: &mut MarkdownParser, opening_range: TextRange) {
+ match self {
+ Self::Link => p.error(crate::syntax::parse_error::unclosed_link(
+ p,
+ opening_range,
+ "expected `)` to close URL",
+ )),
+ Self::Image => p.error(crate::syntax::parse_error::unclosed_image(
+ p,
+ opening_range,
+ "expected `)` to close image URL",
+ )),
+ }
+ }
+}
+
+fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyntax {
+ if !kind.starts_here(p) {
+ return Absent;
+ }
+
+ let checkpoint = p.checkpoint();
+ let m = p.start();
+ let opening_range = p.cur_range();
+ let reference = kind.lookahead_reference(p);
+ // Clear any cached lookahead tokens before switching lexing context.
+ p.reset_lookahead();
+
+ kind.bump_opening(p);
+
+ // Link text / alt text
+ let has_nested_link = if matches!(kind, LinkParseKind::Image) {
+ // For images, allow full inline parsing (including links) in alt text.
+ // This lets nested links/images be parsed so their text can be extracted for alt.
+ parse_inline_item_list_until(p, R_BRACK);
+ false
+ } else {
+ parse_inline_item_list_until_no_links(p, R_BRACK)
+ };
+
+ // ] - if missing, rewind and treat [ as literal text.
+ // Per CommonMark, if there's no valid ] to close the link (e.g., all ]
+ // characters are inside code spans or HTML), the [ is literal text.
+ // NOTE: We intentionally do NOT emit an "unclosed link" diagnostic here.
+ // CommonMark treats unmatched `[` as literal text, not an error.
+ if !p.eat(R_BRACK) {
+ m.abandon(p);
+ p.rewind(checkpoint);
+ return Absent;
+ }
+
+ // Per CommonMark, a link (not image) whose text contains another link must fail.
+ // The inner link wins and the outer `[` becomes literal text.
+ if matches!(kind, LinkParseKind::Link) && has_nested_link {
+ m.abandon(p);
+ p.rewind(checkpoint);
+ return Absent;
+ }
+
+ // Now decide based on what follows ]
+ let link_validation = if p.at(L_PAREN) {
+ inline_link_is_valid(p)
+ } else {
+ InlineLinkValidation::Invalid
+ };
+
+ if matches!(
+ link_validation,
+ InlineLinkValidation::Valid | InlineLinkValidation::DepthExceeded
+ ) {
+ // Inline link/image: [text](url) or 
+ // Bump past ( and lex the following tokens in LinkDefinition context
+ // so whitespace separates destination and title.
+ p.expect_with_context(L_PAREN, crate::lexer::MarkdownLexContext::LinkDefinition);
+
+ let destination = p.start();
+ let destination_result = parse_inline_link_destination_tokens(p);
+
+ // When depth exceeded, destination is truncated but link is still valid.
+ // Complete the destination and link immediately without looking for closing paren.
+ if destination_result == DestinationScanResult::DepthExceeded {
+ destination.complete(p, MD_INLINE_ITEM_LIST);
+ p.force_relex_regular();
+ return Present(m.complete(p, kind.inline_kind()));
+ }
+
+ let has_title = inline_title_starts_after_whitespace_tokens(p);
+ while is_title_separator_token(p) {
+ bump_link_def_separator(p);
+ }
+ if destination_result == DestinationScanResult::Invalid {
+ destination.abandon(p);
+ m.abandon(p);
+ p.rewind(checkpoint);
+ p.force_relex_regular();
+ return Absent;
+ }
+ destination.complete(p, MD_INLINE_ITEM_LIST);
+
+ if has_title {
+ let title_m = p.start();
+ let list_m = p.start();
+ parse_title_content(p, get_title_close_char(p));
+ list_m.complete(p, MD_INLINE_ITEM_LIST);
+ title_m.complete(p, MD_LINK_TITLE);
+ }
+
+ // Skip trailing whitespace/newlines before closing paren without creating nodes
+ // (creating nodes would violate the MD_INLINE_LINK grammar which expects exactly 7 children)
+ while is_title_separator_token(p) {
+ skip_link_def_separator_tokens(p);
+ }
+
+ if !p.eat(R_PAREN) {
+ if p.at_inline_end() {
+ kind.report_unclosed_destination(p, opening_range);
+ }
+ m.abandon(p);
+ p.rewind(checkpoint);
+ p.force_relex_regular();
+ return Absent;
+ }
+
+ Present(m.complete(p, kind.inline_kind()))
+ } else if p.at(L_BRACK) {
+ // Reference link/image: [text][label] or [text][]
+ let label = parse_reference_label(p);
+ let reference = reference.filter(|reference| {
+ if label.is_absent() {
+ reference.is_shortcut
+ } else {
+ true
+ }
+ });
+
+ if let Some(reference) = reference
+ && !reference.is_defined(p)
+ {
+ m.abandon(p);
+ p.rewind(checkpoint);
+ // Return Absent - the caller will treat `[` as textual.
+ // Don't consume the whole bracket sequence to avoid consuming
+ // past emphasis closers.
+ return Absent;
+ }
+
+ Present(m.complete(p, kind.reference_kind()))
+ } else {
+ // Shortcut reference: [text] or ![alt]
+ // No label part - the text/alt IS the label for resolution
+ if let Some(reference) = reference
+ && reference.is_shortcut
+ && !reference.is_defined(p)
+ {
+ m.abandon(p);
+ p.rewind(checkpoint);
+ // Return Absent - the caller will treat `[` as textual.
+ // Don't consume the whole bracket sequence to avoid consuming
+ // past emphasis closers.
+ return Absent;
+ }
+ Present(m.complete(p, kind.reference_kind()))
+ }
+}
+
+struct ReferenceLinkLookahead {
+ label_raw: String,
+ is_shortcut: bool,
+}
+
+impl ReferenceLinkLookahead {
+ fn is_defined(&self, p: &MarkdownParser) -> bool {
+ let normalized = normalize_reference_label(&self.label_raw);
+ p.has_link_reference_definition(&normalized)
+ }
+}
+
+fn lookahead_reference_link(p: &mut MarkdownParser) -> Option {
+ lookahead_reference_common(p, false)
+}
+
+fn lookahead_reference_image(p: &mut MarkdownParser) -> Option {
+ lookahead_reference_common(p, true)
+}
+
+fn lookahead_reference_common(
+ p: &mut MarkdownParser,
+ is_image: bool,
+) -> Option {
+ p.lookahead(|p| {
+ if is_image {
+ if !p.at(BANG) || !p.nth_at(1, L_BRACK) {
+ return None;
+ }
+ p.bump(BANG);
+ }
+
+ if !p.at(L_BRACK) {
+ return None;
+ }
+
+ p.bump(L_BRACK);
+
+ let link_text = collect_link_text(p)?;
+
+ // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty)
+ let normalized_link = normalize_reference_label(&link_text);
+ if normalized_link.is_empty() {
+ return None;
+ }
+
+ p.bump(R_BRACK);
+
+ if p.at(L_PAREN) {
+ return None;
+ }
+
+ if p.at(L_BRACK) {
+ p.bump(L_BRACK);
+ let label_text = collect_label_text_simple(p);
+ if let Some(label_text) = label_text {
+ let label = if label_text.is_empty() {
+ link_text.clone()
+ } else {
+ // Explicit label must also normalize to non-empty
+ let normalized_label = normalize_reference_label(&label_text);
+ if normalized_label.is_empty() {
+ return None;
+ }
+ label_text
+ };
+ p.bump(R_BRACK);
+ return Some(ReferenceLinkLookahead {
+ label_raw: label,
+ is_shortcut: false,
+ });
+ }
+ }
+
+ Some(ReferenceLinkLookahead {
+ label_raw: link_text,
+ is_shortcut: true,
+ })
+ })
+}
+
+/// Collect text for a link label (e.g., the `label` in `[text][label]`).
+///
+/// Per CommonMark §4.7, link labels have specific rules:
+/// - Unescaped square brackets are NOT allowed inside labels (see example 555)
+/// - Backslash escapes ARE allowed (e.g., `\]` is a literal `]` in the label)
+/// - No inline parsing (backticks, HTML, etc. are literal characters)
+///
+/// We stop at the first R_BRACK token (unescaped `]`). Escaped brackets like `\]`
+/// are lexed as MD_TEXTUAL_LITERAL, not R_BRACK, so they're included in the label.
+fn collect_label_text_simple(p: &mut MarkdownParser) -> Option {
+ let mut text = String::new();
+
+ loop {
+ if p.at(T![EOF]) || p.at_inline_end() {
+ return None;
+ }
+
+ // Blank lines terminate
+ if p.at(NEWLINE) && p.at_blank_line() {
+ return None;
+ }
+
+ // R_BRACK token = unescaped `]` closes the label.
+ // Note: Escaped brackets (`\]`) are lexed as MD_TEXTUAL_LITERAL,
+ // not R_BRACK, so they're correctly included in the label text.
+ if p.at(R_BRACK) {
+ return Some(text);
+ }
+
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ }
+}
+
+/// Collect text for link text (e.g., the `text` in `[text](url)` or `[text][label]`).
+/// Per CommonMark, link text CAN contain inline elements - code spans, autolinks, HTML.
+/// `]` inside these constructs does NOT close the link text.
+fn collect_link_text(p: &mut MarkdownParser) -> Option {
+ let mut text = String::new();
+ let mut bracket_depth = 0usize;
+
+ loop {
+ if p.at(T![EOF]) || p.at_inline_end() {
+ return None;
+ }
+
+ // Per CommonMark, blank lines terminate link text
+ if p.at(NEWLINE) && p.at_blank_line() {
+ return None;
+ }
+
+ // Code spans can contain `]` - skip them entirely.
+ // Per CommonMark, `]` inside code spans doesn't terminate link text.
+ if p.at(BACKTICK) {
+ let opening_count = p.cur_text().len();
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+
+ // Find matching closing backticks
+ let mut found_close = false;
+ while !p.at(T![EOF]) && !p.at_inline_end() {
+ if p.at(NEWLINE) && p.at_blank_line() {
+ break; // Blank line terminates
+ }
+ if p.at(BACKTICK) && p.cur_text().len() == opening_count {
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ found_close = true;
+ break;
+ }
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ }
+ if !found_close {
+ // Unclosed code span - treat opening backticks as literal
+ // (already added to text, continue normally)
+ }
+ continue;
+ }
+
+ // Autolinks and inline HTML can contain `]` - skip them entirely.
+ // Per CommonMark, `]` inside `<...>` constructs doesn't terminate link text.
+ if p.at(L_ANGLE) {
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+
+ // Consume until `>` or newline
+ while !p.at(T![EOF]) && !p.at_inline_end() && !p.at(R_ANGLE) {
+ if p.at(NEWLINE) {
+ // Newlines end autolinks/HTML tags
+ break;
+ }
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ }
+ if p.at(R_ANGLE) {
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ }
+ continue;
+ }
+
+ if p.at(L_BRACK) {
+ bracket_depth += 1;
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ continue;
+ }
+
+ if p.at(R_BRACK) {
+ if bracket_depth == 0 {
+ return Some(text);
+ }
+ bracket_depth -= 1;
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ continue;
+ }
+
+ text.push_str(p.cur_text());
+ p.bump(p.cur());
+ }
+}
+
+fn bump_textual_link_def(p: &mut MarkdownParser) {
+ use crate::lexer::MarkdownLexContext;
+
+ let item = p.start();
+ p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::LinkDefinition);
+ item.complete(p, MD_TEXTUAL);
+}
+fn is_whitespace_token(p: &MarkdownParser) -> bool {
+ let text = p.cur_text();
+ !text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t')
+}
+
+fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool {
+ p.lookahead(|p| {
+ let mut saw_whitespace = false;
+ while is_title_separator_token(p) {
+ bump_link_def_separator(p);
+ saw_whitespace = true;
+ }
+ saw_whitespace && get_title_close_char(p).is_some()
+ })
+}
+
+/// Result of validating an inline link.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum InlineLinkValidation {
+ /// Link is valid with complete destination
+ Valid,
+ /// Link is invalid
+ Invalid,
+ /// Link is valid but destination was truncated due to paren depth limit.
+ /// The link should be closed immediately without looking for `)`.
+ DepthExceeded,
+}
+
+fn inline_link_is_valid(p: &mut MarkdownParser) -> InlineLinkValidation {
+ p.lookahead(|p| {
+ if !p.at(L_PAREN) {
+ return InlineLinkValidation::Invalid;
+ }
+
+ p.bump(L_PAREN);
+ p.re_lex_link_definition();
+
+ let destination_result = scan_inline_link_destination_tokens(p);
+
+ // If depth exceeded, link is valid but truncated - no need to check for closing paren
+ if destination_result == DestinationScanResult::DepthExceeded {
+ return InlineLinkValidation::DepthExceeded;
+ }
+
+ if destination_result == DestinationScanResult::Invalid {
+ return InlineLinkValidation::Invalid;
+ }
+
+ let mut saw_separator = false;
+ while is_title_separator_token(p) {
+ skip_link_def_separator_tokens(p);
+ saw_separator = true;
+ }
+ let has_title = saw_separator && get_title_close_char(p).is_some();
+ while is_title_separator_token(p) {
+ skip_link_def_separator_tokens(p);
+ }
+
+ if has_title {
+ scan_title_content(p, get_title_close_char(p));
+ }
+
+ while is_title_separator_token(p) {
+ skip_link_def_separator_tokens(p);
+ }
+
+ if p.at(R_PAREN) {
+ InlineLinkValidation::Valid
+ } else {
+ InlineLinkValidation::Invalid
+ }
+ })
+}
+
+/// Result of scanning a link destination.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum DestinationScanResult {
+ /// Destination is valid and complete
+ Valid,
+ /// Destination is invalid (contains invalid characters, etc.)
+ Invalid,
+ /// Destination was truncated because paren depth exceeded the limit.
+ /// In this case, the link is considered valid but closed at the truncation point.
+ DepthExceeded,
+}
+
+fn scan_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult {
+ const MAX_PAREN_DEPTH: i32 = crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH;
+ // Skip leading whitespace to match parse_inline_link_destination_tokens behavior
+ while is_title_separator_token(p) {
+ skip_link_def_separator_tokens(p);
+ }
+ if p.at(L_ANGLE) {
+ p.bump_link_definition();
+ let mut pending_escape = false;
+ loop {
+ if p.at(EOF) || p.at(NEWLINE) {
+ return DestinationScanResult::Invalid;
+ }
+ if p.at(R_ANGLE) {
+ if pending_escape {
+ if !crate::syntax::validate_link_destination_text(
+ p.cur_text(),
+ crate::syntax::LinkDestinationKind::Enclosed,
+ &mut pending_escape,
+ ) {
+ return DestinationScanResult::Invalid;
+ }
+ p.bump_link_definition();
+ continue;
+ }
+ p.bump_link_definition();
+ return DestinationScanResult::Valid;
+ }
+ if !crate::syntax::validate_link_destination_text(
+ p.cur_text(),
+ crate::syntax::LinkDestinationKind::Enclosed,
+ &mut pending_escape,
+ ) {
+ return DestinationScanResult::Invalid;
+ }
+ p.bump_link_definition();
+ }
+ }
+
+ let mut paren_depth: i32 = 0;
+ let mut pending_escape = false;
+ while !p.at(EOF) && !p.at(NEWLINE) {
+ if is_whitespace_token(p) {
+ break;
+ }
+ let text = p.cur_text();
+ if !crate::syntax::validate_link_destination_text(
+ text,
+ crate::syntax::LinkDestinationKind::Raw,
+ &mut pending_escape,
+ ) {
+ return DestinationScanResult::Invalid;
+ }
+ match crate::syntax::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) {
+ crate::syntax::ParenDepthResult::Ok(next_depth) => {
+ paren_depth = next_depth;
+ p.bump_link_definition();
+ }
+ crate::syntax::ParenDepthResult::DepthExceeded => {
+ // Paren depth exceeded - destination is truncated at this point.
+ // Per CommonMark/cmark, the link is still valid but closed here.
+ return DestinationScanResult::DepthExceeded;
+ }
+ crate::syntax::ParenDepthResult::UnmatchedClose => {
+ // Unmatched closing paren - destination ends here normally.
+ // The `)` belongs to the enclosing construct (inline link closer).
+ break;
+ }
+ }
+ }
+ if p.at(EOF) {
+ return DestinationScanResult::Invalid;
+ }
+ if p.at(NEWLINE) {
+ return if p.at_blank_line() {
+ DestinationScanResult::Invalid
+ } else {
+ DestinationScanResult::Valid
+ };
+ }
+ DestinationScanResult::Valid
+}
+
+fn scan_title_content(p: &mut MarkdownParser, close_char: Option) {
+ let Some(close_char) = close_char else {
+ return;
+ };
+
+ let text = p.cur_text();
+ let is_complete = text.len() >= 2 && crate::syntax::ends_with_unescaped_close(text, close_char);
+
+ p.bump_link_definition();
+ if is_complete {
+ return;
+ }
+
+ loop {
+ // Stop on EOF or blank line (titles cannot span blank lines per CommonMark)
+ if p.at(EOF) || p.at_blank_line() {
+ return;
+ }
+
+ // Continue through single newlines (titles can span non-blank lines)
+ if p.at(NEWLINE) {
+ skip_link_def_separator_tokens(p);
+ continue;
+ }
+
+ let text = p.cur_text();
+ if crate::syntax::ends_with_unescaped_close(text, close_char) {
+ p.bump_link_definition();
+ return;
+ }
+
+ p.bump_link_definition();
+ }
+}
+
+fn skip_link_def_separator_tokens(p: &mut MarkdownParser) {
+ if p.at(NEWLINE) {
+ p.bump(NEWLINE);
+ } else {
+ p.bump_link_definition();
+ }
+}
+
+fn is_title_separator_token(p: &MarkdownParser) -> bool {
+ is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line())
+}
+
+fn bump_link_def_separator(p: &mut MarkdownParser) {
+ if p.at(NEWLINE) {
+ let item = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ item.complete(p, MD_TEXTUAL);
+ } else {
+ bump_textual_link_def(p);
+ }
+}
+
+fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult {
+ p.re_lex_link_definition();
+ const MAX_PAREN_DEPTH: i32 = crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH;
+
+ if p.at(L_ANGLE) {
+ bump_textual_link_def(p);
+ let mut pending_escape = false;
+ loop {
+ if p.at(EOF) || p.at(NEWLINE) {
+ return DestinationScanResult::Invalid;
+ }
+ if p.at(R_ANGLE) {
+ if pending_escape {
+ if !crate::syntax::validate_link_destination_text(
+ p.cur_text(),
+ crate::syntax::LinkDestinationKind::Enclosed,
+ &mut pending_escape,
+ ) {
+ return DestinationScanResult::Invalid;
+ }
+ bump_textual_link_def(p);
+ continue;
+ }
+ bump_textual_link_def(p);
+ return DestinationScanResult::Valid;
+ }
+ if !crate::syntax::validate_link_destination_text(
+ p.cur_text(),
+ crate::syntax::LinkDestinationKind::Enclosed,
+ &mut pending_escape,
+ ) {
+ return DestinationScanResult::Invalid;
+ }
+ bump_textual_link_def(p);
+ }
+ }
+
+ let mut paren_depth: i32 = 0;
+ let mut pending_escape = false;
+ while is_title_separator_token(p) {
+ bump_link_def_separator(p);
+ }
+ while !p.at(EOF) && !p.at(NEWLINE) {
+ if is_whitespace_token(p) {
+ break;
+ }
+
+ let text = p.cur_text();
+ if !crate::syntax::validate_link_destination_text(
+ text,
+ crate::syntax::LinkDestinationKind::Raw,
+ &mut pending_escape,
+ ) {
+ return DestinationScanResult::Invalid;
+ }
+ match crate::syntax::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) {
+ crate::syntax::ParenDepthResult::Ok(next_depth) => {
+ paren_depth = next_depth;
+ bump_textual_link_def(p);
+ }
+ crate::syntax::ParenDepthResult::DepthExceeded => {
+ // Paren depth exceeded - destination is truncated at this point.
+ return DestinationScanResult::DepthExceeded;
+ }
+ crate::syntax::ParenDepthResult::UnmatchedClose => {
+ // Unmatched closing paren - destination ends here normally.
+ // The `)` belongs to the enclosing construct (inline link closer).
+ break;
+ }
+ }
+ }
+ if p.at(EOF) {
+ return DestinationScanResult::Invalid;
+ }
+ if p.at(NEWLINE) {
+ return if p.at_blank_line() {
+ DestinationScanResult::Invalid
+ } else {
+ DestinationScanResult::Valid
+ };
+ }
+ DestinationScanResult::Valid
+}
+
+fn get_title_close_char(p: &MarkdownParser) -> Option {
+ let text = p.cur_text();
+ if text.starts_with('"') {
+ Some('"')
+ } else if text.starts_with('\'') {
+ Some('\'')
+ } else if p.at(L_PAREN) {
+ Some(')')
+ } else {
+ None
+ }
+}
+
+fn parse_title_content(p: &mut MarkdownParser, close_char: Option) {
+ let Some(close_char) = close_char else {
+ return;
+ };
+
+ let text = p.cur_text();
+ let is_complete = text.len() >= 2 && crate::syntax::ends_with_unescaped_close(text, close_char);
+
+ bump_textual_link_def(p);
+ if is_complete {
+ return;
+ }
+
+ loop {
+ // Stop on EOF or blank line (titles cannot span blank lines per CommonMark)
+ if p.at(EOF) || p.at_blank_line() {
+ return;
+ }
+
+ // Continue through single newlines (titles can span non-blank lines)
+ if p.at(NEWLINE) {
+ bump_link_def_separator(p);
+ continue;
+ }
+
+ let text = p.cur_text();
+ if crate::syntax::ends_with_unescaped_close(text, close_char) {
+ bump_textual_link_def(p);
+ return;
+ }
+
+ bump_textual_link_def(p);
+ }
+}
+
+/// Parse inline image (``).
+///
+/// Grammar: `MdInlineImage = '!' '[' alt: MdInlineItemList ']' '(' source: MdInlineItemList ')'`
+///
+/// Note: This is kept for backwards compatibility but `parse_image_or_reference`
+/// is the preferred entry point for image parsing.
+pub(crate) fn parse_inline_image(p: &mut MarkdownParser) -> ParsedSyntax {
+ parse_image_or_reference(p)
+}
diff --git a/crates/biome_markdown_parser/src/syntax/inline/mod.rs b/crates/biome_markdown_parser/src/syntax/inline/mod.rs
new file mode 100644
index 000000000000..8723aabc948f
--- /dev/null
+++ b/crates/biome_markdown_parser/src/syntax/inline/mod.rs
@@ -0,0 +1,331 @@
+//! Inline element parsing for Markdown.
+//!
+//! Handles inline code spans, emphasis (bold/italic), links, images, line breaks, and raw HTML.
+//!
+//! # CommonMark Specification References
+//!
+//! This module implements the following CommonMark 0.31.2 sections:
+//!
+//! - **§6.1 Code spans**: Backtick-delimited inline code (`code`)
+//! - **§6.2 Emphasis and strong emphasis**: `*italic*`, `**bold**`, `_italic_`, `__bold__`
+//! - **§6.3 Links**: `[text](url)` inline links
+//! - **§6.4 Autolinks (URI)**: ``
+//! - **§6.5 Autolinks (email)**: ``
+//! - **§6.6 Hard line breaks**: Trailing spaces or backslash before newline
+//! - **§6.7 Soft line breaks**: Single newline within paragraph
+//! - **§6.8 Raw HTML**: ``, ``, ``, `...?>`, ``, ``
+//!
+//! # Emphasis Algorithm (§6.4)
+//!
+//! This module implements the CommonMark delimiter stack algorithm for emphasis:
+//!
+//! 1. **First pass**: Collect delimiter runs from the inline content
+//! 2. **Second pass**: Match openers and closers using the delimiter stack algorithm
+//! 3. **Rule of 3**: If (opener_count + closer_count) % 3 == 0 and both can open/close,
+//! skip the match unless both counts are divisible by 3
+//!
+//! # Emphasis Flanking Rules (§6.2)
+//!
+//! A delimiter run is **left-flanking** if:
+//! 1. Not followed by Unicode whitespace, AND
+//! 2. Not followed by punctuation, OR preceded by whitespace/punctuation
+//!
+//! A delimiter run is **right-flanking** if:
+//! 1. Not preceded by Unicode whitespace, AND
+//! 2. Not preceded by punctuation, OR followed by whitespace/punctuation
+//!
+//! Underscore (`_`) has additional intraword restrictions (§6.2 rules 2, 5, 7, 8).
+
+use biome_markdown_syntax::MarkdownSyntaxKind;
+use biome_markdown_syntax::T;
+use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
+use biome_parser::Parser;
+use biome_parser::prelude::ParsedSyntax;
+
+use crate::MarkdownParser;
+
+mod code_span;
+mod emphasis;
+mod entities;
+mod html;
+mod links;
+
+pub(crate) use emphasis::EmphasisContext;
+pub(crate) use html::is_inline_html;
+
+fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool {
+ let m = p.start();
+ let prev_context = emphasis::set_inline_emphasis_context_until(p, stop);
+ let mut bracket_depth = 0usize;
+ let mut has_nested_link = false;
+
+ loop {
+ // Per CommonMark, link text can span lines, but blank lines end the link.
+ // Check for blank line (NEWLINE followed by NEWLINE or EOF after optional whitespace)
+ if p.at(NEWLINE) {
+ if p.at_blank_line() {
+ break; // Blank line ends link text
+ }
+ // Single newline inside link text - consume and continue
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if p.at(T![EOF]) {
+ break;
+ }
+
+ // IMPORTANT: Parse constructs that can contain `]` BEFORE checking for stop token.
+ // Per CommonMark, `]` inside code spans, autolinks, and HTML doesn't terminate links.
+
+ // Code spans can contain `]`
+ if p.at(BACKTICK) {
+ if code_span::parse_inline_code(p).is_present() {
+ continue;
+ }
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ // Autolinks and inline HTML can contain `]`
+ if p.at(L_ANGLE) {
+ if html::parse_autolink(p).is_present() {
+ continue;
+ }
+ if html::parse_inline_html(p).is_present() {
+ continue;
+ }
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ // NOW check for stop token (after constructs that can contain it)
+ if p.at(stop) {
+ if bracket_depth == 0 {
+ break;
+ }
+ bracket_depth = bracket_depth.saturating_sub(1);
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if p.at(L_BRACK) {
+ if !has_nested_link && nested_link_starts_here(p) {
+ has_nested_link = true;
+ }
+ bracket_depth += 1;
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if parse_any_inline_no_links(p).is_absent() {
+ break;
+ }
+ }
+
+ m.complete(p, MD_INLINE_ITEM_LIST);
+ p.set_emphasis_context(prev_context);
+ has_nested_link
+}
+
+/// Parse inline items until `stop` token, allowing full inline parsing including links.
+/// Used for image alt text where nested links/images should be fully parsed
+/// so their text content can be extracted for the alt attribute.
+fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) {
+ let m = p.start();
+ let prev_context = emphasis::set_inline_emphasis_context_until(p, stop);
+ let mut bracket_depth = 0usize;
+
+ loop {
+ if p.at(NEWLINE) {
+ if p.at_blank_line() {
+ break;
+ }
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if p.at(T![EOF]) {
+ break;
+ }
+
+ // Code spans can contain `]`
+ if p.at(BACKTICK) {
+ if code_span::parse_inline_code(p).is_present() {
+ continue;
+ }
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ // Autolinks and inline HTML can contain `]`
+ if p.at(L_ANGLE) {
+ if html::parse_autolink(p).is_present() {
+ continue;
+ }
+ if html::parse_inline_html(p).is_present() {
+ continue;
+ }
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if p.at(stop) {
+ if bracket_depth == 0 {
+ break;
+ }
+ bracket_depth = bracket_depth.saturating_sub(1);
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ // For image alt: allow full inline parsing including links and images
+ if p.at(L_BRACK) {
+ let result = links::parse_link_or_reference(p);
+ if result.is_present() {
+ continue;
+ }
+ bracket_depth += 1;
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if p.at(BANG) && p.nth_at(1, L_BRACK) {
+ let result = links::parse_image_or_reference(p);
+ if result.is_present() {
+ continue;
+ }
+ let _ = super::parse_textual(p);
+ continue;
+ }
+
+ if parse_any_inline(p).is_absent() {
+ break;
+ }
+ }
+
+ m.complete(p, MD_INLINE_ITEM_LIST);
+ p.set_emphasis_context(prev_context);
+}
+
+fn nested_link_starts_here(p: &mut MarkdownParser) -> bool {
+ p.lookahead(|p| {
+ if !p.at(L_BRACK) {
+ return false;
+ }
+
+ p.bump(L_BRACK);
+ let mut depth = 0usize;
+
+ loop {
+ if p.at(EOF) || p.at_inline_end() {
+ return false;
+ }
+
+ if p.at(L_BRACK) {
+ depth += 1;
+ p.bump(L_BRACK);
+ continue;
+ }
+
+ if p.at(R_BRACK) {
+ if depth > 0 {
+ depth -= 1;
+ p.bump(R_BRACK);
+ continue;
+ }
+ p.bump(R_BRACK);
+ return p.at(L_PAREN) || p.at(L_BRACK);
+ }
+
+ p.bump(p.cur());
+ }
+ })
+}
+
+fn parse_any_inline_no_links(p: &mut MarkdownParser) -> ParsedSyntax {
+ if p.at(L_BRACK) {
+ return super::parse_textual(p);
+ }
+
+ if p.at(BANG) && p.nth_at(1, L_BRACK) {
+ return links::parse_inline_image(p);
+ }
+
+ parse_any_inline(p)
+}
+
+/// Dispatch to the appropriate inline parser based on current token.
+pub(crate) fn parse_any_inline(p: &mut MarkdownParser) -> ParsedSyntax {
+ if p.at(MD_HARD_LINE_LITERAL) {
+ code_span::parse_hard_line(p)
+ } else if p.at(BACKTICK) || p.at(T!["```"]) {
+ // Try code span, fall back to literal text if no matching closer exists.
+ // T!["```"] can appear when backticks are at line start but info string
+ // contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
+ let result = code_span::parse_inline_code(p);
+ if result.is_absent() {
+ super::parse_textual(p)
+ } else {
+ result
+ }
+ } else if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) {
+ // For cases like `***foo***`, the em match starts at the exact token boundary
+ // (prefix_len=0) while the strong match starts at offset 1 (prefix_len=1).
+ // Try italic first to handle nested emphasis correctly, then try strong.
+ let result = emphasis::parse_inline_italic(p);
+ if result.is_present() {
+ return result;
+ }
+ let result = emphasis::parse_inline_emphasis(p);
+ if result.is_present() {
+ return result;
+ }
+ // Neither matched - re-lex to single token and emit just one char as literal.
+ // This handles cases like `**foo*` where opener is at offset 1.
+ p.force_relex_emphasis_inline();
+ super::parse_textual(p)
+ } else if p.at(T![*]) || p.at(UNDERSCORE) {
+ // Try italic, fall back to literal text if flanking rules fail
+ let result = emphasis::parse_inline_italic(p);
+ if result.is_absent() {
+ super::parse_textual(p)
+ } else {
+ result
+ }
+ } else if p.at(BANG) && p.nth_at(1, L_BRACK) {
+ // Try image, fall back to literal text if parsing fails
+ let result = links::parse_inline_image(p);
+ if result.is_absent() {
+ super::parse_textual(p)
+ } else {
+ result
+ }
+ } else if p.at(L_BRACK) {
+ // Try link, fall back to literal text if parsing fails
+ let result = links::parse_inline_link(p);
+ if result.is_absent() {
+ super::parse_textual(p)
+ } else {
+ result
+ }
+ } else if p.at(L_ANGLE) {
+ // Try autolink first (takes priority per CommonMark)
+ let result = html::parse_autolink(p);
+ if result.is_present() {
+ return result;
+ }
+ // Then try inline HTML
+ let result = html::parse_inline_html(p);
+ if result.is_present() {
+ return result;
+ }
+ // Fall back to textual
+ super::parse_textual(p)
+ } else if p.at(MD_ENTITY_LITERAL) {
+ // Entity or numeric character reference (already validated by lexer)
+ entities::parse_entity_reference(p)
+ } else {
+ super::parse_textual(p)
+ }
+}
diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs
index 69d29d3bd40e..ceed5ff8e975 100644
--- a/crates/biome_markdown_parser/src/syntax/link_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/link_block.rs
@@ -112,7 +112,7 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool {
}
// Label must also be non-empty after normalization (e.g., `[\n ]` normalizes to empty)
- let normalized = crate::link_reference::normalize_reference_label(&label_text);
+ let normalized = crate::syntax::reference::normalize_reference_label(&label_text);
if normalized.is_empty() {
return false;
}
diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax/mod.rs
similarity index 99%
rename from crates/biome_markdown_parser/src/syntax.rs
rename to crates/biome_markdown_parser/src/syntax/mod.rs
index 4c521eef6b3d..0d0e3d21ab00 100644
--- a/crates/biome_markdown_parser/src/syntax.rs
+++ b/crates/biome_markdown_parser/src/syntax/mod.rs
@@ -30,6 +30,7 @@ pub mod link_block;
pub mod list;
pub mod parse_error;
pub mod quote;
+pub mod reference;
pub mod thematic_break_block;
use biome_markdown_syntax::kind::MarkdownSyntaxKind;
@@ -508,7 +509,7 @@ fn newline_is_blank_line(p: &MarkdownParser) -> bool {
prev == b'\n' || prev == b'\r'
}
-/// Consume exactly `indent` columns of leading whitespace at line start.
+/// Check if the current line is blank (lookahead only).
fn consume_indent_prefix(p: &mut MarkdownParser, indent: usize) {
if indent == 0 {
return;
@@ -645,7 +646,7 @@ pub(crate) fn is_dash_only_thematic_break_text(text: &str) -> bool {
/// The byte count includes only the whitespace tokens consumed during the indent skip,
/// NOT the underline token itself. Callers that track byte budgets must subtract this.
///
-/// This is the single source of truth for setext detection in inline contexts.
+/// This is the shared helper for setext detection in inline contexts.
/// Used by `has_matching_code_span_closer`, `parse_inline_html`, and `parse_inline_item_list`.
///
/// Context safety: this function does NOT call `allow_setext_heading` because the token
diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/syntax/reference.rs
similarity index 98%
rename from crates/biome_markdown_parser/src/link_reference.rs
rename to crates/biome_markdown_parser/src/syntax/reference.rs
index 37884700c1ea..d917ba2380f3 100644
--- a/crates/biome_markdown_parser/src/link_reference.rs
+++ b/crates/biome_markdown_parser/src/syntax/reference.rs
@@ -8,7 +8,8 @@ use biome_rowan::{AstNode, Direction};
use crate::MarkdownLosslessTreeSink;
use crate::MarkdownParseOptions;
use crate::parser::MarkdownParser;
-use crate::syntax::parse_document;
+
+use super::parse_document;
/// Normalize a reference label per CommonMark spec.
///
diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs
index 3aaee64c4bc2..d919ec91a672 100644
--- a/crates/biome_markdown_parser/src/to_html.rs
+++ b/crates/biome_markdown_parser/src/to_html.rs
@@ -45,7 +45,7 @@ use biome_rowan::{AstNode, AstNodeList, Direction, TextRange};
use percent_encoding::{AsciiSet, CONTROLS, utf8_percent_encode};
use std::collections::HashMap;
-use crate::link_reference::normalize_reference_label;
+use crate::syntax::reference::normalize_reference_label;
use crate::parser::ListTightness;
// ============================================================================
diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs
index 12d69d4b6b66..c4e672e18a9f 100644
--- a/crates/biome_markdown_parser/tests/spec_test.rs
+++ b/crates/biome_markdown_parser/tests/spec_test.rs
@@ -152,7 +152,7 @@ pub fn quick_test() {
assert_eq!(expected, html, "Example {:03} failed", num);
}
- // Test the 8 failing CommonMark examples
+ // Test the remaining failing CommonMark examples
// TODO: Example 007 still failing - tab expansion issue (produces 3 spaces instead of 2)
// test_example(7, "-\t\tfoo\n", "\n");
test_example(
From 292c747fba45564b1484033ac3aa8d35fb6f634a Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Thu, 29 Jan 2026 06:15:05 -0500
Subject: [PATCH 08/26] refactor(markdown): simplify inline list parsing flow
Refactors newline/quote handling into helpers to clarify control flow. Consolidates stop-token inline parsing and shared lookahead logic.
---
.../src/syntax/inline/mod.rs | 130 +++---
.../biome_markdown_parser/src/syntax/mod.rs | 410 +++++++++---------
crates/biome_markdown_parser/src/to_html.rs | 2 +-
.../biome_markdown_parser/tests/spec_test.rs | 8 +-
4 files changed, 268 insertions(+), 282 deletions(-)
diff --git a/crates/biome_markdown_parser/src/syntax/inline/mod.rs b/crates/biome_markdown_parser/src/syntax/inline/mod.rs
index 8723aabc948f..884968711386 100644
--- a/crates/biome_markdown_parser/src/syntax/inline/mod.rs
+++ b/crates/biome_markdown_parser/src/syntax/inline/mod.rs
@@ -53,7 +53,20 @@ mod links;
pub(crate) use emphasis::EmphasisContext;
pub(crate) use html::is_inline_html;
-fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool {
+enum InlineLinksPolicy {
+ NoLinks,
+ FullLinks,
+}
+
+struct InlineListUntilResult {
+ has_nested_link: bool,
+}
+
+fn parse_inline_item_list_until_impl(
+ p: &mut MarkdownParser,
+ stop: MarkdownSyntaxKind,
+ policy: InlineLinksPolicy,
+) -> InlineListUntilResult {
let m = p.start();
let prev_context = emphasis::set_inline_emphasis_context_until(p, stop);
let mut bracket_depth = 0usize;
@@ -110,87 +123,28 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS
}
if p.at(L_BRACK) {
- if !has_nested_link && nested_link_starts_here(p) {
- has_nested_link = true;
- }
- bracket_depth += 1;
- let _ = super::parse_textual(p);
- continue;
- }
-
- if parse_any_inline_no_links(p).is_absent() {
- break;
- }
- }
-
- m.complete(p, MD_INLINE_ITEM_LIST);
- p.set_emphasis_context(prev_context);
- has_nested_link
-}
-
-/// Parse inline items until `stop` token, allowing full inline parsing including links.
-/// Used for image alt text where nested links/images should be fully parsed
-/// so their text content can be extracted for the alt attribute.
-fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) {
- let m = p.start();
- let prev_context = emphasis::set_inline_emphasis_context_until(p, stop);
- let mut bracket_depth = 0usize;
-
- loop {
- if p.at(NEWLINE) {
- if p.at_blank_line() {
- break;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(T![EOF]) {
- break;
- }
-
- // Code spans can contain `]`
- if p.at(BACKTICK) {
- if code_span::parse_inline_code(p).is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- // Autolinks and inline HTML can contain `]`
- if p.at(L_ANGLE) {
- if html::parse_autolink(p).is_present() {
- continue;
- }
- if html::parse_inline_html(p).is_present() {
- continue;
- }
- let _ = super::parse_textual(p);
- continue;
- }
-
- if p.at(stop) {
- if bracket_depth == 0 {
- break;
- }
- bracket_depth = bracket_depth.saturating_sub(1);
- let _ = super::parse_textual(p);
- continue;
- }
-
- // For image alt: allow full inline parsing including links and images
- if p.at(L_BRACK) {
- let result = links::parse_link_or_reference(p);
- if result.is_present() {
- continue;
+ match policy {
+ InlineLinksPolicy::NoLinks => {
+ if !has_nested_link && nested_link_starts_here(p) {
+ has_nested_link = true;
+ }
+ bracket_depth += 1;
+ let _ = super::parse_textual(p);
+ continue;
+ }
+ InlineLinksPolicy::FullLinks => {
+ let result = links::parse_link_or_reference(p);
+ if result.is_present() {
+ continue;
+ }
+ bracket_depth += 1;
+ let _ = super::parse_textual(p);
+ continue;
+ }
}
- bracket_depth += 1;
- let _ = super::parse_textual(p);
- continue;
}
- if p.at(BANG) && p.nth_at(1, L_BRACK) {
+ if matches!(policy, InlineLinksPolicy::FullLinks) && p.at(BANG) && p.nth_at(1, L_BRACK) {
let result = links::parse_image_or_reference(p);
if result.is_present() {
continue;
@@ -199,13 +153,29 @@ fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind
continue;
}
- if parse_any_inline(p).is_absent() {
+ let parsed = match policy {
+ InlineLinksPolicy::NoLinks => parse_any_inline_no_links(p),
+ InlineLinksPolicy::FullLinks => parse_any_inline(p),
+ };
+ if parsed.is_absent() {
break;
}
}
m.complete(p, MD_INLINE_ITEM_LIST);
p.set_emphasis_context(prev_context);
+ InlineListUntilResult { has_nested_link }
+}
+
+fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool {
+ parse_inline_item_list_until_impl(p, stop, InlineLinksPolicy::NoLinks).has_nested_link
+}
+
+/// Parse inline items until `stop` token, allowing full inline parsing including links.
+/// Used for image alt text where nested links/images should be fully parsed
+/// so their text content can be extracted for the alt attribute.
+fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) {
+ let _ = parse_inline_item_list_until_impl(p, stop, InlineLinksPolicy::FullLinks);
}
fn nested_link_starts_here(p: &mut MarkdownParser) -> bool {
diff --git a/crates/biome_markdown_parser/src/syntax/mod.rs b/crates/biome_markdown_parser/src/syntax/mod.rs
index 0d0e3d21ab00..3566bf4a670c 100644
--- a/crates/biome_markdown_parser/src/syntax/mod.rs
+++ b/crates/biome_markdown_parser/src/syntax/mod.rs
@@ -389,6 +389,12 @@ where
result
}
+enum QuoteBreakKind {
+ None,
+ SetextUnderline,
+ Other,
+}
+
/// Check if we're at an indented code block (4+ spaces of indentation).
///
/// Uses `line_start_leading_indent()` to correctly handle indentation when NEWLINE
@@ -499,6 +505,7 @@ fn has_following_indented_code_line(p: &mut MarkdownParser) -> bool {
}
fn newline_is_blank_line(p: &MarkdownParser) -> bool {
+ // Token stream doesn't expose the prior byte, so read source to detect CR/LF.
let start: usize = p.cur_range().start().into();
if start == 0 {
return true;
@@ -774,6 +781,7 @@ fn real_line_indent_from_source(p: &MarkdownParser) -> usize {
}
fn line_has_quote_prefix(p: &MarkdownParser, depth: usize) -> bool {
+ // Tokens may have consumed whitespace as trivia; scan source to recover columns.
if depth == 0 {
return false;
}
@@ -817,6 +825,204 @@ fn line_has_quote_prefix(p: &MarkdownParser, depth: usize) -> bool {
true
}
+fn classify_quote_break_after_newline(
+ p: &mut MarkdownParser,
+ quote_depth: usize,
+ include_textual_markers: bool,
+) -> QuoteBreakKind {
+ p.lookahead(|p| {
+ consume_quote_prefix_without_virtual(p, quote_depth);
+ with_virtual_line_start(p, p.cur_range().start(), |p| {
+ if p.at(MD_SETEXT_UNDERLINE_LITERAL)
+ || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
+ {
+ QuoteBreakKind::SetextUnderline
+ } else if at_block_interrupt(p)
+ || (include_textual_markers && textual_looks_like_list_marker(p))
+ {
+ QuoteBreakKind::Other
+ } else {
+ QuoteBreakKind::None
+ }
+ })
+ })
+}
+
+enum InlineNewlineAction {
+ Break,
+ Continue,
+}
+
+fn handle_inline_newline(p: &mut MarkdownParser, has_content: bool) -> InlineNewlineAction {
+ if p.at_blank_line() {
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+ return InlineNewlineAction::Break;
+ }
+
+ let quote_depth = p.state().block_quote_depth;
+ if quote_depth > 0 {
+ let is_quote_blank_line = p.lookahead(|p| {
+ p.bump(NEWLINE);
+ if is_quote_only_blank_line_from_source(p, quote_depth) {
+ return true;
+ }
+ if !has_quote_prefix(p, quote_depth) {
+ return false;
+ }
+ consume_quote_prefix_without_virtual(p, quote_depth);
+ while p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') {
+ p.bump(MD_TEXTUAL_LITERAL);
+ }
+ p.at(NEWLINE) || p.at(T![EOF])
+ });
+ if is_quote_blank_line {
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+ return InlineNewlineAction::Break;
+ }
+ }
+
+ // Not a blank line - this is a soft line break within paragraph
+ // Consume the NEWLINE as textual content (remap to MD_TEXTUAL_LITERAL)
+ let text_m = p.start();
+ p.bump_remap(MD_TEXTUAL_LITERAL);
+ text_m.complete(p, MD_TEXTUAL);
+
+ // If we're inside a block quote, only consume the quote prefix
+ // when it doesn't start a new block (e.g., a nested quote).
+ if quote_depth > 0 && has_quote_prefix(p, quote_depth) {
+ let break_kind = classify_quote_break_after_newline(p, quote_depth, true);
+ match break_kind {
+ QuoteBreakKind::SetextUnderline => {
+ // Consume the quote prefix so the setext underline is visible
+ // to the paragraph parser.
+ consume_quote_prefix(p, quote_depth);
+ return InlineNewlineAction::Break;
+ }
+ QuoteBreakKind::Other => {
+ return InlineNewlineAction::Break;
+ }
+ QuoteBreakKind::None => {
+ consume_quote_prefix(p, quote_depth);
+ }
+ }
+ }
+ if quote_depth > 0 && p.at(R_ANGLE) && !has_quote_prefix(p, quote_depth) {
+ consume_partial_quote_prefix(p, quote_depth);
+ }
+
+ // After crossing a line, check for setext underlines.
+ // For non-list paragraphs, we need to look past up to 3 spaces of indent
+ // to detect setext underlines (CommonMark §4.3).
+ // IMPORTANT: Only break if allow_setext_heading() is true - this ensures
+ // setext underlines outside a blockquote (without >) don't incorrectly
+ // terminate the paragraph (CommonMark example 093).
+ if has_content && p.state().list_item_required_indent == 0 && allow_setext_heading(p) {
+ let is_setext = p.lookahead(|p| at_setext_underline_after_newline(p).is_some());
+ if is_setext {
+ // Skip the indent so parse_paragraph sees the underline
+ p.skip_line_indent(INDENT_CODE_BLOCK_SPACES);
+ return InlineNewlineAction::Break;
+ }
+ }
+
+ // Check if we're at a setext heading underline (already past indent)
+ if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
+ return InlineNewlineAction::Break;
+ }
+ if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
+ return InlineNewlineAction::Break;
+ }
+
+ // If we're inside a list item and the next line meets the required indent,
+ // check for block interrupts after skipping that indent. This allows
+ // nested list markers like "\t - baz" to break out of the paragraph.
+ let required_indent = p.state().list_item_required_indent;
+ if required_indent > 0 {
+ // Check for setext underline after indent stripping.
+ // The `---` or `===` may be indented by the list item's required indent,
+ // so we need to look past that indent.
+ let real_indent = real_line_indent_from_source(p);
+ if real_indent >= required_indent {
+ let is_setext = p.lookahead(|p| {
+ p.skip_line_indent(required_indent);
+ p.at(MD_SETEXT_UNDERLINE_LITERAL)
+ || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
+ });
+ if is_setext && has_content {
+ // Skip the indent so parse_paragraph sees the underline
+ p.skip_line_indent(required_indent);
+ return InlineNewlineAction::Break;
+ }
+ }
+
+ let indent = p.line_start_leading_indent();
+ if indent >= required_indent {
+ let interrupts = p.lookahead(|p| {
+ p.skip_line_indent(required_indent);
+ let prev_required = p.state().list_item_required_indent;
+ with_virtual_line_start(p, p.cur_range().start(), |p| {
+ p.state_mut().list_item_required_indent = 0;
+ let breaks = at_block_interrupt(p) || textual_looks_like_list_marker(p);
+ p.state_mut().list_item_required_indent = prev_required;
+ breaks
+ })
+ });
+ if interrupts {
+ return InlineNewlineAction::Break;
+ }
+ }
+ }
+
+ // Check for block-level constructs that can interrupt paragraphs
+ if line_starts_with_fence(p) {
+ return InlineNewlineAction::Break;
+ }
+ if p.at(MD_TEXTUAL_LITERAL) {
+ let text = p.cur_text();
+ if text.starts_with("```") || text.starts_with("~~~") {
+ return InlineNewlineAction::Break;
+ }
+ }
+ if at_block_interrupt(p) {
+ return InlineNewlineAction::Break;
+ }
+
+ // Also check for list markers that appear as textual content.
+ // Inside inline content, '-' is lexed as MD_TEXTUAL_LITERAL, not MINUS,
+ // so at_block_interrupt won't detect them. Per CommonMark §5.1, list
+ // items can interrupt paragraphs (bullet lists always, ordered lists
+ // only if they start with 1).
+ if textual_looks_like_list_marker(p) {
+ return InlineNewlineAction::Break;
+ }
+
+ // Per CommonMark §5.2, when inside a list item, check indentation.
+ // If sufficient indentation, skip it. If insufficient, this is
+ // "lazy continuation" - the content continues without meeting the
+ // indent requirement (at_block_interrupt already checked above).
+ if required_indent > 0 {
+ let indent = p.line_start_leading_indent();
+ if indent >= required_indent {
+ // Sufficient indentation - skip it
+ p.skip_line_indent(required_indent);
+ }
+ // else: Lazy continuation - don't break, don't skip indent.
+ // The at_block_interrupt check above handles real interruptions.
+ // Content continues at its actual position.
+ }
+
+ // For plain paragraphs, strip up to 4 leading spaces on continuation lines.
+ if required_indent == 0 {
+ p.skip_line_indent(INDENT_CODE_BLOCK_SPACES);
+ }
+
+ InlineNewlineAction::Continue
+}
+
/// Parse the inline item list within a block.
///
/// Grammar: MdInlineItemList = AnyMdInline*
@@ -848,196 +1054,12 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) {
// NEWLINE handling: check for blank line (paragraph boundary)
if p.at(NEWLINE) {
- if p.at_blank_line() {
- // Blank line = paragraph boundary
- // Consume this NEWLINE but stop (the second NEWLINE stays for block parser)
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
+ if matches!(
+ handle_inline_newline(p, has_content),
+ InlineNewlineAction::Break
+ ) {
break;
}
-
- let quote_depth = p.state().block_quote_depth;
- if quote_depth > 0 {
- let is_quote_blank_line = p.lookahead(|p| {
- p.bump(NEWLINE);
- if is_quote_only_blank_line_from_source(p, quote_depth) {
- return true;
- }
- if !has_quote_prefix(p, quote_depth) {
- return false;
- }
- consume_quote_prefix_without_virtual(p, quote_depth);
- while p.at(MD_TEXTUAL_LITERAL)
- && p.cur_text().chars().all(|c| c == ' ' || c == '\t')
- {
- p.bump(MD_TEXTUAL_LITERAL);
- }
- p.at(NEWLINE) || p.at(T![EOF])
- });
- if is_quote_blank_line {
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
- break;
- }
- }
-
- // Not a blank line - this is a soft line break within paragraph
- // Consume the NEWLINE as textual content (remap to MD_TEXTUAL_LITERAL)
- let text_m = p.start();
- p.bump_remap(MD_TEXTUAL_LITERAL);
- text_m.complete(p, MD_TEXTUAL);
-
- // If we're inside a block quote, only consume the quote prefix
- // when it doesn't start a new block (e.g., a nested quote).
- if quote_depth > 0 && has_quote_prefix(p, quote_depth) {
- enum QuoteBreakKind {
- None,
- SetextUnderline,
- Other,
- }
-
- let break_kind = p.lookahead(|p| {
- consume_quote_prefix_without_virtual(p, quote_depth);
- with_virtual_line_start(p, p.cur_range().start(), |p| {
- if p.at(MD_SETEXT_UNDERLINE_LITERAL)
- || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
- {
- QuoteBreakKind::SetextUnderline
- } else if at_block_interrupt(p) || textual_looks_like_list_marker(p) {
- QuoteBreakKind::Other
- } else {
- QuoteBreakKind::None
- }
- })
- });
- match break_kind {
- QuoteBreakKind::SetextUnderline => {
- // Consume the quote prefix so the setext underline is visible
- // to the paragraph parser.
- consume_quote_prefix(p, quote_depth);
- break;
- }
- QuoteBreakKind::Other => {
- break;
- }
- QuoteBreakKind::None => {
- consume_quote_prefix(p, quote_depth);
- }
- }
- }
- if quote_depth > 0 && p.at(R_ANGLE) && !has_quote_prefix(p, quote_depth) {
- consume_partial_quote_prefix(p, quote_depth);
- }
-
- // After crossing a line, check for setext underlines.
- // For non-list paragraphs, we need to look past up to 3 spaces of indent
- // to detect setext underlines (CommonMark §4.3).
- // IMPORTANT: Only break if allow_setext_heading() is true - this ensures
- // setext underlines outside a blockquote (without >) don't incorrectly
- // terminate the paragraph (CommonMark example 093).
- if has_content && p.state().list_item_required_indent == 0 && allow_setext_heading(p) {
- let is_setext = p.lookahead(|p| at_setext_underline_after_newline(p).is_some());
- if is_setext {
- // Skip the indent so parse_paragraph sees the underline
- p.skip_line_indent(INDENT_CODE_BLOCK_SPACES);
- break;
- }
- }
-
- // Check if we're at a setext heading underline (already past indent)
- if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
- break;
- }
- if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
- break;
- }
-
- // If we're inside a list item and the next line meets the required indent,
- // check for block interrupts after skipping that indent. This allows
- // nested list markers like "\t - baz" to break out of the paragraph.
- let required_indent = p.state().list_item_required_indent;
- if required_indent > 0 {
- // Check for setext underline after indent stripping.
- // The `---` or `===` may be indented by the list item's required indent,
- // so we need to look past that indent.
- let real_indent = real_line_indent_from_source(p);
- if real_indent >= required_indent {
- let is_setext = p.lookahead(|p| {
- p.skip_line_indent(required_indent);
- p.at(MD_SETEXT_UNDERLINE_LITERAL)
- || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
- });
- if is_setext && has_content {
- // Skip the indent so parse_paragraph sees the underline
- p.skip_line_indent(required_indent);
- break;
- }
- }
-
- let indent = p.line_start_leading_indent();
- if indent >= required_indent {
- let interrupts = p.lookahead(|p| {
- p.skip_line_indent(required_indent);
- let prev_required = p.state().list_item_required_indent;
- with_virtual_line_start(p, p.cur_range().start(), |p| {
- p.state_mut().list_item_required_indent = 0;
- let breaks = at_block_interrupt(p) || textual_looks_like_list_marker(p);
- p.state_mut().list_item_required_indent = prev_required;
- breaks
- })
- });
- if interrupts {
- break;
- }
- }
- }
-
- // Check for block-level constructs that can interrupt paragraphs
- if line_starts_with_fence(p) {
- break;
- }
- if p.at(MD_TEXTUAL_LITERAL) {
- let text = p.cur_text();
- if text.starts_with("```") || text.starts_with("~~~") {
- break;
- }
- }
- if at_block_interrupt(p) {
- break;
- }
-
- // Also check for list markers that appear as textual content.
- // Inside inline content, '-' is lexed as MD_TEXTUAL_LITERAL, not MINUS,
- // so at_block_interrupt won't detect them. Per CommonMark §5.1, list
- // items can interrupt paragraphs (bullet lists always, ordered lists
- // only if they start with 1).
- if textual_looks_like_list_marker(p) {
- break;
- }
-
- // Per CommonMark §5.2, when inside a list item, check indentation.
- // If sufficient indentation, skip it. If insufficient, this is
- // "lazy continuation" - the content continues without meeting the
- // indent requirement (at_block_interrupt already checked above).
- if required_indent > 0 {
- let indent = p.line_start_leading_indent();
- if indent >= required_indent {
- // Sufficient indentation - skip it
- p.skip_line_indent(required_indent);
- }
- // else: Lazy continuation - don't break, don't skip indent.
- // The at_block_interrupt check above handles real interruptions.
- // Content continues at its actual position.
- }
-
- // For plain paragraphs, strip up to 4 leading spaces on continuation lines.
- if required_indent == 0 {
- p.skip_line_indent(INDENT_CODE_BLOCK_SPACES);
- }
-
- // Continue parsing on the new line
continue;
}
@@ -1202,16 +1224,8 @@ fn inline_list_source_len(p: &mut MarkdownParser) -> usize {
let quote_depth = p.state().block_quote_depth;
if quote_depth > 0 && has_quote_prefix(p, quote_depth) {
- let breaks_paragraph = p.lookahead(|p| {
- consume_quote_prefix_without_virtual(p, quote_depth);
- with_virtual_line_start(p, p.cur_range().start(), |p| {
- p.at(MD_SETEXT_UNDERLINE_LITERAL)
- || (p.at(MD_THEMATIC_BREAK_LITERAL)
- && is_dash_only_thematic_break(p))
- || at_block_interrupt(p)
- })
- });
- if breaks_paragraph {
+ let break_kind = classify_quote_break_after_newline(p, quote_depth, false);
+ if !matches!(break_kind, QuoteBreakKind::None) {
break;
}
consume_quote_prefix_without_virtual(p, quote_depth);
diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs
index d919ec91a672..a0a4ac4e76b9 100644
--- a/crates/biome_markdown_parser/src/to_html.rs
+++ b/crates/biome_markdown_parser/src/to_html.rs
@@ -45,8 +45,8 @@ use biome_rowan::{AstNode, AstNodeList, Direction, TextRange};
use percent_encoding::{AsciiSet, CONTROLS, utf8_percent_encode};
use std::collections::HashMap;
-use crate::syntax::reference::normalize_reference_label;
use crate::parser::ListTightness;
+use crate::syntax::reference::normalize_reference_label;
// ============================================================================
// Line Handling Utilities
diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs
index c4e672e18a9f..bf84a5beed22 100644
--- a/crates/biome_markdown_parser/tests/spec_test.rs
+++ b/crates/biome_markdown_parser/tests/spec_test.rs
@@ -152,9 +152,11 @@ pub fn quick_test() {
assert_eq!(expected, html, "Example {:03} failed", num);
}
- // Test the remaining failing CommonMark examples
- // TODO: Example 007 still failing - tab expansion issue (produces 3 spaces instead of 2)
- // test_example(7, "-\t\tfoo\n", "\n");
+ test_example(
+ 7,
+ "-\t\tfoo\n",
+ "\n",
+ );
test_example(
42,
"- `one\n- two`\n",
From 70787dd203f2a0135d4a3401fb2c7ac456a4facb Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Thu, 29 Jan 2026 19:43:30 -0500
Subject: [PATCH 09/26] refactor(markdown): standardize imports to use crate::
paths
Convert super:: imports to crate:: style for consistency across
the markdown parser syntax modules.
---
.../src/syntax/fenced_code_block.rs | 2 +-
crates/biome_markdown_parser/src/syntax/header.rs | 2 +-
.../src/syntax/inline/links.rs | 2 +-
crates/biome_markdown_parser/src/syntax/list.rs | 14 +++++++-------
crates/biome_markdown_parser/src/syntax/quote.rs | 4 ++--
.../biome_markdown_parser/src/syntax/reference.rs | 2 +-
6 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs b/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs
index 37bba8046783..21886a8aeec1 100644
--- a/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs
@@ -32,7 +32,7 @@ use biome_parser::{
},
};
-use super::parse_error::unterminated_fenced_code;
+use crate::syntax::parse_error::unterminated_fenced_code;
/// Minimum number of fence characters required per CommonMark §4.5.
const MIN_FENCE_LENGTH: usize = 3;
diff --git a/crates/biome_markdown_parser/src/syntax/header.rs b/crates/biome_markdown_parser/src/syntax/header.rs
index 67803ba9f73d..12c625288da8 100644
--- a/crates/biome_markdown_parser/src/syntax/header.rs
+++ b/crates/biome_markdown_parser/src/syntax/header.rs
@@ -32,7 +32,7 @@ use biome_parser::{
prelude::ParsedSyntax::{self, *},
};
-use super::parse_any_inline;
+use crate::syntax::parse_any_inline;
/// Maximum number of `#` characters allowed in an ATX heading (CommonMark §4.2).
const MAX_HEADER_HASHES: usize = 6;
diff --git a/crates/biome_markdown_parser/src/syntax/inline/links.rs b/crates/biome_markdown_parser/src/syntax/inline/links.rs
index fc9b11de33de..cc5d1006bd4e 100644
--- a/crates/biome_markdown_parser/src/syntax/inline/links.rs
+++ b/crates/biome_markdown_parser/src/syntax/inline/links.rs
@@ -8,7 +8,7 @@ use biome_rowan::TextRange;
use crate::MarkdownParser;
use crate::syntax::reference::normalize_reference_label;
-use super::{parse_inline_item_list_until, parse_inline_item_list_until_no_links};
+use crate::syntax::inline::{parse_inline_item_list_until, parse_inline_item_list_until_no_links};
/// Parse link starting with `[` - dispatches to inline link or reference link.
///
diff --git a/crates/biome_markdown_parser/src/syntax/list.rs b/crates/biome_markdown_parser/src/syntax/list.rs
index d84aff39798a..923b89c455cc 100644
--- a/crates/biome_markdown_parser/src/syntax/list.rs
+++ b/crates/biome_markdown_parser/src/syntax/list.rs
@@ -40,17 +40,17 @@ use biome_parser::prelude::ParsedSyntax::{self, *};
use biome_parser::prelude::{CompletedMarker, Marker, ParseDiagnostic, TokenSet};
use biome_parser::{Parser, token_set};
-use super::quote::{
- consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix,
- parse_quote_block_list,
-};
use biome_rowan::TextRange;
-use super::fenced_code_block::parse_fenced_code_block;
-use super::parse_error::list_nesting_too_deep;
-use super::{at_block_interrupt, at_indent_code_block, is_paragraph_like};
use crate::MarkdownParser;
+use crate::syntax::fenced_code_block::parse_fenced_code_block;
use crate::syntax::parse_any_block_with_indent_code_policy;
+use crate::syntax::parse_error::list_nesting_too_deep;
+use crate::syntax::quote::{
+ consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix,
+ parse_quote_block_list,
+};
+use crate::syntax::{at_block_interrupt, at_indent_code_block, is_paragraph_like};
/// Tokens that start a new block (used for recovery)
const BLOCK_RECOVERY_SET: TokenSet = token_set![
diff --git a/crates/biome_markdown_parser/src/syntax/quote.rs b/crates/biome_markdown_parser/src/syntax/quote.rs
index 13d1d2dd660b..edbcd36a586c 100644
--- a/crates/biome_markdown_parser/src/syntax/quote.rs
+++ b/crates/biome_markdown_parser/src/syntax/quote.rs
@@ -38,9 +38,9 @@ use biome_parser::parse_lists::ParseNodeList;
use biome_parser::parse_recovery::RecoveryResult;
use biome_parser::prelude::ParsedSyntax::{self, *};
-use super::is_paragraph_like;
-use super::parse_error::quote_nesting_too_deep;
use crate::MarkdownParser;
+use crate::syntax::is_paragraph_like;
+use crate::syntax::parse_error::quote_nesting_too_deep;
/// Check if we're at the start of a block quote (`>`).
pub(crate) fn at_quote(p: &mut MarkdownParser) -> bool {
diff --git a/crates/biome_markdown_parser/src/syntax/reference.rs b/crates/biome_markdown_parser/src/syntax/reference.rs
index d917ba2380f3..8add79777dd2 100644
--- a/crates/biome_markdown_parser/src/syntax/reference.rs
+++ b/crates/biome_markdown_parser/src/syntax/reference.rs
@@ -9,7 +9,7 @@ use crate::MarkdownLosslessTreeSink;
use crate::MarkdownParseOptions;
use crate::parser::MarkdownParser;
-use super::parse_document;
+use crate::syntax::parse_document;
/// Normalize a reference label per CommonMark spec.
///
From 5589eea0c1b510db298283237d68aaad6f51ecbd Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Fri, 30 Jan 2026 10:03:54 -0500
Subject: [PATCH 10/26] refactor(markdown): move lexer imports to module scope
---
crates/biome_markdown_parser/src/syntax/inline/code_span.rs | 5 +----
crates/biome_markdown_parser/src/syntax/inline/links.rs | 3 +--
crates/biome_markdown_parser/src/syntax/link_block.rs | 3 +--
3 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/crates/biome_markdown_parser/src/syntax/inline/code_span.rs b/crates/biome_markdown_parser/src/syntax/inline/code_span.rs
index 1f42fe86d948..5147f83efbb2 100644
--- a/crates/biome_markdown_parser/src/syntax/inline/code_span.rs
+++ b/crates/biome_markdown_parser/src/syntax/inline/code_span.rs
@@ -3,6 +3,7 @@ use biome_markdown_syntax::kind::MarkdownSyntaxKind::*;
use biome_parser::Parser;
use biome_parser::prelude::ParsedSyntax::{self, *};
+use crate::lexer::MarkdownLexContext;
use crate::MarkdownParser;
/// Parse a hard line break.
@@ -30,8 +31,6 @@ pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax {
///
/// Returns false if no match found (opener should become literal text).
fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -> bool {
- use crate::lexer::MarkdownLexContext;
-
p.lookahead(|p| {
// Skip the opening backticks (handle both BACKTICK and TRIPLE_BACKTICK)
if p.at(T!["```"]) {
@@ -170,8 +169,6 @@ fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool {
/// - Backslash escapes are NOT processed inside code spans (\` is literal `\``)
/// - If no matching closer exists, the opener is treated as literal text
pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax {
- use crate::lexer::MarkdownLexContext;
-
// Handle both BACKTICK and TRIPLE_BACKTICK (T!["```"] ) as code span openers.
// TRIPLE_BACKTICK can appear when backticks are at line start but info string
// contains backticks, making it not a fenced code block (CommonMark examples 138, 145).
diff --git a/crates/biome_markdown_parser/src/syntax/inline/links.rs b/crates/biome_markdown_parser/src/syntax/inline/links.rs
index cc5d1006bd4e..820a9d9968e0 100644
--- a/crates/biome_markdown_parser/src/syntax/inline/links.rs
+++ b/crates/biome_markdown_parser/src/syntax/inline/links.rs
@@ -5,6 +5,7 @@ use biome_parser::Parser;
use biome_parser::prelude::ParsedSyntax::{self, *};
use biome_rowan::TextRange;
+use crate::lexer::MarkdownLexContext;
use crate::MarkdownParser;
use crate::syntax::reference::normalize_reference_label;
@@ -491,8 +492,6 @@ fn collect_link_text(p: &mut MarkdownParser) -> Option {
}
fn bump_textual_link_def(p: &mut MarkdownParser) {
- use crate::lexer::MarkdownLexContext;
-
let item = p.start();
p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::LinkDefinition);
item.complete(p, MD_TEXTUAL);
diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs
index ceed5ff8e975..82b2d4b06a38 100644
--- a/crates/biome_markdown_parser/src/syntax/link_block.rs
+++ b/crates/biome_markdown_parser/src/syntax/link_block.rs
@@ -24,6 +24,7 @@ use biome_markdown_syntax::MarkdownSyntaxKind::*;
use biome_parser::Parser;
use biome_parser::prelude::ParsedSyntax::{self, *};
+use crate::lexer::MarkdownLexContext;
use crate::MarkdownParser;
/// Maximum label length per CommonMark spec (999 characters).
@@ -542,8 +543,6 @@ fn parse_link_destination(p: &mut MarkdownParser) {
/// Consume the current token as MdTextual using LinkDefinition context.
/// This ensures whitespace produces separate tokens for destination/title parsing.
fn bump_textual_link_def(p: &mut MarkdownParser) {
- use crate::lexer::MarkdownLexContext;
-
let item = p.start();
p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::LinkDefinition);
item.complete(p, MD_TEXTUAL);
From f0933901a55b16a14b9bfbf932915ef8b46f39cc Mon Sep 17 00:00:00 2001
From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com>
Date: Thu, 29 Jan 2026 20:59:55 -0500
Subject: [PATCH 11/26] refactor(markdown): use preorder visitor for to_html
renderer
Switch the test HTML renderer to a preorder visitor and consolidate list/quote handling in a stateful renderer.
---
crates/biome_markdown_parser/src/to_html.rs | 1381 ++++++++++---------
1 file changed, 694 insertions(+), 687 deletions(-)
diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs
index a0a4ac4e76b9..ee49a7555cbc 100644
--- a/crates/biome_markdown_parser/src/to_html.rs
+++ b/crates/biome_markdown_parser/src/to_html.rs
@@ -33,15 +33,25 @@
//! This implementation prioritizes correctness over performance. Each rendering
//! pass may allocate multiple intermediate strings. For production rendering,
//! consider a single-buffer approach using `fmt::Write` or direct string building.
+//!
+//! ## Traversal
+//!
+//! The renderer uses `biome_rowan`'s `.preorder()` visitor to walk the CST and
+//! emit HTML for enter/leave events instead of manual recursive descent. Nodes
+//! that need post-processing (like paragraphs, headers, list items, and
+//! reference links) are buffered until `Leave` so their final HTML wrapper can
+//! be decided with full context.
use biome_markdown_syntax::{
- AnyCodeBlock, AnyContainerBlock, AnyLeafBlock, AnyMdBlock, AnyMdInline, MdAutolink, MdBullet,
- MdBulletListItem, MdDocument, MdEntityReference, MdFencedCodeBlock, MdHeader, MdHtmlBlock,
- MdIndentCodeBlock, MdInlineCode, MdInlineHtml, MdInlineImage, MdInlineLink,
- MdLinkReferenceDefinition, MdLinkTitle, MdOrderedListItem, MdParagraph, MdQuote,
- MdReferenceImage, MdReferenceLink, MdSetextHeader, MdTextual,
+ AnyCodeBlock, AnyLeafBlock, AnyMdBlock, AnyMdInline, MarkdownLanguage, MdAutolink, MdBlockList,
+ MdBullet, MdBulletListItem, MdDocument, MdEntityReference, MdFencedCodeBlock, MdHardLine,
+ MdHeader, MdHtmlBlock, MdIndentCodeBlock, MdInlineCode, MdInlineEmphasis, MdInlineHtml,
+ MdInlineImage, MdInlineItalic, MdInlineItemList, MdInlineLink, MdLinkBlock, MdLinkDestination,
+ MdLinkLabel, MdLinkReferenceDefinition, MdLinkTitle, MdOrderedListItem, MdParagraph, MdQuote,
+ MdReferenceImage, MdReferenceLink, MdReferenceLinkLabel, MdSetextHeader, MdSoftBreak,
+ MdTextual, MdThematicBreakBlock,
};
-use biome_rowan::{AstNode, AstNodeList, Direction, TextRange};
+use biome_rowan::{AstNode, AstNodeList, Direction, SyntaxNode, TextRange, WalkEvent};
use percent_encoding::{AsciiSet, CONTROLS, utf8_percent_encode};
use std::collections::HashMap;
@@ -312,13 +322,7 @@ pub fn document_to_html(
quote_indents: &[crate::parser::QuoteIndent],
) -> String {
let ctx = HtmlRenderContext::new(document, list_tightness, list_item_indents, quote_indents);
- let mut html = String::new();
-
- for block in document.value() {
- render_block(&block, &ctx, &mut html, false, 0, 0);
- }
-
- html
+ HtmlRenderer::new(&ctx).render(document.syntax())
}
// ============================================================================
@@ -360,115 +364,702 @@ fn collect_link_definitions(document: &MdDocument) -> HashMap {
+ ctx: &'a HtmlRenderContext,
+ buffers: Vec,
+ list_stack: Vec,
+ list_item_stack: Vec,
+ quote_indent_stack: Vec,
quote_indent: usize,
-) {
- match block {
- AnyMdBlock::AnyLeafBlock(leaf) => {
- render_leaf_block(leaf, ctx, out, in_tight_list, list_indent, quote_indent);
- }
- AnyMdBlock::AnyContainerBlock(container) => {
- render_container_block(container, ctx, out, list_indent, quote_indent);
- }
- }
+ depth: usize,
+ opaque_depth: Option,
+ skip_children_depth: Option,
+ suppressed_inline_nodes: Vec>>,
}
-/// Render a leaf block to HTML.
-fn render_leaf_block(
- block: &AnyLeafBlock,
- ctx: &HtmlRenderContext,
- out: &mut String,
+struct Buffer {
+ kind: BufferKind,
+ content: String,
+}
+
+enum BufferKind {
+ Root,
+ Paragraph(ParagraphState),
+ Header(HeaderState),
+ SetextHeader(HeaderState),
+ ReferenceLink(ReferenceLinkState),
+ ListItem,
+}
+
+struct ParagraphState {
in_tight_list: bool,
- list_indent: usize,
quote_indent: usize,
-) {
- match block {
- AnyLeafBlock::MdParagraph(para) => {
- render_paragraph(para, ctx, out, in_tight_list, quote_indent);
+ suppress_wrapping: bool,
+}
+
+struct HeaderState {
+ level: usize,
+}
+
+struct ReferenceLinkState {
+ label_display: Option,
+ url: Option,
+ title: Option,
+}
+
+struct ListState {
+ is_tight: bool,
+}
+
+struct ListItemState {
+ is_tight: bool,
+ leading_newline: bool,
+ trim_trailing_newline: bool,
+ is_empty: bool,
+ block_indents: HashMap,
+}
+
+#[derive(Clone, Copy, Default)]
+struct BlockIndent {
+ indent: usize,
+ first_line_column: usize,
+}
+
+impl<'a> HtmlRenderer<'a> {
+ fn new(ctx: &'a HtmlRenderContext) -> Self {
+ Self {
+ ctx,
+ buffers: vec![Buffer {
+ kind: BufferKind::Root,
+ content: String::new(),
+ }],
+ list_stack: Vec::new(),
+ list_item_stack: Vec::new(),
+ quote_indent_stack: Vec::new(),
+ quote_indent: 0,
+ depth: 0,
+ opaque_depth: None,
+ skip_children_depth: None,
+ suppressed_inline_nodes: Vec::new(),
+ }
+ }
+
+ fn render(mut self, root: &SyntaxNode) -> String {
+ for event in root.preorder() {
+ match event {
+ WalkEvent::Enter(node) => {
+ if self.opaque_depth.is_some() {
+ self.depth += 1;
+ continue;
+ }
+ if let Some(skip) = self.skip_children_depth
+ && self.depth > skip
+ {
+ self.depth += 1;
+ continue;
+ }
+
+ self.enter(node);
+ self.depth += 1;
+ }
+ WalkEvent::Leave(node) => {
+ self.depth = self.depth.saturating_sub(1);
+ if let Some(opaque_depth) = self.opaque_depth {
+ if self.depth == opaque_depth {
+ self.opaque_depth = None;
+ }
+ continue;
+ }
+
+ if let Some(skip) = self.skip_children_depth {
+ if self.depth > skip {
+ continue;
+ }
+ if self.depth == skip {
+ self.leave(node);
+ self.skip_children_depth = None;
+ continue;
+ }
+ }
+
+ self.leave(node);
+ }
+ }
+ }
+
+ self.buffers
+ .pop()
+ .map(|buffer| buffer.content)
+ .unwrap_or_default()
+ }
+
+ fn enter(&mut self, node: SyntaxNode) {
+ if MdInlineItemList::cast(node.clone()).is_some()
+ && self
+ .suppressed_inline_nodes
+ .iter()
+ .flatten()
+ .any(|suppressed| *suppressed == node)
+ {
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if MdParagraph::cast(node.clone()).is_some() {
+ let suppress_wrapping = node.parent().and_then(MdHeader::cast).is_some()
+ || node.parent().and_then(MdSetextHeader::cast).is_some();
+ let is_direct_list_item = node
+ .parent()
+ .and_then(MdBlockList::cast)
+ .and_then(|list| list.syntax().parent())
+ .and_then(MdBullet::cast)
+ .is_some();
+ let in_tight_list = is_direct_list_item
+ && self
+ .list_item_stack
+ .last()
+ .is_some_and(|state| state.is_tight);
+ let state = ParagraphState {
+ in_tight_list,
+ quote_indent: self.quote_indent,
+ suppress_wrapping,
+ };
+ self.push_buffer(BufferKind::Paragraph(state));
+ return;
+ }
+
+ if let Some(header) = MdHeader::cast(node.clone()) {
+ let level = header_level(&header);
+ self.push_buffer(BufferKind::Header(HeaderState { level }));
+ return;
+ }
+
+ if let Some(header) = MdSetextHeader::cast(node.clone()) {
+ let level = setext_header_level(&header);
+ self.push_buffer(BufferKind::SetextHeader(HeaderState { level }));
+ return;
+ }
+
+ if let Some(quote) = MdQuote::cast(node.clone()) {
+ self.push_str("\n");
+ let marker_indent = self.ctx.quote_indent(quote.syntax().text_trimmed_range());
+ self.quote_indent += marker_indent;
+ self.quote_indent_stack.push(marker_indent);
+ return;
+ }
+
+ if let Some(list) = MdBulletListItem::cast(node.clone()) {
+ let is_tight = self.ctx.is_list_tight(list.syntax().text_trimmed_range());
+ self.list_stack.push(ListState { is_tight });
+ self.push_str("\n");
+ return;
+ }
+
+ if let Some(list) = MdOrderedListItem::cast(node.clone()) {
+ let is_tight = self.ctx.is_list_tight(list.syntax().text_trimmed_range());
+ self.list_stack.push(ListState { is_tight });
+
+ let start = list
+ .md_bullet_list()
+ .first()
+ .and_then(|bullet| bullet.bullet().ok())
+ .map_or(1, |marker| {
+ let text = marker.text();
+ text.trim_start()
+ .chars()
+ .take_while(|c| c.is_ascii_digit())
+ .collect::()
+ .parse::()
+ .unwrap_or(1)
+ });
+
+ if start == 1 {
+ self.push_str("\n");
+ } else {
+ self.push_str("\n");
+ }
+ return;
+ }
+
+ if let Some(bullet) = MdBullet::cast(node.clone()) {
+ let list_is_tight = self.list_stack.last().is_some_and(|state| state.is_tight);
+ let blocks: Vec<_> = bullet.content().iter().collect();
+ let item_has_blank_line = blocks
+ .windows(2)
+ .any(|pair| is_newline_block(&pair[0]) && is_newline_block(&pair[1]));
+ let is_tight = list_is_tight && !item_has_blank_line;
+ let is_empty = is_empty_content(&blocks);
+
+ let first_is_paragraph = blocks.first().is_some_and(is_paragraph_block);
+ let last_is_paragraph = blocks
+ .iter()
+ .rev()
+ .find(|b| !is_newline_block(b))
+ .is_some_and(is_paragraph_block);
+
+ let leading_newline = !is_tight || !first_is_paragraph;
+ let trim_trailing_newline = is_tight && last_is_paragraph;
+
+ let list_indent = self
+ .ctx
+ .list_item_indent(bullet.syntax().text_trimmed_range());
+ let (indent, first_line_code_indent, first_line_column) = match list_indent {
+ Some(entry) => {
+ let base = list_item_required_indent(entry);
+ let first_line_code =
+ (entry.spaces_after_marker > INDENT_CODE_BLOCK_SPACES).then_some(base);
+ let column = entry.marker_indent + entry.marker_width;
+ (base, first_line_code, column)
+ }
+ None => (0, None, 0),
+ };
+
+ let mut block_indents = HashMap::new();
+ for (idx, block) in blocks.iter().enumerate() {
+ let block_indent = if idx == 0 {
+ match (first_line_code_indent, block) {
+ (
+ Some(code_indent),
+ AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(
+ AnyCodeBlock::MdIndentCodeBlock(_),
+ )),
+ ) => code_indent,
+ _ => indent,
+ }
+ } else {
+ indent
+ };
+
+ let column_for_block = if idx == 0
+ && first_line_code_indent.is_some()
+ && matches!(
+ block,
+ AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(
+ AnyCodeBlock::MdIndentCodeBlock(_)
+ ))
+ ) {
+ first_line_column
+ } else {
+ 0
+ };
+
+ block_indents.insert(
+ block.syntax().text_trimmed_range(),
+ BlockIndent {
+ indent: block_indent,
+ first_line_column: column_for_block,
+ },
+ );
+ }
+
+ self.list_item_stack.push(ListItemState {
+ is_tight,
+ leading_newline,
+ trim_trailing_newline,
+ is_empty,
+ block_indents,
+ });
+ self.push_buffer(BufferKind::ListItem);
+ if is_empty {
+ self.skip_children_depth = Some(self.depth);
+ }
+ return;
+ }
+
+ if let Some(code) = MdFencedCodeBlock::cast(node.clone()) {
+ let block_indent = self.block_indent(code.syntax().text_trimmed_range());
+ let quote_indent = self.quote_indent;
+ render_fenced_code_block(&code, self.out_mut(), block_indent.indent, quote_indent);
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if let Some(code) = MdIndentCodeBlock::cast(node.clone()) {
+ let block_indent = self.block_indent(code.syntax().text_trimmed_range());
+ let quote_indent = self.quote_indent;
+ if block_indent.first_line_column > 0 {
+ render_indented_code_block_in_list(
+ &code,
+ self.out_mut(),
+ block_indent.indent,
+ quote_indent,
+ block_indent.first_line_column,
+ );
+ } else {
+ render_indented_code_block(
+ &code,
+ self.out_mut(),
+ block_indent.indent,
+ quote_indent,
+ );
+ }
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if let Some(html) = MdHtmlBlock::cast(node.clone()) {
+ let is_inline = node
+ .parent()
+ .and_then(biome_markdown_syntax::MdInlineItemList::cast)
+ .is_some();
+ if is_inline {
+ let content = collect_raw_inline_text(&html.content());
+ self.push_str(&content);
+ } else {
+ let block_indent = self.block_indent(html.syntax().text_trimmed_range());
+ let quote_indent = self.quote_indent;
+ render_html_block(&html, self.out_mut(), block_indent.indent, quote_indent);
+ }
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if MdThematicBreakBlock::cast(node.clone()).is_some() {
+ self.push_str("
\n");
+ return;
+ }
+
+ if MdLinkReferenceDefinition::cast(node.clone()).is_some()
+ || MdLinkBlock::cast(node.clone()).is_some()
+ {
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if MdLinkDestination::cast(node.clone()).is_some()
+ || MdLinkLabel::cast(node.clone()).is_some()
+ || MdLinkTitle::cast(node.clone()).is_some()
+ || MdReferenceLinkLabel::cast(node.clone()).is_some()
+ {
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if MdInlineEmphasis::cast(node.clone()).is_some() {
+ self.push_str("");
+ return;
}
- AnyLeafBlock::MdHeader(header) => {
- render_atx_header(header, ctx, out);
+
+ if MdInlineItalic::cast(node.clone()).is_some() {
+ self.push_str("");
+ return;
+ }
+
+ if let Some(code) = MdInlineCode::cast(node.clone()) {
+ render_inline_code(&code, self.out_mut());
+ self.opaque_depth = Some(self.depth);
+ return;
}
- AnyLeafBlock::MdSetextHeader(header) => {
- render_setext_header(header, ctx, out);
+
+ if let Some(link) = MdInlineLink::cast(node.clone()) {
+ let dest = collect_inline_text(&link.destination());
+ let dest = process_link_destination(&dest);
+ self.suppressed_inline_nodes
+ .push(vec![link.destination().syntax().clone()]);
+
+ self.push_str("");
+ return;
+ }
+
+ if let Some(img) = MdInlineImage::cast(node.clone()) {
+ let alt = extract_alt_text(&img.alt(), self.ctx);
+ let dest = collect_inline_text(&img.destination());
+ let dest = process_link_destination(&dest);
+
+ self.push_str("
");
+ self.opaque_depth = Some(self.depth);
+ return;
+ }
+
+ if let Some(link) = MdReferenceLink::cast(node.clone()) {
+ let text_raw = collect_inline_text(&link.text());
+ let (label, label_display) =
+ resolve_reference_label(link.label(), text_raw, |label_node| {
+ collect_inline_text(&label_node.label())
+ });
+ let (url, title) = self
+ .ctx
+ .get_link_definition(&label)
+ .map_or((None, None), |(url, title)| {
+ (Some(url.clone()), title.clone())
+ });
+ self.push_buffer(BufferKind::ReferenceLink(ReferenceLinkState {
+ label_display,
+ url,
+ title,
+ }));
+ return;
+ }
+
+ if let Some(img) = MdReferenceImage::cast(node.clone()) {
+ let alt = extract_alt_text(&img.alt(), self.ctx);
+ let alt_raw = collect_inline_text(&img.alt());
+ let (label, label_display) =
+ resolve_reference_label(img.label(), alt_raw, |label_node| {
+ collect_inline_text(&label_node.label())
+ });
+
+ if let Some((url, title)) = self.ctx.get_link_definition(&label) {
+ self.push_str("
");
+ } else {
+ self.push_str("![");
+ self.push_str(&alt);
+ self.push_str("]");
+ if let Some(label) = label_display {
+ self.push_str("[");
+ self.push_str(&escape_html(&label));
+ self.push_str("]");
+ }
+ }
+
+ self.opaque_depth = Some(self.depth);
+ return;
}
- AnyLeafBlock::AnyCodeBlock(code) => {
- render_code_block(code, out, list_indent, quote_indent);
+
+ if let Some(autolink) = MdAutolink::cast(node.clone()) {
+ render_autolink(&autolink, self.out_mut());
+ self.opaque_depth = Some(self.depth);
+ return;
}
- AnyLeafBlock::MdThematicBreakBlock(_) => {
- out.push_str("
\n");
+
+ if let Some(html) = MdInlineHtml::cast(node.clone()) {
+ render_inline_html(&html, self.out_mut());
+ self.opaque_depth = Some(self.depth);
+ return;
}
- AnyLeafBlock::MdHtmlBlock(html) => {
- render_html_block(html, out, list_indent, quote_indent);
+
+ if let Some(hard) = MdHardLine::cast(node.clone()) {
+ if is_last_inline_item(&node) {
+ if let Ok(token) = hard.value_token()
+ && token.text().starts_with('\\')
+ {
+ self.push_str("\\");
+ }
+ } else {
+ self.push_str("
\n");
+ }
+ return;
}
- AnyLeafBlock::MdLinkReferenceDefinition(_) => {
- // Link reference definitions don't produce output
+
+ if MdSoftBreak::cast(node.clone()).is_some() {
+ self.push_str("\n");
+ return;
}
- AnyLeafBlock::MdLinkBlock(_) => {
- // MdLinkBlock is an internal structure, skip it
+
+ if let Some(text) = MdTextual::cast(node.clone()) {
+ render_textual(&text, self.out_mut());
+ return;
}
- AnyLeafBlock::MdNewline(_) => {
- // Blank lines don't produce output
+
+ if let Some(entity) = MdEntityReference::cast(node) {
+ render_entity_reference(&entity, self.out_mut());
}
}
-}
-/// Render a container block to HTML.
-fn render_container_block(
- block: &AnyContainerBlock,
- ctx: &HtmlRenderContext,
- out: &mut String,
- list_indent: usize,
- quote_indent: usize,
-) {
- match block {
- AnyContainerBlock::MdQuote(quote) => {
- render_blockquote(quote, ctx, out, list_indent, quote_indent);
+ fn leave(&mut self, node: SyntaxNode) {
+ if MdParagraph::cast(node.clone()).is_some() {
+ let buffer = self.pop_buffer();
+ if let BufferKind::Paragraph(state) = buffer.kind {
+ if state.suppress_wrapping {
+ self.push_str(&buffer.content);
+ return;
+ }
+ let mut content = buffer.content;
+ if state.quote_indent > 0 {
+ content = strip_quote_prefixes(&content, state.quote_indent);
+ }
+ let content = strip_paragraph_indent(
+ content.trim_matches(|c| c == ' ' || c == '\n' || c == '\r'),
+ );
+
+ if state.in_tight_list {
+ self.push_str(&content);
+ self.push_str("\n");
+ } else {
+ self.push_str("");
+ self.push_str(&content);
+ self.push_str("
\n");
+ }
+ }
+ return;
+ }
+
+ if MdHeader::cast(node.clone()).is_some() || MdSetextHeader::cast(node.clone()).is_some() {
+ let buffer = self.pop_buffer();
+ if let BufferKind::Header(state) | BufferKind::SetextHeader(state) = buffer.kind {
+ self.push_str("");
+ self.push_str(buffer.content.trim());
+ self.push_str("\n");
+ }
+ return;
+ }
+
+ if MdQuote::cast(node.clone()).is_some() {
+ self.push_str("
\n");
+ if let Some(indent) = self.quote_indent_stack.pop() {
+ self.quote_indent = self.quote_indent.saturating_sub(indent);
+ }
+ return;
+ }
+
+ if MdBulletListItem::cast(node.clone()).is_some() {
+ self.push_str("\n");
+ self.list_stack.pop();
+ return;
+ }
+
+ if MdOrderedListItem::cast(node.clone()).is_some() {
+ self.push_str("\n");
+ self.list_stack.pop();
+ return;
+ }
+
+ if MdBullet::cast(node.clone()).is_some() {
+ let buffer = self.pop_buffer();
+ let state = self.list_item_stack.pop();
+ if let (BufferKind::ListItem, Some(state)) = (buffer.kind, state) {
+ if state.is_empty {
+ self.push_str("\n");
+ return;
+ }
+
+ self.push_str("");
+ if state.leading_newline {
+ self.push_str("\n");
+ }
+
+ let mut content = buffer.content;
+ if state.trim_trailing_newline && content.ends_with('\n') {
+ content.pop();
+ }
+ self.push_str(&content);
+ self.push_str("\n");
+ }
+ return;
+ }
+
+ if MdInlineEmphasis::cast(node.clone()).is_some() {
+ self.push_str("");
+ return;
}
- AnyContainerBlock::MdBulletListItem(list) => {
- render_bullet_list(list, ctx, out, quote_indent);
+
+ if MdInlineItalic::cast(node.clone()).is_some() {
+ self.push_str("");
+ return;
}
- AnyContainerBlock::MdOrderedListItem(list) => {
- render_ordered_list(list, ctx, out, quote_indent);
+
+ if MdInlineLink::cast(node.clone()).is_some() {
+ self.suppressed_inline_nodes.pop();
+ self.push_str("");
+ return;
+ }
+
+ if MdReferenceLink::cast(node).is_some() {
+ let buffer = self.pop_buffer();
+ if let BufferKind::ReferenceLink(state) = buffer.kind {
+ if let Some(url) = state.url {
+ self.push_str(" tags
- out.push_str(&content);
- out.push('\n');
- } else {
- out.push_str("