Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 39 additions & 41 deletions crates/biome_markdown_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,6 @@ fn line_has_quote_prefix(p: &MarkdownParser, depth: usize) -> bool {
fn classify_quote_break_after_newline(
p: &mut MarkdownParser,
quote_depth: usize,
include_textual_markers: bool,
) -> QuoteBreakKind {
p.lookahead(|p| {
consume_quote_prefix_without_virtual(p, quote_depth);
Expand All @@ -840,9 +839,7 @@ fn classify_quote_break_after_newline(
|| (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
{
QuoteBreakKind::SetextUnderline
} else if at_block_interrupt(p)
|| (include_textual_markers && textual_looks_like_list_marker(p))
{
} else if at_block_interrupt(p) || textual_looks_like_list_marker(p) {
QuoteBreakKind::Other
} else {
QuoteBreakKind::None
Expand All @@ -851,6 +848,24 @@ fn classify_quote_break_after_newline(
})
}

/// Check if the current position is a paragraph break (setext underline,
/// thematic break, fence, block interrupt, or textual list marker).
///
/// This shared predicate consolidates the duplicate break-condition checks
/// in [`handle_inline_newline`] and [`inline_list_source_len`].
fn at_paragraph_break(p: &mut MarkdownParser, has_content: bool) -> bool {
if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
return true;
}
if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
return true;
}
if line_starts_with_fence(p) {
return true;
}
at_block_interrupt(p) || textual_looks_like_list_marker(p)
}

enum InlineNewlineAction {
Break,
Continue,
Expand Down Expand Up @@ -897,7 +912,7 @@ fn handle_inline_newline(p: &mut MarkdownParser, has_content: bool) -> InlineNew
// If we're inside a block quote, only consume the quote prefix
// when it doesn't start a new block (e.g., a nested quote).
if quote_depth > 0 && has_quote_prefix(p, quote_depth) {
let break_kind = classify_quote_break_after_newline(p, quote_depth, true);
let break_kind = classify_quote_break_after_newline(p, quote_depth);
match break_kind {
QuoteBreakKind::SetextUnderline => {
// Consume the quote prefix so the setext underline is visible
Expand Down Expand Up @@ -932,14 +947,6 @@ fn handle_inline_newline(p: &mut MarkdownParser, has_content: bool) -> InlineNew
}
}

// Check if we're at a setext heading underline (already past indent)
if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
return InlineNewlineAction::Break;
}
if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
return InlineNewlineAction::Break;
}

// If we're inside a list item and the next line meets the required indent,
// check for block interrupts after skipping that indent. This allows
// nested list markers like "\t - baz" to break out of the paragraph.
Expand Down Expand Up @@ -980,26 +987,16 @@ fn handle_inline_newline(p: &mut MarkdownParser, has_content: bool) -> InlineNew
}
}

// Check for block-level constructs that can interrupt paragraphs
if line_starts_with_fence(p) {
return InlineNewlineAction::Break;
}
// Check for block-level constructs that can interrupt paragraphs.
// Textual fence tokens (e.g. "```") may not be caught by line_starts_with_fence
// because the lexer emits them as MD_TEXTUAL_LITERAL in inline context.
if p.at(MD_TEXTUAL_LITERAL) {
let text = p.cur_text();
if text.starts_with("```") || text.starts_with("~~~") {
return InlineNewlineAction::Break;
}
}
if at_block_interrupt(p) {
return InlineNewlineAction::Break;
}

// Also check for list markers that appear as textual content.
// Inside inline content, '-' is lexed as MD_TEXTUAL_LITERAL, not MINUS,
// so at_block_interrupt won't detect them. Per CommonMark §5.1, list
// items can interrupt paragraphs (bullet lists always, ordered lists
// only if they start with 1).
if textual_looks_like_list_marker(p) {
if at_paragraph_break(p, has_content) {
return InlineNewlineAction::Break;
}

Expand Down Expand Up @@ -1224,30 +1221,18 @@ fn inline_list_source_len(p: &mut MarkdownParser) -> usize {

let quote_depth = p.state().block_quote_depth;
if quote_depth > 0 && has_quote_prefix(p, quote_depth) {
let break_kind = classify_quote_break_after_newline(p, quote_depth, false);
let break_kind = classify_quote_break_after_newline(p, quote_depth);
if !matches!(break_kind, QuoteBreakKind::None) {
break;
}
consume_quote_prefix_without_virtual(p, quote_depth);
}

if p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) {
break;
}

if p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) {
break;
}

if quote_depth > 0 && p.at(R_ANGLE) && !has_quote_prefix(p, quote_depth) {
consume_partial_quote_prefix_lookahead(p, quote_depth, &mut len);
}

if line_starts_with_fence(p) {
break;
}

if at_block_interrupt(p) {
if at_paragraph_break(p, true) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Prescan doesn't track has_content; both setext/thematic guards are unconditionally enabled.

at_paragraph_break(p, true) at line 1235 hardcodes has_content = true. The parse path (handle_inline_newline) gates the setext/thematic break on the actual has_content flag accumulated during parse. The re-check added at lines 1273–1278 (post-list-indent stripping) has the same gap — its parse-path mirror at lines 965–969 is guarded by if is_setext && has_content.

In practice inline_list_source_len is never called when the inline list starts with a NEWLINE (paragraphs can't start with one), so the risk is limited to emphasis-context sizing in edge cases. But this is a direct parity gap contrary to the PR's stated goal of aligning decision points.

A has_content local flag (mirroring how parse_inline_item_list tracks it) would close the gap:

💡 Sketch — add `has_content` tracking to the prescan
 fn inline_list_source_len(p: &mut MarkdownParser) -> usize {
     p.lookahead(|p| {
         let mut len = 0usize;
+        let mut has_content = false;

         loop {
             // …
             if p.at(NEWLINE) {
                 // …
-                if at_paragraph_break(p, true) {
+                if at_paragraph_break(p, has_content) {
                     break;
                 }
                 // …
                 // After stripping list indent, re-check setext/thematic markers
-                if p.at(MD_SETEXT_UNDERLINE_LITERAL)
-                    || (p.at(MD_THEMATIC_BREAK_LITERAL)
-                        && is_dash_only_thematic_break_text(p.cur_text()))
-                {
-                    break;
-                }
+                if has_content
+                    && (p.at(MD_SETEXT_UNDERLINE_LITERAL)
+                        || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p)))
+                {
+                    break;
+                }
                 continue;
             }
             // …
             len += p.cur_text().len();
             p.bump(p.cur());
+            // mirror has_content tracking from parse_inline_item_list
+            let inline_end: usize = p.cur_range().start().into();
+            // (simplified: any non-NEWLINE token counts as content for prescan purposes)
+            has_content = true;
         }
         len
     })
 }

Also applies to: 1273-1278

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/biome_markdown_parser/src/syntax/mod.rs` at line 1235, The prescan
currently hardcodes has_content=true in calls like at_paragraph_break(p, true),
causing setext/thematic guards to be unconditionally enabled; add a local
mutable has_content flag (initialized false) in the prescan loop (mirror
parse_inline_item_list) and set it true whenever you encounter a non-newline
content token during the prescan, then replace the hardcoded true in
at_paragraph_break and the later re-check (the post-list-indent strip) with this
has_content variable and ensure the same is_setext && has_content gating used in
the parse path (e.g., handle_inline_newline / inline_list_source_len checks) is
applied so setext/thematic logic only runs when real content was seen.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this relevant @jfmcdowell ?

Copy link
Copy Markdown
Contributor Author

@jfmcdowell jfmcdowell Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this issue was real and is fixed in #9313: the prescan mismatch in inline_list_source_len is now gated with has_content for setext/thematic checks (aligned with parse_inline_item_list). Revalidated with parser tests and CommonMark conformance passing. It slipped past the keeper.

break;
}

Expand Down Expand Up @@ -1278,6 +1263,19 @@ fn inline_list_source_len(p: &mut MarkdownParser) -> usize {
len += text.len();
p.bump(MD_TEXTUAL_LITERAL);
}

// After stripping list indent, re-check setext/thematic markers
// to mirror newline handling in the parse path. Without this,
// prescan would include indent bytes and stop one iteration later.
// We intentionally skip the heavier post-indent block-interrupt
// check here; the following non-NEWLINE pass still catches
// interrupts for emphasis-context length calculation.
if p.at(MD_SETEXT_UNDERLINE_LITERAL)
|| (p.at(MD_THEMATIC_BREAK_LITERAL)
&& is_dash_only_thematic_break_text(p.cur_text()))
{
break;
}
}

continue;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
> paragraph line
> - nested bullet

> paragraph line
> 1. nested ordered

> paragraph line
> still paragraph
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
assertion_line: 131
expression: snapshot
---

## Input

```
> paragraph line
> - nested bullet

> paragraph line
> 1. nested ordered

> paragraph line
> still paragraph

```


## AST

```
MdDocument {
bom_token: missing (optional),
value: MdBlockList [
MdQuote {
marker_token: R_ANGLE@0..1 ">" [] [],
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@1..16 "paragraph line" [Skipped(" ")] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@16..17 "\n" [] [],
},
],
hard_line: missing (optional),
},
MdBulletListItem {
md_bullet_list: MdBulletList [
MdBullet {
bullet: MINUS@17..20 "-" [Skipped(">"), Skipped(" ")] [],
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@20..34 " nested bullet" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@34..35 "\n" [] [],
},
],
hard_line: missing (optional),
},
],
},
],
},
],
},
MdNewline {
value_token: NEWLINE@35..36 "\n" [] [],
},
MdQuote {
marker_token: R_ANGLE@36..37 ">" [] [],
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@37..52 "paragraph line" [Skipped(" ")] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@52..53 "\n" [] [],
},
],
hard_line: missing (optional),
},
MdOrderedListItem {
md_bullet_list: MdBulletList [
MdBullet {
bullet: MD_ORDERED_LIST_MARKER@53..57 "1." [Skipped(">"), Skipped(" ")] [],
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@57..72 " nested ordered" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@72..73 "\n" [] [],
},
],
hard_line: missing (optional),
},
],
},
],
},
],
},
MdNewline {
value_token: NEWLINE@73..74 "\n" [] [],
},
MdQuote {
marker_token: R_ANGLE@74..75 ">" [] [],
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@75..90 "paragraph line" [Skipped(" ")] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@90..91 "\n" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@91..108 "still paragraph" [Skipped(">"), Skipped(" ")] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@108..109 "\n" [] [],
},
],
hard_line: missing (optional),
},
],
},
],
eof_token: EOF@109..109 "" [] [],
}
```

## CST

```
0: MD_DOCUMENT@0..109
0: (empty)
1: MD_BLOCK_LIST@0..109
0: MD_QUOTE@0..35
0: R_ANGLE@0..1 ">" [] []
1: MD_BLOCK_LIST@1..35
0: MD_PARAGRAPH@1..17
0: MD_INLINE_ITEM_LIST@1..17
0: MD_TEXTUAL@1..16
0: MD_TEXTUAL_LITERAL@1..16 "paragraph line" [Skipped(" ")] []
1: MD_TEXTUAL@16..17
0: MD_TEXTUAL_LITERAL@16..17 "\n" [] []
1: (empty)
1: MD_BULLET_LIST_ITEM@17..35
0: MD_BULLET_LIST@17..35
0: MD_BULLET@17..35
0: MINUS@17..20 "-" [Skipped(">"), Skipped(" ")] []
1: MD_BLOCK_LIST@20..35
0: MD_PARAGRAPH@20..35
0: MD_INLINE_ITEM_LIST@20..35
0: MD_TEXTUAL@20..34
0: MD_TEXTUAL_LITERAL@20..34 " nested bullet" [] []
1: MD_TEXTUAL@34..35
0: MD_TEXTUAL_LITERAL@34..35 "\n" [] []
1: (empty)
1: MD_NEWLINE@35..36
0: NEWLINE@35..36 "\n" [] []
2: MD_QUOTE@36..73
0: R_ANGLE@36..37 ">" [] []
1: MD_BLOCK_LIST@37..73
0: MD_PARAGRAPH@37..53
0: MD_INLINE_ITEM_LIST@37..53
0: MD_TEXTUAL@37..52
0: MD_TEXTUAL_LITERAL@37..52 "paragraph line" [Skipped(" ")] []
1: MD_TEXTUAL@52..53
0: MD_TEXTUAL_LITERAL@52..53 "\n" [] []
1: (empty)
1: MD_ORDERED_LIST_ITEM@53..73
0: MD_BULLET_LIST@53..73
0: MD_BULLET@53..73
0: MD_ORDERED_LIST_MARKER@53..57 "1." [Skipped(">"), Skipped(" ")] []
1: MD_BLOCK_LIST@57..73
0: MD_PARAGRAPH@57..73
0: MD_INLINE_ITEM_LIST@57..73
0: MD_TEXTUAL@57..72
0: MD_TEXTUAL_LITERAL@57..72 " nested ordered" [] []
1: MD_TEXTUAL@72..73
0: MD_TEXTUAL_LITERAL@72..73 "\n" [] []
1: (empty)
3: MD_NEWLINE@73..74
0: NEWLINE@73..74 "\n" [] []
4: MD_QUOTE@74..109
0: R_ANGLE@74..75 ">" [] []
1: MD_BLOCK_LIST@75..109
0: MD_PARAGRAPH@75..109
0: MD_INLINE_ITEM_LIST@75..109
0: MD_TEXTUAL@75..90
0: MD_TEXTUAL_LITERAL@75..90 "paragraph line" [Skipped(" ")] []
1: MD_TEXTUAL@90..91
0: MD_TEXTUAL_LITERAL@90..91 "\n" [] []
2: MD_TEXTUAL@91..108
0: MD_TEXTUAL_LITERAL@91..108 "still paragraph" [Skipped(">"), Skipped(" ")] []
3: MD_TEXTUAL@108..109
0: MD_TEXTUAL_LITERAL@108..109 "\n" [] []
1: (empty)
2: EOF@109..109 "" [] []

```