Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions crates/biome_markdown_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ use quote::{
at_quote, consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix,
line_has_quote_prefix_at_current, parse_quote,
};
use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block};
use thematic_break_block::{
at_thematic_break_block, parse_thematic_break_block, thematic_break_hides_list_item,
};

use crate::MarkdownParser;

Expand Down Expand Up @@ -289,17 +291,29 @@ pub(crate) fn parse_any_block_with_indent_code_policy(
} else if line_starts_with_fence(p) {
parse_fenced_code_block_force(p)
} else if at_thematic_break_block(p) {
let break_block = try_parse(p, |p| {
let break_block = parse_thematic_break_block(p);
if break_block.is_absent() {
return Err(());
}
Ok(break_block)
});
if let Ok(parsed) = break_block {
parsed
// Per CommonMark §5.2 / §4.1: when the thematic break token starts with
// a bullet marker + space and the remaining content is itself a valid
// thematic break (3+ chars), the list item interpretation wins.
// E.g. `- ---` → list item containing <hr />,
// but `- - -` → thematic break (only 2 chars after marker).
let is_hidden_list_item =
p.at(MD_THEMATIC_BREAK_LITERAL) && thematic_break_hides_list_item(p.cur_text());
if is_hidden_list_item {
p.force_relex_thematic_break_parts();
parse_bullet_list_item(p)
} else {
parse_paragraph(p)
let break_block = try_parse(p, |p| {
let break_block = parse_thematic_break_block(p);
if break_block.is_absent() {
return Err(());
}
Ok(break_block)
});
if let Ok(parsed) = break_block {
parsed
} else {
parse_paragraph(p)
}
}
} else if at_header(p) {
// Check for too many hashes BEFORE try_parse (which would lose diagnostics on rewind)
Expand Down
90 changes: 74 additions & 16 deletions crates/biome_markdown_parser/src/syntax/thematic_break_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,25 @@ use biome_parser::{
Parser,
prelude::ParsedSyntax::{self, *},
};
use biome_unicode_table::Dispatch::{IDT, MIN, MUL};
use biome_unicode_table::lookup_byte;

/// CommonMark requires 3 or more matching characters for thematic breaks.
const THEMATIC_BREAK_MIN_CHARS: usize = 3;

/// Whether `byte` is a thematic break marker character (`*`, `-`, or `_`).
///
/// Uses the `biome_unicode_table` lookup table for `*` (`MUL`) and `-` (`MIN`).
/// `_` shares the `IDT` dispatch variant with ASCII letters, so an explicit
/// byte check is required to disambiguate.
fn is_break_marker(byte: u8) -> bool {
match lookup_byte(byte) {
MUL | MIN => true,
IDT => byte == b'_',
_ => false,
}
}

pub(crate) fn at_thematic_break_block(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
if p.at_line_start() || p.at_start_of_input() {
Expand All @@ -42,6 +57,47 @@ pub(crate) fn at_thematic_break_block(p: &mut MarkdownParser) -> bool {
})
}

/// Check if a `MD_THEMATIC_BREAK_LITERAL` token text should actually be parsed
/// as a bullet list item whose content is a thematic break.
///
/// Returns `true` when the text can be split as:
/// `bullet_marker` + `space/tab` + `consecutive_thematic_break`
///
/// The payload must be a CONSECUTIVE run of 3+ matching break characters
/// with no internal spaces. This distinguishes:
/// `- ---` → list item (payload `---` is consecutive)
/// `- - -` → thematic break (payload `- -` has internal spaces)
/// `- - - -` → thematic break (payload `- - -` has internal spaces)
///
/// Only bullet markers (`-`, `*`, `+`) are checked — ordered list markers
/// cannot collide with thematic break characters.
pub(crate) fn thematic_break_hides_list_item(text: &str) -> bool {
let bytes = text.as_bytes();
// Need at least: marker (1) + space (1) + 3 break chars = 5 bytes
if bytes.len() < 5 {
return false;
}
if !matches!(bytes[0], b'-' | b'*' | b'+') {
return false;
}
if !matches!(bytes[1], b' ' | b'\t') {
return false;
}

// The payload (after marker + space) must be 3+ consecutive matching
// break characters, optionally followed by trailing whitespace only.
let payload = text[2..].trim_end_matches([' ', '\t']);
let payload_bytes = payload.as_bytes();
if payload_bytes.len() < THEMATIC_BREAK_MIN_CHARS {
return false;
}
let break_char = payload_bytes[0];
if !matches!(break_char, b'-' | b'*' | b'_') {
Comment on lines +80 to +95
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remember to use the lookup table for known characters

return false;
}
payload_bytes.iter().all(|&b| b == break_char)
}

/// Check if the remaining content forms a thematic break pattern.
///
/// Per CommonMark §4.1, a thematic break is 3 or more matching characters
Expand All @@ -60,22 +116,25 @@ fn is_thematic_break_pattern(p: &mut MarkdownParser) -> bool {
// If the entire line segment is a single textual literal, validate it directly.
if p.at(MD_TEXTUAL_LITERAL)
&& p.cur_text()
.chars()
.all(|c| c == ' ' || c == '\t' || c == '*' || c == '-' || c == '_')
.bytes()
.all(|b| b == b' ' || b == b'\t' || is_break_marker(b))
{
let mut break_char = None;
let mut break_byte = None;
let mut break_count = 0usize;

for c in p.cur_text().chars() {
if c == ' ' || c == '\t' {
for b in p.cur_text().bytes() {
if b == b' ' || b == b'\t' {
continue;
}
if let Some(existing) = break_char {
if existing != c {
if !is_break_marker(b) {
return false;
}
if let Some(existing) = break_byte {
if existing != b {
return false;
}
} else {
break_char = Some(c);
break_byte = Some(b);
}
break_count += 1;
}
Expand All @@ -102,11 +161,11 @@ fn is_thematic_break_pattern(p: &mut MarkdownParser) -> bool {
} else if p.at(MD_TEXTUAL_LITERAL) {
let text = p.cur_text();
if text.len() == 1 {
match text.chars().next() {
Some('*') => '*',
Some('-') => '-',
Some('_') => '_',
_ => return false,
let b = text.as_bytes()[0];
if is_break_marker(b) {
b as char
} else {
return false;
}
} else {
return false;
Expand Down Expand Up @@ -229,9 +288,8 @@ fn parse_thematic_break_parts(p: &mut MarkdownParser) {
}

if p.at(MD_TEXTUAL_LITERAL) {
let first_char = p.cur_text().as_bytes().first().copied();
match first_char {
Some(b'*' | b'-' | b'_' | b' ' | b'\t') => {
match p.cur_text().as_bytes().first().copied() {
Some(b) if is_break_marker(b) || b == b' ' || b == b'\t' => {
p.force_relex_thematic_break_parts();
relex_active = true;
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,33 +70,37 @@ MdDocument {
MdNewline {
value_token: NEWLINE@8..9 "\n" [] [],
},
MdThematicBreakBlock {
parts: MdThematicBreakPartList [
MdThematicBreakChar {
value: MINUS@9..10 "-" [] [],
},
MdIndentToken {
md_indent_char_token: MD_INDENT_CHAR@10..11 " " [] [],
},
MdThematicBreakChar {
value: MINUS@11..12 "-" [] [],
},
MdThematicBreakChar {
value: MINUS@12..13 "-" [] [],
},
MdThematicBreakChar {
value: MINUS@13..14 "-" [] [],
},
],
},
MdNewline {
value_token: NEWLINE@14..15 "\n" [] [],
},
MdNewline {
value_token: NEWLINE@15..16 "\n" [] [],
},
MdBulletListItem {
md_bullet_list: MdBulletList [
MdBullet {
prefix: MdListMarkerPrefix {
pre_marker_indent: MdIndentTokenList [],
marker: MINUS@9..10 "-" [] [],
post_marker_space_token: MD_LIST_POST_MARKER_SPACE@10..11 " " [] [],
content_indent: MdIndentTokenList [],
},
content: MdBlockList [
MdThematicBreakBlock {
parts: MdThematicBreakPartList [
MdThematicBreakChar {
value: MINUS@11..12 "-" [] [],
},
MdThematicBreakChar {
value: MINUS@12..13 "-" [] [],
},
MdThematicBreakChar {
value: MINUS@13..14 "-" [] [],
},
],
},
MdNewline {
value_token: NEWLINE@14..15 "\n" [] [],
},
],
},
MdNewline {
value_token: NEWLINE@15..16 "\n" [] [],
},
MdBullet {
prefix: MdListMarkerPrefix {
pre_marker_indent: MdIndentTokenList [],
Expand Down Expand Up @@ -332,25 +336,28 @@ MdDocument {
0: NEWLINE@7..8 "\n" [] []
2: MD_NEWLINE@8..9
0: NEWLINE@8..9 "\n" [] []
3: MD_THEMATIC_BREAK_BLOCK@9..14
0: MD_THEMATIC_BREAK_PART_LIST@9..14
0: MD_THEMATIC_BREAK_CHAR@9..10
0: MINUS@9..10 "-" [] []
1: MD_INDENT_TOKEN@10..11
0: MD_INDENT_CHAR@10..11 " " [] []
2: MD_THEMATIC_BREAK_CHAR@11..12
0: MINUS@11..12 "-" [] []
3: MD_THEMATIC_BREAK_CHAR@12..13
0: MINUS@12..13 "-" [] []
4: MD_THEMATIC_BREAK_CHAR@13..14
0: MINUS@13..14 "-" [] []
4: MD_NEWLINE@14..15
0: NEWLINE@14..15 "\n" [] []
5: MD_NEWLINE@15..16
0: NEWLINE@15..16 "\n" [] []
6: MD_BULLET_LIST_ITEM@16..21
0: MD_BULLET_LIST@16..21
0: MD_BULLET@16..21
3: MD_BULLET_LIST_ITEM@9..21
0: MD_BULLET_LIST@9..21
0: MD_BULLET@9..15
0: MD_LIST_MARKER_PREFIX@9..11
0: MD_INDENT_TOKEN_LIST@9..9
1: MINUS@9..10 "-" [] []
2: MD_LIST_POST_MARKER_SPACE@10..11 " " [] []
3: MD_INDENT_TOKEN_LIST@11..11
1: MD_BLOCK_LIST@11..15
0: MD_THEMATIC_BREAK_BLOCK@11..14
0: MD_THEMATIC_BREAK_PART_LIST@11..14
0: MD_THEMATIC_BREAK_CHAR@11..12
0: MINUS@11..12 "-" [] []
1: MD_THEMATIC_BREAK_CHAR@12..13
0: MINUS@12..13 "-" [] []
2: MD_THEMATIC_BREAK_CHAR@13..14
0: MINUS@13..14 "-" [] []
1: MD_NEWLINE@14..15
0: NEWLINE@14..15 "\n" [] []
1: MD_NEWLINE@15..16
0: NEWLINE@15..16 "\n" [] []
2: MD_BULLET@16..21
0: MD_LIST_MARKER_PREFIX@16..18
0: MD_INDENT_TOKEN_LIST@16..16
1: MINUS@16..17 "-" [] []
Expand All @@ -365,11 +372,11 @@ MdDocument {
0: UNDERSCORE@19..20 "_" [] []
2: MD_THEMATIC_BREAK_CHAR@20..21
0: UNDERSCORE@20..21 "_" [] []
7: MD_NEWLINE@21..22
4: MD_NEWLINE@21..22
0: NEWLINE@21..22 "\n" [] []
8: MD_NEWLINE@22..23
5: MD_NEWLINE@22..23
0: NEWLINE@22..23 "\n" [] []
9: MD_BULLET_LIST_ITEM@23..28
6: MD_BULLET_LIST_ITEM@23..28
0: MD_BULLET_LIST@23..28
0: MD_BULLET@23..28
0: MD_LIST_MARKER_PREFIX@23..25
Expand All @@ -386,11 +393,11 @@ MdDocument {
0: MINUS@26..27 "-" [] []
2: MD_THEMATIC_BREAK_CHAR@27..28
0: MINUS@27..28 "-" [] []
10: MD_NEWLINE@28..29
7: MD_NEWLINE@28..29
0: NEWLINE@28..29 "\n" [] []
11: MD_NEWLINE@29..30
8: MD_NEWLINE@29..30
0: NEWLINE@29..30 "\n" [] []
12: MD_BULLET_LIST_ITEM@30..62
9: MD_BULLET_LIST_ITEM@30..62
0: MD_BULLET_LIST@30..62
0: MD_BULLET@30..37
0: MD_LIST_MARKER_PREFIX@30..32
Expand Down Expand Up @@ -476,7 +483,7 @@ MdDocument {
0: UNDERSCORE@60..61 "_" [] []
4: MD_THEMATIC_BREAK_CHAR@61..62
0: UNDERSCORE@61..62 "_" [] []
13: MD_NEWLINE@62..63
10: MD_NEWLINE@62..63
0: NEWLINE@62..63 "\n" [] []
2: EOF@63..63 "" [] []

Expand Down
Loading
Loading