From 95747c55aa01729b86663998aeb4dc2989fb130b Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:02:18 -0400 Subject: [PATCH 1/4] fix(markdown_parser): prefer list item over thematic break for `- ---` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the lexer produces `MD_THEMATIC_BREAK_LITERAL` for a line like `- ---`, the thematic break interpretation won because it was checked before list items in the block dispatcher. Per CommonMark §5.2/§4.1 (and verified against commonmark.js + markdown-it), when stripping a bullet marker + space from the token leaves content that is itself a valid thematic break (3+ matching chars), the list item interpretation should win. E.g.: - `- ---` → list item containing
(3 chars remain) - `- - -` → thematic break (only 2 chars remain after marker) The fix adds a parser-side guard (`thematic_break_hides_list_item`) that inspects the token text. When triggered, the token is re-lexed via `ThematicBreakParts` context to expose the individual marker tokens, then list item parsing proceeds normally. --- .../biome_markdown_parser/src/syntax/mod.rs | 36 ++++-- .../src/syntax/thematic_break_block.rs | 41 +++++++ .../ok/thematic_break_in_list.md.snap | 109 ++++++++++-------- .../biome_markdown_parser/tests/spec_test.rs | 43 +++++-- 4 files changed, 159 insertions(+), 70 deletions(-) diff --git a/crates/biome_markdown_parser/src/syntax/mod.rs b/crates/biome_markdown_parser/src/syntax/mod.rs index d778917c0d86..14934aa6165e 100644 --- a/crates/biome_markdown_parser/src/syntax/mod.rs +++ b/crates/biome_markdown_parser/src/syntax/mod.rs @@ -58,7 +58,9 @@ use quote::{ at_quote, consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix, line_has_quote_prefix_at_current, parse_quote, }; -use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; +use thematic_break_block::{ + at_thematic_break_block, parse_thematic_break_block, thematic_break_hides_list_item, +}; use crate::MarkdownParser; @@ -289,17 +291,29 @@ pub(crate) fn parse_any_block_with_indent_code_policy( } else if line_starts_with_fence(p) { parse_fenced_code_block_force(p) } else if at_thematic_break_block(p) { - let break_block = try_parse(p, |p| { - let break_block = parse_thematic_break_block(p); - if break_block.is_absent() { - return Err(()); - } - Ok(break_block) - }); - if let Ok(parsed) = break_block { - parsed + // Per CommonMark §5.2 / §4.1: when the thematic break token starts with + // a bullet marker + space and the remaining content is itself a valid + // thematic break (3+ chars), the list item interpretation wins. + // E.g. `- ---` → list item containing
, + // but `- - -` → thematic break (only 2 chars after marker). + let is_hidden_list_item = + p.at(MD_THEMATIC_BREAK_LITERAL) && thematic_break_hides_list_item(p.cur_text()); + if is_hidden_list_item { + p.force_relex_thematic_break_parts(); + parse_bullet_list_item(p) } else { - parse_paragraph(p) + let break_block = try_parse(p, |p| { + let break_block = parse_thematic_break_block(p); + if break_block.is_absent() { + return Err(()); + } + Ok(break_block) + }); + if let Ok(parsed) = break_block { + parsed + } else { + parse_paragraph(p) + } } } else if at_header(p) { // Check for too many hashes BEFORE try_parse (which would lose diagnostics on rewind) diff --git a/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs b/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs index 89d3bce075d7..b2879871418c 100644 --- a/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs +++ b/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs @@ -42,6 +42,47 @@ pub(crate) fn at_thematic_break_block(p: &mut MarkdownParser) -> bool { }) } +/// Check if a `MD_THEMATIC_BREAK_LITERAL` token text should actually be parsed +/// as a bullet list item whose content is a thematic break. +/// +/// Returns `true` when the text can be split as: +/// `bullet_marker` + `space/tab` + `consecutive_thematic_break` +/// +/// The payload must be a CONSECUTIVE run of 3+ matching break characters +/// with no internal spaces. This distinguishes: +/// `- ---` → list item (payload `---` is consecutive) +/// `- - -` → thematic break (payload `- -` has internal spaces) +/// `- - - -` → thematic break (payload `- - -` has internal spaces) +/// +/// Only bullet markers (`-`, `*`, `+`) are checked — ordered list markers +/// cannot collide with thematic break characters. +pub(crate) fn thematic_break_hides_list_item(text: &str) -> bool { + let bytes = text.as_bytes(); + // Need at least: marker (1) + space (1) + 3 break chars = 5 bytes + if bytes.len() < 5 { + return false; + } + if !matches!(bytes[0], b'-' | b'*' | b'+') { + return false; + } + if !matches!(bytes[1], b' ' | b'\t') { + return false; + } + + // The payload (after marker + space) must be 3+ consecutive matching + // break characters, optionally followed by trailing whitespace only. + let payload = text[2..].trim_end_matches([' ', '\t']); + let payload_bytes = payload.as_bytes(); + if payload_bytes.len() < THEMATIC_BREAK_MIN_CHARS { + return false; + } + let break_char = payload_bytes[0]; + if !matches!(break_char, b'-' | b'*' | b'_') { + return false; + } + payload_bytes.iter().all(|&b| b == break_char) +} + /// Check if the remaining content forms a thematic break pattern. /// /// Per CommonMark §4.1, a thematic break is 3 or more matching characters diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/thematic_break_in_list.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/thematic_break_in_list.md.snap index b8a714a0ef85..6e74597ccdac 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/thematic_break_in_list.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/thematic_break_in_list.md.snap @@ -70,33 +70,37 @@ MdDocument { MdNewline { value_token: NEWLINE@8..9 "\n" [] [], }, - MdThematicBreakBlock { - parts: MdThematicBreakPartList [ - MdThematicBreakChar { - value: MINUS@9..10 "-" [] [], - }, - MdIndentToken { - md_indent_char_token: MD_INDENT_CHAR@10..11 " " [] [], - }, - MdThematicBreakChar { - value: MINUS@11..12 "-" [] [], - }, - MdThematicBreakChar { - value: MINUS@12..13 "-" [] [], - }, - MdThematicBreakChar { - value: MINUS@13..14 "-" [] [], - }, - ], - }, - MdNewline { - value_token: NEWLINE@14..15 "\n" [] [], - }, - MdNewline { - value_token: NEWLINE@15..16 "\n" [] [], - }, MdBulletListItem { md_bullet_list: MdBulletList [ + MdBullet { + prefix: MdListMarkerPrefix { + pre_marker_indent: MdIndentTokenList [], + marker: MINUS@9..10 "-" [] [], + post_marker_space_token: MD_LIST_POST_MARKER_SPACE@10..11 " " [] [], + content_indent: MdIndentTokenList [], + }, + content: MdBlockList [ + MdThematicBreakBlock { + parts: MdThematicBreakPartList [ + MdThematicBreakChar { + value: MINUS@11..12 "-" [] [], + }, + MdThematicBreakChar { + value: MINUS@12..13 "-" [] [], + }, + MdThematicBreakChar { + value: MINUS@13..14 "-" [] [], + }, + ], + }, + MdNewline { + value_token: NEWLINE@14..15 "\n" [] [], + }, + ], + }, + MdNewline { + value_token: NEWLINE@15..16 "\n" [] [], + }, MdBullet { prefix: MdListMarkerPrefix { pre_marker_indent: MdIndentTokenList [], @@ -332,25 +336,28 @@ MdDocument { 0: NEWLINE@7..8 "\n" [] [] 2: MD_NEWLINE@8..9 0: NEWLINE@8..9 "\n" [] [] - 3: MD_THEMATIC_BREAK_BLOCK@9..14 - 0: MD_THEMATIC_BREAK_PART_LIST@9..14 - 0: MD_THEMATIC_BREAK_CHAR@9..10 - 0: MINUS@9..10 "-" [] [] - 1: MD_INDENT_TOKEN@10..11 - 0: MD_INDENT_CHAR@10..11 " " [] [] - 2: MD_THEMATIC_BREAK_CHAR@11..12 - 0: MINUS@11..12 "-" [] [] - 3: MD_THEMATIC_BREAK_CHAR@12..13 - 0: MINUS@12..13 "-" [] [] - 4: MD_THEMATIC_BREAK_CHAR@13..14 - 0: MINUS@13..14 "-" [] [] - 4: MD_NEWLINE@14..15 - 0: NEWLINE@14..15 "\n" [] [] - 5: MD_NEWLINE@15..16 - 0: NEWLINE@15..16 "\n" [] [] - 6: MD_BULLET_LIST_ITEM@16..21 - 0: MD_BULLET_LIST@16..21 - 0: MD_BULLET@16..21 + 3: MD_BULLET_LIST_ITEM@9..21 + 0: MD_BULLET_LIST@9..21 + 0: MD_BULLET@9..15 + 0: MD_LIST_MARKER_PREFIX@9..11 + 0: MD_INDENT_TOKEN_LIST@9..9 + 1: MINUS@9..10 "-" [] [] + 2: MD_LIST_POST_MARKER_SPACE@10..11 " " [] [] + 3: MD_INDENT_TOKEN_LIST@11..11 + 1: MD_BLOCK_LIST@11..15 + 0: MD_THEMATIC_BREAK_BLOCK@11..14 + 0: MD_THEMATIC_BREAK_PART_LIST@11..14 + 0: MD_THEMATIC_BREAK_CHAR@11..12 + 0: MINUS@11..12 "-" [] [] + 1: MD_THEMATIC_BREAK_CHAR@12..13 + 0: MINUS@12..13 "-" [] [] + 2: MD_THEMATIC_BREAK_CHAR@13..14 + 0: MINUS@13..14 "-" [] [] + 1: MD_NEWLINE@14..15 + 0: NEWLINE@14..15 "\n" [] [] + 1: MD_NEWLINE@15..16 + 0: NEWLINE@15..16 "\n" [] [] + 2: MD_BULLET@16..21 0: MD_LIST_MARKER_PREFIX@16..18 0: MD_INDENT_TOKEN_LIST@16..16 1: MINUS@16..17 "-" [] [] @@ -365,11 +372,11 @@ MdDocument { 0: UNDERSCORE@19..20 "_" [] [] 2: MD_THEMATIC_BREAK_CHAR@20..21 0: UNDERSCORE@20..21 "_" [] [] - 7: MD_NEWLINE@21..22 + 4: MD_NEWLINE@21..22 0: NEWLINE@21..22 "\n" [] [] - 8: MD_NEWLINE@22..23 + 5: MD_NEWLINE@22..23 0: NEWLINE@22..23 "\n" [] [] - 9: MD_BULLET_LIST_ITEM@23..28 + 6: MD_BULLET_LIST_ITEM@23..28 0: MD_BULLET_LIST@23..28 0: MD_BULLET@23..28 0: MD_LIST_MARKER_PREFIX@23..25 @@ -386,11 +393,11 @@ MdDocument { 0: MINUS@26..27 "-" [] [] 2: MD_THEMATIC_BREAK_CHAR@27..28 0: MINUS@27..28 "-" [] [] - 10: MD_NEWLINE@28..29 + 7: MD_NEWLINE@28..29 0: NEWLINE@28..29 "\n" [] [] - 11: MD_NEWLINE@29..30 + 8: MD_NEWLINE@29..30 0: NEWLINE@29..30 "\n" [] [] - 12: MD_BULLET_LIST_ITEM@30..62 + 9: MD_BULLET_LIST_ITEM@30..62 0: MD_BULLET_LIST@30..62 0: MD_BULLET@30..37 0: MD_LIST_MARKER_PREFIX@30..32 @@ -476,7 +483,7 @@ MdDocument { 0: UNDERSCORE@60..61 "_" [] [] 4: MD_THEMATIC_BREAK_CHAR@61..62 0: UNDERSCORE@61..62 "_" [] [] - 13: MD_NEWLINE@62..63 + 10: MD_NEWLINE@62..63 0: NEWLINE@62..63 "\n" [] [] 2: EOF@63..63 "" [] [] diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs index d4c34ce710bf..508647d04aec 100644 --- a/crates/biome_markdown_parser/tests/spec_test.rs +++ b/crates/biome_markdown_parser/tests/spec_test.rs @@ -418,12 +418,10 @@ pub fn quick_test() { "\n\n", ); // Reduce: thematic break in list then different marker - // NOTE: `- ---` is a pre-existing Biome bug where it parses as a top-level - // thematic break instead of a list item containing
. test_example( 30013, "- ---\n\n+ item\n", - "
\n\n", + "\n\n", ); // Reduce: setext heading in list then different marker test_example( @@ -459,6 +457,38 @@ pub fn quick_test() { "- outer\n - nested\n lazy line\nhello\n", "\n", ); + + // #region Thematic break vs list item precedence + // + // When a bullet marker + space leaves content that is itself a valid + // thematic break (3+ consecutive matching chars), the list item wins. + // When removing the marker leaves spaced or < 3 chars, it stays a break. + + // `- ---` → list item containing
(3 consecutive dashes after `- `) + test_example(30020, "- ---\n", "\n"); + // `* ***` → list item containing
(3 consecutive stars after `* `) + test_example(30021, "* ***\n", "\n"); + // `+ ___` → list item containing
(3 consecutive underscores after `+ `) + test_example(30022, "+ ___\n", "\n"); + // `- ---` with following content and marker change + test_example( + 30023, + "- ---\n\n+ item\n", + "\n\n", + ); + + // These remain thematic breaks — removing the marker leaves spaced or < 3 chars. + // `- - -` → thematic break (payload `- -` has internal spaces) + test_example(30024, "- - -\n", "
\n"); + // `* * *` → thematic break (payload `* *` has internal spaces) + test_example(30025, "* * *\n", "
\n"); + // Plain `---` → thematic break (no list marker prefix) + test_example(30026, "---\n", "
\n"); + // `***` → thematic break + test_example(30027, "***\n", "
\n"); + // `___` → thematic break (underscore is not a bullet marker) + test_example(30028, "___\n", "
\n"); + // #endregion } fn fuzz_test_example(num: u32, input: &str, expected: &str) { @@ -492,16 +522,12 @@ fn fuzz_mixed_markers_paragraph() { ); } -/// NOTE: `- ---` is parsed by Biome as a top-level thematic break rather than -/// a list item containing `
`. This is a separate pre-existing bug -/// (thematic break precedence over list marker) unrelated to the mixed-marker -/// list-split fix. The expected value here matches Biome's current behavior. #[test] fn fuzz_mixed_markers_thematic_break() { fuzz_test_example( 3, "- ---\n\n+ item\n", - "
\n\n", + "\n\n", ); } @@ -549,3 +575,4 @@ fn fuzz_code_after_list_not_absorbed() { "\n
code here\n
\n", ); } + From a5f958e4beacd2aecbdd2604f11ff47f2b2885e8 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Sun, 12 Apr 2026 18:46:09 +0000 Subject: [PATCH 2/4] [autofix.ci] apply automated fixes --- crates/biome_markdown_parser/tests/spec_test.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs index 508647d04aec..db089d8b807a 100644 --- a/crates/biome_markdown_parser/tests/spec_test.rs +++ b/crates/biome_markdown_parser/tests/spec_test.rs @@ -575,4 +575,3 @@ fn fuzz_code_after_list_not_absorbed() { "\n
code here\n
\n", ); } - From bd64d6901c34dd6f87ed695642350adee1dc331c Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Mon, 13 Apr 2026 08:19:10 -0400 Subject: [PATCH 3/4] refactor(markdown_parser): use lookup_byte for thematic break marker classification Route `*`, `-`, and `_` classification through `biome_unicode_table::lookup_byte` via a shared `is_break_marker` helper, following the project convention. Whitespace checks (`' '`/`'\t'`) are kept explicit since `WHS` is semantically broader than what CommonMark requires here. --- .../src/syntax/thematic_break_block.rs | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs b/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs index b2879871418c..42dd74e19f8e 100644 --- a/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs +++ b/crates/biome_markdown_parser/src/syntax/thematic_break_block.rs @@ -21,10 +21,25 @@ use biome_parser::{ Parser, prelude::ParsedSyntax::{self, *}, }; +use biome_unicode_table::Dispatch::{IDT, MIN, MUL}; +use biome_unicode_table::lookup_byte; /// CommonMark requires 3 or more matching characters for thematic breaks. const THEMATIC_BREAK_MIN_CHARS: usize = 3; +/// Whether `byte` is a thematic break marker character (`*`, `-`, or `_`). +/// +/// Uses the `biome_unicode_table` lookup table for `*` (`MUL`) and `-` (`MIN`). +/// `_` shares the `IDT` dispatch variant with ASCII letters, so an explicit +/// byte check is required to disambiguate. +fn is_break_marker(byte: u8) -> bool { + match lookup_byte(byte) { + MUL | MIN => true, + IDT => byte == b'_', + _ => false, + } +} + pub(crate) fn at_thematic_break_block(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { if p.at_line_start() || p.at_start_of_input() { @@ -101,22 +116,25 @@ fn is_thematic_break_pattern(p: &mut MarkdownParser) -> bool { // If the entire line segment is a single textual literal, validate it directly. if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() - .chars() - .all(|c| c == ' ' || c == '\t' || c == '*' || c == '-' || c == '_') + .bytes() + .all(|b| b == b' ' || b == b'\t' || is_break_marker(b)) { - let mut break_char = None; + let mut break_byte = None; let mut break_count = 0usize; - for c in p.cur_text().chars() { - if c == ' ' || c == '\t' { + for b in p.cur_text().bytes() { + if b == b' ' || b == b'\t' { continue; } - if let Some(existing) = break_char { - if existing != c { + if !is_break_marker(b) { + return false; + } + if let Some(existing) = break_byte { + if existing != b { return false; } } else { - break_char = Some(c); + break_byte = Some(b); } break_count += 1; } @@ -143,11 +161,11 @@ fn is_thematic_break_pattern(p: &mut MarkdownParser) -> bool { } else if p.at(MD_TEXTUAL_LITERAL) { let text = p.cur_text(); if text.len() == 1 { - match text.chars().next() { - Some('*') => '*', - Some('-') => '-', - Some('_') => '_', - _ => return false, + let b = text.as_bytes()[0]; + if is_break_marker(b) { + b as char + } else { + return false; } } else { return false; @@ -270,9 +288,8 @@ fn parse_thematic_break_parts(p: &mut MarkdownParser) { } if p.at(MD_TEXTUAL_LITERAL) { - let first_char = p.cur_text().as_bytes().first().copied(); - match first_char { - Some(b'*' | b'-' | b'_' | b' ' | b'\t') => { + match p.cur_text().as_bytes().first().copied() { + Some(b) if is_break_marker(b) || b == b' ' || b == b'\t' => { p.force_relex_thematic_break_parts(); relex_active = true; continue; From 81aaa78efd803c10facd9aba49131819434f1bf6 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Mon, 13 Apr 2026 08:42:05 -0400 Subject: [PATCH 4/4] chore: update module_graph snapshot after upstream rebase --- .../tests/snapshots/test_optional_and_readonly_members.snap | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/biome_module_graph/tests/snapshots/test_optional_and_readonly_members.snap b/crates/biome_module_graph/tests/snapshots/test_optional_and_readonly_members.snap index fd8582ae19d4..c7952f45b3f7 100644 --- a/crates/biome_module_graph/tests/snapshots/test_optional_and_readonly_members.snap +++ b/crates/biome_module_graph/tests/snapshots/test_optional_and_readonly_members.snap @@ -34,7 +34,6 @@ Imports { ``` Config => BindingTypeData { Types Module(0) TypeId(2), - Exported Ranges: (17..23) } ```