Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion crates/biome_markdown_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use super::{MarkdownLexer, TextSize};
use crate::lexer::MarkdownLexContext;
use biome_markdown_syntax::MarkdownSyntaxKind::*;
use biome_parser::lexer::Lexer;
use biome_parser::lexer::{BufferedLexer, Lexer};
use quickcheck_macros::quickcheck;
use std::sync::mpsc::channel;
use std::thread;
Expand Down Expand Up @@ -574,3 +574,34 @@ fn block_quote_simple() {
NEWLINE:1,
}
}

#[test]
fn force_relex_at_line_start_produces_thematic_break() {
// After consuming a blockquote prefix (`> `), `---` is normally lexed as
// MINUS tokens because after_newline is false. force_relex_at_line_start
// should make the lexer treat the position as a line start, producing
// MD_THEMATIC_BREAK_LITERAL instead.
let source = "> ---\n";
let lexer = MarkdownLexer::from_str(source);
let mut buffered = BufferedLexer::new(lexer);

// Lex first token: `>` (R_ANGLE)
buffered.next_token(MarkdownLexContext::Regular);
assert_eq!(buffered.current(), R_ANGLE);

// Lex second token: ` ` (whitespace as MD_TEXTUAL_LITERAL)
buffered.next_token(MarkdownLexContext::Regular);
assert_eq!(buffered.current(), MD_TEXTUAL_LITERAL);

// Lex third token: without re-lex, `---` becomes MINUS
buffered.next_token(MarkdownLexContext::Regular);
assert_eq!(buffered.current(), MINUS, "without re-lex, should be MINUS");

// Now re-lex at line start — should produce MD_THEMATIC_BREAK_LITERAL
let kind = buffered.force_relex_at_line_start(MarkdownLexContext::Regular);
assert_eq!(
kind, MD_THEMATIC_BREAK_LITERAL,
"after force_relex_at_line_start, `---` should be MD_THEMATIC_BREAK_LITERAL"
);
assert_eq!(buffered.current(), MD_THEMATIC_BREAK_LITERAL);
}
9 changes: 9 additions & 0 deletions crates/biome_markdown_parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,15 @@ impl<'source> MarkdownParser<'source> {
.force_relex_in_context(MarkdownLexContext::Regular);
}

/// Re-lex the current token in Regular context, treating the position as
/// a line start. After consuming a blockquote prefix, the lexer's
/// `after_newline` flag is false, which prevents it from producing
/// line-start-gated tokens like `MD_THEMATIC_BREAK_LITERAL`. This method
/// overrides that flag so the lexer behaves as if at line start.
pub(crate) fn force_relex_at_line_start(&mut self) {
self.source.force_relex_at_line_start();
}

/// Force re-lex the current token in CodeSpan context.
/// In this context, backslash is literal (not an escape character).
/// Used for autolinks where `\>` should be `\` + `>` as separate tokens.
Expand Down
8 changes: 6 additions & 2 deletions crates/biome_markdown_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,9 @@ fn classify_quote_break_after_newline(
p.lookahead(|p| {
consume_quote_prefix_without_virtual(p, quote_depth);
with_virtual_line_start(p, p.cur_range().start(), |p| {
// Re-lex at line start so the lexer produces block-level tokens
// (e.g. MD_THEMATIC_BREAK_LITERAL for `---`) instead of MINUS.
p.force_relex_at_line_start();
if p.at(MD_SETEXT_UNDERLINE_LITERAL)
|| (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p))
{
Expand Down Expand Up @@ -865,9 +868,10 @@ fn break_for_quote_prefix_after_inline_newline(p: &mut MarkdownParser, quote_dep
if has_quote_prefix(p, quote_depth) {
let break_kind = classify_quote_break_after_newline(p, quote_depth);
if matches!(break_kind, QuoteBreakKind::SetextUnderline) {
// Consume the quote prefix so the setext underline is visible
// to the paragraph parser.
// Consume the quote prefix and re-lex at line start so the
// paragraph parser sees MD_THEMATIC_BREAK_LITERAL for `---`.
consume_quote_prefix(p, quote_depth);
p.force_relex_at_line_start();
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
match break_kind {
QuoteBreakKind::SetextUnderline | QuoteBreakKind::Other => return true,
Expand Down
85 changes: 82 additions & 3 deletions crates/biome_markdown_parser/src/syntax/quote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ pub(crate) fn parse_quote(p: &mut MarkdownParser) -> ParsedSyntax {
p.state_mut().block_quote_depth += 1;

let marker_space = emit_quote_prefix_node(p);
relex_after_quote_prefix_consumed(p);
p.set_virtual_line_start();

parse_quote_block_list(p);
Expand Down Expand Up @@ -125,6 +126,68 @@ fn emit_quote_prefix_node(p: &mut MarkdownParser) -> bool {
marker_space
}

/// After consuming a quote prefix, selectively re-lex the current token as if
/// it were at line start when the remaining line could form a thematic break.
///
/// Re-lexing unconditionally perturbs ordinary quoted text tokenization by
/// splitting leading spaces into separate tokens. We only need line-start
/// semantics here for thematic-break candidates like `> ---`.
Comment on lines +129 to +134
Copy link
Copy Markdown
Member

@ematipico ematipico Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I understand the good and technical comment, it doesn't actually explain the criteria of what we check for the thematic line break.

I suggest rewording the docstring with a more concrete approach, or having some inline comments in the weird parts of the code. For example the all() function usage is weird to me, and probably wrong (I might be wrong, but alas that's why I ask for a more down to earth comment)

///
/// A candidate is any line whose non-whitespace bytes are all the **same**
/// thematic break character (`-`, `*`, or `_`). Per CommonMark §4.1, mixing
/// different break characters (e.g. `_*-`) does **not** form a thematic break.
fn force_relex_thematic_break_after_quote_prefix(p: &mut MarkdownParser) {
let is_thematic_break_candidate = p.at(T![-])
|| p.at(T![*])
|| p.at(UNDERSCORE)
|| p.at(DOUBLE_UNDERSCORE)
|| (p.at(MD_TEXTUAL_LITERAL) && is_thematic_break_candidate_text(p.cur_text()));

if is_thematic_break_candidate {
p.force_relex_at_line_start();
}
}

fn relex_after_quote_prefix_consumed(p: &mut MarkdownParser) {
force_relex_thematic_break_after_quote_prefix(p);
}

/// Check if `text` could be a thematic break: all non-whitespace bytes must be
/// the **same** thematic break character (`-`, `*`, or `_`).
fn is_thematic_break_candidate_text(text: &str) -> bool {
use biome_unicode_table::{
Dispatch::{IDT, MIN, MUL, WHS},
lookup_byte,
};

let mut break_char: Option<u8> = None;
for &b in text.as_bytes() {
let dispatched = lookup_byte(b);
// Skip whitespace (space, tab, etc.) via the shared lookup table.
if dispatched == WHS {
continue;
}
// Match thematic break characters via dispatch variants:
// MIN = `-`, MUL = `*`, IDT = `_` (IDT also covers letters, so
// narrow to `b'_'` explicitly).
let is_break_char = matches!(dispatched, MIN | MUL) || (dispatched == IDT && b == b'_');
if is_break_char {
if let Some(expected) = break_char {
// Mixed break characters like `_*-` are not valid.
if b != expected {
return false;
}
} else {
break_char = Some(b);
}
} else {
// Any other non-whitespace byte disqualifies the line.
return false;
}
}
break_char.is_some()
}

/// Emit one quote prefix token sequence: [indent?] `>` [optional space/tab].
///
/// Returns whether a post-marker separator was consumed.
Expand Down Expand Up @@ -273,6 +336,7 @@ impl QuoteBlockList {
{
if has_quote_prefix(p, self.depth) {
consume_quote_prefix(p, self.depth);
relex_after_quote_prefix_consumed(p);
self.line_started_with_prefix = true;
} else {
return false;
Expand Down Expand Up @@ -532,15 +596,30 @@ fn parse_code_block_newline(p: &mut MarkdownParser, depth: usize) -> bool {
return false;
}

let continues_code_block = p.lookahead(|p| {
consume_quote_prefix(p, depth);

// Blank lines (consecutive newlines) are allowed in indented code.
if p.at(NEWLINE) {
return true;
}

at_quote_indented_code_start(p)
});

if !continues_code_block {
return false;
}

consume_quote_prefix(p, depth);
relex_after_quote_prefix_consumed(p);

// Blank lines (consecutive newlines) are allowed in indented code
// Blank lines (consecutive newlines) are allowed in indented code.
if p.at(NEWLINE) {
return true;
}

// Next line must still be indented to continue the code block
at_quote_indented_code_start(p)
true
}

/// Parse a single textual token in an indented code block.
Expand Down
8 changes: 8 additions & 0 deletions crates/biome_markdown_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ impl<'source> MarkdownTokenSource<'source> {
self.lexer.force_relex_in_context(context)
}

/// Re-lex the current token in Regular context, treating the position as
/// a line start. This makes the lexer produce line-start-gated tokens
/// like `MD_THEMATIC_BREAK_LITERAL`.
pub fn force_relex_at_line_start(&mut self) -> MarkdownSyntaxKind {
self.lexer
.force_relex_at_line_start(MarkdownLexContext::Regular)
}

pub fn set_force_ordered_list_marker(&mut self, value: bool) {
self.lexer.lexer_mut().set_force_ordered_list_marker(value);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ MdDocument {
post_marker_space_token: MD_QUOTE_POST_MARKER_SPACE@36..37 " " [] [],
},
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdSetextHeader {
content: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@37..40 "Foo" [] [],
},
Expand All @@ -111,20 +111,11 @@ MdDocument {
marker_token: R_ANGLE@41..42 ">" [] [],
post_marker_space_token: MD_QUOTE_POST_MARKER_SPACE@42..43 " " [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@43..44 "-" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@44..45 "-" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@45..46 "-" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@46..47 "\n" [] [],
},
],
hard_line: missing (optional),
underline_token: MD_SETEXT_UNDERLINE_LITERAL@43..46 "---" [] [],
},
MdNewline {
value_token: NEWLINE@46..47 "\n" [] [],
},
],
},
Expand Down Expand Up @@ -242,8 +233,8 @@ MdDocument {
1: R_ANGLE@35..36 ">" [] []
2: MD_QUOTE_POST_MARKER_SPACE@36..37 " " [] []
1: MD_BLOCK_LIST@37..47
0: MD_PARAGRAPH@37..47
0: MD_INLINE_ITEM_LIST@37..47
0: MD_SETEXT_HEADER@37..46
0: MD_INLINE_ITEM_LIST@37..43
0: MD_TEXTUAL@37..40
0: MD_TEXTUAL_LITERAL@37..40 "Foo" [] []
1: MD_TEXTUAL@40..41
Expand All @@ -252,15 +243,9 @@ MdDocument {
0: MD_QUOTE_INDENT_LIST@41..41
1: R_ANGLE@41..42 ">" [] []
2: MD_QUOTE_POST_MARKER_SPACE@42..43 " " [] []
3: MD_TEXTUAL@43..44
0: MD_TEXTUAL_LITERAL@43..44 "-" [] []
4: MD_TEXTUAL@44..45
0: MD_TEXTUAL_LITERAL@44..45 "-" [] []
5: MD_TEXTUAL@45..46
0: MD_TEXTUAL_LITERAL@45..46 "-" [] []
6: MD_TEXTUAL@46..47
0: MD_TEXTUAL_LITERAL@46..47 "\n" [] []
1: (empty)
1: MD_SETEXT_UNDERLINE_LITERAL@43..46 "---" [] []
1: MD_NEWLINE@46..47
0: NEWLINE@46..47 "\n" [] []
10: MD_NEWLINE@47..48
0: NEWLINE@47..48 "\n" [] []
11: MD_BULLET_LIST_ITEM@48..66
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
> Foo
> ---

> Bar
> ===

> > Nested
> > ---

> Dashes with spaces
> - - -

> Stars
> ***

> Stars with spaces
> * * *

> Underscores
> ___

> Underscores with spaces
> _ _ _

> Mixed break chars are NOT thematic breaks (CommonMark §4.1),
> so this line is a continuation paragraph, not a heading.
> -*_
Loading
Loading