From 903d68adb5f406b3c68e65d3ef755d3d444e85c8 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Mon, 6 Apr 2026 18:01:17 -0400 Subject: [PATCH 1/3] refactor(markdown_parser): deduplicate link helper functions Extract `is_whitespace_token` and `get_title_close_char` to `syntax/mod.rs` as shared `pub(crate)` helpers. Both `link_block.rs` and `inline/links.rs` maintained identical local copies that could silently diverge. --- .../src/syntax/inline/links.rs | 21 +--------- .../src/syntax/link_block.rs | 25 +---------- .../biome_markdown_parser/src/syntax/mod.rs | 42 +++++++++++-------- 3 files changed, 29 insertions(+), 59 deletions(-) diff --git a/crates/biome_markdown_parser/src/syntax/inline/links.rs b/crates/biome_markdown_parser/src/syntax/inline/links.rs index dd4e746fd944..bc0f68019f1d 100644 --- a/crates/biome_markdown_parser/src/syntax/inline/links.rs +++ b/crates/biome_markdown_parser/src/syntax/inline/links.rs @@ -13,7 +13,8 @@ use crate::syntax::parse_error::{unclosed_image, unclosed_link}; use crate::syntax::reference::normalize_reference_label; use crate::syntax::{ LinkDestinationKind, MAX_LINK_DESTINATION_PAREN_DEPTH, ParenDepthResult, - ends_with_unescaped_close, try_update_paren_depth, validate_link_destination_text, + ends_with_unescaped_close, get_title_close_char, is_whitespace_token, try_update_paren_depth, + validate_link_destination_text, }; /// Parse link starting with `[` - dispatches to inline link or reference link. @@ -594,11 +595,6 @@ fn bump_textual_link_def(p: &mut MarkdownParser) { item.complete(p, MD_TEXTUAL); } -fn is_whitespace_token(p: &MarkdownParser) -> bool { - let text = p.cur_text(); - !text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t') -} - fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { let mut saw_whitespace = false; @@ -778,19 +774,6 @@ fn bump_link_def_separator(p: &mut MarkdownParser) { } } -fn get_title_close_char(p: &MarkdownParser) -> Option { - let text = p.cur_text(); - if text.starts_with('"') { - Some('"') - } else if text.starts_with('\'') { - Some('\'') - } else if p.at(L_PAREN) { - Some(')') - } else { - None - } -} - fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { let Some(close_char) = close_char else { return; diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index b8ffe716fa53..09fc2c7ed2f4 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -29,8 +29,8 @@ use crate::lexer::MarkdownLexContext; use crate::syntax::reference::normalize_reference_label; use crate::syntax::{ LinkDestinationKind, MAX_BLOCK_PREFIX_INDENT, MAX_LINK_DESTINATION_PAREN_DEPTH, - ParenDepthResult, ends_with_unescaped_close, try_update_paren_depth, - validate_link_destination_text, + ParenDepthResult, ends_with_unescaped_close, get_title_close_char, is_whitespace_token, + try_update_paren_depth, validate_link_destination_text, }; /// Maximum label length per CommonMark spec (999 characters). @@ -632,21 +632,6 @@ fn parse_link_title(p: &mut MarkdownParser) { m.complete(p, MD_LINK_TITLE); } -/// Get the closing character for a title based on current token. -/// Returns None if not at a title start. -fn get_title_close_char(p: &MarkdownParser) -> Option { - let text = p.cur_text(); - if text.starts_with('"') { - Some('"') - } else if text.starts_with('\'') { - Some('\'') - } else if p.at(L_PAREN) { - Some(')') - } else { - None - } -} - /// Parse title content until closing delimiter, including trailing whitespace. /// /// Inside title quotes, we use Regular context so whitespace doesn't split tokens. @@ -708,12 +693,6 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { } } -/// Check if current token is whitespace (space or tab). -fn is_whitespace_token(p: &MarkdownParser) -> bool { - let text = p.cur_text(); - !text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t') -} - /// Consume the current token as an MdTextual node. /// /// This is a helper to reduce boilerplate for the common pattern: diff --git a/crates/biome_markdown_parser/src/syntax/mod.rs b/crates/biome_markdown_parser/src/syntax/mod.rs index 80ed9e4f53a4..9e63f95643c3 100644 --- a/crates/biome_markdown_parser/src/syntax/mod.rs +++ b/crates/biome_markdown_parser/src/syntax/mod.rs @@ -62,6 +62,27 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; use crate::MarkdownParser; +/// Check if current token is whitespace (space or tab). +pub(crate) fn is_whitespace_token(p: &MarkdownParser) -> bool { + let text = p.cur_text(); + !text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t') +} + +/// Get the closing character for a title based on current token. +/// Returns `None` if not at a title start. +pub(crate) fn get_title_close_char(p: &MarkdownParser) -> Option { + let text = p.cur_text(); + if text.starts_with('"') { + Some('"') + } else if text.starts_with('\'') { + Some('\'') + } else if p.at(L_PAREN) { + Some(')') + } else { + None + } +} + /// Maximum paren nesting allowed in link destinations per CommonMark. pub(crate) const MAX_LINK_DESTINATION_PAREN_DEPTH: i32 = 32; @@ -644,24 +665,11 @@ pub(crate) fn is_dash_only_thematic_break_text(text: &str) -> bool { !text.is_empty() && text.trim().chars().all(|c| c == '-') } -/// Token-based check: is the current line a setext underline? -/// -/// Call after consuming a NEWLINE token. Skips 0–3 columns of leading whitespace -/// (tabs expand to the next tab stop per CommonMark §2.2), then checks for -/// `MD_SETEXT_UNDERLINE_LITERAL` or a dash-only `MD_THEMATIC_BREAK_LITERAL`. -/// -/// Returns `Some(bytes_consumed)` if the line is a setext underline, `None` otherwise. -/// The byte count includes only the whitespace tokens consumed during the indent skip, -/// NOT the underline token itself. Callers that track byte budgets must subtract this. -/// -/// This is the shared helper for setext detection in inline contexts. -/// Used by `has_matching_code_span_closer`, `parse_inline_html`, and `parse_inline_item_list`. +/// Returns `Some(indent_bytes)` if the current line is a setext underline. /// -/// Context safety: this function does NOT call `allow_setext_heading` because the token -/// stream itself encodes context. In blockquotes, `R_ANGLE` tokens appear after NEWLINE -/// before content, so the whitespace-only skip naturally rejects those lines. In list -/// items, the indent reflected in the token stream is the raw line indent, and the -/// `columns < 4` check correctly rejects lines with 4+ columns of leading whitespace. +/// Call this after consuming `NEWLINE`. It skips up to 3 columns of leading +/// whitespace, then checks for a setext underline token or a dash-only thematic +/// break token. The returned byte count covers only the skipped whitespace. pub(crate) fn at_setext_underline_after_newline(p: &mut MarkdownParser) -> Option { let mut columns = 0; let mut bytes_consumed = 0; From e7084bb07ddc849402d4526136a95ef05aefa978 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Tue, 7 Apr 2026 16:06:05 -1000 Subject: [PATCH 2/3] docs(markdown_parser): clarify intent of link helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback from @ematipico: - Rename is_whitespace_token to is_space_or_tab_token to make the ASCII-only intent self-evident - Document why it uses space/tab only instead of the Dispatch::WHS lookup table (CommonMark §4.7/§6.3 semantics) - Add CommonMark spec references to get_title_close_char docstring --- .../src/syntax/inline/links.rs | 12 +++++----- .../src/syntax/link_block.rs | 22 +++++++++---------- .../biome_markdown_parser/src/syntax/mod.rs | 18 +++++++++++---- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/crates/biome_markdown_parser/src/syntax/inline/links.rs b/crates/biome_markdown_parser/src/syntax/inline/links.rs index bc0f68019f1d..ae52d226c688 100644 --- a/crates/biome_markdown_parser/src/syntax/inline/links.rs +++ b/crates/biome_markdown_parser/src/syntax/inline/links.rs @@ -13,7 +13,7 @@ use crate::syntax::parse_error::{unclosed_image, unclosed_link}; use crate::syntax::reference::normalize_reference_label; use crate::syntax::{ LinkDestinationKind, MAX_LINK_DESTINATION_PAREN_DEPTH, ParenDepthResult, - ends_with_unescaped_close, get_title_close_char, is_whitespace_token, try_update_paren_depth, + ends_with_unescaped_close, get_title_close_char, is_space_or_tab_token, try_update_paren_depth, validate_link_destination_text, }; @@ -597,12 +597,12 @@ fn bump_textual_link_def(p: &mut MarkdownParser) { fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { - let mut saw_whitespace = false; + let mut saw_separator = false; while is_title_separator_token(p) { bump_link_def_separator(p); - saw_whitespace = true; + saw_separator = true; } - saw_whitespace && get_title_close_char(p).is_some() + saw_separator && get_title_close_char(p).is_some() }) } @@ -727,7 +727,7 @@ fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationSc } while !p.at(EOF) && !p.at(NEWLINE) { - if is_whitespace_token(p) { + if is_space_or_tab_token(p) { break; } let text = p.cur_text(); @@ -761,7 +761,7 @@ fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationSc } fn is_title_separator_token(p: &MarkdownParser) -> bool { - is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line()) + is_space_or_tab_token(p) || (p.at(NEWLINE) && !p.at_blank_line()) } fn bump_link_def_separator(p: &mut MarkdownParser) { diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 09fc2c7ed2f4..314fb6099547 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -29,7 +29,7 @@ use crate::lexer::MarkdownLexContext; use crate::syntax::reference::normalize_reference_label; use crate::syntax::{ LinkDestinationKind, MAX_BLOCK_PREFIX_INDENT, MAX_LINK_DESTINATION_PAREN_DEPTH, - ParenDepthResult, ends_with_unescaped_close, get_title_close_char, is_whitespace_token, + ParenDepthResult, ends_with_unescaped_close, get_title_close_char, is_space_or_tab_token, try_update_paren_depth, validate_link_destination_text, }; @@ -427,7 +427,7 @@ pub(crate) fn parse_link_block(p: &mut MarkdownParser) -> ParsedSyntax { // Check for title on next line - need to skip trailing whitespace first // Also validate that the title is complete and has no trailing content let has_valid_title_after_newline = p.lookahead(|p| { - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { p.bump_link_definition(); } if p.at(NEWLINE) && !p.at_blank_line() { @@ -488,14 +488,14 @@ fn parse_link_destination(p: &mut MarkdownParser) { let list = p.start(); // Include optional whitespace before destination in the destination node. - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } // Per CommonMark §4.7, destination can be on the next line if p.at(NEWLINE) && !p.at_blank_line() { bump_textual_link_def(p); - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } } @@ -514,7 +514,7 @@ fn parse_link_destination(p: &mut MarkdownParser) { let mut paren_depth: i32 = 0; while !p.at(EOF) && !p.at(NEWLINE) { - if is_whitespace_token(p) { + if is_space_or_tab_token(p) { break; // Bare destination stops at first whitespace } @@ -550,7 +550,7 @@ fn bump_textual_link_def(p: &mut MarkdownParser) { fn at_link_title(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { // Skip whitespace before title - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { p.bump_link_definition(); } let text = p.cur_text(); @@ -589,7 +589,7 @@ fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) { let list = p.start(); // Include trailing whitespace after destination - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } @@ -599,7 +599,7 @@ fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) { } // Include leading whitespace on title line - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } @@ -620,7 +620,7 @@ fn parse_link_title(p: &mut MarkdownParser) { let list = p.start(); // Include optional filler whitespace before title - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } @@ -653,7 +653,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { if is_complete { // Consume trailing whitespace after title (before newline) p.re_lex_link_definition(); - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } return; @@ -677,7 +677,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { bump_textual(p); // Consume trailing whitespace after title (before newline) p.re_lex_link_definition(); - while is_whitespace_token(p) { + while is_space_or_tab_token(p) { bump_textual_link_def(p); } break; diff --git a/crates/biome_markdown_parser/src/syntax/mod.rs b/crates/biome_markdown_parser/src/syntax/mod.rs index 9e63f95643c3..63bc59d0a858 100644 --- a/crates/biome_markdown_parser/src/syntax/mod.rs +++ b/crates/biome_markdown_parser/src/syntax/mod.rs @@ -62,14 +62,24 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; use crate::MarkdownParser; -/// Check if current token is whitespace (space or tab). -pub(crate) fn is_whitespace_token(p: &MarkdownParser) -> bool { +/// Check if current token consists only of ASCII spaces and/or tabs. +/// +/// This intentionally does **not** use `Dispatch::WHS` from the lookup table, +/// which classifies `\n`, `\r`, and other whitespace bytes. CommonMark §4.7 +/// and §6.3 define the separator between a link destination and an optional +/// title as spaces/tabs only — newlines are significant structure there, not +/// whitespace. The lexer uses the same narrow rule for link definitions. +pub(crate) fn is_space_or_tab_token(p: &MarkdownParser) -> bool { let text = p.cur_text(); !text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t') } -/// Get the closing character for a title based on current token. -/// Returns `None` if not at a title start. +/// Get the closing delimiter for a CommonMark link title (§4.7, §6.3). +/// +/// A link title appears after the destination in link reference definitions +/// (`[label]: url "title"`) and inline links (`[text](url "title")`). It may +/// be enclosed in `"…"`, `'…'`, or `(…)`. Returns the expected closing +/// character, or `None` if the current token does not start a title. pub(crate) fn get_title_close_char(p: &MarkdownParser) -> Option { let text = p.cur_text(); if text.starts_with('"') { From cb4fe38d7aec50023d735b71efe51f141fade6e5 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Tue, 7 Apr 2026 16:45:06 -1000 Subject: [PATCH 3/3] refactor(markdown_parser): use is_space_or_tab_token in link_block lookaheads Replace three inline `text.chars().all(|c| c == ' ' || c == '\t')` checks in skip_whitespace_tokens_tracked and skip_destination_tokens with the shared is_space_or_tab_token helper to prevent drift. --- .../src/syntax/link_block.rs | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 314fb6099547..8f932942dbcb 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -207,14 +207,9 @@ fn skip_whitespace_tokens(p: &mut MarkdownParser) { /// Skip whitespace tokens (spaces/tabs) in lookahead and return whether any were skipped. fn skip_whitespace_tokens_tracked(p: &mut MarkdownParser) -> bool { let mut skipped = false; - while !p.at(EOF) && !p.at(NEWLINE) { - let text = p.cur_text(); - if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { - p.bump_link_definition(); - skipped = true; - } else { - break; - } + while !p.at(EOF) && !p.at(NEWLINE) && is_space_or_tab_token(p) { + p.bump_link_definition(); + skipped = true; } skipped } @@ -239,13 +234,8 @@ enum DestinationResult { /// Skip destination tokens in lookahead. Returns the destination result. fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult { // Skip optional leading whitespace before destination - while !p.at(EOF) && !p.at(NEWLINE) { - let text = p.cur_text(); - if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { - p.bump_link_definition(); - } else { - break; - } + while !p.at(EOF) && !p.at(NEWLINE) && is_space_or_tab_token(p) { + p.bump_link_definition(); } if p.at(L_ANGLE) { @@ -295,9 +285,8 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult { let mut pending_escape = false; while !p.at(EOF) && !p.at(NEWLINE) { - let text = p.cur_text(); // Stop at whitespace - if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { + if is_space_or_tab_token(p) { if has_content { saw_separator = true; } @@ -312,6 +301,7 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult { break; } + let text = p.cur_text(); if !validate_link_destination_text(text, LinkDestinationKind::Raw, &mut pending_escape) { return DestinationResult::Invalid;