Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 7 additions & 24 deletions crates/biome_markdown_parser/src/syntax/inline/links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ use crate::syntax::parse_error::{unclosed_image, unclosed_link};
use crate::syntax::reference::normalize_reference_label;
use crate::syntax::{
LinkDestinationKind, MAX_LINK_DESTINATION_PAREN_DEPTH, ParenDepthResult,
ends_with_unescaped_close, try_update_paren_depth, validate_link_destination_text,
ends_with_unescaped_close, get_title_close_char, is_space_or_tab_token, try_update_paren_depth,
validate_link_destination_text,
};

/// Parse link starting with `[` - dispatches to inline link or reference link.
Expand Down Expand Up @@ -594,19 +595,14 @@ fn bump_textual_link_def(p: &mut MarkdownParser) {
item.complete(p, MD_TEXTUAL);
}

fn is_whitespace_token(p: &MarkdownParser) -> bool {
let text = p.cur_text();
!text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t')
}

fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
let mut saw_whitespace = false;
let mut saw_separator = false;
while is_title_separator_token(p) {
bump_link_def_separator(p);
saw_whitespace = true;
saw_separator = true;
}
saw_whitespace && get_title_close_char(p).is_some()
saw_separator && get_title_close_char(p).is_some()
})
}

Expand Down Expand Up @@ -731,7 +727,7 @@ fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationSc
}

while !p.at(EOF) && !p.at(NEWLINE) {
if is_whitespace_token(p) {
if is_space_or_tab_token(p) {
break;
}
let text = p.cur_text();
Expand Down Expand Up @@ -765,7 +761,7 @@ fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationSc
}

fn is_title_separator_token(p: &MarkdownParser) -> bool {
is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line())
is_space_or_tab_token(p) || (p.at(NEWLINE) && !p.at_blank_line())
}

fn bump_link_def_separator(p: &mut MarkdownParser) {
Expand All @@ -778,19 +774,6 @@ fn bump_link_def_separator(p: &mut MarkdownParser) {
}
}

fn get_title_close_char(p: &MarkdownParser) -> Option<char> {
let text = p.cur_text();
if text.starts_with('"') {
Some('"')
} else if text.starts_with('\'') {
Some('\'')
} else if p.at(L_PAREN) {
Some(')')
} else {
None
}
}

fn parse_title_content(p: &mut MarkdownParser, close_char: Option<char>) {
let Some(close_char) = close_char else {
return;
Expand Down
69 changes: 19 additions & 50 deletions crates/biome_markdown_parser/src/syntax/link_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ use crate::lexer::MarkdownLexContext;
use crate::syntax::reference::normalize_reference_label;
use crate::syntax::{
LinkDestinationKind, MAX_BLOCK_PREFIX_INDENT, MAX_LINK_DESTINATION_PAREN_DEPTH,
ParenDepthResult, ends_with_unescaped_close, try_update_paren_depth,
validate_link_destination_text,
ParenDepthResult, ends_with_unescaped_close, get_title_close_char, is_space_or_tab_token,
try_update_paren_depth, validate_link_destination_text,
};

/// Maximum label length per CommonMark spec (999 characters).
Expand Down Expand Up @@ -207,14 +207,9 @@ fn skip_whitespace_tokens(p: &mut MarkdownParser) {
/// Skip whitespace tokens (spaces/tabs) in lookahead and return whether any were skipped.
fn skip_whitespace_tokens_tracked(p: &mut MarkdownParser) -> bool {
let mut skipped = false;
while !p.at(EOF) && !p.at(NEWLINE) {
let text = p.cur_text();
if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() {
p.bump_link_definition();
skipped = true;
} else {
break;
}
while !p.at(EOF) && !p.at(NEWLINE) && is_space_or_tab_token(p) {
p.bump_link_definition();
skipped = true;
}
skipped
}
Expand All @@ -239,13 +234,8 @@ enum DestinationResult {
/// Skip destination tokens in lookahead. Returns the destination result.
fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult {
// Skip optional leading whitespace before destination
while !p.at(EOF) && !p.at(NEWLINE) {
let text = p.cur_text();
if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() {
p.bump_link_definition();
} else {
break;
}
while !p.at(EOF) && !p.at(NEWLINE) && is_space_or_tab_token(p) {
p.bump_link_definition();
}

if p.at(L_ANGLE) {
Expand Down Expand Up @@ -295,9 +285,8 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult {
let mut pending_escape = false;

while !p.at(EOF) && !p.at(NEWLINE) {
let text = p.cur_text();
// Stop at whitespace
if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() {
if is_space_or_tab_token(p) {
if has_content {
saw_separator = true;
}
Expand All @@ -312,6 +301,7 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult {
break;
}

let text = p.cur_text();
if !validate_link_destination_text(text, LinkDestinationKind::Raw, &mut pending_escape)
{
return DestinationResult::Invalid;
Expand Down Expand Up @@ -427,7 +417,7 @@ pub(crate) fn parse_link_block(p: &mut MarkdownParser) -> ParsedSyntax {
// Check for title on next line - need to skip trailing whitespace first
// Also validate that the title is complete and has no trailing content
let has_valid_title_after_newline = p.lookahead(|p| {
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
p.bump_link_definition();
}
if p.at(NEWLINE) && !p.at_blank_line() {
Expand Down Expand Up @@ -488,14 +478,14 @@ fn parse_link_destination(p: &mut MarkdownParser) {
let list = p.start();

// Include optional whitespace before destination in the destination node.
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}

// Per CommonMark §4.7, destination can be on the next line
if p.at(NEWLINE) && !p.at_blank_line() {
bump_textual_link_def(p);
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}
}
Expand All @@ -514,7 +504,7 @@ fn parse_link_destination(p: &mut MarkdownParser) {
let mut paren_depth: i32 = 0;

while !p.at(EOF) && !p.at(NEWLINE) {
if is_whitespace_token(p) {
if is_space_or_tab_token(p) {
break; // Bare destination stops at first whitespace
}

Expand Down Expand Up @@ -550,7 +540,7 @@ fn bump_textual_link_def(p: &mut MarkdownParser) {
fn at_link_title(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
// Skip whitespace before title
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
p.bump_link_definition();
}
let text = p.cur_text();
Expand Down Expand Up @@ -589,7 +579,7 @@ fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) {
let list = p.start();

// Include trailing whitespace after destination
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}

Expand All @@ -599,7 +589,7 @@ fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) {
}

// Include leading whitespace on title line
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}

Expand All @@ -620,7 +610,7 @@ fn parse_link_title(p: &mut MarkdownParser) {
let list = p.start();

// Include optional filler whitespace before title
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}

Expand All @@ -632,21 +622,6 @@ fn parse_link_title(p: &mut MarkdownParser) {
m.complete(p, MD_LINK_TITLE);
}

/// Get the closing character for a title based on current token.
/// Returns None if not at a title start.
fn get_title_close_char(p: &MarkdownParser) -> Option<char> {
let text = p.cur_text();
if text.starts_with('"') {
Some('"')
} else if text.starts_with('\'') {
Some('\'')
} else if p.at(L_PAREN) {
Some(')')
} else {
None
}
}

/// Parse title content until closing delimiter, including trailing whitespace.
///
/// Inside title quotes, we use Regular context so whitespace doesn't split tokens.
Expand All @@ -668,7 +643,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option<char>) {
if is_complete {
// Consume trailing whitespace after title (before newline)
p.re_lex_link_definition();
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}
return;
Expand All @@ -692,7 +667,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option<char>) {
bump_textual(p);
// Consume trailing whitespace after title (before newline)
p.re_lex_link_definition();
while is_whitespace_token(p) {
while is_space_or_tab_token(p) {
bump_textual_link_def(p);
}
break;
Expand All @@ -708,12 +683,6 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option<char>) {
}
}

/// Check if current token is whitespace (space or tab).
fn is_whitespace_token(p: &MarkdownParser) -> bool {
let text = p.cur_text();
!text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t')
}

/// Consume the current token as an MdTextual node.
///
/// This is a helper to reduce boilerplate for the common pattern:
Expand Down
52 changes: 35 additions & 17 deletions crates/biome_markdown_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,37 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block};

use crate::MarkdownParser;

/// Check if current token consists only of ASCII spaces and/or tabs.
///
/// This intentionally does **not** use `Dispatch::WHS` from the lookup table,
/// which classifies `\n`, `\r`, and other whitespace bytes. CommonMark §4.7
/// and §6.3 define the separator between a link destination and an optional
/// title as spaces/tabs only — newlines are significant structure there, not
/// whitespace. The lexer uses the same narrow rule for link definitions.
pub(crate) fn is_space_or_tab_token(p: &MarkdownParser) -> bool {
let text = p.cur_text();
!text.is_empty() && text.chars().all(|c| c == ' ' || c == '\t')
Comment thread
ematipico marked this conversation as resolved.
}

/// Get the closing delimiter for a CommonMark link title (§4.7, §6.3).
///
/// A link title appears after the destination in link reference definitions
/// (`[label]: url "title"`) and inline links (`[text](url "title")`). It may
/// be enclosed in `"…"`, `'…'`, or `(…)`. Returns the expected closing
/// character, or `None` if the current token does not start a title.
pub(crate) fn get_title_close_char(p: &MarkdownParser) -> Option<char> {
let text = p.cur_text();
if text.starts_with('"') {
Some('"')
} else if text.starts_with('\'') {
Some('\'')
} else if p.at(L_PAREN) {
Some(')')
} else {
None
}
}

/// Maximum paren nesting allowed in link destinations per CommonMark.
pub(crate) const MAX_LINK_DESTINATION_PAREN_DEPTH: i32 = 32;

Expand Down Expand Up @@ -644,24 +675,11 @@ pub(crate) fn is_dash_only_thematic_break_text(text: &str) -> bool {
!text.is_empty() && text.trim().chars().all(|c| c == '-')
}

/// Token-based check: is the current line a setext underline?
///
/// Call after consuming a NEWLINE token. Skips 0–3 columns of leading whitespace
/// (tabs expand to the next tab stop per CommonMark §2.2), then checks for
/// `MD_SETEXT_UNDERLINE_LITERAL` or a dash-only `MD_THEMATIC_BREAK_LITERAL`.
///
/// Returns `Some(bytes_consumed)` if the line is a setext underline, `None` otherwise.
/// The byte count includes only the whitespace tokens consumed during the indent skip,
/// NOT the underline token itself. Callers that track byte budgets must subtract this.
///
/// This is the shared helper for setext detection in inline contexts.
/// Used by `has_matching_code_span_closer`, `parse_inline_html`, and `parse_inline_item_list`.
/// Returns `Some(indent_bytes)` if the current line is a setext underline.
///
/// Context safety: this function does NOT call `allow_setext_heading` because the token
/// stream itself encodes context. In blockquotes, `R_ANGLE` tokens appear after NEWLINE
/// before content, so the whitespace-only skip naturally rejects those lines. In list
/// items, the indent reflected in the token stream is the raw line indent, and the
/// `columns < 4` check correctly rejects lines with 4+ columns of leading whitespace.
/// Call this after consuming `NEWLINE`. It skips up to 3 columns of leading
/// whitespace, then checks for a setext underline token or a dash-only thematic
/// break token. The returned byte count covers only the skipped whitespace.
pub(crate) fn at_setext_underline_after_newline(p: &mut MarkdownParser) -> Option<usize> {
let mut columns = 0;
let mut bytes_consumed = 0;
Expand Down
Loading